In [1]:
import numpy as np
import scipy 
import scipy.stats
from scipy import stats
import seaborn as sns
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import itertools
from itertools import groupby
import pickle
import os
import math
from sympy import S, symbols
from collections import Counter

In [2]:
# A host of Scikit-learn models
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, train_test_split
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, accuracy_score, precision_score, average_precision_score, \
classification_report, recall_score, confusion_matrix, f1_score
from sklearn.model_selection import KFold 
from sklearn.base import clone

from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [3]:
def read_from_pickle(directory, filename):
    df = pd.DataFrame()
    filepath = directory + filename
    with open(filepath, 'rb') as file:
        df = pickle.load(file)
            
    return df

In [4]:
directory = '/Users/dianaow/Documents/formula-1-race-data/dataframes/'

In [5]:
rs = 12
njobs = 4

### Step 1: Initiate tuned sklearn models

In [28]:
all_tuned_classifiers = read_from_pickle(directory, "all_tuned_classifiers.pickle")
print("Total number of tuned classifiers to test:", len(all_tuned_classifiers))

('Total number of tuned classifiers to test:', 16)


In [7]:
skfold = StratifiedKFold(n_splits=10, random_state=rs, shuffle=True)

In [8]:
# Utilize only the Extra Trees Classifiers
ExtCs = all_tuned_classifiers[0:4]

In [9]:
def get_models():
    """Generate a library of base learners."""

    return all_tuned_classifiers

In [10]:
base_learners = get_models()

### Step 2: Define a meta-learner

In [11]:
meta_learner = ['XGBoost', XGBClassifier(seed = rs, n_estimators=1000, learning_rate=0.1, max_depth=3, min_child_weight=1, gamma=0.0, \
                     subsample=0.5, colsample_bytree=0.5, reg_alpha= 1e-05)]

### Step 3: Get train-test set

In [12]:
def scale_data(X_train, X_test, scaler):
    
    if scaler == 'StandardScaler':
        SS = StandardScaler()
        Xs_train = SS.fit_transform(X_train)
        Xs_test = SS.fit_transform(X_test)
    elif scaler == 'MinMaxScaler':
        mm = MinMaxScaler()
        Xs_train = mm.fit_transform(X_train)
        Xs_test = mm.fit_transform(X_test)  
    elif scaler == False:
        Xs_train = X_train
        Xs_test = X_test
        
    return Xs_train, Xs_test

def build_train_test_set(df, df_test, train_yr, test_yr, index_list, target_var, target_var_list, scaler, \
                         name=None, races_curr_same_cat=None, races_same_cat=None, \
                         zV_nzV_check=False, VIF_check=False, multilabel=False, print_stats=False):
    
    """
       Capabilites of this function:
       1) Splits dataset to a train and test set and allows for a single or multiple target variables. 
       2) 2 methods of train-test split
           Approach 1: Train-test split by year (set all_races=None)
               - This is not a viable options if model includes features that are only known pre-race. eg drivers' selected tyre sets and qualifying position)
           Approach 2: Train-test split by races
       3) Check for multi-collinearity and features with Zero Variance (important for linear/ordinal regression)
       
       Notes:
       - If multilabel=True, input 'target_var' and 'target_var_list' as the same variables.
    """
    
    if zV_nzV_check==True:
        non_zero_nz_Var = zV_nzV_filter(df, index_list, target_var_list)
        df = df[non_zero_nz_Var]
        df_test = df_test[non_zero_nz_Var]

    if VIF_check==True:
        features_vif_below10 = VIF_filter(df, index_list, target_var_list)
        df = df[features_vif_below10]
        df_test = df_test[features_vif_below10]
        
    # =============================================================================================
    # BUILD TRAIN AND TEST SET
    train_set = df[df['year'].isin(train_yr)].reset_index(drop=True)
    
    if name != None:

        train_set = train_set[train_set['name'].isin(races_same_cat['name'])]
        target_ibdex = races_curr_same_cat.index(name)
        races_before = races_curr_same_cat[:target_ibdex]
        addto_train_set = df[(df['year'].isin(test_yr)) & (df['name'].isin(races_before))].reset_index(drop=True)
        train_set = train_set.append(addto_train_set)
            
        test_set = df_test[df_test['name'] == name].reset_index(drop=True)
    
    elif (name == None) & (races_same_cat == None) & (races_curr_same_cat == None):
        test_set = df_test
        
    # Separate index, features and target variable
    arr = []
    arr = list(index_list)
    arr_tar_tvar = list(target_var_list)
    arr.extend(arr_tar_tvar)
    learning_columns = np.setdiff1d(train_set.columns, np.array(arr))
    X_train = train_set.loc[:, learning_columns]
    X_test = test_set.loc[:, learning_columns]

    if multilabel==True:
        Y_train = train_set[target_var_list]
        Y_test = test_set[target_var_list]
        # Ensure target variables are in integers as alogrithms will throw an error otherwise
        Y_train = Y_train.apply(lambda x: x.astype(int)) # Dataframe
        Y_test = Y_test.apply(lambda x: x.astype(int))
    else:
        Y_train = train_set[target_var]
        Y_test = test_set[target_var]
        Y_train = Y_train.astype(int) # Series
        Y_test = Y_test.astype(int)
        
    # =============================================================================================
    # Standard scale the dataset after train-test split
    Xs_train, Xs_test = scale_data(X_train, X_test, scaler)
        
    # =============================================================================================
    # PRINT STATISTICS (because multilabel option will generate a dataframe, whereas a single target variable results in a series,
    # to print statistics, different methods apply...)
    
    if (multilabel==True) & (print_stats==True):
        print '=================================='
        print 'Race:', name
        print '=================================='

        print 'Train set:' + str(len(Y_train))
        print '---------------------------------'
        print 'Shape of Train Set: ', Xs_train.shape
        print Y_train.apply(pd.value_counts).fillna(0)

        print '=================================='
        print 'Test set: ' + str(len(Y_test))
        print '---------------------------------'
        print 'Shape of Test Set: ', Xs_test.shape
        test_vc = Y_test.apply(pd.value_counts).fillna(0)
        print test_vc

    elif (multilabel==False) & (print_stats==True):
        print '=================================='
        print 'Race:', name
        print '=================================='

        print 'Train set:' + str(Y_train.count())
        print '---------------------------------'
        print 'Shape of Train Set: ', Xs_train.shape
        print Y_train.value_counts() 

        print '=================================='
        print 'Test set: ' + str(Y_test.count())
        print '---------------------------------'
        print 'Shape of Test Set: ', Xs_test.shape
        print Y_test.value_counts() 

    return train_set, test_set, Xs_train, Xs_test, np.array(Y_train).ravel(), np.array(Y_test).ravel()

### Step 4: Train the base learners on a training set (xtrain_base)

In [13]:
def train_base_learners(base_learners, inp, out, verbose=True):
    """Train all base learners in the library."""

    for m in base_learners:
        m[1].fit(inp, out)

###  Step 5: Generate base learner predictions (Input: xpred_base, output: P_base)
- P_base is a matrix of predictions (n_samles, n_base_learners)

In [14]:
def predict_base_learners(pred_base_learners, inp, verbose=True):
    """Generate a prediction matrix."""
    P = np.zeros((inp.shape[0], len(pred_base_learners)))

    for i,m in enumerate(pred_base_learners):
        p = m[1].predict_proba(inp)
        # With two classes, need only predictions for one class
        P[:, i] = p[:, 1]

    return P

### Step 6: Use the base predictions to train the meta learner (Input: P_pred, ypred_base)

In [15]:
def ensemble_predict(base_learners, meta_learner, inp, verbose=True):
    """Generate predictions from the ensemble."""
    P_pred = predict_base_learners(base_learners, inp, verbose=verbose)
    return P_pred, meta_learner.predict(P_pred)

## Training base learners with cross-validation
- Stacking: Fitting an ensemble with cross-validation 

In [16]:
def stacking(base_learners, meta_learner, df, df_test, train_yr, test_yr, index_list, target_var_list, \
             scaler, generator, races_curr_same_cat, races_same_cat):
    
    """Simple training routine for stacking."""

    P_pred_all = pd.DataFrame()
    p_all =  pd.DataFrame()
    results_all = pd.DataFrame()
    
    for race in races_curr_same_cat:
        # Step 3: 
        train_set, test_set, Xs_train, Xs_test, Y_train, Y_test  = \
        build_train_test_set(df, df_test, train_yr, test_yr, index_list, target_var_list, target_var_list, scaler, \
                             race, races_curr_same_cat, races_same_cat, \
                             zV_nzV_check=False, VIF_check=False, multilabel=False, print_stats=False)

        # Step 4: Train final base learners for test time
        #print("Fitting final base learners...")
        train_base_learners(base_learners, Xs_train, Y_train, verbose=False)
        #print("done")

        # Step 5: Generate predictions for training meta learners
        # Outer loop:
        #print("Generating cross-validated predictions...")
        cv_preds, cv_y = [], []
        for i, (train_idx, test_idx) in enumerate(generator.split(Xs_train, Y_train)):

            fold_xtrain, fold_ytrain = Xs_train[train_idx, :], Y_train[train_idx]
            fold_xtest, fold_ytest = Xs_train[test_idx, :], Y_train[test_idx]

            # Inner loop: step 4 and 5
            fold_base_learners = [[model[0], clone(model[1])] for model in base_learners]
            train_base_learners(fold_base_learners, fold_xtrain, fold_ytrain, verbose=False)

            fold_P_base = predict_base_learners(fold_base_learners, fold_xtest, verbose=False)

            cv_preds.append(fold_P_base)
            cv_y.append(fold_ytest)
            #print("Fold %i done" % (i + 1))

        #print("CV-predictions done")

        # Be careful to get rows in the right order
        cv_preds = np.vstack(cv_preds)
        cv_y = np.hstack(cv_y)

        # Step 5: Train meta learner
        #print("Fitting meta learner...")
        meta_learner[1].fit(cv_preds, cv_y)
        #print("done")
        
        # Step 6: Use the base predictions to train the meta learner
        P_pred, p = ensemble_predict(base_learners, meta_learner[1], Xs_test, verbose=False)
        
        P_pred_all = pd.concat([P_pred_all, pd.DataFrame(P_pred)])
        p_all = pd.concat([p_all, pd.DataFrame(p, columns=[race])], axis=1)

        # Step 7: Evaluate prediction results
        results_stats  = calc_classification_stats(race, target_var_list, meta_learner[0], "", Y_test, p, average=None)
        results_all = pd.concat([results_all, results_stats])
        
    return results_all, P_pred_all, p_all

In [17]:
def calc_classification_stats(name, target_var, classifier, sampler, y_test, y_pred, average):

    # Initiate lists for storing results
    recall = []
    precision = []
    baseline_accuracy = []
    test_accuracy = []
    f1 = []
    avg_precision = []
    
    #recall.append(recall_score(y_test, y_pred, average=None))
    #precision.append(precision_score(y_test, y_pred, average=None))
    baseline_accuracy.append(float(pd.Series(y_test).value_counts().max()) / pd.Series(y_test).count())
    test_accuracy.append(accuracy_score(y_test, y_pred))
    avg_precision.append(average_precision_score(y_test, y_pred))
    f1.append(f1_score(y_test, y_pred, average=None)) 

    results_stats = pd.DataFrame({'Index': str(name), 'Target Var': str(target_var), 'Method': classifier, 'Resampler':sampler,\
                                  'Avg Precision': avg_precision, 'F1 Score': f1, 'Test accuracy': test_accuracy, \
                                  'Baseline accuracy': baseline_accuracy})

    results_stats = results_stats[['Index', 'Target Var', 'Method', 'Resampler', "Baseline accuracy", "Test accuracy", 'F1 Score', 'Avg Precision']]

    return results_stats  

In [18]:
xl = pd.ExcelFile("/Users/dianaow/Downloads/F1_Tyre_data.xlsx")
xl.sheet_names
pirelli = xl.parse("Sheet7")
races_dict = pirelli[['year', 'name']].to_dict('list')

df_races = read_from_pickle(directory, "df_races.pickle")
races15 = list(df_races[df_races['year'] == 2015].name.unique())
races = list(df_races[df_races['year'] == 2017].name.unique())

index_list = ['year', 'driverRef', 'name']
target_var_list = ['statusId']

In [19]:
status_dataset_train = read_from_pickle(directory, "status_dataset_train.pickle")
status_dataset_test = read_from_pickle(directory, "status_dataset_test.pickle")

#### 1) XGBoost as the meta-leaner with all 16 classifiers (4 from each type of classifier: Extra Trees, Random Forest, Gradient Boosting, MLP) as the base-learners

In [38]:
base_learners

[['ExtraTrees0',
  ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
             max_depth=None, max_features=11, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=2, min_samples_split=14,
             min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
             oob_score=False, random_state=12, verbose=0, warm_start=False)],
 ['ExtraTrees1',
  ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
             max_depth=None, max_features=11, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=2, min_samples_split=10,
             min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
             oob_score=False, random_state=12, verbose=0, warm_start=False)],
 ['ExtraTrees2',
  ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
             max_depth=None, max_features=11, ma

In [21]:
results_all[results_all['Test accuracy'] > results_all['Baseline accuracy']]

Unnamed: 0,Index,Target Var,Method,Resampler,Baseline accuracy,Test accuracy,F1 Score,Avg Precision
0,Chinese Grand Prix,['statusId'],XGBoost,,0.789474,0.842105,"[0.571428571429, 0.903225806452]",0.869298
0,Bahrain Grand Prix,['statusId'],XGBoost,,0.65,0.7,"[0.4, 0.8]",0.701584


In [22]:
results_all

Unnamed: 0,Index,Target Var,Method,Resampler,Baseline accuracy,Test accuracy,F1 Score,Avg Precision
0,Australian Grand Prix,['statusId'],XGBoost,,0.631579,0.631579,"[0.222222222222, 0.758620689655]",0.645769
0,Chinese Grand Prix,['statusId'],XGBoost,,0.789474,0.842105,"[0.571428571429, 0.903225806452]",0.869298
0,Bahrain Grand Prix,['statusId'],XGBoost,,0.65,0.7,"[0.4, 0.8]",0.701584
0,Russian Grand Prix,['statusId'],XGBoost,,0.8,0.65,"[0.0, 0.787878787879]",0.771324
0,Spanish Grand Prix,['statusId'],XGBoost,,0.8,0.65,"[0.222222222222, 0.774193548387]",0.8
0,Monaco Grand Prix,['statusId'],XGBoost,,0.684211,0.578947,"[0.0, 0.733333333333]",0.652774
0,Canadian Grand Prix,['statusId'],XGBoost,,0.75,0.7,"[0.4, 0.8]",0.79
0,Azerbaijan Grand Prix,['statusId'],XGBoost,,0.65,0.6,"[0.333333333333, 0.714285714286]",0.662821
0,Austrian Grand Prix,['statusId'],XGBoost,,0.8,0.8,"[0.333333333333, 0.882352941176]",0.83125
0,British Grand Prix,['statusId'],XGBoost,,0.85,0.7,"[0.0, 0.823529411765]",0.828201


#### 2) XGBoost as the meta-leaner with a one selected classifier from the group of various classifiers  as the base-learners

In [32]:
classifiers_to_filter = ['ExtraTree1', 'RandomForest0','GradientBoosting1', 'MLP0']
sel_clfs = [x for x in all_tuned_classifiers if x[0] in classifiers_to_filter]

In [34]:
results_selclfs, P_pred_selclfs, p_selclfs = stacking(sel_clfs, meta_learner, status_dataset_train, status_dataset_test,\
                                            [2016], [2017], index_list, target_var_list, \
                                            'StandardScaler', skfold, races, races_dict)

In [35]:
results_selclfs[results_selclfs['Test accuracy'] > results_selclfs['Baseline accuracy']]

Unnamed: 0,Index,Target Var,Method,Resampler,Baseline accuracy,Test accuracy,F1 Score,Avg Precision
0,Chinese Grand Prix,['statusId'],XGBoost,,0.789474,0.842105,"[0.4, 0.909090909091]",0.833333
0,United States Grand Prix,['statusId'],XGBoost,,0.789474,0.842105,"[0.4, 0.909090909091]",0.833333


In [37]:
results_selclfs

Unnamed: 0,Index,Target Var,Method,Resampler,Baseline accuracy,Test accuracy,F1 Score,Avg Precision
0,Australian Grand Prix,['statusId'],XGBoost,,0.631579,0.526316,"[0.181818181818, 0.666666666667]",0.607895
0,Chinese Grand Prix,['statusId'],XGBoost,,0.789474,0.842105,"[0.4, 0.909090909091]",0.833333
0,Bahrain Grand Prix,['statusId'],XGBoost,,0.65,0.65,"[0.222222222222, 0.774193548387]",0.665385
0,Russian Grand Prix,['statusId'],XGBoost,,0.8,0.5,"[0.166666666667, 0.642857142857]",0.771875
0,Spanish Grand Prix,['statusId'],XGBoost,,0.8,0.75,"[0.0, 0.857142857143]",0.790132
0,Monaco Grand Prix,['statusId'],XGBoost,,0.684211,0.473684,"[0.0, 0.642857142857]",0.625911
0,Canadian Grand Prix,['statusId'],XGBoost,,0.75,0.6,"[0.333333333333, 0.714285714286]",0.762821
0,Azerbaijan Grand Prix,['statusId'],XGBoost,,0.65,0.55,"[0.181818181818, 0.689655172414]",0.630769
0,Austrian Grand Prix,['statusId'],XGBoost,,0.8,0.65,"[0.222222222222, 0.774193548387]",0.8
0,British Grand Prix,['statusId'],XGBoost,,0.85,0.8,"[0.333333333333, 0.882352941176]",0.878547


### Evaluation of results:
- 2 out of 20 races in 2017 season achieved predicion accuracy above baseline using XGBoost Classifier as the meta-learner with Ensemble Stacking.
- Narrowing down base-learners did not improve on prediction results