## Classification of Tyre Use Per Stint Per Driver for each race in 2017 season

In [1]:
import numpy as np
import scipy 
import scipy.stats
from scipy import stats
import seaborn as sns
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import itertools
from itertools import groupby
import pickle
import os
import math
from sympy import S, symbols
from collections import Counter

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

plt.style.use('fivethirtyeight')
#sns.mpl.rcParams['figure.figsize'] = (16, 10)

# Directory to store pickled dataframes
directory = '/Users/dianaow/Documents/formula-1-race-data/dataframes/'

In [2]:
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn import cross_validation
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, classification_report, recall_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.under_sampling import TomekLinks
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier



In [3]:
palette = {"Ultra soft":"#9b59b6", 
           "Super soft":"#ff5745", 
           "Soft":"#edee3d", 
           "Medium":"#cccccc", 
           "Hard":"#ff9932",
           "Intermediate":"#00f164",
           "Wet":"#85c8fb",
           "None": "#fabebe"}

plt.style.use("dark_background")

In [4]:
def read_from_pickle(directory, filename):
    df = pd.DataFrame()
    filepath = directory + filename
    with open(filepath, 'rb') as file:
        df = pickle.load(file)
            
    return df

In [6]:
X_k = read_from_pickle(directory, "X_k.pickle")
clusters = read_from_pickle(directory, "clusters.pickle")
df_tyres = read_from_pickle(directory, "df_tyres.pickle")
drivers_2016 = read_from_pickle(directory, "drivers_2016.pickle")
drivers_2016t = read_from_pickle(directory, "drivers_2016t.pickle")

In [7]:
clusters16 = read_from_pickle(directory, "clusters16.pickle")

In [8]:
xl = pd.ExcelFile("./formula-1-race-data/Selected_Sets.xlsx", encoding='utf-8')
xl.sheet_names
selected_sets = xl.parse("Sheet10")

In [9]:
xl = pd.ExcelFile("./formula-1-race-data/Track_Information.xlsx", encoding='utf-8')
xl.sheet_names
track_info = xl.parse("Copy of Sheet12")

In [10]:
xl = pd.ExcelFile("/Users/dianaow/Downloads/F1_Tyre_data.xlsx")
xl.sheet_names
pirelli = xl.parse("Sheet7")

### Features of the classification model:
- Count of each tyre type selected for the race (teams select the number of tyres for their driver based on Pirelli's allocated mandatory tyre set)
- Track Information
    - 1) Average track temperature (from 2014 to 2017)
    - 2) Range of Track Temperature(from 2014 to 2017)
    - 3) Tyre Stress Rate
    - 4) Downforce Rate
    - 5) Asphalt Abrasion
    - 6) Number of race laps
    - 7) Circuit length


- Cluster label based on the track's 2016 pit strategy records
- Driver-specific multiplier for each tyre type (to account for the fact that different drivers wear out their tyres at different rates. Measured by calculating the total stint length each driver spent on each tyre type at each race track (only include completed stints), divided by the summed lengths of each stint)

Data Sources: 
- https://racingspot.pirelli.com/global/en-ww/infographics
- https://docs.google.com/spreadsheets/d/15A9gs4X3vIPpCnooKS49fH6BdBUwLIIcz4XE3SiDzpw/edit?usp=sharing
      - Copied over tyre information on Pirelli infographic to google spreadsheet

#### Pre-processing before performing Classification

In [11]:
# SS, S, M = 1
# S, M, H = 2
# SS, S, US = 3

for i,row in pirelli.iterrows(): 
    if (pirelli.loc[i,'Super Soft'] == 'Super Soft') & (pirelli.loc[i,'Soft'] == 'Soft') & (pirelli.loc[i,'Medium'] == "Medium"):  
        pirelli.loc[i,'mandatory combi'] = float(1)
    elif (pirelli.loc[i,'Soft'] == "Soft") & (pirelli.loc[i,'Medium'] == "Medium") & (pirelli.loc[i,'Hard'] == "Hard"):
        pirelli.loc[i,'mandatory combi'] = float(2)
    elif (pirelli.loc[i,'Super Soft'] == 'Super Soft') & (pirelli.loc[i,'Soft'] == 'Soft') & (pirelli.loc[i,'Ultra Soft'] == 'Ultra Soft'):
        pirelli.loc[i,'mandatory combi'] = float(3)
    elif (pirelli.loc[i,'Super Soft'] == 'Super Soft') & (pirelli.loc[i,'Soft'] == 'Soft') & (pirelli.loc[i,'Medium'] != "Medium"): 
        pirelli.loc[i,'mandatory combi'] = float(4)
    elif (pirelli.loc[i,'Super Soft'] != 'Super Soft') & (pirelli.loc[i,'Soft'] == 'Soft') & (pirelli.loc[i,'Medium'] != "Medium"): 
        pirelli.loc[i,'mandatory combi'] = float(5)
    elif (pirelli.loc[i,'Super Soft'] != 'Soft') & (pirelli.loc[i,'Medium'] == "Medium") & (pirelli.loc[i,'Hard'] == "Hard"): 
        pirelli.loc[i,'mandatory combi'] = float(6)
    else:
        pirelli.loc[i,'mandatory combi'] = float(7)

In [12]:
pirelli.sort_values(['name', 'year'])

Unnamed: 0,year,name,Super Soft,Soft,Medium,Hard,Ultra Soft,mandatory combi
39,2015,Abu Dhabi Grand Prix,Super Soft,Soft,,,,4.0
20,2016,Abu Dhabi Grand Prix,Super Soft,Soft,,,Ultra Soft,3.0
59,2017,Abu Dhabi Grand Prix,Super Soft,Soft,,,Ultra Soft,3.0
21,2015,Australian Grand Prix,,Soft,Medium,,,7.0
0,2016,Australian Grand Prix,Super Soft,Soft,Medium,,,1.0
40,2017,Australian Grand Prix,Super Soft,Soft,,,Ultra Soft,3.0
28,2015,Austrian Grand Prix,Super Soft,Soft,,,,4.0
8,2016,Austrian Grand Prix,Super Soft,Soft,,,Ultra Soft,3.0
48,2017,Austrian Grand Prix,Super Soft,Soft,,,Ultra Soft,3.0
7,2016,Azerbaijan Grand Prix,Super Soft,Soft,Medium,,,1.0


In [106]:
drivers_2016pt = pd.pivot_table(drivers_2016t, values='ratio', index=['driverRef'], columns=['tyre']).reset_index()
drivers_2016pt.rename(columns={"Hard": "H_m", "Intermediate": "I_m", "Medium":"M_m", "Soft": "S_m", "Super soft": "SS_m", "Ultra soft":"US_m", "Wet": "W_m"}, inplace=True)

In [107]:
track_new_agg = read_from_pickle(directory, "track_new_agg.pickle")
track_new_agg.drop(['Year'], axis=1, inplace=True)
track_new_agg.rename(columns={"Race": "name"}, inplace=True)

In [108]:
# Merge df containing driver-specific tyre degradation multiplier
df_final_1 = pd.merge(selected_sets, drivers_2016pt.reset_index(drop=True), on=['driverRef'], how='left')

# Merge df containing track-specific information(tyre wear rating etc.)
df_final_1 = pd.merge(df_final_1, track_new_agg.reset_index(drop=True), on=['name'], how='left')

In [109]:
XY = pd.merge(df_final_1, df_tyres[['year', 'name', 'driverRef', 'first set', 'second set', 'third set', 'fourth set']], on=['year', 'name', 'driverRef'], how='left')

In [19]:
# Null values because pirelli did not publish information of selected tyres for these races, althoug they did release the mandatory tyre set info for the race.
# I will instead inpute these values with the average values of races assigned the same mandatory tyre sets. 
# Each race has mandatory 3 tyre sets, so if there are more than 3 avg values, only 3 can be selected. Base it off the previos' year's tyre choice

In [110]:
XY = pd.merge(XY, pirelli[['year', 'name', 'mandatory combi']], on=['year', 'name'], how='left')
XY.groupby("mandatory combi").agg({'Medium':'mean', 'Soft':'mean', 'Super Soft':'mean', 'Ultra soft':'mean', 'Hard':'mean'})

Unnamed: 0_level_0,Super Soft,Medium,Soft,Ultra soft,Hard
mandatory combi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,7.563291,1.588608,3.848101,0.0,0.0
2.0,0.0,3.845455,7.163636,0.758621,1.738636
3.0,2.922222,0.0,1.974074,8.103704,0.0


In [111]:
# Find out which are the races with null values for driver selected sets
XY.drop_duplicates(subset=['year','name'], keep='first', inplace=False).sort_values("name")

Unnamed: 0,year,name,driverRef,Medium,Soft,Super Soft,Ultra soft,Hard,H_m,I_m,...,Tyre stress,Downforce,Lateral,Asphalt Abrasion,Number of laps,first set,second set,third set,fourth set,mandatory combi
798,2016,Abu Dhabi Grand Prix,hamilton,0.0,4.0,2.0,7.0,0.0,0.97651,0.833021,...,2.0,3.0,3.0,2.0,55.0,Ultra soft,Soft,Soft,,3.0
360,2017,Abu Dhabi Grand Prix,hamilton,0.0,2.0,2.0,9.0,0.0,0.97651,0.833021,...,2.0,3.0,3.0,2.0,55.0,Ultra soft,Super soft,,,3.0
776,2016,Australian Grand Prix,hamilton,1.0,6.0,6.0,0.0,0.0,0.97651,0.833021,...,1.0,4.0,1.0,3.0,58.0,Super soft,Medium,,,1.0
340,2017,Australian Grand Prix,hamilton,,,,,,0.97651,0.833021,...,1.0,4.0,1.0,3.0,58.0,Ultra soft,Soft,,,3.0
600,2016,Austrian Grand Prix,hamilton,0.0,2.0,3.0,8.0,0.0,0.97651,0.833021,...,2.0,3.0,3.0,1.0,71.0,Ultra soft,Soft,Soft,,3.0
200,2017,Austrian Grand Prix,hamilton,0.0,2.0,3.0,8.0,0.0,0.97651,0.833021,...,2.0,3.0,3.0,1.0,71.0,Super soft,Ultra soft,,,3.0
622,2016,Azerbaijan Grand Prix,hamilton,1.0,4.0,8.0,0.0,0.0,0.97651,0.833021,...,3.0,2.0,2.0,1.0,51.0,Super soft,Soft,,,1.0
220,2017,Azerbaijan Grand Prix,hamilton,1.0,4.0,8.0,0.0,0.0,0.97651,0.833021,...,3.0,2.0,2.0,1.0,51.0,Super soft,Soft,Super soft,,1.0
754,2016,Bahrain Grand Prix,hamilton,1.0,6.0,6.0,0.0,0.0,0.97651,0.833021,...,3.0,3.0,3.0,5.0,57.0,Super soft,Medium,Super soft,Soft,1.0
320,2017,Bahrain Grand Prix,hamilton,,,,,,0.97651,0.833021,...,3.0,3.0,3.0,5.0,57.0,Super soft,Soft,Soft,,1.0


In [None]:
for i, row in XY1.iterrows():
    if (XY1.loc[i,"clusters(pit strategy)"] == 1) & (XY1.loc[i,"name"] == "Australian Grand Prix") & (XY1.loc[i,"year"] == 2017):
        XY1.loc[i,"Super Soft"] =3 
        XY1.loc[i,"Ultra soft"] =8
        XY1.loc[i,"Soft"] =2
    elif (XY1.loc[i,"clusters(pit strategy)"] == 1) & (XY1.loc[i,"name"] == "Bahrain Grand Prix")  & (XY1.loc[i,"year"] == 2017):
        XY1.loc[i,"Super Soft"] =7
        XY1.loc[i,"Medium"] =2
        XY1.loc[i,"Soft"] = 4
    elif (XY1.loc[i,"clusters(pit strategy)"] == 1) & (XY1.loc[i,"name"] == "Italian Grand Prix")  & (XY1.loc[i,"year"] == 2016):
        XY1.loc[i,"Super Soft"] =7
        XY1.loc[i,"Medium"] =2
        XY1.loc[i,"Soft"] = 4
    elif (XY1.loc[i,"clusters(pit strategy)"] == 1) & (XY1.loc[i,"name"] == "Mexican Grand Prix")  & (XY1.loc[i,"year"] == 2016):
        XY1.loc[i,"Super Soft"] =7
        XY1.loc[i,"Medium"] =2
        XY1.loc[i,"Soft"] = 4
    elif (XY1.loc[i,"clusters(pit strategy)"] == 1) & (XY1.loc[i,"name"] == "Russian Grand Prix")  & (XY1.loc[i,"year"] == 2017):
        XY1.loc[i,"Super Soft"] =3 
        XY1.loc[i,"Ultra soft"] =8
        XY1.loc[i,"Soft"] =2
    elif (XY1.loc[i,"clusters(pit strategy)"] == 1) & (XY1.loc[i,"name"] == "Spanish Grand Prix")  & (XY1.loc[i,"year"] == 2017):
        XY1.loc[i,"Medium"] = 5
        XY1.loc[i,"Hard"] =1
        XY1.loc[i,"Soft"] = 7

In [112]:
def fillna_tyres(XY1):   
            
    # Null values in first set belong to drivers who DNS or did not finish the stint
    # Inpute first sets with "None" indicating they failed to finish the stint
    XY1['first set'] = XY1['first set'].fillna("None")
    XY1['second set'] = XY1['second set'].fillna("None")
    XY1['third set'] = XY1['third set'].fillna("None")
    XY1['fourth set'] = XY1['fourth set'].fillna("None")

    # Fill null values for the tyre columns with 0. Null values indicate the tyre was not part of the mandatory set
    tyrecols = ["Medium", "Soft", "Super Soft", "Ultra soft", "Hard"]
    for i in tyrecols:
        XY1[i] = XY1[i].fillna(0)  
        
    # New drivers in 2017 (stroll, gasly, hartley) do not have any tyre degration multiplier. 
    # Inpute their multipler as 1.
    XY1["H_m"] = XY1["H_m"].fillna(1)
    XY1["I_m"] = XY1["I_m"].fillna(1)
    XY1["M_m"] = XY1["M_m"].fillna(1)
    XY1["S_m"] = XY1["S_m"].fillna(1)
    XY1["SS_m"] = XY1["SS_m"].fillna(1)
    XY1["US_m"] = XY1["US_m"].fillna(1)
    XY1["W_m"] = XY1["W_m"].fillna(1)
    
    return XY1

In [113]:
XY = fillna_tyres(XY)

In [114]:
XY.isnull().sum()

year                 0
name                 0
driverRef            0
Medium               0
Soft                 0
Super Soft           0
Ultra soft           0
Hard                 0
H_m                  0
I_m                  0
M_m                  0
S_m                  0
SS_m                 0
US_m                 0
W_m                  0
Track Temp(mean)     0
Track Temp(range)    0
Tyre stress          0
Downforce            0
Lateral              0
Asphalt Abrasion     0
Number of laps       0
first set            0
second set           0
third set            0
fourth set           0
mandatory combi      0
dtype: int64

## Model Building
### Divide model by races allocated Combination1 (SS, S, M tyre set), Combination2 (S, M, H tyre set), Combination3 (SS, S, US tyre set)
### Further divide model by each stint. Assume that the maximum possible number of stints in a race is 4. In total, there are 12 separate classification models

In [115]:
# SS, S, M = 1
# S, M, H = 2
# SS, S, US = 3
XY.replace("Super Soft", "Super soft", inplace=True)

XY1 = XY.drop(['second set', 'third set', 'fourth set'], axis=1)
XY1 = XY1.drop(XY1[XY1['first set'] == 'None'].index)
XY1_c1 = XY1[XY1['mandatory combi'] == 1].reset_index()
XY1_c1 = XY1_c1.drop(["I_m", "US_m", "W_m", "H_m"], axis=1)
XY1_c2 = XY1[XY1['mandatory combi'] == 2].reset_index()
XY1_c2 = XY1_c2.drop(["I_m", "US_m", "W_m", "SS_m"], axis=1)
XY1_c3 = XY1[XY1['mandatory combi'] == 3].reset_index()
XY1_c3 = XY1_c3.drop(["I_m", "M_m", "W_m", "H_m"], axis=1)

XY2 = XY.drop(['first set', 'third set', 'fourth set'], axis=1)
XY2 = XY2.drop(XY2[XY2['second set'] == 'None'].index)
XY2_c1 = XY2[XY2['mandatory combi'] == 1].reset_index()
XY2_c1 = XY2_c1.drop(["I_m", "US_m", "W_m", "H_m"], axis=1)
XY2_c2 = XY2[XY2['mandatory combi'] == 2].reset_index()
XY2_c2 = XY2_c2.drop(["I_m", "US_m", "W_m", "SS_m"], axis=1)
XY2_c3 = XY2[XY2['mandatory combi'] == 3].reset_index()
XY2_c3 = XY2_c3.drop(["I_m", "M_m", "W_m", "H_m"], axis=1)

XY3 = XY.drop(['second set', 'first set', 'fourth set'], axis=1)
XY3 = XY3.drop(XY3[XY3['third set'] == 'None'].index)
XY3_c1 = XY3[XY3['mandatory combi'] == 1].reset_index()
XY3_c1 = XY3_c1.drop(["I_m", "US_m", "W_m", "H_m"], axis=1)
XY3_c2 = XY3[XY3['mandatory combi'] == 2].reset_index()
XY3_c2 = XY3_c2.drop(["I_m", "US_m", "W_m", "SS_m"], axis=1)
XY3_c3 = XY3[XY3['mandatory combi'] == 3].reset_index()
XY3_c3 = XY3_c3.drop(["I_m", "M_m", "W_m", "H_m"], axis=1)

XY4 = XY.drop(['second set', 'first set', 'third set'], axis=1)
XY4 = XY4.drop(XY4[XY4['fourth set'] == 'None'].index)
XY4_c1 = XY4[XY4['mandatory combi'] == 1].reset_index()
XY4_c1 = XY4_c1.drop(["I_m", "US_m", "W_m", "H_m"], axis=1)
XY4_c2 = XY4[XY4['mandatory combi'] == 2].reset_index()
XY4_c2 = XY4_c2.drop(["I_m", "US_m", "W_m", "SS_m"], axis=1)
XY4_c3 = XY4[XY4['mandatory combi'] == 3].reset_index()
XY4_c3 = XY4_c3.drop(["I_m", "M_m", "W_m", "H_m"], axis=1)

### Classification of tyres to use for races assigned the mandatory tyre set of Super soft, Soft, Medium

#### For Combination 1, what tyre type is each driver going to use for Stint 1?

In [117]:
XY1_c1_16 = XY1_c1[XY1_c1['year'] == 2016].reset_index(drop=True)
X1_c1 = XY1_c1_16.drop(['year', 'name', 'driverRef', 'first set', 'mandatory combi'], axis=1)
Y1_c1 = XY1_c1_16['first set']

XY1_c1_17 = XY1_c1[XY1_c1['year'] == 2017].reset_index(drop=True)
X1_c1_17 = XY1_c1_17.drop(['year', 'name', 'driverRef', 'first set', 'mandatory combi'], axis=1)
Y1_c1_17 = XY1_c1_17['first set']

In [124]:
XY1_c1_16.columns

Index([            u'index',              u'year',              u'name',
               u'driverRef',            u'Medium',              u'Soft',
              u'Super Soft',        u'Ultra soft',              u'Hard',
                     u'M_m',               u'S_m',              u'SS_m',
        u'Track Temp(mean)', u'Track Temp(range)',       u'Tyre stress',
               u'Downforce',           u'Lateral',  u'Asphalt Abrasion',
          u'Number of laps',         u'first set',   u'mandatory combi'],
      dtype='object')

In [118]:
SS = StandardScaler()
X1_c1 = SS.fit_transform(X1_c1)
X1_c1_17 = SS.fit_transform(X1_c1_17)

In [119]:
XY1_c1['first set'].value_counts()

Super soft    263
Soft          121
Medium          6
Name: first set, dtype: int64

In [120]:
# Train multinomial logistic regression model
def logreg(dftest, X_train, X_test, Y_train, Y_test):
    mul_lr = LogisticRegression(multi_class='multinomial', solver='newton-cg').fit(X_train, Y_train)

    print "Multinomial Logistic regression Train Accuracy :: ", accuracy_score(Y_train, mul_lr.predict(X_train))
    print "Multinomial Logistic regression Test Accuracy :: ", accuracy_score(Y_test, mul_lr.predict(X_test))
    
    p = pd.DataFrame(mul_lr.predict(X_test)).reset_index(drop=True)
    results = pd.concat([dftest, p], axis=1)
    
    return results

In [125]:
XY1_c1_res = logreg(XY1_c1_17, X1_c1, X1_c1_17, Y1_c1, Y1_c1_17)
XY1_c1_res

Multinomial Logistic regression Train Accuracy ::  0.693617021277
Multinomial Logistic regression Test Accuracy ::  0.677419354839


Unnamed: 0,index,year,name,driverRef,Medium,Soft,Super Soft,Ultra soft,Hard,M_m,...,Track Temp(mean),Track Temp(range),Tyre stress,Downforce,Lateral,Asphalt Abrasion,Number of laps,first set,mandatory combi,0
0,0,2017,Brazilian Grand Prix,vettel,1.0,3.0,9.0,0.0,0.0,1.111487,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Super soft,1.0,Super soft
1,1,2017,Brazilian Grand Prix,bottas,1.0,4.0,8.0,0.0,0.0,1.252126,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Super soft,1.0,Super soft
2,2,2017,Brazilian Grand Prix,raikkonen,1.0,3.0,9.0,0.0,0.0,1.009876,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Super soft,1.0,Super soft
3,3,2017,Brazilian Grand Prix,hamilton,1.0,4.0,8.0,0.0,0.0,0.851359,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Soft,1.0,Soft
4,4,2017,Brazilian Grand Prix,verstappen,1.0,3.0,9.0,0.0,0.0,1.020470,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Super soft,1.0,Super soft
5,5,2017,Brazilian Grand Prix,ricciardo,1.0,4.0,8.0,0.0,0.0,1.045744,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Soft,1.0,Super soft
6,7,2017,Brazilian Grand Prix,alonso,1.0,3.0,9.0,0.0,0.0,0.677418,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Super soft,1.0,Soft
7,8,2017,Brazilian Grand Prix,perez,1.0,4.0,8.0,0.0,0.0,1.179297,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Super soft,1.0,Super soft
8,9,2017,Brazilian Grand Prix,hulkenberg,1.0,2.0,10.0,0.0,0.0,1.037634,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Super soft,1.0,Super soft
9,10,2017,Brazilian Grand Prix,sainz,1.0,2.0,10.0,0.0,0.0,1.131192,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Super soft,1.0,Super soft


#### For Combination 1, what tyre type is each driver going to use for Stint 2?

In [126]:
XY2_c1['second set'].value_counts()

Soft          246
Medium         61
Super soft     57
Name: second set, dtype: int64

In [127]:
XY2_c1_16 = XY2_c1[XY2_c1['year'] == 2016].reset_index(drop=True)
X2_c1 = XY2_c1_16.drop(['year', 'name', 'driverRef', 'second set', 'mandatory combi'], axis=1)
Y2_c1 = XY2_c1_16['second set']

XY2_c1_17 = XY2_c1[XY2_c1['year'] == 2017].reset_index(drop=True)
X2_c1_17 = XY2_c1_17.drop(['year', 'name', 'driverRef', 'second set', 'mandatory combi'], axis=1)
Y2_c1_17 = XY2_c1_17['second set']

X2_c1 = SS.fit_transform(X2_c1)
X2_c1_17 = SS.fit_transform(X2_c1_17)

In [128]:
XY2_c1_res = logreg(XY2_c1_17, X2_c1, X2_c1_17, Y2_c1, Y2_c1_17)
XY2_c1_res

Multinomial Logistic regression Train Accuracy ::  0.696428571429
Multinomial Logistic regression Test Accuracy ::  0.55


Unnamed: 0,index,year,name,driverRef,Medium,Soft,Super Soft,Ultra soft,Hard,M_m,...,Track Temp(mean),Track Temp(range),Tyre stress,Downforce,Lateral,Asphalt Abrasion,Number of laps,second set,mandatory combi,0
0,0,2017,Brazilian Grand Prix,vettel,1.0,3.0,9.0,0.0,0.0,1.111487,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Soft,1.0,Medium
1,1,2017,Brazilian Grand Prix,bottas,1.0,4.0,8.0,0.0,0.0,1.252126,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Soft,1.0,Soft
2,2,2017,Brazilian Grand Prix,raikkonen,1.0,3.0,9.0,0.0,0.0,1.009876,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Soft,1.0,Medium
3,3,2017,Brazilian Grand Prix,hamilton,1.0,4.0,8.0,0.0,0.0,0.851359,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Super soft,1.0,Soft
4,4,2017,Brazilian Grand Prix,verstappen,1.0,3.0,9.0,0.0,0.0,1.020470,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Soft,1.0,Soft
5,5,2017,Brazilian Grand Prix,ricciardo,1.0,4.0,8.0,0.0,0.0,1.045744,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Soft,1.0,Soft
6,7,2017,Brazilian Grand Prix,alonso,1.0,3.0,9.0,0.0,0.0,0.677418,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Soft,1.0,Medium
7,8,2017,Brazilian Grand Prix,perez,1.0,4.0,8.0,0.0,0.0,1.179297,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Soft,1.0,Medium
8,9,2017,Brazilian Grand Prix,hulkenberg,1.0,2.0,10.0,0.0,0.0,1.037634,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Soft,1.0,Medium
9,10,2017,Brazilian Grand Prix,sainz,1.0,2.0,10.0,0.0,0.0,1.131192,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Soft,1.0,Medium


#### For Combination 1, what tyre type is each driver going to use for Stint 3?

In [137]:
XY3_c1['third set'].value_counts()

Soft          85
Super soft    72
Medium        59
Name: third set, dtype: int64

In [138]:
XY3_c1_16 = XY3_c1[XY3_c1['year'] == 2016].reset_index(drop=True)
X3_c1 = XY3_c1_16.drop(['year', 'name', 'driverRef', 'third set', 'mandatory combi'], axis=1)
Y3_c1 = XY3_c1_16['third set']

XY3_c1_17 = XY3_c1[XY3_c1['year'] == 2017].reset_index(drop=True)
X3_c1_17 = XY3_c1_17.drop(['year', 'name', 'driverRef', 'third set', 'mandatory combi'], axis=1)
Y3_c1_17 = XY3_c1_17['third set']

X3_c1 = SS.fit_transform(X3_c1)
X3_c1_17 = SS.fit_transform(X3_c1_17)

XY3_c1_res = logreg(XY3_c1_17, X3_c1, X3_c1_17, Y3_c1, Y3_c1_17)
XY3_c1_res

Multinomial Logistic regression Train Accuracy ::  0.590361445783
Multinomial Logistic regression Test Accuracy ::  0.16


Unnamed: 0,index,year,name,driverRef,Medium,Soft,Super Soft,Ultra soft,Hard,M_m,...,Track Temp(mean),Track Temp(range),Tyre stress,Downforce,Lateral,Asphalt Abrasion,Number of laps,third set,mandatory combi,0
0,4,2017,Brazilian Grand Prix,verstappen,1.0,3.0,9.0,0.0,0.0,1.02047,...,30.4,5.4,3.0,4.0,4.0,3.0,71.0,Super soft,1.0,Medium
1,5,2017,Brazilian Grand Prix,ricciardo,1.0,4.0,8.0,0.0,0.0,1.045744,...,30.4,5.4,3.0,4.0,4.0,3.0,71.0,Super soft,1.0,Soft
2,15,2017,Brazilian Grand Prix,stroll,1.0,2.0,10.0,0.0,0.0,1.0,...,30.4,5.4,3.0,4.0,4.0,3.0,71.0,Super soft,1.0,Medium
3,69,2017,Japanese Grand Prix,stroll,1.0,4.0,8.0,0.0,0.0,1.0,...,29.216667,3.366667,5.0,3.0,5.0,3.0,53.0,Super soft,1.0,Medium
4,71,2017,Japanese Grand Prix,vandoorne,1.0,2.0,10.0,0.0,0.0,1.0,...,29.216667,3.366667,5.0,3.0,5.0,3.0,53.0,Super soft,1.0,Medium
5,73,2017,Japanese Grand Prix,gasly,1.0,3.0,9.0,0.0,0.0,1.0,...,29.216667,3.366667,5.0,3.0,5.0,3.0,53.0,Super soft,1.0,Medium
6,78,2017,Japanese Grand Prix,wehrlein,1.0,3.0,9.0,0.0,0.0,1.014619,...,29.216667,3.366667,5.0,3.0,5.0,3.0,53.0,Soft,1.0,Medium
7,94,2017,Malaysian Grand Prix,grosjean,1.0,5.0,7.0,0.0,0.0,1.107727,...,45.35,8.966667,4.0,3.0,4.0,3.0,56.0,Super soft,1.0,Soft
8,96,2017,Malaysian Grand Prix,hulkenberg,1.0,3.0,9.0,0.0,0.0,1.037634,...,45.35,8.966667,4.0,3.0,4.0,3.0,56.0,Super soft,1.0,Soft
9,123,2017,Italian Grand Prix,verstappen,1.0,2.0,10.0,0.0,0.0,1.02047,...,39.116667,4.766667,5.0,1.0,2.0,3.0,53.0,Super soft,1.0,Soft


#### Try classifiers and sampling techniques without tuning hyperparameters to see which method is most promising

In [131]:
njobs = 4
rs=12

In [132]:
# create a dummy class for no sampling
class NoSampling(object):
    
    def fit_sample(self, X, y):
        return X, y

# try out a bunch of classifiers to see which is worth pursuing further
classifiers = [
    ['Logistic Regression', LogisticRegressionCV(multi_class='multinomial', solver='newton-cg', penalty='l2', max_iter= 500, Cs=10, cv=5, verbose=1, n_jobs=njobs, random_state=rs)],
    ['Random Forest', RandomForestClassifier(random_state=rs)],
    ['KNN', KNeighborsClassifier(n_jobs=njobs)],
    ['SVM (linear kernel)', SVC(kernel = 'linear', probability=True, verbose=1, random_state=rs)],
    ['SVM (rbf kernel)', SVC(kernel = 'rbf', probability=True, verbose=1, random_state=rs)]
]

# list out the sampling techniques that we are comparing
samplers = [
    ['No Sampling', NoSampling()],
    ['Random Oversampler', RandomOverSampler(random_state=rs)],
    ['SMOTE', SMOTE(ratio='minority', k_neighbors=3, random_state=rs)],
    ['SMOTE Tomek', SMOTETomek(random_state=rs)],
    ['SMOTE ENN', SMOTEENN(random_state=rs)]
]

In [34]:
def sampling_classifier_loop(train_X, test_X, train_y, test_y):

    # initialise dataframe for capturing coefficients of each feature
    l_classifier = []
    l_resampler = []
    l_recall = []
    l_precision = []

    # loop through all the classifiers except the logreg with class weights
    for classifier in classifiers[1:4]:
        #print '------------------------'
        #print classifier[0].upper()
        #print '------------------------'

        # loop through all the types of samplers
        for sampler in samplers:

            #print 
            #print sampler[0]
            #print '------------------------'

            # perform sampling
            train_X_sm, train_y_sm = sampler[1].fit_sample(train_X, train_y)

            # return dataframe of sampling results
            #print 'Ratio of training set:'
            #print pd.Series(train_y_sm).value_counts() /  pd.Series(train_y_sm).value_counts().sum()

            # fit model on train set
            classifier[1].fit(train_X_sm, train_y_sm)

            # use model to predict y
            y_pred = classifier[1].predict(test_X)

            # display confusion matrix & classification report
            #print 'Confusion Matrix:'
            #print confusion_matrix(test_y, y_pred)
            #print classification_report(test_y, y_pred)

            # save
            l_recall.append(recall_score(test_y, y_pred, average=None))
            l_precision.append(precision_score(test_y, y_pred, average=None))
            l_classifier.append(classifier[0])
            l_resampler.append(sampler[0])

    return l_recall, l_precision, l_classifier, l_resampler

In [133]:
# identify the combination of classifier and resampler that obtains the best recall
def results(l_recall, l_precision, l_classifier, l_resampler, Rcols, Pcols):
    df_results = pd.DataFrame({'Classifier': l_classifier, 'Resampler':l_resampler, 
                              'Recall':l_recall, 'Precision': l_precision})
    df_results = df_results[['Classifier', 'Resampler', 'Recall', 'Precision']]
    df_results[Rcols] = pd.DataFrame(df_results.Recall.values.tolist(), index= df_results.index)
    df_results[Pcols] = pd.DataFrame(df_results.Precision.values.tolist(), index= df_results.index)
    df_results = df_results.drop(['Recall', 'Precision'], axis=1)
    
    return df_results

In [134]:
# Combi 1 stint 1:
s1c1_recall, s1c1_precision, s1c1_classifier, s1c1_resampler = sampling_classifier_loop(X1_c1, X1_c1_17, Y1_c1, Y1_c1_17)

Rcols = ['recall: M', 'recall: Soft', 'recall: SS']
Pcols = ['precision: M', 'precision: Soft', 'precision: SS']
df_results_s1c1 = results(s1c1_recall, s1c1_precision, s1c1_classifier, s1c1_resampler, Rcols, Pcols)
df_results_s1c1

# SVM Linear Kernel

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

Unnamed: 0,Classifier,Resampler,recall: M,recall: Soft,recall: SS,precision: M,precision: Soft,precision: SS
0,Random Forest,No Sampling,0.448276,0.642857,,0.224138,0.835052,
1,Random Forest,Random Oversampler,0.0,0.482759,0.579365,0.0,0.215385,0.829545
2,Random Forest,SMOTE,0.0,0.310345,0.539683,0.0,0.173077,0.8
3,Random Forest,SMOTE Tomek,0.0,0.517241,0.380952,0.0,0.189873,0.8
4,Random Forest,SMOTE ENN,0.0,0.655172,0.357143,0.0,0.223529,0.882353
5,KNN,No Sampling,0.0,0.275862,0.777778,0.0,0.235294,0.823529
6,KNN,Random Oversampler,0.0,0.37931,0.650794,0.0,0.22,0.828283
7,KNN,SMOTE,0.0,0.172414,0.68254,0.0,0.192308,0.834951
8,KNN,SMOTE Tomek,0.0,0.206897,0.404762,0.0,0.12766,0.796875
9,KNN,SMOTE ENN,0.0,0.310345,0.396825,0.0,0.2,0.847458


In [139]:
# Combi 1 stint 2:
s2c1_recall, s2c1_precision, s2c1_classifier, s2c1_resampler = sampling_classifier_loop(X2_c1, X2_c1_17, Y2_c1, Y2_c1_17)
df_results_s2c1 = results(s2c1_recall, s2c1_precision, s2c1_classifier, s2c1_resampler, Rcols, Pcols)
df_results_s2c1

# KNN + SMOTE gave the best results

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

Unnamed: 0,Classifier,Resampler,recall: M,recall: Soft,recall: SS,precision: M,precision: Soft,precision: SS
0,Random Forest,No Sampling,0.0,0.716981,0.030303,0.0,0.730769,0.142857
1,Random Forest,Random Oversampler,1.0,0.716981,0.060606,0.025,0.826087,0.25
2,Random Forest,SMOTE,0.0,0.537736,0.424242,0.0,0.77027,0.304348
3,Random Forest,SMOTE Tomek,0.0,0.330189,0.424242,0.0,0.777778,0.291667
4,Random Forest,SMOTE ENN,0.0,0.207547,0.636364,0.0,0.916667,0.308824
5,KNN,No Sampling,0.0,0.924528,0.0,0.0,0.771654,0.0
6,KNN,Random Oversampler,0.0,0.54717,0.212121,0.0,0.753247,0.205882
7,KNN,SMOTE,0.0,0.603774,0.393939,0.0,0.780488,0.265306
8,KNN,SMOTE Tomek,0.0,0.415094,0.424242,0.0,0.745763,0.237288
9,KNN,SMOTE ENN,0.0,0.330189,0.606061,0.0,0.813953,0.263158


In [572]:
# Combi 1 stint 3:
s3c1_recall, s3c1_precision, s3c1_classifier, s3c1_resampler = sampling_classifier_loop(X3_c1, X3_c1_17, Y3_c1, Y3_c1_17)
df_results_s3c1 = results(s3c1_recall, s3c1_precision, s3c1_classifier, s3c1_resampler, Rcols, Pcols)
df_results_s3c1

# KNN + SMOTE gave the best results

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

Unnamed: 0,Classifier,Resampler,recall: M,recall: None,recall: Soft,recall: SS,precision: M,precision: None,precision: Soft,precision: SS
0,Random Forest,No Sampling,0.0,0.318182,0.181818,0.205128,0.0,0.648148,0.04878,0.347826
1,Random Forest,Random Oversampler,0.0,0.254545,0.272727,0.051282,0.0,0.651163,0.056604,0.125
2,Random Forest,SMOTE,0.0,0.290909,0.090909,0.179487,0.0,0.581818,0.038462,0.170732
3,Random Forest,SMOTE Tomek,0.0,0.163636,0.090909,0.461538,0.0,0.72,0.111111,0.268657
4,Random Forest,SMOTE ENN,0.0,0.0,0.0,0.717949,0.0,0.0,0.0,0.269231
5,KNN,No Sampling,0.0,0.463636,0.545455,0.076923,0.0,0.69863,0.146341,0.428571
6,KNN,Random Oversampler,0.0,0.381818,0.090909,0.282051,0.0,0.7,0.043478,0.275
7,KNN,SMOTE,0.0,0.354545,0.363636,0.461538,0.0,0.78,0.142857,0.346154
8,KNN,SMOTE Tomek,0.0,0.327273,0.090909,0.282051,0.0,0.705882,0.04,0.275
9,KNN,SMOTE ENN,0.0,0.0,0.0,0.871795,0.0,0.0,0.0,0.295652


### Print prediction results

In [60]:
def print_pred_results(dftest, X_train, X_test, Y_train, Y_test, sampler, classifier):
    
    cXs_train_sm, cy_train_sm = sampler.fit_sample(X_train, Y_train)

    classifier.fit(cXs_train_sm, cy_train_sm)
    y_pred = classifier.predict(X_test)

    # Merge test prediction results back to test set
    p = pd.DataFrame(y_pred).reset_index(drop=True)
    results = pd.concat([dftest, p], axis=1)
    
    print "Train Accuracy :: ", accuracy_score(cy_train_sm, classifier.predict(cXs_train_sm))
    print "Test Accuracy :: ", accuracy_score(Y_test, y_pred)

    print 'Confusion Matrix:'
    print confusion_matrix(Y_test, y_pred)
    print classification_report(Y_test, y_pred) 
    
    return results

In [61]:
smote = SMOTE(ratio='minority', k=3, random_state=rs)
smoteen = SMOTEENN( random_state=rs)
smotetomek = SMOTETomek(random_state=rs)
knn = KNeighborsClassifier(n_jobs=njobs)
logreg = LogisticRegressionCV(multi_class='multinomial', solver='newton-cg', penalty='l2', max_iter= 500, Cs=10, cv=5, verbose=1, n_jobs=njobs, random_state=rs)

In [62]:
results_2017_c1_s1 = print_pred_results(XY1_c1_17, X1_c1, X1_c1_17, Y1_c1, Y1_c1_17, smotetomek, )
results_2017_c1_s1

Train Accuracy ::  0.856368563686
Test Accuracy ::  0.574193548387
Confusion Matrix:
[[ 0  0  0]
 [ 1 13 15]
 [ 9 41 76]]
             precision    recall  f1-score   support

     Medium       0.00      0.00      0.00         0
       Soft       0.24      0.45      0.31        29
 Super soft       0.84      0.60      0.70       126

avg / total       0.72      0.57      0.63       155



  'recall', 'true', average, warn_for)


Unnamed: 0,index,year,name,driverRef,Medium,Soft,Super Soft,Ultra soft,Hard,clusters(pit strategy),track info clusters,M_m,S_m,SS_m,first set,mandatory combi,0
0,0,2017,Brazilian Grand Prix,vettel,1.0,3.0,9.0,0.0,0.0,0,2,1.111487,1.176995,0.811017,Super soft,1.0,Soft
1,1,2017,Brazilian Grand Prix,bottas,1.0,4.0,8.0,0.0,0.0,0,2,1.252126,1.054087,0.942624,Super soft,1.0,Super soft
2,2,2017,Brazilian Grand Prix,raikkonen,1.0,3.0,9.0,0.0,0.0,0,2,1.009876,1.124674,0.901477,Super soft,1.0,Soft
3,3,2017,Brazilian Grand Prix,hamilton,1.0,4.0,8.0,0.0,0.0,0,2,0.851359,0.882515,1.063797,Soft,1.0,Medium
4,4,2017,Brazilian Grand Prix,verstappen,1.0,3.0,9.0,0.0,0.0,0,2,1.020470,0.911364,0.946199,Super soft,1.0,Super soft
5,5,2017,Brazilian Grand Prix,ricciardo,1.0,4.0,8.0,0.0,0.0,0,2,1.045744,1.010958,1.172284,Soft,1.0,Soft
6,7,2017,Brazilian Grand Prix,alonso,1.0,3.0,9.0,0.0,0.0,0,2,0.677418,1.018829,1.069176,Super soft,1.0,Soft
7,8,2017,Brazilian Grand Prix,perez,1.0,4.0,8.0,0.0,0.0,0,2,1.179297,1.215491,1.082196,Super soft,1.0,Soft
8,9,2017,Brazilian Grand Prix,hulkenberg,1.0,2.0,10.0,0.0,0.0,0,2,1.037634,0.961426,1.072898,Super soft,1.0,Super soft
9,10,2017,Brazilian Grand Prix,sainz,1.0,2.0,10.0,0.0,0.0,0,2,1.131192,0.985030,1.032151,Super soft,1.0,Super soft


In [140]:
results_2017_c1_s2 = print_pred_results(XY2_c1_17, X2_c1, X2_c1_17, Y2_c1, Y2_c1_17, smote, knn)
results_2017_c1_s2



Train Accuracy ::  0.773529411765
Test Accuracy ::  0.55
Confusion Matrix:
[[ 0  1  0]
 [ 6 64 36]
 [ 3 17 13]]
             precision    recall  f1-score   support

     Medium       0.00      0.00      0.00         1
       Soft       0.78      0.60      0.68       106
 Super soft       0.27      0.39      0.32        33

avg / total       0.65      0.55      0.59       140



Unnamed: 0,index,year,name,driverRef,Medium,Soft,Super Soft,Ultra soft,Hard,M_m,...,Track Temp(mean),Track Temp(range),Tyre stress,Downforce,Lateral,Asphalt Abrasion,Number of laps,second set,mandatory combi,0
0,0,2017,Brazilian Grand Prix,vettel,1.0,3.0,9.0,0.0,0.0,1.111487,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Soft,1.0,Soft
1,1,2017,Brazilian Grand Prix,bottas,1.0,4.0,8.0,0.0,0.0,1.252126,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Soft,1.0,Soft
2,2,2017,Brazilian Grand Prix,raikkonen,1.0,3.0,9.0,0.0,0.0,1.009876,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Soft,1.0,Super soft
3,3,2017,Brazilian Grand Prix,hamilton,1.0,4.0,8.0,0.0,0.0,0.851359,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Super soft,1.0,Super soft
4,4,2017,Brazilian Grand Prix,verstappen,1.0,3.0,9.0,0.0,0.0,1.020470,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Soft,1.0,Super soft
5,5,2017,Brazilian Grand Prix,ricciardo,1.0,4.0,8.0,0.0,0.0,1.045744,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Soft,1.0,Soft
6,7,2017,Brazilian Grand Prix,alonso,1.0,3.0,9.0,0.0,0.0,0.677418,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Soft,1.0,Medium
7,8,2017,Brazilian Grand Prix,perez,1.0,4.0,8.0,0.0,0.0,1.179297,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Soft,1.0,Medium
8,9,2017,Brazilian Grand Prix,hulkenberg,1.0,2.0,10.0,0.0,0.0,1.037634,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Soft,1.0,Soft
9,10,2017,Brazilian Grand Prix,sainz,1.0,2.0,10.0,0.0,0.0,1.131192,...,30.400000,5.400000,3.0,4.0,4.0,3.0,71.0,Soft,1.0,Soft


In [141]:
results_2017_c1_s3 = print_pred_results(XY3_c1_17, X3_c1, X3_c1_17, Y3_c1, Y3_c1_17, smote, knn)
results_2017_c1_s3



Train Accuracy ::  0.705314009662
Test Accuracy ::  0.36
Confusion Matrix:
[[ 0  0  0]
 [ 5  6  0]
 [ 9 18 12]]
             precision    recall  f1-score   support

     Medium       0.00      0.00      0.00         0
       Soft       0.25      0.55      0.34        11
 Super soft       1.00      0.31      0.47        39

avg / total       0.83      0.36      0.44        50



Unnamed: 0,index,year,name,driverRef,Medium,Soft,Super Soft,Ultra soft,Hard,M_m,...,Track Temp(mean),Track Temp(range),Tyre stress,Downforce,Lateral,Asphalt Abrasion,Number of laps,third set,mandatory combi,0
0,4,2017,Brazilian Grand Prix,verstappen,1.0,3.0,9.0,0.0,0.0,1.02047,...,30.4,5.4,3.0,4.0,4.0,3.0,71.0,Super soft,1.0,Soft
1,5,2017,Brazilian Grand Prix,ricciardo,1.0,4.0,8.0,0.0,0.0,1.045744,...,30.4,5.4,3.0,4.0,4.0,3.0,71.0,Super soft,1.0,Soft
2,15,2017,Brazilian Grand Prix,stroll,1.0,2.0,10.0,0.0,0.0,1.0,...,30.4,5.4,3.0,4.0,4.0,3.0,71.0,Super soft,1.0,Soft
3,69,2017,Japanese Grand Prix,stroll,1.0,4.0,8.0,0.0,0.0,1.0,...,29.216667,3.366667,5.0,3.0,5.0,3.0,53.0,Super soft,1.0,Medium
4,71,2017,Japanese Grand Prix,vandoorne,1.0,2.0,10.0,0.0,0.0,1.0,...,29.216667,3.366667,5.0,3.0,5.0,3.0,53.0,Super soft,1.0,Medium
5,73,2017,Japanese Grand Prix,gasly,1.0,3.0,9.0,0.0,0.0,1.0,...,29.216667,3.366667,5.0,3.0,5.0,3.0,53.0,Super soft,1.0,Medium
6,78,2017,Japanese Grand Prix,wehrlein,1.0,3.0,9.0,0.0,0.0,1.014619,...,29.216667,3.366667,5.0,3.0,5.0,3.0,53.0,Soft,1.0,Medium
7,94,2017,Malaysian Grand Prix,grosjean,1.0,5.0,7.0,0.0,0.0,1.107727,...,45.35,8.966667,4.0,3.0,4.0,3.0,56.0,Super soft,1.0,Soft
8,96,2017,Malaysian Grand Prix,hulkenberg,1.0,3.0,9.0,0.0,0.0,1.037634,...,45.35,8.966667,4.0,3.0,4.0,3.0,56.0,Super soft,1.0,Soft
9,123,2017,Italian Grand Prix,verstappen,1.0,2.0,10.0,0.0,0.0,1.02047,...,39.116667,4.766667,5.0,1.0,2.0,3.0,53.0,Super soft,1.0,Medium


### Findings: Classfication results indicate overfit as training accuracy is much higher than that of test accuracy. Furthermore , precision and recall scores are not ideal. Perhaps, the feature selection could be fine-tuned further to enable better classification results