## Create dataset containing selected features 

In [1]:
%run F1_classes_func.ipynb

### Import datasets

#### 1) Dataframe of Race finish statuses and position numbers

In [2]:
df_results = pd.read_csv('./formula-1-race-data/results.csv')
df_results.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,22.0,1,1.0,1,1,10.0,58,34:50.6,5690616.0,39.0,2.0,01:27.5,218.3,1
1,2,18,2,2,3.0,5,2.0,2,2,8.0,58,5.478,5696094.0,41.0,3.0,01:27.7,217.586,1
2,3,18,3,3,7.0,7,3.0,3,3,6.0,58,8.163,5698779.0,41.0,5.0,01:28.1,216.719,1
3,4,18,4,4,5.0,11,4.0,4,4,5.0,58,17.181,5707797.0,58.0,7.0,01:28.6,215.464,1
4,5,18,5,1,23.0,3,5.0,5,5,4.0,58,18.014,5708630.0,43.0,1.0,01:27.4,218.385,1


#### 2) Dataframe containing qualifying information of each driver at each race

In [3]:
df_qualifying = pd.read_csv('./formula-1-race-data/qualifying.csv')
df_qualifying.head()

Unnamed: 0,qualifyId,raceId,driverId,constructorId,number,position,q1,q2,q3
0,1,18,1,1,22,1,1:26.572,1:25.187,1:26.714
1,2,18,9,2,4,2,1:26.103,1:25.315,1:26.869
2,3,18,5,1,23,3,1:25.664,1:25.452,1:27.079
3,4,18,13,6,2,4,1:25.994,1:25.691,1:27.178
4,5,18,2,2,3,5,1:25.960,1:25.518,1:27.236


#### 3) Dataframe containing pitstop information at each race

In [4]:
df_pitStops = pd.read_csv('./formula-1-race-data/pitStops.csv')
df_pitStops.head()

Unnamed: 0,raceId,driverId,stop,lap,time,duration,milliseconds
0,841,153,1,1,17:05:23,26.898,26898
1,841,30,1,1,17:05:52,25.021,25021
2,841,17,1,11,17:20:48,23.426,23426
3,841,4,1,12,17:22:34,23.251,23251
4,841,13,1,13,17:24:10,23.842,23842


#### 4) Dataframe containing weather labels and SC appearance labels (Overtaking figures and SC Laps can be ignored)

In [5]:
xl = pd.ExcelFile("./formula-1-race-data/cliptheapex.xlsx", encoding='utf-8')
xl.sheet_names
weather = xl.parse("Sheet16")
weather['name'] = weather['name'] + " Grand Prix"
weather.replace("Dy", "Dry", inplace=True)
weather.head()

Unnamed: 0,Race,name,SC Laps,weather,overtakes,year
0,1,Australian Grand Prix,12.0,Dry,12.0,2008
1,2,Malaysian Grand Prix,,Dry,5.0,2008
2,3,Bahrain Grand Prix,,Dry,13.0,2008
3,4,Spanish Grand Prix,9.0,Dry,2.0,2008
4,5,Turkish Grand Prix,2.0,Dry,11.0,2008


#### 5) Dataframe containing overtaking information of each driver at each race

In [6]:
df_overtake = pd.read_csv(directory+"df_overtake.csv")
df_overtake.head()

Unnamed: 0.1,Unnamed: 0,lap,name,driverRef,since pit,until pit,position,year,clear lap?,raceId
0,0,4,Australian Grand Prix,hamilton,3.0,12,5.0,2016,overtaker,948.0
1,1,5,Australian Grand Prix,ricciardo,4.0,7,8.0,2016,overtaker,948.0
2,2,5,Australian Grand Prix,bottas,4.0,12,16.0,2016,overtaker,948.0
3,3,9,Australian Grand Prix,bottas,8.0,8,14.0,2016,overtaker,948.0
4,4,10,Australian Grand Prix,ricciardo,9.0,2,6.0,2016,overtaker,948.0


#### 6) Dataframe containing cluster label of each driver at each race. 

In [7]:
# Cluster label is derived by heriachal clustering of each driver's tyre strategy 
tyre_strategy_clusters = read_from_pickle(directory, "tyre_strategy_clusters.pickle")
tyre_strategy_clusters.head()

Unnamed: 0,year,driverRef,name,clusters
0,2015,button,Australian Grand Prix,6
1,2015,ericsson,Australian Grand Prix,9
2,2015,hamilton,Australian Grand Prix,6
3,2015,hulkenberg,Australian Grand Prix,4
4,2015,massa,Australian Grand Prix,6


#### 7) Dataframe containing number an type of tyre sets selected by each driver at each race. 

In [8]:
xl = pd.ExcelFile("./formula-1-race-data/Selected_Sets.xlsx", encoding='utf-8')
xl.sheet_names
selected_sets = xl.parse("Selected_Sets_Copy")
selected_sets.head()

Unnamed: 0,year,name,driverRef,Medium,Soft,Super Soft,Ultra soft,Hard
0,2017,Brazilian Grand Prix,vettel,1,3,9,0,0
1,2017,Brazilian Grand Prix,bottas,1,4,8,0,0
2,2017,Brazilian Grand Prix,raikkonen,1,3,9,0,0
3,2017,Brazilian Grand Prix,hamilton,1,4,8,0,0
4,2017,Brazilian Grand Prix,verstappen,1,3,9,0,0


#### Import dataframes of drivers, races and teams identification information

In [9]:
df_drivers = pd.read_csv('./formula-1-race-data/drivers.csv')
df_races = pd.read_csv('./formula-1-race-data/races.csv')
df_constructors = pd.read_csv('./formula-1-race-data/constructors.csv')

# Cleaning:
# Some drivers have the same surnames, resolve this by replacing driverRef of the non-current driver with the full name
df_results.replace("max_verstappen", "verstappen", inplace=True)
df_results.replace("jolyon_palmer", "palmer", inplace=True)
df_results.replace("kevin_magnussen", "magnussen", inplace=True)
df_results.replace("brandon_hartley", "hartley", inplace=True)

df_drivers.replace("max_verstappen", "verstappen", inplace=True)
df_drivers.replace("jolyon_palmer", "palmer", inplace=True)
df_drivers.replace("kevin_magnussen", "magnussen", inplace=True)
df_drivers.replace("brandon_hartley", "hartley", inplace=True)

df_drivers.loc[75, "driverRef"] = 'jan_magnussen'
df_drivers.loc[49, "driverRef"] = 'jos_verstappen'
df_drivers.loc[155, "driverRef"] = 'jonathan_palmer'
df_drivers.loc[155, "driverRef"] = 'jonathan_palmer'
df_drivers.loc[813, "driverRef"] = 'di resta'

df_races.loc[942, "name"] = "Azerbaijan Grand Prix"

#### DATASET TEMPLATE/FRAME

In [10]:
df_tyres = read_from_pickle(directory, "df_tyres.pickle")
template = df_tyres[['year', 'name', 'driverRef']]
template.head()

Unnamed: 0,year,name,driverRef
0,2016,Abu Dhabi Grand Prix,hamilton
1,2016,Australian Grand Prix,hamilton
2,2016,Austrian Grand Prix,hamilton
3,2016,Azerbaijan Grand Prix,hamilton
4,2016,Bahrain Grand Prix,hamilton


### Functions to pre-process data

In [37]:
class DataPreprocess():
    
    """
    This class contains functions to calculate statistics along rows or columns.
    """
    def __init__(self, results, year_range, calc_method):
        self.results = results
        self.year_range = year_range
        self.calc_method = calc_method

    def remove_outliers(self, df, groupby_fields, agg_method, col, threshold):

        g = df.groupby(groupby_fields)[col].agg(agg_method).reset_index()

        # Filter out the outliers
        return g[g[col] < g[col].quantile([0, threshold])[threshold]]

    def calc_stats_wrapper(self, function, df, col, groupby_field, agg_method):

        g_all = pd.DataFrame()

        if (self.calc_method == "rolling_value"):

            ranges = [range(self.year_range[idx]-3, self.year_range[idx]) for idx,value in enumerate(self.year_range)]

            for r in ranges:
                g = function(df, r, col, groupby_field, agg_method)
                g['year'] = r[-1] + 1
                g_all = pd.concat([g_all, g])
                
                results = self.results[self.results['year'] == r[-1]+1]
                drivers = list(results.driverRef.unique())
                if groupby_field[0]=='driverRef':
                    g_all = g_all[g_all['driverRef'].isin(drivers)]

            return g_all

        elif (self.calc_method == 'one_year'):

            for r in self.year_range:
                try:
                    g = function(df, [r], col, groupby_field, agg_method)
                    g['year'] = r
                    g_all = pd.concat([g_all, g])
                except:
                    pass
            return g_all

        raise ValueError("Only rolling_value and one_year are available options for calc_method")

        
    def calc_proportion(self, df, yr_range, col, groupby_field, agg_method):
        """
        A multi-purpose function to find proportion of an element amongst a group.
        Eg. if agg_method = 'sum', then the function calculates proportion of an element summmed over a specified range of years.
        """  
        
        df = df[df['year'].isin([yr_range[-1]])] 
        g = df.groupby(groupby_field)[col].agg([agg_method]).reset_index()
        
        # Because we are finding the proportion amongst the drivers participating in a season, filter the drivers accordingly.
        results = self.results[self.results['year'] == yr_range[-1]+1]
        drivers = list(results.driverRef.unique())
        df = df[df['driverRef'].isin(drivers)]
        
        if len(groupby_field) > 1:

            df_overall = df.groupby(groupby_field[1:])[col].agg([agg_method]).reset_index()
            df_overall.rename(columns={agg_method: agg_method+' (overall)'}, inplace=True)
            df_new = pd.merge(g, df_overall, on=groupby_field[1:], how='left')

            df_new['proportion'] = (df_new[agg_method] / df_new[agg_method+' (overall)'])
            df_new.drop([agg_method, agg_method +' (overall)'], axis=1, inplace=True)

            return df_new

        elif len(groupby_field) == 1:

            total = float(df[col].agg([agg_method])[agg_method])

            for i, row in g.iterrows():
                g.loc[i, 'proportion'] = float(g.loc[i, agg_method]) / total
            g.drop([agg_method], axis=1, inplace=True)

            return g
        
        
    def calc_avg(self, df, yr_range, col, groupby_field, agg_method):
        """
        Functions to calculate average count of an element within a group over a specified range of years.
        """    
        df = df[df['year'].isin(yr_range)] 
        g = df.groupby(groupby_field)[col].agg([agg_method]).reset_index()
        return g
        
        
    def calc_rate(self, df, yr_range, col, groupby_fields, agg_method):
        """
        Function to calculate percentage of an element occurring over a specified range of years
        """     
        df = df[df['year'].isin(yr_range)] 

        g = pd.DataFrame(df.groupby(groupby_fields)[col].value_counts())
        g.rename(columns={col:agg_method}, inplace=True)
        g = g.reset_index()

        g_overall = pd.DataFrame(df.groupby(groupby_fields)[col].agg(agg_method).rename("total")).reset_index()

        g = pd.merge(g, g_overall, on=groupby_fields, how='left')
        g['percentage'] = (g[agg_method] / g['total']).apply(lambda x: round(x,2))

        gPT = pd.pivot_table(g, index=groupby_fields, columns=[col], values='percentage').reset_index()
        gPT.fillna(0, inplace=True)

        return gPT

### Class to create features

In [15]:
class CreateFeatures():
    
    """
    This class contains functions to pre-process datasets, calculate statistics and create new features.
    """
    def __init__(self, year_range, calc_method):
        self.calc_method = calc_method
        self.year_range = year_range
        
    def calc_indiv_stats(self, df_qualifying, df_results, weather, df_overtake, df_pitStops, df_races, df_drivers):
        

        # Feature: Qualifying position
        qual = self.preprocess_results(df_qualifying, df_races, df_drivers)
        qual = qual[['year', 'name', 'driverRef', 'position']]
        
        df_results_new = self.preprocess_results(df_results, df_races, df_drivers)
        results = self.categorize_finish_pos_status(df_results_new)
        
        PP = DataPreprocess(results, self.year_range, self.calc_method)
        
        # Feature: Race finish status
        status = PP.calc_stats_wrapper(PP.calc_rate, results, 'statusId', ['driverRef', 'name'], 'count')
        
        # Feature: DNF reason category
        pos = PP.calc_stats_wrapper(PP.calc_rate, results, 'position', ['driverRef', 'name'], 'count')
    
        weather = self.SC_binary_label(weather)
    
        # Feature: Safety car
        sc = PP.calc_stats_wrapper(PP.calc_rate, weather, 'SC', ['name'], 'count')

        # Feature: Wet weather rate
        ww = PP.calc_stats_wrapper(PP.calc_rate, weather, 'weather', ['name'], 'count')

        pitStops = self.preprocess_pitstops(df_pitStops, qual)
        pS_notouliers = PP.remove_outliers(pitStops, ['driverRef', 'name', 'year'], 'sum', "milliseconds", 0.95)
        
        overtake = self.preprocess_overtaking(df_overtake, qual)

        # Feature: Average number of overtakes per driver for the past 3 season
        o_avg = PP.calc_stats_wrapper(PP.calc_avg, overtake, 'count', ['driverRef'], 'mean')
        o_avg = o_avg.rename(columns={'mean': 'overtaking (avg)'})
        
        # Feature: Proportion of overtakes amongst drivers for the past 3 season
        o_d = PP.calc_stats_wrapper(PP.calc_proportion, overtake, 'count', ['driverRef'], 'sum')
        o_d = o_d.rename(columns={'proportion': 'overtaking prop(driver)'})

        # Feature: Proportion of overtakes amongst races for the past 3 season 
        o_r = PP.calc_stats_wrapper(PP.calc_proportion, overtake, 'count', ['name'], 'sum')
        o_r = o_r.rename(columns={'proportion': 'overtaking prop(track)'})

        # Feature: Average number of overtakes per driver for the past 3 season
        pS_avg = PP.calc_stats_wrapper(PP.calc_avg, pS_notouliers, 'milliseconds', ['driverRef'], 'mean')
        pS_avg = pS_avg.rename(columns={'mean': 'pitStop timing (avg)'})
        
        # Feature: Proportion of pitStop timings amongst drivers for the past 3 season
        pS_d = PP.calc_stats_wrapper(PP.calc_proportion, pS_notouliers, 'milliseconds', ['driverRef'], 'sum')
        pS_d = pS_d.rename(columns={'proportion': 'pitStop timing prop(driver)'})

        # Feature: Proportion of pitStop timings amongst races for the past 3 season
        pS_r = PP.calc_stats_wrapper(PP.calc_proportion, pS_notouliers, 'milliseconds', ['name'], 'sum')
        pS_r = pS_r.rename(columns={'proportion': 'pitStop timing prop(race)'})
        
        # Target Variable: StatusId
        target_var = self.extract_target_variable(results)

        return results, weather, pitStops, overtake, qual, status, pos, sc, ww, \
               o_avg, o_d, o_r, pS_avg, pS_d, pS_r, target_var
   
    
    def preprocess_results(self, data, df_races, df_drivers):
        
        results = pd.merge(data, df_drivers[['driverId', 'driverRef']], on=['driverId'], how='left')
        results = pd.merge(results, df_races[['raceId', 'year', 'name']], on=['raceId'], how='left')
        #results = pd.merge(results, df_constructors[['constructorId', 'constructorRef']], on=['constructorId'], how='left')

        return results
    
    def preprocess_qualifying_pos(self, data, df_races, df_drivers):

        qual_pos = data[['raceId', 'driverId', 'position']]
        qual_pos = self.preprocess_results(qual_pos, df_races, df_drivers)
        qual_pos = qual_pos[qual_pos['year'].isin(self.year_range)]
        qual_pos.drop(['raceId', 'driverId'], axis=1, inplace=True)
        
        return qual_pos
    
    def categorize_finish_pos_status(self, data):
    
        # Feature: Finish position
        results = data.copy()
        results['position'] = results['position'].replace(range(1,4) ,"Podium")
        results['position'] = results['position'].replace(range(5,10) , "Pos 4 to 10")
        results['position'] = results['position'].replace(np.nan , "Did not finish")
        mask = ~results['position'].isin(['Podium',"Pos 4 to 10", "Did not finish"])
        results['position'] = results['position'].mask(mask, "Pos > 10")

        # Feature: Reason category for not finishing race
        results['statusId'] = results['statusId'].replace([1,11,12,13,14] ,"Finished")
        results['statusId'] = results['statusId'].replace([3,4] , "Accident / Collision")
        mask = ~results['statusId'].isin(['Finished',"Accident / Collision"])
        results['statusId'] = results['statusId'].mask(mask, "Technical Failure")

        return results  
    
    def SC_binary_label(self, data):
        
        # Feature: SC Appearance in race
        data['SC Laps'].fillna(0, inplace=True)
        data['SC'] = np.where(data['SC Laps'] > 0, "SC", "No SC")
        
        return data
            
    def preprocess_overtaking(self, data, qual):
    
        df_overtake = data[data['clear lap?'] == 'overtaker']
        g = df_overtake.groupby(['year','driverRef', 'name'])['clear lap?'].agg(['count']).reset_index()
        g = pd.merge(qual[['year', 'name', 'driverRef']], g, on=['year', 'name', 'driverRef'], how='left').fillna(0)
        
        return g
    
    def preprocess_pitstops(self, data, qual):
    
        pitStops = self.preprocess_results(data, df_races, df_drivers)
        g = pd.merge(qual[['year', 'name', 'driverRef']], pitStops, on=['year', 'name', 'driverRef'], how='left').fillna(0)
        g = g.sort_values('stop', ascending=False).groupby(['year', 'name', 'driverRef']).first().reset_index()  

        return g
    
    def extract_target_variable(self, data):
        
        status = data[['year', 'name', 'driverRef', 'statusId']]
        status.replace('Finished', 1, inplace=True)
        status.replace('Accident / Collision', 0, inplace=True)
        status.replace('Technical Failure', 0, inplace=True)

        return status

    

### Class to create dataset

In [18]:
class CreateDataset():
    def __init__(self, add_qual_pos=True, add_status=True, add_finish_pos=True, add_safety_car=True, add_weather=True, 
                 add_overtaking=True, add_pitStop=True, add_tyre_sets=True, add_clusters=True):

        self.add_qual_pos = add_qual_pos
        self.add_status = add_status
        self.add_finish_pos = add_finish_pos
        self.add_safety_car = add_safety_car
        self.add_weather = add_weather
        self.add_overtaking = add_overtaking
        self.add_pitStop = add_pitStop
        self.add_tyre_sets = add_tyre_sets
        self.add_clusters = add_clusters

    def merge_all_stats(self, template, qual, status, pos, sc, ww, o_avg, o_d, o_r, pS_avg, pS_d, pS_r,
                        target_var, tyre_sets, tyre_strategy_clusters):

        # Template to merge all feature to
        df = template.copy()
        
        # Merge dataframe containing target variable
        df = pd.merge(df, target_var, on=['year', 'name', 'driverRef'], how='left')
        
        # Feature: Qualifying position
        if self.add_qual_pos==True:   
            df = pd.merge(df, qual, on=['year', 'name', 'driverRef'], how='left')
        
        # Feature: Finishing position
        if self.add_finish_pos==True:   
            pos = pos.drop(['Pos > 10'], axis=1)
            df = pd.merge(df, pos, on=['year','name', 'driverRef'], how='left')

        # Feature: DNF reason category
        if self.add_status==True:
            status = status.drop(['Technical Failure'], axis=1)
            df = pd.merge(df, status, on=['year','name', 'driverRef'], how='left')
          
        # Feature: Safety Car
        if self.add_safety_car==True:
            sc = sc.drop(['No SC'], axis=1)
            df = pd.merge(df, sc, on=['year','name'], how='left')
           
        # Feature: Wet weather rate
        if self.add_weather==True:
            ww = ww.drop(['Varied'], axis=1)
            df = pd.merge(df, ww, on=['year','name'], how='left') 
            
        # Feature: Overtaking
        if self.add_overtaking==True:
            df = pd.merge(df, o_avg, on=['year', 'driverRef'], how='left') 
            df = pd.merge(df, o_d, on=['year', 'driverRef'], how='left') 
            df = pd.merge(df, o_r, on=['year', 'name'], how='left') 
     
        # Feature: Pitstop Timings
        if self.add_pitStop==True:
            df = pd.merge(df, pS_d, on=['year', 'driverRef'], how='left')   
            df = pd.merge(df, pS_r, on=['year', 'name'], how='left')  

        # Feature: Selected Tyre Sets as ordinal categorical vaues
        if self.add_tyre_sets == True:
            df = pd.merge(df, selected_sets, on=['year', 'name', 'driverRef'], how='left') 

        # Feature: Tyre Strategy Clusters
        if self.add_clusters==True:
            df = pd.merge(df, tyre_strategy_clusters, on=['year', 'name', 'driverRef'], how='left')
        
        return df
    
    def handling_missing_values(self, df):
     
        df = self.miscellaneous_cleaning(df)
        
        imputer = Imputer(missing_values='NaN', strategy='median', axis=0)
        df_new = pd.DataFrame(imputer.fit_transform(df.drop(['year', 'name', 'driverRef', 'statusId'], axis=1)))
        df_new = pd.concat([df[['year', 'name', 'driverRef', 'statusId']], df_new], axis=1)
        df_new.columns = df.columns 
        
        return df_new

    def miscellaneous_cleaning(self, df):
        
        # Null values for drivers who did not set a qual time or participate in qualifying
        if self.add_qual_pos==True: 
            df['position'].fillna(22, inplace=True)
            
        # Null values for drivers who did not set a qual time or participate in qualifying
        if self.add_qual_pos==True: 
            df['position'].fillna(22, inplace=True)
            
        # Null values for drivers who did not set a qual time or participate in qualifying
        if self.add_clusters==True:
            df['clusters'].fillna(21, inplace=True)
              
        return df

#### Dataset 1
- Values of features are aggregated over the passt three seasons
- ALL features

In [38]:
cf = CreateFeatures([2015, 2016, 2017], 'rolling_value')
results, weather, pitStops, overtake, qual, status, pos, sc, ww, o_avg, o_d, o_r, pS_avg, pS_d, pS_r, target_var =\
cf.calc_indiv_stats(df_qualifying, df_results, weather, df_overtake, df_pitStops, df_races, df_drivers)

In [40]:
cd = CreateDataset()
dataset = cd.merge_all_stats(template, qual, status, pos, sc, ww, o_avg, o_d, o_r, pS_avg, pS_d, pS_r, target_var, 
                            selected_sets, tyre_strategy_clusters)
dataset.isnull().sum()

year                             0
name                             0
driverRef                        0
statusId                         4
position                        13
Did not finish                 346
Podium                         346
Pos 4 to 10                    346
Accident / Collision           346
Finished                       346
SC                              42
Dry                             42
Wet                             42
overtaking (avg)               283
overtaking prop(driver)        283
overtaking prop(track)          42
pitStop timing prop(driver)    283
pitStop timing prop(race)       42
Medium                         376
Soft                           376
Super Soft                     376
Ultra soft                     376
Hard                           376
clusters                        98
dtype: int64

In [41]:
dataset_new = cd.handling_missing_values(dataset)
dataset_new.isnull().sum()

year                           0
name                           0
driverRef                      0
statusId                       4
position                       0
Did not finish                 0
Podium                         0
Pos 4 to 10                    0
Accident / Collision           0
Finished                       0
SC                             0
Dry                            0
Wet                            0
overtaking (avg)               0
overtaking prop(driver)        0
overtaking prop(track)         0
pitStop timing prop(driver)    0
pitStop timing prop(race)      0
Medium                         0
Soft                           0
Super Soft                     0
Ultra soft                     0
Hard                           0
clusters                       0
dtype: int64

In [42]:
# There are 4 rows with null values for statusId. 
# They belong to hartley, a driver who joined toro rosso only in the lasst 4 races. Seems that the Eargast API results were not updated to reflect the changes.
# Check the official results and inpute the statusId accordingly.
dataset_new[dataset_new['statusId'].isnull()]

Unnamed: 0,year,name,driverRef,statusId,position,Did not finish,Podium,Pos 4 to 10,Accident / Collision,Finished,...,overtaking prop(driver),overtaking prop(track),pitStop timing prop(driver),pitStop timing prop(race),Medium,Soft,Super Soft,Ultra soft,Hard,clusters
1130,2017,Abu Dhabi Grand Prix,hartley,,22.0,0.0,0.0,0.33,0.0,1.0,...,0.061224,0.0,0.057475,0.063397,0.0,1.0,3.0,9.0,0.0,21.0
1131,2017,Brazilian Grand Prix,hartley,,22.0,0.0,0.0,0.33,0.0,1.0,...,0.061224,0.0,0.057475,0.032301,1.0,4.0,8.0,0.0,0.0,21.0
1132,2017,Mexican Grand Prix,hartley,,22.0,0.0,0.0,0.33,0.0,1.0,...,0.061224,0.069841,0.057475,0.064512,0.0,1.0,2.0,10.0,0.0,21.0
1133,2017,United States Grand Prix,hartley,,22.0,0.0,0.0,0.33,0.0,1.0,...,0.061224,0.069841,0.057475,0.069947,0.0,2.0,3.0,8.0,0.0,21.0


In [43]:
dataset_new.loc[1130, 'statusId'] = 1
dataset_new.loc[1131, 'statusId'] = 0
dataset_new.loc[1132, 'statusId'] = 0
dataset_new.loc[1133, 'statusId'] = 1

In [44]:
dataset_new.to_csv(directory+'dataset.csv', index=False)

#### Dataset 2
- Values of features are aggregated over the passt three seasons
- Clusters removed

In [24]:
cd_2 = CreateDataset(add_clusters=False)
dataset_2 = cd_2.merge_all_stats(template, qual, status, pos, sc, ww, o_avg, o_d, o_r, pS_avg, pS_d, pS_r,
                                 target_var, selected_sets, tyre_strategy_clusters)
dataset_2 = cd_2.handling_missing_values(dataset_2)
dataset_2.loc[1130, 'statusId'] = 1
dataset_2.loc[1131, 'statusId'] = 0
dataset_2.loc[1132, 'statusId'] = 0
dataset_2.loc[1133, 'statusId'] = 1
print dataset_2.columns

dataset_2.to_csv(directory+'dataset_2.csv', index=False)

Index([                       u'year',                        u'name',
                         u'driverRef',                    u'statusId',
                          u'position',              u'Did not finish',
                            u'Podium',                 u'Pos 4 to 10',
              u'Accident / Collision',                    u'Finished',
                                u'SC',                         u'Dry',
                               u'Wet',            u'overtaking (avg)',
           u'overtaking prop(driver)',      u'overtaking prop(track)',
       u'pitStop timing prop(driver)',   u'pitStop timing prop(race)',
                            u'Medium',                        u'Soft',
                        u'Super Soft',                  u'Ultra soft',
                              u'Hard'],
      dtype='object')


#### Dataset 3
- Values of features are aggregated over the passt three seasons
- Remove features that showed in visualization to have no clear relationship with target variable

In [25]:
cd_3 = CreateDataset(add_overtaking=False)

dataset_3 = cd_3.merge_all_stats(template, qual, status, pos, sc, ww, o_avg, o_d, o_r, pS_avg, pS_d, pS_r,
                                 target_var, selected_sets, tyre_strategy_clusters)
dataset_3 = cd_3.handling_missing_values(dataset_3)
dataset_3.loc[1130, 'statusId'] = 1
dataset_3.loc[1131, 'statusId'] = 0
dataset_3.loc[1132, 'statusId'] = 0
dataset_3.loc[1133, 'statusId'] = 1
print dataset_3.columns

dataset_3.to_csv(directory+'dataset_3.csv', index=False)

Index([                       u'year',                        u'name',
                         u'driverRef',                    u'statusId',
                          u'position',              u'Did not finish',
                            u'Podium',                 u'Pos 4 to 10',
              u'Accident / Collision',                    u'Finished',
                                u'SC',                         u'Dry',
                               u'Wet', u'pitStop timing prop(driver)',
         u'pitStop timing prop(race)',                      u'Medium',
                              u'Soft',                  u'Super Soft',
                        u'Ultra soft',                        u'Hard',
                          u'clusters'],
      dtype='object')


#### Dataset
- Values of features are NOT aggregated

In [30]:
cf_noagg = CreateFeatures([2015, 2016, 2017], 'one_year')
results1, weather1, pitStops1, overtake1, qual1, status1, pos1, sc1, ww1, o_avg1, o_d1, o_r1, pS_avg1, pS_d1, pS_r1, target_var1  =\
cf_noagg.calc_indiv_stats(df_qualifying, df_results, weather, df_overtake, df_pitStops, df_races, df_drivers)

In [31]:
cd_noagg = CreateDataset()
dataset_noagg = cd_noagg.merge_all_stats(template, qual1, status1, pos1, sc1, ww1,
                                         o_avg1, o_d1, o_r1, pS_avg1, pS_d1, pS_r1,
                                         target_var1, selected_sets, tyre_strategy_clusters)

In [32]:
dataset_noagg_new = cd.handling_missing_values(dataset_noagg)
dataset_noagg_new.loc[1130, 'statusId'] = 1
dataset_noagg_new.loc[1131, 'statusId'] = 0
dataset_noagg_new.loc[1132, 'statusId'] = 0
dataset_noagg_new.loc[1133, 'statusId'] = 1

dataset_noagg_new.to_csv(directory+'dataset_noagg_new.csv', index=False)