# Race data set creation

## Imports

In [1]:
from google.colab import drive
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


## Read in F1 datasets

In [3]:
dataframes = {
    'circuits_df': pd.read_csv('/content/drive/My Drive/200_Final_Project/f1db_csv/circuits.csv'),
    'constructor_standings_df': pd.read_csv('/content/drive/My Drive/200_Final_Project/f1db_csv/constructor_standings.csv'),
    'constructors_df': pd.read_csv('/content/drive/My Drive/200_Final_Project/f1db_csv/constructors.csv'),
    'driver_standings_df': pd.read_csv('/content/drive/My Drive/200_Final_Project/f1db_csv/driver_standings.csv'),
    'drivers_df': pd.read_csv('/content/drive/My Drive/200_Final_Project/f1db_csv/drivers.csv'),
    'races_df': pd.read_csv('/content/drive/My Drive/200_Final_Project/f1db_csv/races.csv'),
    'results_df': pd.read_csv('/content/drive/My Drive/200_Final_Project/f1db_csv/results.csv'),
    'status_df': pd.read_csv('/content/drive/My Drive/200_Final_Project/f1db_csv/status.csv')
}

In [4]:
circuits_df = dataframes['circuits_df']
constructor_standings_df = dataframes['constructor_standings_df']
constructors_df = dataframes['constructors_df']
driver_standings_df = dataframes['driver_standings_df']
drivers_df = dataframes['drivers_df']
races_df = dataframes['races_df']
results_df = dataframes['results_df']
status_df = dataframes['status_df']

## Merge dataframes to create race dataset

In [5]:
race_circuits = pd.merge(races_df, circuits_df, on='circuitId', how='left', suffixes=('_race', '_circuits'))
res_race_circuits = pd.merge(results_df, race_circuits, on='raceId', how='left', suffixes=('_results', '_race'))
merge_driver_df = pd.merge(res_race_circuits, drivers_df, on='driverId', how='left', suffixes=('_race', '_drivers'))
merge_status_df = pd.merge(merge_driver_df, status_df, on='statusId', how='left', suffixes=('_drivers', '_status'))
merge_constructors_df = pd.merge(merge_status_df, constructors_df, on='constructorId', how='left', suffixes=('_status', '_constructors'))
merge_drivers_standing_df = pd.merge(merge_constructors_df, driver_standings_df, on=['raceId', 'driverId'], how='left', suffixes=('_constructors', '_drivers'))
merge_constructors_standing_df = pd.merge(merge_drivers_standing_df, constructor_standings_df, on=['raceId', 'constructorId'], how='left', suffixes=('_driver', '_constructor'))

In [6]:
filtered_df = merge_constructors_standing_df[(merge_constructors_standing_df['year'] >= 2018) & (merge_constructors_standing_df['year'] <= 2023)].reset_index()

In [7]:
filtered_df.head()

Unnamed: 0,index,resultId,raceId,driverId,constructorId,number_race,grid,position_constructors,positionText_constructors,positionOrder,points_constructors,laps,time_results,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId,year,round,circuitId,name_race,date,time_race,url_race,fp1_date,fp1_time,fp2_date,fp2_time,fp3_date,fp3_time,quali_date,quali_time,sprint_date,sprint_time,circuitRef,name_circuits,location,country,lat,lng,alt,url_circuits,driverRef,number_drivers,code,forename,surname,dob,nationality_status,url_status,status,constructorRef,name,nationality_constructors,url_constructors,driverStandingsId,points_drivers,position_drivers,positionText_drivers,wins_driver,constructorStandingsId,points,position,positionText,wins_constructor
0,23777,23782,989,20,6,5,3,1,1,1,25.0,58,1:29:33.283,5373283,53,4,1:26.469,220.782,1,2018,1,1,Australian Grand Prix,2018-03-25,05:10:00,http://en.wikipedia.org/wiki/2018_Australian_G...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,http://en.wikipedia.org/wiki/Melbourne_Grand_P...,vettel,5,VET,Sebastian,Vettel,1987-07-03,German,http://en.wikipedia.org/wiki/Sebastian_Vettel,Finished,ferrari,Ferrari,Italian,http://en.wikipedia.org/wiki/Scuderia_Ferrari,68609.0,25.0,1.0,1,1.0,26933.0,40.0,1.0,1,1.0
1,23778,23783,989,1,131,44,1,2,2,2,18.0,58,+5.036,5378319,50,3,1:26.444,220.845,1,2018,1,1,Australian Grand Prix,2018-03-25,05:10:00,http://en.wikipedia.org/wiki/2018_Australian_G...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,http://en.wikipedia.org/wiki/Melbourne_Grand_P...,hamilton,44,HAM,Lewis,Hamilton,1985-01-07,British,http://en.wikipedia.org/wiki/Lewis_Hamilton,Finished,mercedes,Mercedes,German,http://en.wikipedia.org/wiki/Mercedes-Benz_in_...,68610.0,18.0,2.0,2,0.0,26934.0,22.0,2.0,2,0.0
2,23779,23784,989,8,6,7,2,3,3,3,15.0,58,+6.309,5379592,57,2,1:26.373,221.027,1,2018,1,1,Australian Grand Prix,2018-03-25,05:10:00,http://en.wikipedia.org/wiki/2018_Australian_G...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,http://en.wikipedia.org/wiki/Melbourne_Grand_P...,raikkonen,7,RAI,Kimi,Räikkönen,1979-10-17,Finnish,http://en.wikipedia.org/wiki/Kimi_R%C3%A4ikk%C...,Finished,ferrari,Ferrari,Italian,http://en.wikipedia.org/wiki/Scuderia_Ferrari,68611.0,15.0,3.0,3,0.0,26933.0,40.0,1.0,1,1.0
3,23780,23785,989,817,9,3,8,4,4,4,12.0,58,+7.069,5380352,54,1,1:25.945,222.128,1,2018,1,1,Australian Grand Prix,2018-03-25,05:10:00,http://en.wikipedia.org/wiki/2018_Australian_G...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,http://en.wikipedia.org/wiki/Melbourne_Grand_P...,ricciardo,3,RIC,Daniel,Ricciardo,1989-07-01,Australian,http://en.wikipedia.org/wiki/Daniel_Ricciardo,Finished,red_bull,Red Bull,Austrian,http://en.wikipedia.org/wiki/Red_Bull_Racing,68612.0,12.0,4.0,4,0.0,26935.0,20.0,3.0,3,0.0
4,23781,23786,989,4,1,14,10,5,5,5,10.0,58,+27.886,5401169,57,7,1:26.978,219.489,1,2018,1,1,Australian Grand Prix,2018-03-25,05:10:00,http://en.wikipedia.org/wiki/2018_Australian_G...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,http://en.wikipedia.org/wiki/Melbourne_Grand_P...,alonso,14,ALO,Fernando,Alonso,1981-07-29,Spanish,http://en.wikipedia.org/wiki/Fernando_Alonso,Finished,mclaren,McLaren,British,http://en.wikipedia.org/wiki/McLaren,68613.0,10.0,5.0,5,0.0,26936.0,12.0,4.0,4,0.0


In [8]:
cols_to_keep = [
    'driverId', 'forename', 'surname', 'name',
    'raceId', 'date', 'time_race', 'location', 'country', 'lat', 'lng',
    'name_circuits', 'circuitRef',  'round',
    'grid', 'position_constructors', 'laps', 'fastestLap', 'milliseconds','points_constructors',
    'status', 'points_drivers', 'position_drivers', 'points', 'position', 'positionOrder'
]

race_df = filtered_df[cols_to_keep]

In [9]:
rename_dict = {
    'forename': 'first_name',
    'surname': 'last_name',
    'name': 'team_name',
    'raceId': 'race_id',
    'date': 'race_date',
    'time_race': 'race_time_utc',
    'name_circuits': 'circuit_name',
    'circuitRef': 'circuit_ref',
    'grid': 'starting_race_position',
    'position_constructors': 'team_position_after_race',
    'points_constructors': 'race_points',
    'points_drivers': 'driver_current_points',
    'position_drivers': 'driver_current_position',
    'milliseconds': 'race_time',
    'fastestLap': 'fastest_lap',
    'points': 'constructor_current_points',
    'position': 'constructor_position',
    'positionOrder': 'ending_race_position'
}

race_df = race_df.rename(columns=rename_dict)

In [10]:
race_df.head()

Unnamed: 0,driverId,first_name,last_name,team_name,race_id,race_date,race_time_utc,location,country,lat,lng,circuit_name,circuit_ref,round,starting_race_position,team_position_after_race,laps,fastest_lap,race_time,race_points,status,driver_current_points,driver_current_position,constructor_current_points,constructor_position,ending_race_position
0,20,Sebastian,Vettel,Ferrari,989,2018-03-25,05:10:00,Melbourne,Australia,-37.8497,144.968,Albert Park Grand Prix Circuit,albert_park,1,3,1,58,53,5373283,25.0,Finished,25.0,1.0,40.0,1.0,1
1,1,Lewis,Hamilton,Mercedes,989,2018-03-25,05:10:00,Melbourne,Australia,-37.8497,144.968,Albert Park Grand Prix Circuit,albert_park,1,1,2,58,50,5378319,18.0,Finished,18.0,2.0,22.0,2.0,2
2,8,Kimi,Räikkönen,Ferrari,989,2018-03-25,05:10:00,Melbourne,Australia,-37.8497,144.968,Albert Park Grand Prix Circuit,albert_park,1,2,3,58,57,5379592,15.0,Finished,15.0,3.0,40.0,1.0,3
3,817,Daniel,Ricciardo,Red Bull,989,2018-03-25,05:10:00,Melbourne,Australia,-37.8497,144.968,Albert Park Grand Prix Circuit,albert_park,1,8,4,58,54,5380352,12.0,Finished,12.0,4.0,20.0,3.0,4
4,4,Fernando,Alonso,McLaren,989,2018-03-25,05:10:00,Melbourne,Australia,-37.8497,144.968,Albert Park Grand Prix Circuit,albert_park,1,10,5,58,57,5401169,10.0,Finished,10.0,5.0,12.0,4.0,5


## Check data quality & perform data preprocessing

In [11]:
race_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 26 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   driverId                    2500 non-null   int64  
 1   first_name                  2500 non-null   object 
 2   last_name                   2500 non-null   object 
 3   team_name                   2500 non-null   object 
 4   race_id                     2500 non-null   int64  
 5   race_date                   2500 non-null   object 
 6   race_time_utc               2500 non-null   object 
 7   location                    2500 non-null   object 
 8   country                     2500 non-null   object 
 9   lat                         2500 non-null   float64
 10  lng                         2500 non-null   float64
 11  circuit_name                2500 non-null   object 
 12  circuit_ref                 2500 non-null   object 
 13  round                       2500 

In [12]:
# Check missing data
total = race_df.isnull().sum().sort_values(ascending=False)
percent = (race_df.isnull().sum()/race_df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
print(missing_data)

                            Total  Percent
driverId                        0      0.0
first_name                      0      0.0
constructor_position            0      0.0
constructor_current_points      0      0.0
driver_current_position         0      0.0
driver_current_points           0      0.0
status                          0      0.0
race_points                     0      0.0
race_time                       0      0.0
fastest_lap                     0      0.0
laps                            0      0.0
team_position_after_race        0      0.0
starting_race_position          0      0.0
round                           0      0.0
circuit_ref                     0      0.0
circuit_name                    0      0.0
lng                             0      0.0
lat                             0      0.0
country                         0      0.0
location                        0      0.0
race_time_utc                   0      0.0
race_date                       0      0.0
race_id    

In [13]:
# Check for duplicates
race_df.nunique()

driverId                        37
first_name                      37
last_name                       37
team_name                       15
race_id                        125
race_date                      125
race_time_utc                   24
location                        31
country                         27
lat                             31
lng                             31
circuit_name                    31
circuit_ref                     31
round                           22
starting_race_position          21
team_position_after_race        21
laps                            78
fastest_lap                     78
race_time                     1380
race_points                     23
status                          53
driver_current_points          323
driver_current_position         23
constructor_current_points     368
constructor_position            10
ending_race_position            20
dtype: int64

In [14]:
# Checking for data entry errors
print(race_df.team_name.unique())
print(race_df.team_name.nunique())

['Ferrari' 'Mercedes' 'Red Bull' 'McLaren' 'Renault' 'Force India'
 'Sauber' 'Williams' 'Toro Rosso' 'Haas F1 Team' 'Alfa Romeo'
 'Racing Point' 'AlphaTauri' 'Aston Martin' 'Alpine F1 Team']
15


We want to keep it coherent so we will remove the "F1 Team" portions from these entries.

In [15]:
race_df['team_name'] = race_df['team_name'].replace({'Alpine F1 Team': 'Alpine', 'Haas F1 Team': 'Haas'})

In [16]:
print(race_df.status.unique())
print(race_df.status.nunique())

['Finished' '+1 Lap' 'Wheel' 'Engine' 'Steering' 'Brakes' 'Puncture'
 'Electrical' 'Collision damage' 'Accident' 'Collision' '+2 Laps'
 '+3 Laps' 'Gearbox' 'Oil leak' 'Turbo' 'Exhaust' 'Fuel pressure'
 'Hydraulics' 'Power Unit' 'Tyre' 'Retired' 'Power loss' 'Suspension'
 'Disqualified' 'Mechanical' 'Battery' 'Overheating' 'Damage'
 'Out of fuel' 'Transmission' 'Spun off' 'Water pressure' 'Withdrew'
 'Electronics' '+5 Laps' 'Debris' 'Radiator' 'Illness' 'Wheel nut'
 'Driveshaft' 'Rear wing' 'Cooling system' 'Water pump' 'Fuel leak'
 'Front wing' 'Water leak' 'Vibrations' 'Fuel pump' 'Undertray' '+6 Laps'
 'Differential' 'Technical']
53


In [17]:
race_df.shape

(2500, 26)

If the driver did not start at all, then we want to remove them from the analysis. Therefore, we will remove rows with 'Disqualified,' 'Illness,' and 'Withdrew' statuses.

In [18]:
race_df = race_df[~race_df['status'].isin(['Disqualified', 'Illness', 'Withdrew'])]

In [19]:
race_df.shape

(2484, 26)

If a driver has driven below 38 races, minimum of 2 seasons, we are excluding them from the analysis. We want to analyze drivers that have had the opportunity to race many times in our designed conditions.

In [20]:
driver_count = race_df.last_name.value_counts()

In [21]:
race_df = race_df[race_df['last_name'].isin(driver_count[driver_count >= 38].index)]

In [22]:
race_df.shape

(2294, 26)

If a team has less than 38 entries, we are excluding them from the analysis. We want to analyze teams that have had the opportunity to race as many times in our designed conditions.

In [23]:
team_count = race_df.team_name.value_counts()

In [24]:
race_df = race_df[race_df['team_name'].isin(team_count[team_count >= 38].index)]

In [25]:
race_df.shape

(2273, 26)

Removing Drivers in reserve conditions, just Hulkenberg pretty much

In [26]:
race_df.groupby(['team_name', 'last_name'])['last_name'].size().reset_index(name='count').head()

Unnamed: 0,team_name,last_name,count
0,Alfa Romeo,Bottas,44
1,Alfa Romeo,Giovinazzi,60
2,Alfa Romeo,Räikkönen,58
3,Alfa Romeo,Zhou,44
4,AlphaTauri,Gasly,61


In [27]:
condition =  ((race_df['team_name'] == 'Racing Point') | (race_df['team_name'] == 'Aston Martin')) & (race_df['last_name'] == 'Hülkenberg')

In [28]:
race_df = race_df[~condition]

In [29]:
race_df.shape

(2268, 26)

The status column shows us whether the driver finished the race or not and the reason why. We want to capture whether the driver finished/did not finish regardless of the reasons in a separate column called F/DNF where 0 is for finished and 1 is for did not finish.

In [30]:
finished_statuses = ['Finished', '+1 Lap', '+2 Laps', '+3 Laps', '+4 Laps', '+5 Laps', '+6 Laps']

# Create a new column 'F/DNF'
race_df['F/DNF'] = race_df['status'].apply(lambda x: 0 if x in finished_statuses else 1)

In [31]:
race_df.head()

Unnamed: 0,driverId,first_name,last_name,team_name,race_id,race_date,race_time_utc,location,country,lat,lng,circuit_name,circuit_ref,round,starting_race_position,team_position_after_race,laps,fastest_lap,race_time,race_points,status,driver_current_points,driver_current_position,constructor_current_points,constructor_position,ending_race_position,F/DNF
0,20,Sebastian,Vettel,Ferrari,989,2018-03-25,05:10:00,Melbourne,Australia,-37.8497,144.968,Albert Park Grand Prix Circuit,albert_park,1,3,1,58,53,5373283,25.0,Finished,25.0,1.0,40.0,1.0,1,0
1,1,Lewis,Hamilton,Mercedes,989,2018-03-25,05:10:00,Melbourne,Australia,-37.8497,144.968,Albert Park Grand Prix Circuit,albert_park,1,1,2,58,50,5378319,18.0,Finished,18.0,2.0,22.0,2.0,2,0
2,8,Kimi,Räikkönen,Ferrari,989,2018-03-25,05:10:00,Melbourne,Australia,-37.8497,144.968,Albert Park Grand Prix Circuit,albert_park,1,2,3,58,57,5379592,15.0,Finished,15.0,3.0,40.0,1.0,3,0
3,817,Daniel,Ricciardo,Red Bull,989,2018-03-25,05:10:00,Melbourne,Australia,-37.8497,144.968,Albert Park Grand Prix Circuit,albert_park,1,8,4,58,54,5380352,12.0,Finished,12.0,4.0,20.0,3.0,4,0
4,4,Fernando,Alonso,McLaren,989,2018-03-25,05:10:00,Melbourne,Australia,-37.8497,144.968,Albert Park Grand Prix Circuit,albert_park,1,10,5,58,57,5401169,10.0,Finished,10.0,5.0,12.0,4.0,5,0


## Statistics Summary

In [32]:
race_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
driverId,2268.0,676.766314,321.881173,1.0,815.0,832.0,844.0,855.0
race_id,2268.0,1054.809965,37.336729,989.0,1023.0,1055.0,1086.0,1120.0
lat,2268.0,33.442025,21.117179,-37.8497,26.0325,41.57,47.2197,52.3888
lng,2268.0,14.379475,58.061402,-115.173,2.26111,9.28111,49.8533,144.968
round,2268.0,10.969136,6.127589,1.0,6.0,11.0,16.0,22.0
starting_race_position,2268.0,9.611993,5.69697,0.0,5.0,9.0,14.0,20.0
laps,2268.0,54.573192,17.391428,0.0,51.0,57.0,67.0,87.0
race_points,2268.0,5.513889,7.404079,0.0,0.0,1.0,10.0,26.0
driver_current_points,2268.0,61.655864,81.593348,0.0,6.0,28.0,84.0,575.0
driver_current_position,2268.0,9.900353,5.61551,1.0,5.0,10.0,15.0,22.0


## Save dataframe as a parquet file to be read in another Colab for analysis

In [33]:
race_df.to_parquet('race_df.parquet', compression = 'BROTLI')

In [34]:
!cp race_df.parquet /content/drive/MyDrive/200_Final_Project/ColabSharedFolder/