In [1]:
# import packages
import os

# for data analysis
import numpy as np
import pandas as pd
import yaml

In [2]:
def yaml_to_df(data_path):
    # Load the YAML file
    with open(data_path, 'r') as file:
        data = yaml.load(file, Loader=yaml.FullLoader)

    # Convert the YAML data to a Pandas DataFrame
    df = pd.DataFrame.from_dict(data)
    return df

In [3]:
data_path = 'data/2024/races/01-bahrain/'
file = 'race-results.yml'

df = yaml_to_df(data_path+file)

In [4]:
## Get all the relevant data

years = ['2015','2016','2017','2018','2019','2020','2021','2022','2023','2024']
file_names = ['race-results.yml',
             'free-practice-1-results.yml',
             'free-practice-2-results.yml',
             'free-practice-3-results.yml',
             'qualifying-results.yml',]

f1 = pd.DataFrame()

for year in years: # loop through each year
    race_path = 'data/'+year+'/races'
    # get every race that year
    races = os.listdir(race_path)
    
    for race in races: # loop through every race that year
        round_num = race.split('-')[0]
        circuitId = race.split('-', maxsplit=1)[1]
        
        # race-results
        race_df = yaml_to_df(race_path+'/'+race+'/race-results.yml')
        
        # track data
        race_df['round'] = int(round_num)
        race_df['year'] = year
        race_df['circuitId'] = circuitId
        
        # set the gap of the fastest driver to 0.0
        race_df.loc[0, 'gap'] = '+0.000'
        
        # free-practice-1-results
        try:
            df = yaml_to_df(race_path+'/'+race+'/free-practice-1-results.yml')[['driverId','position','gap']]
            df.rename(columns={'position': 'fp1_position', 'gap': 'fp1_gap'}, inplace=True)
            
            # set the gap of the fastest driver to 0.0
            df.loc[0, 'fp1_gap'] = '+0.000'
        
            race_df = pd.merge(race_df, df, on='driverId', how='left')
        except:
            pass
        
        # free-practice-2-results
        try:
            df = yaml_to_df(race_path+'/'+race+'/free-practice-2-results.yml')[['driverId','position','gap']]
            df.rename(columns={'position': 'fp2_position', 'gap': 'fp2_gap'}, inplace=True)
            
            # set the gap of the fastest driver to 0.0
            df.loc[0, 'fp2_gap'] = '+0.000'
        
            race_df = pd.merge(race_df, df, on='driverId', how='left')
        except:
            pass
        
        # free-practice-3-results
        try:
            df = yaml_to_df(race_path+'/'+race+'/free-practice-3-results.yml')[['driverId','position','gap']]
            df.rename(columns={'position': 'fp3_position', 'gap': 'fp3_gap'}, inplace=True)
            
            # set the gap of the fastest driver to 0.0
            df.loc[0, 'fp3_gap'] = '+0.000'
        
            race_df = pd.merge(race_df, df, on='driverId', how='left')
        except:
            pass
        
        # qualifying-results
        try:
            df = yaml_to_df(race_path+'/'+race+'/qualifying-results.yml')[['driverId','position','gap','q1','q2','q3']]
            df.rename(columns={'position': 'qual_position', 'gap': 'qual_gap'}, inplace=True)
            
            # set the gap of the fastest driver to 0.0
            df.loc[0, 'qual_gap'] = '+0.000'
        
            race_df = pd.merge(race_df, df, on='driverId', how='left')
        except:
            pass
        
        # get fastest lap
        try:
            fastest_driver = yaml_to_df(race_path+'/'+race+'/fastest-laps.yml')['driverId'][0]
        
            race_df['fastest_lap'] = np.where(race_df['driverId'] == fastest_driver, 1, 0)
        except:
            race_df['fastest_lap'] = 0
            pass
        
        # get driver of the day
        try:
            driver_of_the_day = yaml_to_df(race_path+'/'+race+'/driver-of-the-day-results.yml')['driverId'][0]
        
            race_df['driver_of_the_day'] = np.where(race_df['driverId'] == fastest_driver, 1, 0)
        except:
            race_df['driver_of_the_day'] = 0
            pass
        
        # add this race to the master dataframe
        f1 = pd.concat([f1, race_df], axis=0)
        
    ### END loop through all races in current year
    
### END loop through each year
        
f1.info()  

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4222 entries, 0 to 19
Data columns (total 31 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   position              4222 non-null   object 
 1   driverNumber          4222 non-null   int64  
 2   driverId              4222 non-null   object 
 3   constructorId         4222 non-null   object 
 4   engineManufacturerId  4222 non-null   object 
 5   tyreManufacturerId    4222 non-null   object 
 6   laps                  4192 non-null   float64
 7   time                  2212 non-null   object 
 8   timePenalty           192 non-null    object 
 9   gap                   3512 non-null   object 
 10  interval              1998 non-null   object 
 11  reasonRetired         703 non-null    object 
 12  points                2090 non-null   float64
 13  gridPosition          4210 non-null   object 
 14  round                 4222 non-null   int64  
 15  year                  4

In [5]:
f1.tail(20)

Unnamed: 0,position,driverNumber,driverId,constructorId,engineManufacturerId,tyreManufacturerId,laps,time,timePenalty,gap,...,fp3_position,fp3_gap,qual_position,qual_gap,q1,q2,q3,fastest_lap,driver_of_the_day,grandSlam
0,1,4,lando-norris,mclaren,mercedes,pirelli,58.0,1:26:33.291,,+0.000,...,2.0,0.193,1,0.0,1:23.682,1:23.098,1:22.595,0,0,
1,2,55,carlos-sainz-jr,ferrari,ferrari,pirelli,58.0,1:26:39.123,,+5.832,...,5.0,0.438,3,0.229,1:23.487,1:22.985,1:22.824,0,0,
2,3,16,charles-leclerc,ferrari,ferrari,pirelli,58.0,1:27:05.219,,+31.928,...,9.0,0.665,14,,1:23.302,1:23.833,,0,0,
3,4,44,lewis-hamilton,mercedes,mercedes,pirelli,58.0,1:27:09.774,,+36.483,...,3.0,0.39,18,,1:23.887,,,0,0,
4,5,63,george-russell,mercedes,mercedes,pirelli,58.0,1:27:10.829,,+37.538,...,6.0,0.642,7,0.537,1:23.678,1:23.283,1:23.132,0,0,
5,6,1,max-verstappen,red-bull,honda-rbpt,pirelli,58.0,1:27:23.138,,+49.847,...,4.0,0.411,5,0.35,1:23.516,1:22.998,1:22.945,0,0,
6,7,10,pierre-gasly,alpine,renault,pirelli,58.0,1:27:45.851,,+1:12.560,...,13.0,0.975,6,0.389,1:23.548,1:23.086,1:22.984,0,0,
7,8,27,nico-hulkenberg,haas,ferrari,pirelli,58.0,1:27:48.845,,+1:15.554,...,7.0,0.66,4,0.291,1:23.722,1:23.040,1:22.886,0,0,
8,9,14,fernando-alonso,aston-martin,mercedes,pirelli,58.0,1:27:55.664,,+1:22.373,...,15.0,1.02,8,0.601,1:23.794,1:23.268,1:23.196,0,0,
9,10,81,oscar-piastri,mclaren,mercedes,pirelli,58.0,1:27:57.112,,+1:23.821,...,1.0,0.0,2,0.209,1:23.640,1:23.199,1:22.804,0,0,


In [6]:
f1.to_csv('data/f1_master.csv', index=False)