# Cleaning and Transforming Data for F1 Machine Learning Model

## Imports

In [2]:
import fastf1
import pandas as pd
fastf1.Cache.enable_cache("cache")

## References
- https://medium.com/@nityachintan/how-i-built-an-f1-race-prediction-app-as-my-first-machine-learning-project-7e2e9cc89826
- https://docs.fastf1.dev/


## Getting Data needed for possible use
- (Current Drivers only) Driver - Experience, Name, Form, Consistency
- Team - Team points trend, DNF Rate,
- Qualifying - Grid Position, Sector Times, Tyre used
- Race - Average lap time, pit strategy, stint lengths
- Context - Weather and Round Number


In [70]:
quali_2024_results = pd.read_csv("Data/2024_Qualifying_Results.csv")
quali_2024_weather = pd.read_csv("Data/2024_Qualifying_Weather.csv")

race_2024_laps = pd.read_csv("Data/2024_Race_Laps.csv")
race_2024_results = pd.read_csv("Data/2024_Race_Results.csv")
race_2024_weather = pd.read_csv("Data/2024_Race_Weather.csv")

quali_2025_results = pd.read_csv("Data/2025_Qualifying_Results.csv")
quali_2025_weather = pd.read_csv("Data/2025_Qualifying_Weather.csv")

race_2025_laps = pd.read_csv("Data/2025_Race_Laps.csv")
race_2025_results = pd.read_csv("Data/2025_Race_Results.csv")
race_2025_weather = pd.read_csv("Data/2025_Race_Weather.csv")

## Data Cleaning 

### Quali Results Data
- 2024 and 2025 contain same structure data so any cleaning will be applied to both datasets

In [90]:
quali_2024_results.isnull().sum()

DriverId         0
TeamId           0
FullName         0
CountryCode      0
Position         0
Q1               5
Q2             121
Q3             244
RoundNumber      0
dtype: int64

In [71]:
# Columns to drop from qualifying that are redundant or all null values
quali_columns_to_drop = {
    'DriverNumber', 'BroadcastName', 'Abbreviation','TeamName',
       'TeamColor','FirstName', 'LastName','HeadshotUrl','Points', 
       "ClassifiedPosition", "GridPosition", "Time", "Status", "Laps"
}

quali_2024_results = quali_2024_results.drop(columns=quali_columns_to_drop)

quali_2025_results = quali_2025_results.drop(columns=quali_columns_to_drop)



In [72]:
# Changing dtype of the q1, 2 and 3 to represent their values 
quali_sessions = ["Q1", "Q2", "Q3"]

quali_2024_results[quali_sessions] = quali_2024_results[quali_sessions].apply(pd.to_timedelta)

quali_2025_results[quali_sessions] = quali_2025_results[quali_sessions].apply(pd.to_timedelta)


In [73]:
# Adding a prefix before the round number so can be delimited when the two datasets are joined
quali_2025_results["RoundNumber"] = "2025_" + quali_2025_results["RoundNumber"].astype("string")

quali_2024_results["RoundNumber"] = "2024_" + quali_2024_results["RoundNumber"].astype("string")



In [77]:
quali_2025_results["DriverId"] = quali_2025_results["DriverId"].astype("string")

quali_2024_results["DriverId"] = quali_2024_results["DriverId"].astype("string")

In [None]:
# Fixing Null values in 2025 Country Codes for Qualifying
country_codes = {'albon': 'THA',
        'leclerc': 'MON' ,
        'tsunoda': 'JPN' ,
        'stroll' :'CAN' ,
        'russell': 'GBR' ,
        'piastri' :'AUS' ,
        'perez' :'MEX' ,
        'norris' :'GBR' ,
        'alonso' :'ESP' ,
        'max_verstappen': 'NED' ,
        'hulkenberg' :'GER' ,
        'hamilton' :'GBR' ,
        'gasly' :'FRA' ,
        'bottas' :'FIN' ,
        'zhou' :'CHN' ,
        'ocon' :'FRA' ,
        'sainz' :'ESP' ,
        'kevin_magnussen' :'DEN', 
        'ricciardo' :'AUS' ,
        'sargeant' :'USA' ,
        'colapinto' :'ARG' ,
        'lawson': 'NZL' ,
        'bearman': 'GBR' ,
        'doohan' :'AUS' }

In [None]:
quali_results_merge = [quali_2025_results, quali_2024_results]

quali_results = pd.concat(quali_results_merge)

In [87]:
quali_2025_results[quali_2025_results["CountryCode"].isnull()]

Unnamed: 0,DriverId,TeamId,FullName,CountryCode,Position,Q1,Q2,Q3,RoundNumber
0,norris,mclaren,Lando Norris,,1.0,0 days 00:01:15.912000,0 days 00:01:15.415000,0 days 00:01:15.096000,2025_1
1,piastri,mclaren,Oscar Piastri,,2.0,0 days 00:01:16.062000,0 days 00:01:15.468000,0 days 00:01:15.180000,2025_1
2,max_verstappen,red_bull,Max Verstappen,,3.0,0 days 00:01:16.018000,0 days 00:01:15.565000,0 days 00:01:15.481000,2025_1
3,russell,mercedes,George Russell,,4.0,0 days 00:01:15.971000,0 days 00:01:15.798000,0 days 00:01:15.546000,2025_1
4,tsunoda,rb,Yuki Tsunoda,,5.0,0 days 00:01:16.225000,0 days 00:01:16.009000,0 days 00:01:15.670000,2025_1
...,...,...,...,...,...,...,...,...,...
395,bortoleto,sauber,Gabriel Bortoleto,,16.0,0 days 00:01:17.412000,NaT,NaT,2025_20
396,albon,williams,Alexander Albon,,17.0,0 days 00:01:17.490000,NaT,NaT,2025_20
397,gasly,alpine,Pierre Gasly,,18.0,0 days 00:01:17.546000,NaT,NaT,2025_20
398,stroll,aston_martin,Lance Stroll,,19.0,0 days 00:01:17.606000,NaT,NaT,2025_20


In [92]:
quali_2025_results[["DriverId","CountryCode"]].value_counts()

Series([], Name: count, dtype: int64)

In [97]:
x=quali_2024_results[["DriverId","CountryCode"]]

x.drop_duplicates().to_dict()

{'DriverId': {0: 'max_verstappen',
  1: 'leclerc',
  2: 'russell',
  3: 'sainz',
  4: 'perez',
  5: 'alonso',
  6: 'norris',
  7: 'piastri',
  8: 'hamilton',
  9: 'hulkenberg',
  10: 'tsunoda',
  11: 'stroll',
  12: 'albon',
  13: 'ricciardo',
  14: 'kevin_magnussen',
  15: 'bottas',
  16: 'zhou',
  17: 'sargeant',
  18: 'ocon',
  19: 'gasly',
  30: 'bearman',
  316: 'colapinto',
  373: 'lawson',
  478: 'doohan'},
 'CountryCode': {0: 'NED',
  1: 'MON',
  2: 'GBR',
  3: 'ESP',
  4: 'MEX',
  5: 'ESP',
  6: 'GBR',
  7: 'AUS',
  8: 'GBR',
  9: 'GER',
  10: 'JPN',
  11: 'CAN',
  12: 'THA',
  13: 'AUS',
  14: 'DEN',
  15: 'FIN',
  16: 'CHN',
  17: 'USA',
  18: 'FRA',
  19: 'FRA',
  30: 'GBR',
  316: 'ARG',
  373: 'NZL',
  478: 'AUS'}}

### Qualifying Weather Data 
- Will leave as is for now will probabily take average of each per session or round 

In [25]:
quali_2024_weather

Unnamed: 0,Time,AirTemp,Humidity,Pressure,Rainfall,TrackTemp,WindDirection,WindSpeed,RoundNumber
0,0 days 00:00:15.662000,18.2,48.0,1018.8,False,22.2,53,2.7,1
1,0 days 00:01:15.661000,18.1,48.0,1018.8,False,22.2,51,2.9,1
2,0 days 00:02:15.648000,18.1,48.0,1018.8,False,22.2,57,2.1,1
3,0 days 00:03:15.646000,18.1,48.0,1018.8,False,22.2,48,2.4,1
4,0 days 00:04:15.662000,18.1,48.0,1018.8,False,22.2,45,2.3,1
...,...,...,...,...,...,...,...,...,...
1982,0 days 01:16:38.816000,25.7,63.0,1017.6,False,28.8,288,0.7,24
1983,0 days 01:17:38.817000,25.7,63.0,1017.7,False,28.8,292,1.5,24
1984,0 days 01:18:38.816000,25.7,63.0,1017.6,False,28.8,254,1.8,24
1985,0 days 01:19:38.818000,25.7,63.0,1017.8,False,28.8,0,1.1,24
