# Cleaning and Transforming Data for F1 Machine Learning Model 

### Imports

In [202]:
import fastf1
import pandas as pd
fastf1.Cache.enable_cache("cache")

## References
- https://medium.com/@nityachintan/how-i-built-an-f1-race-prediction-app-as-my-first-machine-learning-project-7e2e9cc89826
- https://docs.fastf1.dev/


## Possible Data Requirements
- (Current Drivers only) Driver - Experience, Name, Form, Consistency
- Team - Team points trend, DNF Rate,
- Qualifying - Grid Position, Sector Times, Tyre used
- Race - Average lap time, pit strategy, stint lengths
- Context - Weather and Round Number


### Qualifying Data

In [203]:
quali_2024_results = pd.read_csv("Raw_Data/2024_Qualifying_Results.csv")
quali_2024_weather = pd.read_csv("Raw_Data/2024_Qualifying_Weather.csv")

quali_2025_results = pd.read_csv("Raw_Data/2025_Qualifying_Results.csv")
quali_2025_weather = pd.read_csv("Raw_Data/2025_Qualifying_Weather.csv")


## Data Cleaning 

### Quali Results Data

In [204]:
quali_2024_results.isnull().sum()

DriverNumber            0
BroadcastName           0
Abbreviation            0
DriverId                0
TeamName                0
TeamColor               0
TeamId                  0
FirstName               0
LastName                0
FullName                0
HeadshotUrl            19
CountryCode             0
Position                0
ClassifiedPosition    479
GridPosition          479
Q1                      5
Q2                    121
Q3                    244
Time                  479
Status                479
Points                479
Laps                  479
RoundNumber             0
dtype: int64

In [205]:
quali_2025_results.isnull().sum()

DriverNumber            0
BroadcastName           0
Abbreviation            0
DriverId                0
TeamName                0
TeamColor               0
TeamId                  0
FirstName               0
LastName                0
FullName                0
HeadshotUrl            14
CountryCode           400
Position                0
ClassifiedPosition    400
GridPosition          400
Q1                      7
Q2                    108
Q3                    203
Time                  400
Status                400
Points                400
Laps                  400
RoundNumber             0
dtype: int64

### Dropping Null/Redundant Columns 

In [206]:
# Columns to drop from qualifying that are redundant or all null values
quali_columns_to_drop = {
    'DriverNumber', 'BroadcastName', 'Abbreviation',
       'TeamColor','FirstName', 'LastName','HeadshotUrl','Points', 
       "ClassifiedPosition", "GridPosition", "Time", "Status", "Laps"
}

quali_2024_results = quali_2024_results.drop(columns=quali_columns_to_drop)

quali_2025_results = quali_2025_results.drop(columns=quali_columns_to_drop)



### Q1, Q2 and Q3 Columns

In [207]:
# Changing dtype of the q1, 2 and 3 to represent their values 
quali_sessions = ["Q1", "Q2", "Q3"]

quali_2024_results[quali_sessions] = quali_2024_results[quali_sessions].apply(pd.to_timedelta)

quali_2025_results[quali_sessions] = quali_2025_results[quali_sessions].apply(pd.to_timedelta)


### Round Number Column

In [208]:
# Adding a prefix before the round number so can be delimited when the two datasets are joined
quali_2025_results["RoundNumber"] = "2025_" + quali_2025_results["RoundNumber"].astype("string")

quali_2024_results["RoundNumber"] = "2024_" + quali_2024_results["RoundNumber"].astype("string")



### Driver ID Column

In [209]:
quali_2025_results["DriverId"] = quali_2025_results["DriverId"].astype("string")

quali_2024_results["DriverId"] = quali_2024_results["DriverId"].astype("string")

### Country Code Column

In [210]:
# Fixing Null values in 2025 Country Codes for Qualifying by creating a map of respective country codes
country_codes = {'albon': 'THA',
        'leclerc': 'MON' ,
        'tsunoda': 'JPN' ,
        'stroll' :'CAN' ,
        'russell': 'GBR' ,
        'piastri' :'AUS' ,
        'perez' :'MEX' ,
        'norris' :'GBR' ,
        'alonso' :'ESP' ,
        'max_verstappen': 'NED' ,
        'hulkenberg' :'GER' ,
        'hamilton' :'GBR' ,
        'gasly' :'FRA' ,
        'bottas' :'FIN' ,
        'zhou' :'CHN' ,
        'ocon' :'FRA' ,
        'sainz' :'ESP' ,
        'kevin_magnussen' :'DEN', 
        'ricciardo' :'AUS' ,
        'sargeant' :'USA' ,
        'colapinto' :'ARG' ,
        'lawson': 'NZL' ,
        'bearman': 'GBR' ,
        'doohan' :'AUS',
        "bortoleto": "BRA",
        "hadjar":"FRA",
        "antonelli":"ITA" }

# Mapping country codes based on driver id
quali_2025_results["CountryCode"] = quali_2025_results["DriverId"].map(country_codes)

### Merging Qualifying Results Data

In [211]:
merge_quali = [quali_2024_results, quali_2025_results]

quali_data = pd.concat(merge_quali)

In [212]:
# Creating a list of all current drivers
cur_drivers = ['max_verstappen','leclerc','russell',  'sainz',
                'alonso', 'norris','piastri',
                'hamilton','hulkenberg','tsunoda', 'stroll','albon',
                'ocon',  'gasly','bearman', 'colapinto', 'lawson',
                'hadjar', 'bortoleto', 'antonelli']

# Normalizing the columns data
quali_data["DriverId"] = quali_data["DriverId"].str.strip().str.lower().astype("string")

# Filtering the df to only show data on current drivers
quali_data = quali_data[quali_data["DriverId"].isin(cur_drivers)]


### Qualifying Weather Data

### Round Number Column

In [213]:
# Adding a prefix before the round number so can be delimited when the two datasets are joined
quali_2025_weather["RoundNumber"] = "2025_" + quali_2025_weather["RoundNumber"].astype("string")

quali_2024_weather["RoundNumber"] = "2024_" + quali_2024_weather["RoundNumber"].astype("string")


### Joining Dataframes

In [214]:
quali_weather = [quali_2024_weather, quali_2025_weather]

quali_weather_data = pd.concat(quali_weather)



### Aggregating Weather data 

In [215]:
# Aggregating all data per round number to get min, max and mean of each round 
quali_weather_aggregation = (
    quali_weather_data.groupby("RoundNumber").agg({
        "AirTemp": ["mean","min","max","median"],
        "Humidity": ["mean","min","max","median"],
        'Pressure': ["mean","min","max","median"],
        'WindDirection' : ["mean","min","max","median"],
        "WindSpeed" : ["mean","min","max","median"],
        "Rainfall": ["sum"]
    })
)

### Creating New Weather Dataframe

In [216]:
# Function to take the aggregated data column and create new column names
def create_new_weather_data(data, new_col_name):

    new_data = pd.DataFrame(data).reset_index()

    new_data.rename(columns={"mean":"Mean" + new_col_name,"median":"Median"+new_col_name, "min":"Min"+new_col_name, "max":"Max"+new_col_name},inplace=True)

    return new_data


def classify_rain(data):
    if data["sum"] == 0:
        return "No Rain"
    elif 0 < data["sum"] <= 30:
        return "Light Rain"
    elif 31 <= data["sum"] <= 60:
        return "Moderate Rain"
    else:
        return "Heavy Rain"
    

In [217]:
# Creating new dataframes with each aggregation 
air_temp = create_new_weather_data(quali_weather_aggregation["AirTemp"], "AirTemp")
humidity = create_new_weather_data(quali_weather_aggregation["Humidity"], "Humidity")
pressure = create_new_weather_data(quali_weather_aggregation["Pressure"], "Pressure")
wind_direction = create_new_weather_data(quali_weather_aggregation["WindDirection"], "WindDirection")
wind_speed = create_new_weather_data(quali_weather_aggregation["WindSpeed"],"WindSpeed")

rainfall = pd.DataFrame(quali_weather_aggregation["Rainfall"]).reset_index()
rainfall["RainAmount"] = rainfall.apply(classify_rain,axis=1)

In [218]:
# Merging all aggregations to one df 
quali_weather_data = air_temp.merge(humidity, on="RoundNumber",how="left")
quali_weather_data = quali_weather_data.merge(pressure, on="RoundNumber",how="left")
quali_weather_data = quali_weather_data.merge(wind_direction, on="RoundNumber",how="left")
quali_weather_data = quali_weather_data.merge(wind_speed, on="RoundNumber",how="left")
quali_weather_data = quali_weather_data.merge(rainfall[["RoundNumber","RainAmount"]], on="RoundNumber",how="left")



In [219]:
# Merging all data into one dataframe for qualifying
full_quali_data = quali_data.merge(quali_weather_data, on="RoundNumber",how="left")

In [220]:
full_quali_data.to_csv("Cleaned_Data/Cleaned_Quali_Data.csv",index=False)

## Race Data

In [221]:
race_2024_laps = pd.read_csv("Raw_Data/2024_Race_Laps.csv")
race_2024_results = pd.read_csv("Raw_Data/2024_Race_Results.csv")
race_2024_weather = pd.read_csv("Raw_Data/2024_Race_Weather.csv")

race_2025_laps = pd.read_csv("Raw_Data/2025_Race_Laps.csv")
race_2025_results = pd.read_csv("Raw_Data/2025_Race_Results.csv")
race_2025_weather = pd.read_csv("Raw_Data/2025_Race_Weather.csv")

### Round Number Column

In [222]:
# Adding a prefix before the round number so can be delimited when the two datasets are joined
race_2025_laps["RoundNumber"] = "2025_" + race_2025_laps["RoundNumber"].astype("string")

race_2024_laps["RoundNumber"] = "2024_" + race_2024_laps["RoundNumber"].astype("string")

### Merging Race lap data

In [223]:
race_lap_data_merge = [race_2025_laps, race_2024_laps]
race_lap_data = pd.concat(race_lap_data_merge)

In [224]:
race_lap_data.shape

(48938, 32)

In [225]:
race_lap_data.isnull().sum()

Time                      0
Driver                    0
DriverNumber              0
LapTime                 767
LapNumber                 0
Stint                   446
PitOutTime            47367
PitInTime             47382
Sector1Time             976
Sector2Time              81
Sector3Time             105
Sector1SessionTime     1082
Sector2SessionTime       81
Sector3SessionTime      105
SpeedI1                7694
SpeedI2                 103
SpeedFL                1635
SpeedST                3953
IsPersonalBest           32
Compound                503
TyreLife                526
FreshTyre                 0
Team                      0
LapStartTime              0
LapStartDate          48938
TrackStatus               0
Position                 57
Deleted                   0
DeletedReason         48371
FastF1Generated           0
IsAccurate                0
RoundNumber               0
dtype: int64

### Creating DriverId Column

In [226]:
driver_id_map = {
    'VER':'max_verstappen',
    'GAS':'gasly',
    'ANT':'antonelli',
    'ALO':'alonso',
    'LEC':"leclerc",
    'STR':'stroll',
    'TSU':'tsunoda',
    'ALB':'albon',
    'HUL':'hulkenberg',
    'LAW':'lawson', 
    'OCO':'ocon',
    'NOR':'norris', 
    'HAM':'hamilton',
    'BOR':'bortoleto', 
    'SAI':'sainz',
    'HAD':"hadjar",
    'RUS':'russell',
    'PIA':"piastri",
    'BEA':"bearman",
    'COL':"colapinto"
}

race_lap_data["DriverId"] = race_lap_data["Driver"].map(driver_id_map).str.strip().astype("string")

### Dropping Non Current Driver Data

In [227]:
# Creating a list of all current drivers
cur_drivers_laps = ['max_verstappen','leclerc','russell',  'sainz',
                'alonso', 'norris','piastri',
                'hamilton','hulkenberg','tsunoda', 'stroll','albon',
                'ocon',  'gasly','bearman', 'colapinto', 'lawson',
                'hadjar', 'bortoleto', 'antonelli']

# Normalizing the columns data
race_lap_data["DriverId"] = race_lap_data["DriverId"].str.strip().str.lower().astype("string")

# Filtering the df to only show data on current drivers
race_lap_data = race_lap_data[race_lap_data["DriverId"].isin(cur_drivers_laps)]

### Dropping Redundant/Null Columns

In [228]:
cols_to_drop_laps = ['Time', 'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime', "LapStartTime",	"LapStartDate","DeletedReason","IsAccurate","FastF1Generated","IsPersonalBest"]

In [229]:
race_lap_data.drop(columns=cols_to_drop_laps,inplace=True)

### Time Data Columns

In [230]:
time_columns = ['LapTime','PitOutTime','PitInTime', 'Sector1Time', 'Sector2Time', 'Sector3Time']

race_lap_data[time_columns] = race_lap_data[time_columns].apply(pd.to_timedelta)

### Aggregating Time Data to find outliers

In [231]:
agg_time_data = (
    race_lap_data.groupby("RoundNumber").agg({
    'LapTime':["min","max","mean","median"],
    'Sector1Time':["min","max","mean"],
    'Sector2Time':["min","max","mean"],
    'Sector3Time':["min","max","mean"]

}))

In [232]:
agg_time_data["LapTime"].reset_index()

Unnamed: 0,RoundNumber,min,max,mean,median
0,2024_1,0 days 00:01:32.608000,0 days 00:02:10.911000,0 days 00:01:37.861020176,0 days 00:01:36.941000
1,2024_10,0 days 00:01:17.115000,0 days 00:01:42.656000,0 days 00:01:21.193407608,0 days 00:01:20.428500
2,2024_11,0 days 00:01:07.694000,0 days 00:02:00.257000,0 days 00:01:12.021772126,0 days 00:01:11.045000
3,2024_12,0 days 00:01:28.293000,0 days 00:02:08.459000,0 days 00:01:36.299475535,0 days 00:01:32.817000
4,2024_13,0 days 00:01:20.305000,0 days 00:01:44.868000,0 days 00:01:24.911900,0 days 00:01:24.282500
5,2024_14,0 days 00:01:45.563000,0 days 00:02:04.840000,0 days 00:01:49.848686688,0 days 00:01:49.187000
6,2024_15,0 days 00:01:13.817000,0 days 00:01:37.706000,0 days 00:01:16.577511488,0 days 00:01:16.151000
7,2024_16,0 days 00:01:21.432000,0 days 00:02:00.178000,0 days 00:01:25.656902013,0 days 00:01:24.801000
8,2024_17,0 days 00:01:45.255000,0 days 00:02:29.162000,0 days 00:01:49.951171957,0 days 00:01:48.895000
9,2024_18,0 days 00:01:34.925000,0 days 00:02:13.278000,0 days 00:01:38.989240418,0 days 00:01:38.416000
