# Cleaning and Transforming Data for F1 Machine Learning Model 

### Imports

In [2]:
import fastf1
import pandas as pd
fastf1.Cache.enable_cache("cache")

## References
- https://medium.com/@nityachintan/how-i-built-an-f1-race-prediction-app-as-my-first-machine-learning-project-7e2e9cc89826
- https://docs.fastf1.dev/


## Possible Data Requirements
- (Current Drivers only) Driver - Experience, Name, Form, Consistency
- Team - Team points trend, DNF Rate,
- Qualifying - Grid Position, Sector Times, Tyre used
- Race - Average lap time, pit strategy, stint lengths
- Context - Weather and Round Number


### Qualifying Data

In [3]:
quali_2024_results = pd.read_csv("Raw_Data/2024_Qualifying_Results.csv")
quali_2024_weather = pd.read_csv("Raw_Data/2024_Qualifying_Weather.csv")

quali_2025_results = pd.read_csv("Raw_Data/2025_Qualifying_Results.csv")
quali_2025_weather = pd.read_csv("Raw_Data/2025_Qualifying_Weather.csv")


# Data Cleaning 

## Quali Results Data

In [4]:
quali_2024_results.isnull().sum()

DriverNumber            0
BroadcastName           0
Abbreviation            0
DriverId                0
TeamName                0
TeamColor               0
TeamId                  0
FirstName               0
LastName                0
FullName                0
HeadshotUrl            19
CountryCode             0
Position                0
ClassifiedPosition    479
GridPosition          479
Q1                      5
Q2                    121
Q3                    244
Time                  479
Status                479
Points                479
Laps                  479
RoundNumber             0
dtype: int64

In [5]:
quali_2025_results.isnull().sum()

DriverNumber            0
BroadcastName           0
Abbreviation            0
DriverId                0
TeamName                0
TeamColor               0
TeamId                  0
FirstName               0
LastName                0
FullName                0
HeadshotUrl            14
CountryCode           400
Position                0
ClassifiedPosition    400
GridPosition          400
Q1                      7
Q2                    108
Q3                    203
Time                  400
Status                400
Points                400
Laps                  400
RoundNumber             0
dtype: int64

### Dropping Null/Redundant Columns 

In [4]:
# Columns to drop from qualifying that are redundant or all null values
quali_columns_to_drop = {
    'DriverNumber', 'BroadcastName', 'Abbreviation',
       'TeamColor','FirstName', 'LastName','HeadshotUrl','Points', 
       "ClassifiedPosition", "GridPosition", "Time", "Status", "Laps"
}

quali_2024_results = quali_2024_results.drop(columns=quali_columns_to_drop)

quali_2025_results = quali_2025_results.drop(columns=quali_columns_to_drop)



### Q1, Q2 and Q3 Columns

In [5]:
# Changing dtype of the q1, 2 and 3 to represent their values 
quali_sessions = ["Q1", "Q2", "Q3"]

quali_2024_results[quali_sessions] = quali_2024_results[quali_sessions].apply(pd.to_timedelta)

quali_2025_results[quali_sessions] = quali_2025_results[quali_sessions].apply(pd.to_timedelta)


### Round Number Column

In [6]:
# Adding a prefix before the round number so can be delimited when the two datasets are joined
quali_2025_results["Year"] = "2025"

quali_2024_results["Year"] = "2024" 



### Driver ID Column

In [8]:
quali_2025_results["DriverId"] = quali_2025_results["DriverId"].astype("string")

quali_2024_results["DriverId"] = quali_2024_results["DriverId"].astype("string")

### Country Code Column

In [9]:
# Fixing Null values in 2025 Country Codes for Qualifying by creating a map of respective country codes
country_codes = {'albon': 'THA',
        'leclerc': 'MON' ,
        'tsunoda': 'JPN' ,
        'stroll' :'CAN' ,
        'russell': 'GBR' ,
        'piastri' :'AUS' ,
        'perez' :'MEX' ,
        'norris' :'GBR' ,
        'alonso' :'ESP' ,
        'max_verstappen': 'NED' ,
        'hulkenberg' :'GER' ,
        'hamilton' :'GBR' ,
        'gasly' :'FRA' ,
        'bottas' :'FIN' ,
        'zhou' :'CHN' ,
        'ocon' :'FRA' ,
        'sainz' :'ESP' ,
        'kevin_magnussen' :'DEN', 
        'ricciardo' :'AUS' ,
        'sargeant' :'USA' ,
        'colapinto' :'ARG' ,
        'lawson': 'NZL' ,
        'bearman': 'GBR' ,
        'doohan' :'AUS',
        "bortoleto": "BRA",
        "hadjar":"FRA",
        "antonelli":"ITA" }

# Mapping country codes based on driver id
quali_2025_results["CountryCode"] = quali_2025_results["DriverId"].map(country_codes)

### Merging Qualifying Results Data

In [10]:
merge_quali = [quali_2024_results, quali_2025_results]

quali_data = pd.concat(merge_quali)

## Qualifying Weather Data

### Round Number Column

In [14]:
# Adding a prefix before the round number so can be delimited when the two datasets are joined
quali_2025_weather["Year"] = "2025" 

quali_2024_weather["Year"] = "2024"


### Joining Dataframes

In [15]:
quali_weather = [quali_2024_weather, quali_2025_weather]

quali_weather_data = pd.concat(quali_weather)



### Aggregating Weather data 

In [16]:
# Aggregating all data per round number to get min, max and mean of each round 
quali_weather_aggregation = (
    quali_weather_data.groupby("RoundNumber").agg({
        "AirTemp": ["mean","min","max","median"],
        "Humidity": ["mean","min","max","median"],
        'Pressure': ["mean","min","max","median"],
        'WindDirection' : ["mean","min","max","median"],
        "WindSpeed" : ["mean","min","max","median"],
        "Rainfall": ["sum"]
    })
)

### Creating New Weather Dataframe

In [17]:
# Function to take the aggregated data column and create new column names
def create_new_weather_data(data, new_col_name):

    new_data = pd.DataFrame(data).reset_index()

    new_data.rename(columns={"mean":"Mean" + new_col_name,"median":"Median"+new_col_name, "min":"Min"+new_col_name, "max":"Max"+new_col_name},inplace=True)

    return new_data


def classify_rain(data):
    if data["sum"] == 0:
        return "No Rain"
    elif 0 < data["sum"] <= 30:
        return "Light Rain"
    elif 31 <= data["sum"] <= 60:
        return "Moderate Rain"
    else:
        return "Heavy Rain"
    

In [18]:
# Creating new dataframes with each aggregation 
air_temp = create_new_weather_data(quali_weather_aggregation["AirTemp"], "AirTemp")
humidity = create_new_weather_data(quali_weather_aggregation["Humidity"], "Humidity")
pressure = create_new_weather_data(quali_weather_aggregation["Pressure"], "Pressure")
wind_direction = create_new_weather_data(quali_weather_aggregation["WindDirection"], "WindDirection")
wind_speed = create_new_weather_data(quali_weather_aggregation["WindSpeed"],"WindSpeed")

rainfall = pd.DataFrame(quali_weather_aggregation["Rainfall"]).reset_index()
rainfall["RainAmount"] = rainfall.apply(classify_rain,axis=1)

In [19]:
# Merging all aggregations to one df 
quali_weather_data = air_temp.merge(humidity, on="RoundNumber",how="left")
quali_weather_data = quali_weather_data.merge(pressure, on="RoundNumber",how="left")
quali_weather_data = quali_weather_data.merge(wind_direction, on="RoundNumber",how="left")
quali_weather_data = quali_weather_data.merge(wind_speed, on="RoundNumber",how="left")
quali_weather_data = quali_weather_data.merge(rainfall[["RoundNumber","RainAmount"]], on="RoundNumber",how="left")



In [20]:
# Merging all data into one dataframe for qualifying
full_quali_data = quali_data.merge(quali_weather_data, on="RoundNumber",how="left")

In [21]:
full_quali_data.to_csv("Cleaned_Data/Cleaned_Quali_Data.csv",index=False)

## Cleaning Race Data

In [100]:
race_2024_laps = pd.read_csv("Raw_Data/2024_Race_Laps.csv")
race_2024_results = pd.read_csv("Raw_Data/2024_Race_Results.csv")
race_2024_weather = pd.read_csv("Raw_Data/2024_Race_Weather.csv")

race_2025_laps = pd.read_csv("Raw_Data/2025_Race_Laps.csv")
race_2025_results = pd.read_csv("Raw_Data/2025_Race_Results.csv")
race_2025_weather = pd.read_csv("Raw_Data/2025_Race_Weather.csv")

### Creating Year Column

In [101]:
race_2024_laps["Year"] = 2024

In [102]:
race_2025_laps["Year"] = 2025

### Creating DriverId Column

In [None]:
# Creating a map to map the Drivers abreviation to an ID
driver_id_map = {
    'VER':'max_verstappen',
    'GAS':'gasly',
    'ANT':'antonelli',
    'ALO':'alonso',
    'LEC':"leclerc",
    'STR':'stroll',
    'TSU':'tsunoda',
    'ALB':'albon',
    'HUL':'hulkenberg',
    'LAW':'lawson', 
    'OCO':'ocon',
    'NOR':'norris', 
    'HAM':'hamilton',
    'BOR':'bortoleto', 
    'SAI':'sainz',
    'HAD':"hadjar",
    'RUS':'russell',
    'PIA':"piastri",
    'BEA':"bearman",
    'COL':"colapinto",
    'DOO':"doohan",
    "PER":"perez",
    "ZHO":"zhou",
    "MAG":"kevin_magnussen",
    "RIC":"ricciardo",
    "BOT":"bottas",
    "SAR":"sargeant"

}

# Mapping by Taking the driver column values and locating it within the driver id map
race_2024_laps["DriverID"] = race_2024_laps["Driver"].map(driver_id_map).str.strip().astype("string")

race_2025_laps["DriverID"] = race_2025_laps["Driver"].map(driver_id_map).str.strip().astype("string")

### Dropping Redundant/Null Columns

In [105]:
cols_to_drop_laps = ["DeletedReason","IsAccurate","FastF1Generated","IsPersonalBest","LapStartDate"]

In [106]:
race_2024_laps.drop(columns=cols_to_drop_laps,inplace=True)

race_2025_laps.drop(columns=cols_to_drop_laps,inplace=True)

### Time Data Columns

In [108]:
time_columns = ['Time','LapTime','PitOutTime', 'PitInTime', 'Sector1Time', 'Sector2Time', 'Sector3Time','Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime',"LapStartTime"]

race_2025_laps[time_columns] = race_2025_laps[time_columns].apply(pd.to_timedelta)

race_2024_laps[time_columns] = race_2024_laps[time_columns].apply(pd.to_timedelta)


## Race Weather Data

### Creating Year Column

In [109]:
race_2024_weather["Year"] = 2024

race_2025_weather["Year"] = 2025

### Weather Time Columns

In [110]:
race_2024_weather["Time"] = pd.to_timedelta(race_2024_weather["Time"])

race_2025_weather["Time"] = pd.to_timedelta(race_2025_weather["Time"])

### Merging Race Weather Data with Race Lap Data

In [None]:
# Creating empty list to append data too
merged_data_2024 = []

# Looping through each roundnumber in the 2024 dataset and sorting values to ensure it loops in the correct order
for round_2024 in sorted(race_2024_laps["RoundNumber"].unique()):

    # Creating a variables for the lap data and sorting values for later merge
    laps_2024 = (
        race_2024_laps[race_2024_laps["RoundNumber"] == round_2024].sort_values("Time")
    )

    # Creating weather data variable where the round number is equal to the part of the loop
    weather_2024 = (
        race_2024_weather[race_2024_weather["RoundNumber"] == round_2024].sort_values("Time")
    )

    # Merging the two variables on time using direction backword to ensure it takes the weather data less than or eqaul to that time period
    merge_2024 = pd.merge_asof(
        laps_2024, weather_2024, on="Time", direction="backward"
    )

    # Finally appending the data to the original empty list
    merged_data_2024.append(merge_2024)

# Concatinating the data into one dataset 
race_2024_data = pd.concat(merged_data_2024, ignore_index=True)



In [None]:
# Dropping and renaming columns that were created during the merge
cols_drop = ['RoundNumber_y', 'Year_y']

cols_rename = {'RoundNumber_x':'RoundNumber', 'Year_x':'Year'}

race_2024_data.drop(columns=cols_drop,inplace=True)

race_2024_data.rename(columns=cols_rename,inplace=True)

In [None]:
# Same process as above but for 2025 data
merged_data_2025 = []

for round_2025 in sorted(race_2025_laps["RoundNumber"].unique()):
    laps_2025 = (
        race_2025_laps[race_2025_laps["RoundNumber"] == round_2025].sort_values("Time")
    )

    weather_2025 = (
        race_2025_weather[race_2025_weather["RoundNumber"] == round_2025].sort_values("Time")
    )

    merge_2025 = pd.merge_asof(
        laps_2025, weather_2025, on="Time", direction="backward"
    )

    merged_data_2025.append(merge_2025)


race_2025_data = pd.concat(merged_data_2025, ignore_index=True)

In [114]:
cols_drop = ['RoundNumber_y', 'Year_y']

cols_rename = {'RoundNumber_x':'RoundNumber', 'Year_x':'Year'}

race_2025_data.drop(columns=cols_drop,inplace=True)

race_2025_data.rename(columns=cols_rename,inplace=True)

### Joining 2025 and 2024 Datasets

In [115]:
merger = [race_2024_data, race_2025_data]

race_data = pd.concat(merger, ignore_index=True)

### Writing Data to CSV

In [118]:
race_data.to_csv("Cleaned_Data/Cleaned_Race_Data.csv",index=False)

## Inspection of Cleaned Race Data

In [120]:
race_data = pd.read_csv("Cleaned_Data/Cleaned_Race_Data.csv")

In [122]:
race_data.columns

Index(['Time', 'Driver', 'DriverNumber', 'LapTime', 'LapNumber', 'Stint',
       'PitOutTime', 'PitInTime', 'Sector1Time', 'Sector2Time', 'Sector3Time',
       'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime',
       'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'Compound', 'TyreLife',
       'FreshTyre', 'Team', 'LapStartTime', 'TrackStatus', 'Position',
       'Deleted', 'RoundNumber', 'Year', 'DriverID', 'AirTemp', 'Humidity',
       'Pressure', 'Rainfall', 'TrackTemp', 'WindDirection', 'WindSpeed'],
      dtype='object')