In [46]:
import fastf1
import fastf1.plotting

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

In [47]:
df = pd.read_csv('racing-data.csv')
df.head()

Unnamed: 0,DriverNumber,DriverId,Abbreviation,FullName,TeamName,Position,GridPosition,Q1_x,Q2_x,Q3_x,Year,RaceName,Q1_y,Q2_y,Q3_y
0,44,hamilton,HAM,Lewis Hamilton,Mercedes,1.0,2.0,,,,2021,1,0 days 00:01:30.617000,0 days 00:01:30.085000,0 days 00:01:29.385000
1,33,max_verstappen,VER,Max Verstappen,Red Bull Racing,2.0,1.0,,,,2021,1,0 days 00:01:30.499000,0 days 00:01:30.318000,0 days 00:01:28.997000
2,77,bottas,BOT,Valtteri Bottas,Mercedes,3.0,3.0,,,,2021,1,0 days 00:01:31.200000,0 days 00:01:30.186000,0 days 00:01:29.586000
3,4,norris,NOR,Lando Norris,McLaren,4.0,7.0,,,,2021,1,0 days 00:01:30.902000,0 days 00:01:30.099000,0 days 00:01:29.974000
4,11,perez,PER,Sergio Perez,Red Bull Racing,5.0,0.0,,,,2021,1,0 days 00:01:31.165000,0 days 00:01:30.659000,


In [48]:
def get_race_data(year, race_name, session_type="R"):
    try:
        session = fastf1.get_session(year, race_name, session_type)
        session.load()
        race_data = session.results.loc[:, [
            'DriverNumber', 'DriverId', 'Abbreviation', 'FullName',
            'TeamName', 'Position', 'GridPosition', 'Q1', 'Q2', 'Q3'
        ]]
        race_data['Year'] = year
        race_data['RaceName'] = race_name
        return race_data
    except Exception as e:
        print(f"Error loading data for {race_name} in {year}: {e}")
        return pd.DataFrame()  # Return an empty DataFrame in case of an error

In [49]:
def get_qualifying_data(year, race_name, session_type="Q"):
    try:
        session = fastf1.get_session(year, race_name, session_type)
        session.load()
        qualifying_data = session.results.loc[:, [
            'DriverNumber', 'DriverId', 'Abbreviation', 'FullName',
            'TeamName', 'Q1', 'Q2', 'Q3'
        ]]
        qualifying_data['Year'] = year
        qualifying_data['RaceName'] = race_name
        return qualifying_data
    except Exception as e:
        print(f"Error loading data for {race_name} in {year}: {e}")
        return pd.DataFrame()  # Return an empty DataFrame in case of an error

In [50]:
def collect_data(years, race_names):
    combined_data = []
    for year in years:
        for race_name in race_names:
            race_data = get_race_data(year, race_name)
            qualifying_data = get_qualifying_data(year, race_name)
            
            if not race_data.empty and not qualifying_data.empty:
                # Merge race and qualifying data on common columns
                combined = pd.merge(race_data, 
                                    qualifying_data, 
                                    on = ['DriverId', 'Year', 'RaceName', 'DriverNumber', 'Abbreviation', 'FullName', 'TeamName', 'RaceName'], 
                                    how = 'inner')
                combined_data.append(combined)
            print("DONE GETTING RACE", race_name)
        print("Completed collecting race data for: ", year)
    
    if combined_data:
        return pd.concat(combined_data, ignore_index=True)
    else:
        return pd.DataFrame()  # Return an empty DataFrame if no data was collected


In [51]:
# Updated prepare_features function
def prepare_features(data):
    # Convert qualifying times (Q1_y, Q2_y, Q3_y) to seconds and fill NaNs with high values (indicating no time set)
    for session in ['Q1_y', 'Q2_y', 'Q3_y']:
        data[session] = pd.to_timedelta(data[session], errors='coerce').dt.total_seconds()
        data[session] = data[session].fillna(9999)
    
    # Encode categorical features
    data['DriverId'] = data['DriverId'].astype('category').cat.codes
    data['TeamName'] = data['TeamName'].astype('category').cat.codes
    
    # Feature for average qualifying time
    data['AvgQualifyingTime'] = data[['Q1_y', 'Q2_y', 'Q3_y']].mean(axis=1)
    
    return data


In [52]:
# Prepare features
prepared_data = prepare_features(df)
print(prepared_data.head())

   DriverNumber  DriverId Abbreviation         FullName  TeamName  Position  \
0            44         6          HAM   Lewis Hamilton         8       1.0   
1            33        13          VER   Max Verstappen         9       2.0   
2            77         2          BOT  Valtteri Bottas         8       3.0   
3             4        16          NOR     Lando Norris         7       4.0   
4            11        18          PER     Sergio Perez         9       5.0   

   GridPosition  Q1_x  Q2_x  Q3_x  Year  RaceName    Q1_y    Q2_y      Q3_y  \
0           2.0   NaN   NaN   NaN  2021         1  90.617  90.085    89.385   
1           1.0   NaN   NaN   NaN  2021         1  90.499  90.318    88.997   
2           3.0   NaN   NaN   NaN  2021         1  91.200  90.186    89.586   
3           7.0   NaN   NaN   NaN  2021         1  90.902  90.099    89.974   
4           0.0   NaN   NaN   NaN  2021         1  91.165  90.659  9999.000   

   AvgQualifyingTime  
0             90.029  
1   

In [55]:
# Assuming prepared_data is your DataFrame after running prepare_features
prepared_data['IsWinner'] = (prepared_data['Position'] == 1).astype(int)

# Filter out extreme values in AvgQualifyingTime
valid_data = prepared_data[prepared_data['AvgQualifyingTime'] < 9999]

# Handle NaNs in GridPosition (e.g., filling NaNs with median grid position)
valid_data['GridPosition'].fillna(valid_data['GridPosition'].median(), inplace=True)

# Define features and target
features = ['GridPosition', 'AvgQualifyingTime', 'Position']
X = valid_data[features]

# Fill NaNs in the 'Position' column with the mean of the column
X['Position'].fillna(X['Position'].mean(), inplace=True)

# Alternatively, you can use the median if it's more appropriate for your data:
# X['Position'].fillna(X['Position'].median(), inplace=True)

y = valid_data['IsWinner']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Train a Neural Network classifier
model = MLPClassifier(hidden_layer_sizes=(200, 200, 100), max_iter=1000, random_state=1)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred))


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  valid_data['GridPosition'].fillna(valid_data['GridPosition'].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_data['GridPosition'].fillna(valid_data['GridPosition'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using

Accuracy: 0.95
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       247
           1       0.00      0.00      0.00        14

    accuracy                           0.95       261
   macro avg       0.47      0.50      0.49       261
weighted avg       0.90      0.95      0.92       261



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [80]:
# Step 1: Collect data for given year and race.
race_to_predict = collect_data([2024], ["Canadian Grand Prix"])

core           INFO 	Loading data for Canadian Grand Prix - Race [v3.3.8]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '63', '44', '81', '14', '18', '3', '10', '31', '27', '20', '77', '22', '24', '55', '23', '11', '16', '2']
core           INFO 	Loading data for Canadian Grand Prix - Q

DONE GETTING RACE Canadian Grand Prix
Completed collecting race data for:  2024


In [81]:
# Step 2: Prepare the Features
prepared_target_data = prepare_features(race_to_predict)

# Ensure the target features match those used during training
X_target_race = prepared_target_data[features]

# Step 3: Make Predictions
win_probabilities = model.predict_proba(X_target_race)[:, 1]  # Probability of being the winner
prepared_target_data['WinProbability'] = win_probabilities

# Step 4: Interpret the Predictions
# Sort by 'WinProbability' in descending order and get the top 5
top_5_drivers = prepared_target_data.nlargest(5, 'WinProbability')

print("Top 5 most likely winners for", race_to_predict["RaceName"].iloc[0], race_to_predict["Year"].iloc[0])
print(top_5_drivers[['FullName', 'TeamName', 'GridPosition', 'AvgQualifyingTime', 'WinProbability']])


Top 5 most likely winners for Canadian Grand Prix 2024
         FullName  TeamName  GridPosition  AvgQualifyingTime  WinProbability
0  Max Verstappen         8           2.0          72.303000    7.824467e-13
1    Lando Norris         5           3.0          72.393667    6.778935e-13
3  Lewis Hamilton         6           7.0          72.370000    5.973940e-13
2  George Russell         6           1.0          72.251667    4.816926e-13
4   Oscar Piastri         5           4.0          72.490667    3.871709e-13
