In [136]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import multiprocessing
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
# from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [137]:
data = pd.read_csv('../../reference/f1_cleaned.csv')
data = data.rename(columns={'driver_name' : 'team_name'})
data.drop(['code'], axis=1, inplace=True)
data.dtypes

team_name         object
driver_nat        object
circuitRef        object
year             float64
round            float64
starting_pos     float64
finishing_pos    float64
laps             float64
quali_mean       float64
driver_age       float64
driver_dnf         int64
car_dnf            int64
dtype: object

In [138]:
#train test split
#we are not using a random split here, training with pre 2024 data and trying to predict the races that occured in 2024

train = data[data.year<2024].copy()
test = data[data.year==2024].copy()

#testing set
y_test = test.pop('finishing_pos')
x_test = test

#training set
y_train = train.pop('finishing_pos')
x_train = train

In [139]:
print("Selected columns:", x_train.columns.tolist())
print("Selected columns:", x_test.columns.tolist())

Selected columns: ['team_name', 'driver_nat', 'circuitRef', 'year', 'round', 'starting_pos', 'laps', 'quali_mean', 'driver_age', 'driver_dnf', 'car_dnf']
Selected columns: ['team_name', 'driver_nat', 'circuitRef', 'year', 'round', 'starting_pos', 'laps', 'quali_mean', 'driver_age', 'driver_dnf', 'car_dnf']


In [140]:
#encoding vars and scaling data

cat_feat = ['team_name',
            'driver_nat', 
            'circuitRef']
x_num_feat = ['year',
              'starting_pos', 
              'laps', 
              'driver_dnf', 
              'car_dnf',
              'round',
              'quali_mean',
              'driver_age']

#scale y later if needed for a distance model

ct = ColumnTransformer(transformers=[
    ('encoder', OneHotEncoder(handle_unknown='ignore', drop='first'), cat_feat), #avoid dummy var trap with OHE
    ('scx', StandardScaler(), x_num_feat)
])

In [141]:
#Basic LR model

model = Pipeline(steps=[
    ('preprocessor', ct),
    ('regressor', LinearRegression())
])

model.fit(x_train, y_train)
y_pred = model.predict(x_test)

r2 = r2_score(y_test, y_pred)
print('R2;', r2)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', rmse)


# Add predictions and actuals back to `test`
test['pred_lr'] = y_pred
test['finishing_pos'] = y_test  # reattach actuals for ranking

# Convert actual and predicted to ranks within each race
test['actual_rank'] = test.groupby('circuitRef')['finishing_pos'].rank(method='min')
test['lr_rank'] = test.groupby('circuitRef')['pred_lr'].rank(method='min')

# Calculate MAE and R² on ranks
mae = mean_absolute_error(test['actual_rank'], test['lr_rank'])
r2 = r2_score(test['actual_rank'], test['lr_rank'])
rsme = np.sqrt((mean_squared_error(test['actual_rank'], test['lr_rank'])))

n = x_test.shape[0]  # number of samples
p = x_test.shape[1]  # number of features

print("Ranked MAE:", mae)
print("Ranked R²:", r2)
print("Ranked RSME:", rsme)

adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
print('Adjusted R²:', adjusted_r2)

R2; 0.6458855299950413
Root Mean Squared Error: 3.425204804831232
Ranked MAE: 2.44258872651357
Ranked R²: 0.6666574579820879
Ranked RSME: 3.3232275480497737
Adjusted R²: 0.6558580235754601




# Backward Elimination

In [112]:

import statsmodels.api as sm

# 1) Build statsmodels design matrix
X_be = pd.get_dummies(x_train.drop(columns=['year']), drop_first=True)
X_be = sm.add_constant(X_be).astype(float)
y_num = y_train.astype(float)

x_test_be = pd.get_dummies(x_test.drop(columns=['year']), drop_first=True)
x_test_be = sm.add_constant(x_test_be).astype(float)

In [113]:
selected_cols = X_be.columns.tolist()
print("Selected columns:", selected_cols)
print("Selected columns:", x_test_be.columns.tolist())

Selected columns: ['const', 'round', 'starting_pos', 'laps', 'quali_mean', 'driver_age', 'driver_dnf', 'car_dnf', 'team_name_Aston Martin', 'team_name_Caterham', 'team_name_Ferrari', 'team_name_Haas F1 Team', 'team_name_Manor Marussia', 'team_name_McLaren', 'team_name_Mercedes', 'team_name_RB F1 Team', 'team_name_Red Bull', 'team_name_Sauber', 'team_name_Williams', 'driver_nat_Australian', 'driver_nat_Belgian', 'driver_nat_Brazilian', 'driver_nat_British', 'driver_nat_Canadian', 'driver_nat_Chinese', 'driver_nat_Danish', 'driver_nat_Dutch', 'driver_nat_Finnish', 'driver_nat_French', 'driver_nat_German', 'driver_nat_Indonesian', 'driver_nat_Italian', 'driver_nat_Japanese', 'driver_nat_Mexican', 'driver_nat_Monegasque', 'driver_nat_New Zealander', 'driver_nat_Polish', 'driver_nat_Russian', 'driver_nat_Spanish', 'driver_nat_Swedish', 'driver_nat_Thai', 'driver_nat_Venezuelan', 'circuitRef_americas', 'circuitRef_bahrain', 'circuitRef_baku', 'circuitRef_catalunya', 'circuitRef_hockenheimrin

In [114]:

import statsmodels.api as sm

# 1) Build statsmodels design matrix
X_be = pd.get_dummies(x_train.drop(columns=['year']), drop_first=True)
X_be = sm.add_constant(X_be).astype(float)
y_num = y_train.astype(float)

x_test_be = pd.get_dummies(x_test.drop(columns=['year']), drop_first=True)
x_test_be = sm.add_constant(x_test_be).astype(float)

# 2) Map each original feature to its columns
feature_groups = {}
for f in x_num_feat:
    feature_groups[f] = [f]
for f in cat_feat:
    feature_groups[f] = [c for c in X_be.columns if c.startswith(f + '_')]

# 3) Iteratively drop groups
SL = 0.05
X_curr = X_be.copy()
x_test_be = x_test_be.copy()
step = 1

while True:
    full_mod = sm.OLS(y_num, X_curr).fit()
    pvals = {}
    for feat, cols in feature_groups.items():
        # skip already-removed
        if any(c not in X_curr.columns for c in cols):
            continue
        X_red = X_curr.drop(columns=cols)
        red_mod = sm.OLS(y_num, X_red).fit()
        anova = sm.stats.anova_lm(red_mod, full_mod)
        pvals[feat] = anova['Pr(>F)'][1]
    worst_feat, worst_p = max(pvals.items(), key=lambda x: x[1])
    if worst_p > SL:
        # drop group and remove from dict
        print(f"Step {step}: drop {worst_feat!r} (p={worst_p:.3f})")
        X_curr = X_curr.drop(columns=feature_groups[worst_feat])

        x_test_be = x_test_be.drop(columns=feature_groups[worst_feat])

        del feature_groups[worst_feat]
        step += 1
    else:
        print("No more features with p > 0.05; stopping.")
        break

selected_cols = X_curr.columns.tolist()
print("Selected columns:", selected_cols)
print("Selected columns:", x_test_be.columns.tolist())

Step 1: drop 'driver_age' (p=0.394)
Step 2: drop 'round' (p=0.333)
No more features with p > 0.05; stopping.
Selected columns: ['const', 'starting_pos', 'laps', 'quali_mean', 'driver_dnf', 'car_dnf', 'team_name_Aston Martin', 'team_name_Caterham', 'team_name_Ferrari', 'team_name_Haas F1 Team', 'team_name_Manor Marussia', 'team_name_McLaren', 'team_name_Mercedes', 'team_name_RB F1 Team', 'team_name_Red Bull', 'team_name_Sauber', 'team_name_Williams', 'driver_nat_Australian', 'driver_nat_Belgian', 'driver_nat_Brazilian', 'driver_nat_British', 'driver_nat_Canadian', 'driver_nat_Chinese', 'driver_nat_Danish', 'driver_nat_Dutch', 'driver_nat_Finnish', 'driver_nat_French', 'driver_nat_German', 'driver_nat_Indonesian', 'driver_nat_Italian', 'driver_nat_Japanese', 'driver_nat_Mexican', 'driver_nat_Monegasque', 'driver_nat_New Zealander', 'driver_nat_Polish', 'driver_nat_Russian', 'driver_nat_Spanish', 'driver_nat_Swedish', 'driver_nat_Thai', 'driver_nat_Venezuelan', 'circuitRef_americas', 'cir

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- actual_rank
- driver_nat_Argentinian 
- finishing_pos
- lr_rank
- pred_lr
Feature names seen at fit time, yet now missing:
- circuitRef_hockenheimring
- circuitRef_istanbul
- circuitRef_mugello
- circuitRef_nurburgring
- circuitRef_portimao
- ...


## Visualization

In [None]:
#df created to compare results 

comparison_df = pd.DataFrame({
    'Team': test['team_name'],              
    'Circut': test['circuitRef'],          
    'Actual Pos': y_test,                         
    'Predicted Pos': y_pred                       
})

In [None]:
test = test.copy()
test['y_pred'] = y_pred
test['y_true'] = y_test.values


test['actual_rank'] = test.groupby('circuitRef')['y_true'].rank(method='min')
test['predicted_rank'] = test.groupby('circuitRef')['y_pred'].rank(method='min')


plt.figure(figsize=(12, 8))
sns.scatterplot(x='actual_rank', y='predicted_rank', hue='team_name', data=test)

#regression line
plt.plot([1, test['actual_rank'].max()], [1, test['actual_rank'].max()], 'r--', label='Perfect Prediction')

plt.xlabel('Actual Race Rank')
plt.ylabel('Predicted Race Rank')
plt.title('Predicted vs Actual Rankings by Team')
plt.legend(title='Team', prop={'size': 8}, bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# starting vs final positions in train and test

In [None]:
#adding team colors
team_colors = {
    'Red Bull': '#1E41FF',
    'Mercedes': '#00D2BE',
    'Ferrari': '#DC0000',
    'McLaren': '#FF8700',
    'Aston Martin': '#006F62',
    'Alpine F1 Team': '#0090FF',
    'Williams': '#005AFF',
    'RB F1 Team': '#6692FF',
    'Haas F1 Team': '#B6BABD',
    'Sauber': '#52E252',
    #fallback color
    'Other': '#888888'
}
palette = {team: color for team, color in team_colors.items() if team in test['team_name'].unique()}

#getting unique races
races = test['circuitRef'].unique()
n_races = len(races)

cols = 3
rows = math.ceil(n_races / cols)

plt.figure(figsize=(6 * cols, 5 * rows))

for idx, race in enumerate(races):
    ax = plt.subplot(rows, cols, idx + 1)
    
    race_data = test[test['circuitRef'] == race]
    
    sns.scatterplot(
        x='actual_rank', 
        y='predicted_rank', 
        hue='team_name', 
        data=race_data, 
        ax=ax,
        palette=palette,
        legend=False
    )
    
    #regression line
    ax.plot([1, race_data['actual_rank'].max()], [1, race_data['actual_rank'].max()], 'r--')
    
    #scale
    max_rank = int(max(race_data['actual_rank'].max(), race_data['predicted_rank'].max())) + 1
    ax.set_xticks(range(1, max_rank + 1))
    ax.set_yticks(range(1, max_rank + 1))
    
    ax.set_xlim(0.8, max_rank + 0.2)
    ax.set_ylim(0.8, max_rank + 0.2)

    ax.set_title(f'{race.capitalize()}')
    ax.set_xlabel('Actual Rank')
    ax.set_ylabel('Predicted Rank')
    ax.grid(True)

plt.tight_layout()
plt.suptitle('Race-by-Race: Actual vs Predicted Driver Rankings by Team', fontsize=16, y=1.02)
plt.legend(title='Team', bbox_to_anchor=(1.05, 1), loc='upper left', prop={'size': 8})
plt.show()

In [None]:
def position_to_points(pos):
    points_map = {1: 25, 2: 18, 3: 15, 4: 12, 5: 10,
                  6: 8, 7: 6, 8: 4, 9: 2, 10: 1}
    return points_map.get(pos, 0)

test['actual_points'] = test['actual_rank'].apply(position_to_points)
test['predicted_points'] = test['predicted_rank'].apply(position_to_points)

team_points = test.groupby(['team_name', 'circuitRef']).agg({
    'actual_points': 'sum',
    'predicted_points': 'sum'
}).reset_index()


season_totals = team_points.groupby('team_name').agg({
    'actual_points': 'sum',
    'predicted_points': 'sum'
}).reset_index()

#sorting by actual
season_totals = season_totals.sort_values(by='actual_points', ascending=False)


teams = season_totals['team_name']
x = np.arange(len(teams))
width = 0.35

fig, ax = plt.subplots(figsize=(12, 6))
ax.bar(x - width/2, season_totals['actual_points'], width, label='Actual')
ax.bar(x + width/2, season_totals['predicted_points'], width, label='Predicted')

ax.set_ylabel('Total Points')
ax.set_title('Constructor Championship: Actual vs Predicted Points')
ax.set_xticks(x)
ax.set_xticklabels(teams, rotation=45, ha='right')
ax.legend()

plt.tight_layout()
plt.show()

print(season_totals)