# 1. Data Preparation & Time-Based Split

In [40]:
from google.colab import files
import pandas as pd

# upload files
uploaded = files.upload()

# load the driver-season feature dataset
df = pd.read_csv('f1_driver_features_2014_2023.csv')

# fill missing std with mean
df['Position_STD'] = df['Position_STD'].fillna(df['Position_STD'].mean())

# define feature columns
feature_cols = [
    'Total_Points', 'Wins', 'Podiums', 'Top10s',
    'Races_Entered', 'Avg_Position', 'Position_STD'
]

# time-based split: no shuffling, no leakage
train_years = list(range(2014, 2020))  # 2014â€“2019
val_years   = [2020, 2021]
test_years  = [2022, 2023]

train_df = df[df['Year'].isin(train_years)].copy()
val_df   = df[df['Year'].isin(val_years)].copy()
test_df  = df[df['Year'].isin(test_years)].copy()

X_train = train_df[feature_cols]
y_train = train_df['Champion']

X_val   = val_df[feature_cols]
y_val   = val_df['Champion']

X_test  = test_df[feature_cols]
y_test  = test_df['Champion']

print("train years:", sorted(train_df['Year'].unique()), "shape:", X_train.shape)
print("val years:  ", sorted(val_df['Year'].unique()), "shape:", X_val.shape)
print("test years: ", sorted(test_df['Year'].unique()), "shape:", X_test.shape)


Saving f1_driver_features_2014_2023.csv to f1_driver_features_2014_2023 (5).csv
train years: [np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019)] shape: (135, 7)
val years:   [np.int64(2020), np.int64(2021)] shape: (44, 7)
test years:  [np.int64(2022), np.int64(2023)] shape: (44, 7)


# 2. Define Models (Logistic Regression, Decision Tree, Random Forests, SVM) with Scaling & Class Weights

In [41]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# add scaling and class_weight
models = {
    "Logistic Regression": Pipeline([
        ('scale', StandardScaler()),
        ('clf', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42))
    ]),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced', random_state=42),
    "Random Forest": RandomForestClassifier(class_weight='balanced', random_state=42),
    "Support Vector Machine (SVM)": Pipeline([
        ('scale', StandardScaler()),
        ('clf', SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=42))
    ])
}



# 3. Train & Choose Best Model

In [42]:
from sklearn.metrics import classification_report

best_models = {}
val_scores = {}

for name, model in models.items():
    print(f"\n=== Training {name} ===")
    model.fit(X_train, y_train)

    y_val_pred = model.predict(X_val)
    report = classification_report(y_val, y_val_pred, output_dict=True)
    f1_champion = report['1']['f1-score']
    val_scores[name] = f1_champion
    best_models[name] = model

    print(classification_report(y_val, y_val_pred, digits=4))
    print(f"F1 score for champion class (1): {f1_champion:.4f}")

# select best model based on F1 for champion class
best_name = max(val_scores, key=val_scores.get)
best_model = best_models[best_name]

print(f"Best model selected: {best_name}")



=== Training Logistic Regression ===
              precision    recall  f1-score   support

           0     1.0000    0.9762    0.9880        42
           1     0.6667    1.0000    0.8000         2

    accuracy                         0.9773        44
   macro avg     0.8333    0.9881    0.8940        44
weighted avg     0.9848    0.9773    0.9794        44

F1 score for champion class (1): 0.8000

=== Training Decision Tree ===
              precision    recall  f1-score   support

           0     1.0000    0.9762    0.9880        42
           1     0.6667    1.0000    0.8000         2

    accuracy                         0.9773        44
   macro avg     0.8333    0.9881    0.8940        44
weighted avg     0.9848    0.9773    0.9794        44

F1 score for champion class (1): 0.8000

=== Training Random Forest ===
              precision    recall  f1-score   support

           0     1.0000    0.9762    0.9880        42
           1     0.6667    1.0000    0.8000         2



# 4. Final Classification Evaluation on Test Set

In [43]:
from sklearn.metrics import accuracy_score, classification_report

y_test_pred = best_model.predict(X_test)

print("=== Test Set Classification ===")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred, digits=4))



=== Test Set Classification ===
Accuracy: 1.0
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        42
           1     1.0000    1.0000    1.0000         2

    accuracy                         1.0000        44
   macro avg     1.0000    1.0000    1.0000        44
weighted avg     1.0000    1.0000    1.0000        44



# 5. Ranking Evaluation & Champion Prediction

In [44]:
# add predicted probabilities for test set
test_df = test_df.copy()
test_df['pred_proba'] = best_model.predict_proba(X_test)[:, 1]

def ranking_metrics(df_season, top_k=3):
    ranked = df_season.sort_values('pred_proba', ascending=False)
    champion_idx = ranked['Champion'].values.argmax()  # index of first '1'
    hit1 = int(champion_idx == 0)
    hitk = int(champion_idx < top_k)
    rank = champion_idx + 1
    return rank, hit1, hitk, ranked

results = []
for year, grp in test_df.groupby('Year'):
    rank, hit1, hit3, ranked = ranking_metrics(grp, top_k=3)
    results.append({
        'Year': year,
        'Champion_Rank': rank,
        'Hit@1': hit1,
        'Hit@3': hit3
    })

ranking_df = pd.DataFrame(results).sort_values('Year')
print("=== Champion Rank per Test Season ===")
print(ranking_df)
print("\nOverall Hit@1:", ranking_df['Hit@1'].mean())
print("Overall Hit@3:", ranking_df['Hit@3'].mean())

=== Champion Rank per Test Season ===
   Year  Champion_Rank  Hit@1  Hit@3
0  2022              1      1      1
1  2023              1      1      1

Overall Hit@1: 1.0
Overall Hit@3: 1.0


# 6. Predicted vs. True Champion

In [45]:
predicted_champions = []

for year, grp in test_df.groupby('Year'):
    ranked = grp.sort_values('pred_proba', ascending=False)
    predicted_driver = ranked.iloc[0]['Driver']
    true_driver = ranked.loc[ranked['Champion'] == 1, 'Driver'].iloc[0]
    predicted_champions.append({
        'Year': year,
        'Predicted Champion': predicted_driver,
        'True Champion': true_driver,
        'Correct?': predicted_driver == true_driver
    })

predicted_df = pd.DataFrame(predicted_champions).sort_values('Year')
print("=== Predicted vs True Champions (Test Years) ===")
print(predicted_df)

=== Predicted vs True Champions (Test Years) ===
   Year Predicted Champion True Champion  Correct?
0  2022         Verstappen    Verstappen      True
1  2023         Verstappen    Verstappen      True


# 7. ðŸ¥‰ Top 3 Ranked Drivers for Each Season

In [46]:
top3_list = []

for year, grp in test_df.groupby('Year'):
    ranked = grp.sort_values('pred_proba', ascending=False).head(3)
    top3_list.append({
        'Year': year,
        '1st': ranked.iloc[0]['Driver'],
        '2nd': ranked.iloc[1]['Driver'],
        '3rd': ranked.iloc[2]['Driver']
    })

top3_df = pd.DataFrame(top3_list).sort_values('Year')
top3_df

Unnamed: 0,Year,1st,2nd,3rd
0,2022,Verstappen,Leclerc,PÃ©rez
1,2023,Verstappen,PÃ©rez,Hamilton
