# F1  Winner **Prediction** - Using Ergast API

**Author**: Esma Yildirim

**Date**: 05.04.2025

**Description**: Predicts F1 Grand Prix winners using real-time stats (Ergast API)

**Links**:  
- [GitHub Repo](https://github.com/frauvate/f1-2025-winner-prediction/) | [Ergast API](https://ergast.com/mrd/)

Importing Libraries

In [32]:
import requests
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Creating Data Frame

Trace Results Through API

In [2]:
def get_race_results(season):
    url = f"http://ergast.com/api/f1/{season}/results.json?limit=1000"
    response = requests.get(url)
    data = response.json()

    races = data['MRData']['RaceTable']['Races']
    results = []
    for race in races:
        round_num = race['round']
        race_name = race['raceName']
        for result in race['Results']:
            results.append({
                "season": season,
                "round": int(round_num),
                "raceName": race_name,
                "driver": result['Driver']['driverId'],
                "constructor": result['Constructor']['constructorId'],
                "grid": int(result['grid']),
                "position": int(result['position']),
                "status": result['status']
            })
    return results

Create Data Frame Using the Results of Last 4 Years

In [48]:
# Get all results between 2021-2024
all_results = []
for year in range(2021, 2025):
    all_results.extend(get_race_results(year))

df_results = pd.DataFrame(all_results)
print(df_results.head())

   season  round            raceName          driver constructor  grid  \
0    2021      1  Bahrain Grand Prix        hamilton    mercedes     2   
1    2021      1  Bahrain Grand Prix  max_verstappen    red_bull     1   
2    2021      1  Bahrain Grand Prix          bottas    mercedes     3   
3    2021      1  Bahrain Grand Prix          norris     mclaren     7   
4    2021      1  Bahrain Grand Prix           perez    red_bull     0   

   position    status  
0         1  Finished  
1         2  Finished  
2         3  Finished  
3         4  Finished  
4         5  Finished  


Tag the Winner

In [None]:
df_results["is_winner"] = df_results["position"] == 1

In [None]:
df_features = df_results.copy() # Make sure all race-driver combinations are unique

Calculating Past Wins

In [None]:
df_features["past_wins"] = 0

for season in df_features["season"].unique():
    for driver in df_features[df_features["season"] == season]["driver"].unique():
        driver_races = df_features[(df_features["season"] == season) & (df_features["driver"] == driver)]
        wins = 0
        for idx in driver_races.index:
            df_features.loc[idx, "past_wins"] = wins
            if df_features.loc[idx, "is_winner"]:
                wins += 1

Select Relevant Columns

In [None]:
df_ml = df_features[["season", "round", "driver", "constructor", "grid", "past_wins", "is_winner"]]

One-hot Encoding

In [None]:
df_ml_encoded = pd.get_dummies(df_ml, columns=["constructor", "driver"])

Seperate Tags and Features

In [None]:
X = df_ml_encoded.drop(columns=["is_winner"])
y = df_ml_encoded["is_winner"]

# Model Training

Split into Train and Test Sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Train the Random Forest Classifier

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

Predict and Evaluate

In [None]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Simulate and Predict

Simulate 2025 Race Data

In [None]:
prediction_df = df_results[df_results["season"] == 2024].copy()
prediction_df["season"] = 2025

One-Hot Encoding and Calculating Past Wins

In [None]:
# This mirrors the steps performed for the training data
prediction_df["is_winner"] = prediction_df["position"] == 1
prediction_df["past_wins"] = 0
# Calculate past_wins for 2025 using 2024 data, assuming same driver performance
for driver in prediction_df["driver"].unique():
    driver_races = prediction_df[prediction_df["driver"] == driver]
    wins = 0
    for idx in driver_races.index:
        prediction_df.loc[idx, "past_wins"] = wins
        if prediction_df.loc[idx, "is_winner"]:
            wins += 1

prediction_df_ml = prediction_df[["season", "round", "driver", "constructor", "grid", "past_wins", "is_winner"]]
prediction_X = pd.get_dummies(prediction_df_ml, columns=["constructor", "driver"])

Handle Missing Values

In [None]:
# Align the columns in prediction_X with those in X used for training
# This ensures that prediction_X contains the same columns (and order) as X
# Get missing columns from X that are not in prediction_X
missing_cols = set(X.columns) - set(prediction_X.columns)
# Add missing columns to prediction_X with 0 values
for col in missing_cols:
    prediction_X[col] = 0
# Reorder columns in prediction_X to match the order in X
prediction_X = prediction_X[X.columns]

Predict

In [None]:
prediction_df["win_probability"] = model.predict_proba(prediction_X)[:, 1]

Calculate Average Win Probability

In [None]:
avg_probs = prediction_df.groupby("driver")["win_probability"].mean().sort_values(ascending=False)

Driver With the Highest Probability

In [None]:
top_driver_id = avg_probs.idxmax()
top_probability = avg_probs.max()

Get Top 5 Drivers

In [None]:
top_5_drivers = avg_probs.head(5).reset_index()
top_5_drivers.columns = ["Driver ID", "Win Probability"]

top_5_drivers["Driver"] = top_5_drivers["Driver ID"].str.replace('_', ' ').str.title()

# Format as a markdown table
from IPython.display import Markdown, display
display(Markdown("### 🏆 Top 5 Predicted 2025 Winners"))
display(Markdown(top_5_drivers[["Driver", "Win Probability"]].to_markdown(index=False, floatfmt=".1%")))

Final Winner Prediction

In [47]:
driver_name = df_results[df_results["driver"] == top_driver_id]["driver"].iloc[0].replace('_', ' ').title()

print(f"\n🏆 2025 Winner Prediction: {driver_name}")
print(f"🔮 Average Probability of Winning: {top_probability:.2%}")


🏆 2025 Winner Prediction: Max Verstappen
🔮 Average Probability of Winning: 78.60%
