# F1 What-If Simulator: Data Exploration and Training

This notebook connects to the OpenF1 API to explore Formula 1 data and prepare it for model training.

In [None]:
# Imports
import pandas as pd
import httpx
import asyncio
from typing import Dict, List, Optional

In [None]:
# API Base URL
OPENF1_BASE_URL = "https://api.openf1.org/v1"

## Find a Race Session

Our goal is to programmatically find the session_key for a specific, complete race event. We will use the 'Bahrain' Grand Prix from the 2023 season as our target.

In [None]:
async def fetch_session_key() -> Optional[str]:
    """
    Fetch the session_key for the Bahrain Grand Prix 2023 Race session.
    
    Returns:
        Optional[str]: The session_key if found, None otherwise
    """
    async with httpx.AsyncClient() as client:
        try:
            # Step 1: Find the meeting_key for Bahrain 2023
            print("Step 1: Finding Bahrain 2023 meeting...")
            meetings_response = await client.get(f"{OPENF1_BASE_URL}/meetings?year=2023")
            meetings_response.raise_for_status()
            meetings_data = meetings_response.json()
            
            # Find Bahrain meeting
            bahrain_meeting = None
            for meeting in meetings_data:
                if meeting.get('meeting_name', '').lower() == 'bahrain':
                    bahrain_meeting = meeting
                    break
            
            if not bahrain_meeting:
                print("Error: Bahrain meeting not found in 2023")
                return None
            
            meeting_key = bahrain_meeting['meeting_key']
            print(f"Found Bahrain meeting with key: {meeting_key}")
            
            # Step 2: Find the Race session for this meeting
            print("\nStep 2: Finding Race session...")
            sessions_response = await client.get(f"{OPENF1_BASE_URL}/sessions?meeting_key={meeting_key}")
            sessions_response.raise_for_status()
            sessions_data = sessions_response.json()
            
            # Find Race session
            race_session = None
            for session in sessions_data:
                if session.get('session_name', '').lower() == 'race':
                    race_session = session
                    break
            
            if not race_session:
                print("Error: Race session not found for Bahrain 2023")
                return None
            
            session_key = race_session['session_key']
            print(f"Found Race session with key: {session_key}")
            
            return session_key
            
        except httpx.HTTPStatusError as e:
            print(f"HTTP error occurred: {e}")
            return None
        except Exception as e:
            print(f"Error occurred: {e}")
            return None

# Execute the function
session_key = await fetch_session_key()
print(f"\nFinal session_key: {session_key}")

## Fetch Race Data

Now that we have the race session_key, let's fetch the essential data for our simulation.

In [None]:
async def fetch_race_data(session_key: str) -> Dict[str, pd.DataFrame]:
    """
    Fetch race data from OpenF1 API and return as DataFrames.
    
    Args:
        session_key (str): The session key for the race
        
    Returns:
        Dict[str, pd.DataFrame]: Dictionary containing 'laps', 'pit', and 'drivers' DataFrames
    """
    async with httpx.AsyncClient() as client:
        try:
            print(f"Fetching data for session_key: {session_key}")
            
            # Fetch lap data
            print("\nFetching lap data...")
            laps_response = await client.get(f"{OPENF1_BASE_URL}/laps?session_key={session_key}")
            laps_response.raise_for_status()
            laps_data = laps_response.json()
            laps_df = pd.DataFrame(laps_data)
            
            # Fetch pit stop data
            print("Fetching pit stop data...")
            pit_response = await client.get(f"{OPENF1_BASE_URL}/pit?session_key={session_key}")
            pit_response.raise_for_status()
            pit_data = pit_response.json()
            pit_df = pd.DataFrame(pit_data)
            
            # Fetch driver information
            print("Fetching driver information...")
            drivers_response = await client.get(f"{OPENF1_BASE_URL}/drivers?session_key={session_key}")
            drivers_response.raise_for_status()
            drivers_data = drivers_response.json()
            drivers_df = pd.DataFrame(drivers_data)
            
            print("\nData fetching completed successfully!")
            
            return {
                'laps': laps_df,
                'pit': pit_df,
                'drivers': drivers_df
            }
            
        except httpx.HTTPStatusError as e:
            print(f"HTTP error occurred: {e}")
            return {}
        except Exception as e:
            print(f"Error occurred: {e}")
            return {}

# Execute the function
if session_key:
    race_data = await fetch_race_data(session_key)
    
    # Extract DataFrames
    laps_df = race_data.get('laps', pd.DataFrame())
    pit_df = race_data.get('pit', pd.DataFrame())
    drivers_df = race_data.get('drivers', pd.DataFrame())
else:
    print("Cannot fetch race data without a valid session_key")

## Initial Data Review

Let's examine the structure, columns, and data types of our three DataFrames.

In [None]:
# Review Lap Data
print("=" * 50)
print("LAP DATA REVIEW")
print("=" * 50)
print("\nFirst 5 rows:")
print(laps_df.head())
print("\nDataFrame Info:")
print(laps_df.info())
print(f"\nShape: {laps_df.shape}")
print(f"Columns: {list(laps_df.columns)}")

In [None]:
# Review Pit Stop Data
print("=" * 50)
print("PIT STOP DATA REVIEW")
print("=" * 50)
print("\nFirst 5 rows:")
print(pit_df.head())
print("\nDataFrame Info:")
print(pit_df.info())
print(f"\nShape: {pit_df.shape}")
print(f"Columns: {list(pit_df.columns)}")

In [None]:
# Review Driver Data
print("=" * 50)
print("DRIVER DATA REVIEW")
print("=" * 50)
print("\nFirst 5 rows:")
print(drivers_df.head())
print("\nDataFrame Info:")
print(drivers_df.info())
print(f"\nShape: {drivers_df.shape}")
print(f"Columns: {list(drivers_df.columns)}")

## Summary

We have successfully:
1. Connected to the OpenF1 API
2. Found the session_key for the Bahrain Grand Prix 2023 Race
3. Fetched lap data, pit stop data, and driver information
4. Reviewed the structure and content of our datasets

This data will serve as the foundation for our F1 simulation model training.

## Feature Engineering

Now we will prepare our features for the machine learning model. Our initial model will be simple, focusing on the most important predictors of lap times.

In [None]:
# Feature Engineering
print("Starting feature engineering...")
print(f"Initial laps DataFrame shape: {laps_df.shape}")

# Select features and target
feature_columns = ['lap_number', 'driver_number', 'tyre_compound']
target_column = 'lap_time'

# Check if required columns exist
missing_columns = [col for col in feature_columns + [target_column] if col not in laps_df.columns]
if missing_columns:
    print(f"Warning: Missing columns: {missing_columns}")
    print(f"Available columns: {list(laps_df.columns)}")
else:
    print("All required columns found!")

# Create feature DataFrame with selected columns
X = laps_df[feature_columns].copy()
y = laps_df[target_column].copy()

print(f"\nFeature DataFrame shape: {X.shape}")
print(f"Target Series shape: {y.shape}")

# One-hot encode tyre_compound
print("\nOne-hot encoding tyre_compound...")
tyre_dummies = pd.get_dummies(X['tyre_compound'], prefix='tyre')
print(f"Tyre compound categories: {list(tyre_dummies.columns)}")

# Create final feature DataFrame
X_final = pd.concat([X[['lap_number', 'driver_number']], tyre_dummies], axis=1)

print(f"\nFinal feature DataFrame shape: {X_final.shape}")
print(f"Final feature columns: {list(X_final.columns)}")

# Handle missing values
print("\nHandling missing values...")
print(f"Missing values in target: {y.isnull().sum()}")
print(f"Missing values in features: {X_final.isnull().sum().sum()}")

# Drop rows where lap_time is null
valid_indices = y.notna()
X_clean = X_final[valid_indices].copy()
y_clean = y[valid_indices].copy()

print(f"\nAfter cleaning - X shape: {X_clean.shape}, y shape: {y_clean.shape}")
print(f"Data loss: {len(y) - len(y_clean)} rows dropped due to missing lap_time values")

# Display sample of final data
print("\nSample of final features:")
print(X_clean.head())
print("\nSample of target values:")
print(y_clean.head())

## Model Training

Now we will train a baseline machine learning model to predict lap times. We'll use LightGBM, which is excellent for tabular data and provides good performance with minimal hyperparameter tuning.

In [None]:
# Model Training
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

print("Starting model training...")
print(f"Training data shape: {X_clean.shape}")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y_clean, 
    test_size=0.2, 
    random_state=42
)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

# Initialize the model
model = LGBMRegressor(
    random_state=42,
    verbose=-1  # Suppress verbose output
)

print("\nTraining LightGBM model...")
# Train the model
model.fit(X_train, y_train)

print("Model training completed!")
print(f"Number of features used: {model.n_features_in_}")
print(f"Feature names: {list(model.feature_name_)})")

## Model Evaluation & Saving

Now we will evaluate the model's performance and save it for use in our API. We'll calculate the Root Mean Squared Error (RMSE) to assess prediction accuracy.

In [None]:
# Model Evaluation & Saving
import joblib
import os

print("Evaluating model performance...")

# Make predictions on test set
y_pred = model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse:.4f} seconds")

# Calculate additional metrics
mae = np.mean(np.abs(y_test - y_pred))
print(f"Mean Absolute Error (MAE): {mae:.4f} seconds")

# Calculate R-squared
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R²): {r2:.4f}")

# Display some sample predictions
print("\nSample predictions vs actual:")
comparison_df = pd.DataFrame({
    'Actual': y_test.head(10),
    'Predicted': y_pred[:10],
    'Difference': y_test.head(10) - y_pred[:10]
})
print(comparison_df)

# Save the model
print("\nSaving the trained model...")
model_path = "app/models/lap_time_predictor.joblib"

# Create the models directory if it doesn't exist
os.makedirs(os.path.dirname(model_path), exist_ok=True)

# Save the model
joblib.dump(model, model_path)
print(f"Model saved successfully to: {model_path}")

# Verify the model can be loaded
print("\nVerifying model can be loaded...")
loaded_model = joblib.load(model_path)
test_prediction = loaded_model.predict(X_test.head(1))
print(f"Test prediction from loaded model: {test_prediction[0]:.4f} seconds")
print("Model loading verification successful!")

## Final Summary

We have successfully:
1. ✅ Connected to the OpenF1 API and fetched race data
2. ✅ Engineered features for lap time prediction
3. ✅ Trained a LightGBM regression model
4. ✅ Evaluated model performance with RMSE metric
5. ✅ Saved the trained model for API use

The baseline model is now ready to be used by our F1 What-If Simulator API for predicting lap times based on lap number, driver number, and tyre compound.