In [2]:

import fastf1
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import xgboost as xgb
import matplotlib.pyplot as plt
from tqdm import tqdm
import os

# --- Cache Setup ---
# Ensures the cache directory exists, making the script robust.
CACHE_PATH = 'f1_US_GP'
print(f"Ensuring cache directory '{CACHE_PATH}' exists...")
os.makedirs(CACHE_PATH, exist_ok=True)
fastf1.Cache.enable_cache(CACHE_PATH)


def get_driver_track_history(driver_number, circuit_key, all_results_df):
    """Calculates the average finishing position for a driver at a specific track from past years."""
    driver_history = all_results_df[(all_results_df['CircuitKey'] == circuit_key) & (all_results_df['DriverNumber'] == driver_number)]
    return driver_history['Position'].mean() if not driver_history.empty else np.nan

def get_driver_recent_form(driver_number, current_year, current_round, all_results_df, n_races=5):
    """Calculates the average finishing position for a driver in their last n races."""
    races_before = all_results_df[((all_results_df['Year'] == current_year) & (all_results_df['Round'] < current_round)) | (all_results_df['Year'] < current_year)]
    driver_races = races_before[races_before['DriverNumber'] == driver_number].sort_values(by=['Year', 'Round'], ascending=False).head(n_races)
    return driver_races['Position'].mean() if not driver_races.empty else np.nan

# --- 1. Build Historical Dataset ---
print("\nBuilding historical race dataset for training (using 2023-2024 data)...")
all_race_features = []
historical_results_base = []
TRAINING_YEARS = range(2023, 2025)

for year in tqdm(TRAINING_YEARS, desc="Processing Years"):
    try:
        schedule = fastf1.get_event_schedule(year, include_testing=False)
        rounds_in_year = schedule['RoundNumber'].max()
    except ValueError:
        print(f"Warning: Could not load event schedule for {year}. Falling back to a max of 24 rounds.")
        rounds_in_year = 24
        
    for rnd in tqdm(range(1, int(rounds_in_year) + 1), desc=f"Processing Rounds in {year}", leave=False):
        try:
            race_session = fastf1.get_session(year, rnd, 'Race')
            race_session.load(telemetry=False, weather=False, messages=False, laps=False)
            if race_session.results is None: continue

            qual_session = fastf1.get_session(year, rnd, 'Qualifying')
            qual_session.load(telemetry=False, weather=False, messages=False, laps=False)
            if qual_session.results is None: continue
            qual_results = qual_session.results

            circuit_key = race_session.session_info['Meeting']['Circuit']['Key']
            
            race_results_df = race_session.results[['DriverNumber', 'Position', 'TeamName']].copy()
            race_results_df['Year'], race_results_df['Round'], race_results_df['CircuitKey'] = year, rnd, circuit_key
            historical_results_base.append(race_results_df)
            current_hist_df = pd.concat(historical_results_base, ignore_index=True)

            for _, race_result in race_session.results.iterrows():
                driver_num = race_result['DriverNumber']
                qual_result = qual_results[qual_results['DriverNumber'] == driver_num].iloc[0] if not qual_results[qual_results['DriverNumber'] == driver_num].empty else None
                
                if qual_result is None or pd.isna(race_result.get('Position')) or pd.isna(race_result.get('GridPosition')):
                    continue
                
                qual_time_obj = qual_result.get('Q3') or qual_result.get('Q2') or qual_result.get('Q1')
                if pd.isna(qual_time_obj):
                    continue

                features = {
                    'Year': year, 'Round': rnd, 'DriverNumber': driver_num,
                    'GridPosition': race_result['GridPosition'],
                    'QualifyingTime': qual_time_obj.total_seconds(),
                    'TrackHistory': get_driver_track_history(driver_num, circuit_key, current_hist_df[current_hist_df['Year'] < year]),
                    'RecentForm': get_driver_recent_form(driver_num, year, rnd, current_hist_df),
                    'FinishingPosition': race_result['Position']
                }
                all_race_features.append(features)
        except Exception:
            # Catches errors for non-existent rounds (e.g., trying R25) and gracefully skips them.
            continue

# --- 2. Prepare Training Data ---
if not all_race_features:
    print("\nFATAL ERROR: No valid training data could be created. This indicates a persistent connection issue.")
else:
    print(f"\nSuccessfully collected {len(all_race_features)} valid driver records.")
    training_df = pd.DataFrame(all_race_features)
    training_df['FinishingPosition'] = pd.to_numeric(training_df['FinishingPosition'], errors='coerce').fillna(20)
    training_df['RaceID'] = training_df['Year'].astype(str) + '-' + training_df['Round'].astype(str)
    training_df.sort_values(by=['Year', 'Round'], inplace=True)

    groups = training_df.groupby('RaceID').size().to_numpy()
    feature_cols = ['GridPosition', 'QualifyingTime', 'TrackHistory', 'RecentForm']
    X_train = training_df[feature_cols]
    y_train = training_df['FinishingPosition']

    imputer = SimpleImputer(strategy="median")
    X_train_imputed = imputer.fit_transform(X_train)

    # --- 3. Train the Model ---
    print("Training the XGBRanker model...")
    xgb_ranker = xgb.XGBRanker(
        objective='rank:pairwise', learning_rate=0.1, max_depth=4,
        n_estimators=400, subsample=0.8, colsample_bytree=0.8,
        random_state=42, eval_metric='ndcg@5'
    )
    xgb_ranker.fit(X_train_imputed, y_train, group=groups, verbose=False)
    print("Model training complete.")

    # --- 4. Prediction Section ---
    print("\n-----------------------------------------------------------------")
    print("The model has been successfully trained on historical data and is ready for future predictions.")
    print("You can now integrate the 2025 prediction logic when the season begins.")
    print("-----------------------------------------------------------------")

Ensuring cache directory 'f1_US_GP' exists...

Building historical race dataset for training (using 2023-2024 data)...


Processing Years:   0%|          | 0/2 [00:00<?, ?it/s]core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '14', '55', '44', '18', '63', '77', '10', '23', '22', '2', '20', '21', '27', '24', '4', '31', '16', '81']
core           INFO 	Loading data for Bahrain Grand Prix - Qualifying [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '16', '55', '14', '63', '44', '18', '31', '27', '4', '77', '24', '22', '23', '2', '20', '81', '21', '10']
core           INFO 	Loading data for Saudi Arabian Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
core           INFO 	Finish


Successfully collected 449 valid driver records.
Training the XGBRanker model...
Model training complete.

-----------------------------------------------------------------
The model has been successfully trained on historical data and is ready for future predictions.
You can now integrate the 2025 prediction logic when the season begins.
-----------------------------------------------------------------


In [3]:

# --- 4. PREDICTION LOGIC FOR SINGAPORE 2025 ---
print("\nStarting prediction process for the 2025 Singapore Grand Prix...")
PRED_YEAR = 2025
PRED_ROUND = 19

try:
    # Step 1: Load all race results from the ongoing 2025 season to build history
    all_history_df = pd.concat(historical_results_base, ignore_index=True) # Start with 2023-24 history
    print(f"Loading race results from the {PRED_YEAR} season (Rounds 1 to {PRED_ROUND-1})...")
    for rnd in tqdm(range(1, PRED_ROUND), desc=f"Loading {PRED_YEAR} Season Data"):
        try:
            race_session = fastf1.get_session(PRED_YEAR, rnd, 'R')
            race_session.load(telemetry=False, weather=False, messages=False, laps=False)
            if race_session.results is None: continue
            
            race_results = race_session.results[['DriverNumber', 'Position', 'TeamName']].copy()
            race_results['Year'], race_results['Round'] = PRED_YEAR, rnd
            race_results['CircuitKey'] = race_session.session_info['Meeting']['Circuit']['Key']
            all_history_df = pd.concat([all_history_df, race_results], ignore_index=True)
        except Exception:
            continue

    # Step 2: Load the actual qualifying results for the Singapore 2025 race
    print(f"Loading Qualifying results for the {PRED_YEAR} US GP...")
    qualifying_session = fastf1.get_session(PRED_YEAR, PRED_ROUND, 'Qualifying')
    qualifying_session.load(telemetry=False)
    qual_results = qualifying_session.results
    sgp_circuit_key = qualifying_session.session_info['Meeting']['Circuit']['Key']

    # Step 3: Build the feature set for each driver for this specific race
    prediction_features = []
    for _, driver_data in qual_results.iterrows():
        driver_num = driver_data['DriverNumber']
        qual_time_obj = driver_data.get('Q3') or driver_data.get('Q2') or driver_data.get('Q1')
        if pd.isna(qual_time_obj): continue

        features = {
            'DriverNumber': driver_num, 'Abbreviation': driver_data['Abbreviation'],
            'GridPosition': driver_data['Position'], 
            'QualifyingTime': qual_time_obj.total_seconds(),
            'TrackHistory': get_driver_track_history(driver_num, sgp_circuit_key, all_history_df[all_history_df['Year'] < PRED_YEAR]),
            'RecentForm': get_driver_recent_form(driver_num, PRED_YEAR, PRED_ROUND, all_history_df),
        }
        prediction_features.append(features)

    predict_df = pd.DataFrame(prediction_features)
    X_predict = predict_df[feature_cols]
    X_predict_imputed = imputer.transform(X_predict)

    # --- 5. Make and Display Final Predictions ---
    predicted_scores = xgb_ranker.predict(X_predict_imputed)
    predict_df['PredictedScore'] = predicted_scores
    
    # Monte Carlo Simulation for Probabilities
    simulations = 10000
    podium_counts = {dn: 0 for dn in predict_df['DriverNumber']}
    noise_std_dev = np.std(y_train - xgb_ranker.predict(X_train_imputed))

    print("Running Monte Carlo simulation for podium probabilities...")
    for _ in tqdm(range(simulations)):
        noise = np.random.normal(0, noise_std_dev, len(predict_df))
        sim_scores = predict_df['PredictedScore'] + noise
        sim_podium = predict_df.iloc[sim_scores.argsort()[:3]]['DriverNumber']
        for dn in sim_podium: podium_counts[dn] += 1
    
    predict_df['PodiumProb'] = predict_df['DriverNumber'].map(lambda dn: (podium_counts.get(dn, 0) / simulations) * 100)
    
    print("\n🏆 Predicted Top 3 for 2025 US Grand Prix 🏆")
    predict_df.sort_values('PredictedScore', ascending=True, inplace=True)
    for i in range(3):
        driver = predict_df.iloc[i]
        print(f"🥇 P{i+1}: {driver['Abbreviation']} with {driver['PodiumProb']:.1f}% chance of a podium finish.")

except Exception as e:
    print("\n-----------------------------------------------------------------")
    print("COULD NOT GENERATE 2025 PREDICTION.")
    print(f"An error occurred: {e}")
    print("Please ensure the 2025 US GP qualifying has occurred and data is available.")
    print("-----------------------------------------------------------------")



Starting prediction process for the 2025 Singapore Grand Prix...
Loading race results from the 2025 season (Rounds 1 to 18)...


Loading 2025 Season Data:   0%|          | 0/18 [00:00<?, ?it/s]core           INFO 	Loading data for Australian Grand Prix - Race [v3.6.1]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
core           INFO 	Finished loading data for 20 drivers: ['4', '1', '63', '12', '23', '18', '27', '16', '81', '44', '10', '22', '31', '87', '30', '5', '14', '55', '7', '6']
Loading 2025 Season Data:   6%|▌         | 1/18 [00:01<00:30,  1.80s/it]core           INFO 	Loading data for Chinese Grand Prix - Race [v3.6.1]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
req            INFO 	Data has been written to cache!
req    

Loading Qualifying results for the 2025 US GP...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to

Running Monte Carlo simulation for podium probabilities...


100%|██████████| 10000/10000 [00:01<00:00, 5321.95it/s]


🏆 Predicted Top 3 for 2025 US Grand Prix 🏆
🥇 P1: NOR with 54.0% chance of a podium finish.
🥇 P2: VER with 40.4% chance of a podium finish.
🥇 P3: LEC with 37.5% chance of a podium finish.



