In [12]:
import pandas as pd
import numpy as np

In [13]:
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

In [16]:
from sklearn.metrics import mean_absolute_error, accuracy_score


In [18]:
import streamlit as st

In [19]:
from datetime import datetime

In [21]:
import pandas as pd
import os

In [22]:
data_folder = r'C:\Users\dunit\Desktop\VisionF1\data'

In [23]:
def load_dataset(filename):
    try:
        path = os.path.join(data_folder, filename)
        df = pd.read_csv(path)
        print(f"Successfully loaded {filename}")
        return df
    except FileNotFoundError:
        print(f"Error: {filename} not found in {data_folder}")
        return None
    except Exception as e:
        print(f"Error loading {filename}: {str(e)}")
        return None


In [24]:
results = load_dataset('results.csv')
races = load_dataset('races.csv')
drivers = load_dataset('drivers.csv')
qualifying = load_dataset('qualifying.csv')
driver_standings = load_dataset('driver_standings.csv')
constructors = load_dataset('constructors.csv')
circuits = load_dataset('circuits.csv')
constructor_standings = load_dataset('constructor_standings.csv')

Successfully loaded results.csv
Successfully loaded races.csv
Successfully loaded drivers.csv
Successfully loaded qualifying.csv
Successfully loaded driver_standings.csv
Successfully loaded constructors.csv
Successfully loaded circuits.csv
Successfully loaded constructor_standings.csv


In [25]:
loaded_datasets = {
    'Results': results,
    'Races': races,
    'Drivers': drivers,
    'Qualifying': qualifying,
    'Driver Standings': driver_standings,
    'Constructors': constructors,
    'Circuits': circuits,
    'Constructor Standings': constructor_standings
}

In [26]:

print("\nDataset Loading Summary:")
for name, df in loaded_datasets.items():
    status = "Loaded" if df is not None else "Failed to load"
    print(f"{name}: {status}")
    if df is not None:
        print(f"  Shape: {df.shape}")
        print(f"  Columns: {list(df.columns)}\n")


Dataset Loading Summary:
Results: Loaded
  Shape: (26759, 18)
  Columns: ['resultId', 'raceId', 'driverId', 'constructorId', 'number', 'grid', 'position', 'positionText', 'positionOrder', 'points', 'laps', 'time', 'milliseconds', 'fastestLap', 'rank', 'fastestLapTime', 'fastestLapSpeed', 'statusId']

Races: Loaded
  Shape: (1125, 18)
  Columns: ['raceId', 'year', 'round', 'circuitId', 'name', 'date', 'time', 'url', 'fp1_date', 'fp1_time', 'fp2_date', 'fp2_time', 'fp3_date', 'fp3_time', 'quali_date', 'quali_time', 'sprint_date', 'sprint_time']

Drivers: Loaded
  Shape: (861, 9)
  Columns: ['driverId', 'driverRef', 'number', 'code', 'forename', 'surname', 'dob', 'nationality', 'url']

Qualifying: Loaded
  Shape: (10494, 9)
  Columns: ['qualifyId', 'raceId', 'driverId', 'constructorId', 'number', 'position', 'q1', 'q2', 'q3']

Driver Standings: Loaded
  Shape: (34863, 7)
  Columns: ['driverStandingsId', 'raceId', 'driverId', 'points', 'position', 'positionText', 'wins']

Constructors: Lo

In [27]:
import pandas as pd
import numpy as np
from datetime import datetime


In [28]:
races['date'] = pd.to_datetime(races['date'])

In [29]:
# Clean results data - convert position to numeric and handle DNF/DNS/etc.
results['position'] = pd.to_numeric(results['position'], errors='coerce')
results = results.dropna(subset=['position'])

In [30]:

# Create driver names
drivers['driver_name'] = drivers['forename'] + ' ' + drivers['surname']


In [31]:
# Prepare constructor names
constructors = constructors.rename(columns={'name': 'constructor_name'})

In [32]:
circuits = circuits.rename(columns={'name': 'circuit_name'})

In [33]:
def merge_f1_data(results, races, drivers, constructors, circuits, qualifying, driver_standings):
    # Merge basic race results
    df = results.merge(races, on='raceId')
    df = df.merge(drivers[['driverId', 'driver_name']], on='driverId')
    df = df.merge(constructors[['constructorId', 'constructor_name']], on='constructorId')
    df = df.merge(circuits[['circuitId', 'circuit_name', 'country']], on='circuitId')
    
    # Add qualifying data (latest qualifying position for each race-driver)
    qualifying_latest = qualifying.sort_values(['raceId', 'driverId', 'qualifyId']).drop_duplicates(
        ['raceId', 'driverId'], keep='last')
    df = df.merge(qualifying_latest[['raceId', 'driverId', 'position']].rename(
        columns={'position': 'qualifying_position'}), on=['raceId', 'driverId'], how='left')
    
    # Add driver standings (points before the race)
    df = df.merge(driver_standings.rename(columns={
        'points': 'points_before_race',
        'position': 'standing_position_before_race',
        'wins': 'wins_before_race'
    }), on=['raceId', 'driverId'], how='left')
    
    return df

In [34]:
df = merge_f1_data(results, races, drivers, constructors, circuits, qualifying, driver_standings)

In [35]:
# Filter recent data (last 10 years)
df = df[df['year'] >= 2015]


In [36]:
# Create target variables
df['podium'] = (df['positionOrder'] <= 3).astype(int)
df['points_finish'] = (df['positionOrder'] <= 10).astype(int)


In [37]:

print("Merged dataset shape:", df.shape)
print("Columns:", df.columns.tolist())

Merged dataset shape: (3578, 47)
Columns: ['resultId', 'raceId', 'driverId', 'constructorId', 'number', 'grid', 'position', 'positionText_x', 'positionOrder', 'points', 'laps', 'time_x', 'milliseconds', 'fastestLap', 'rank', 'fastestLapTime', 'fastestLapSpeed', 'statusId', 'year', 'round', 'circuitId', 'name', 'date', 'time_y', 'url', 'fp1_date', 'fp1_time', 'fp2_date', 'fp2_time', 'fp3_date', 'fp3_time', 'quali_date', 'quali_time', 'sprint_date', 'sprint_time', 'driver_name', 'constructor_name', 'circuit_name', 'country', 'qualifying_position', 'driverStandingsId', 'points_before_race', 'standing_position_before_race', 'positionText_y', 'wins_before_race', 'podium', 'points_finish']


In [38]:

def create_rolling_features(df, group_col, window=5):
    df = df.sort_values(['year', 'round'])
    
    # Rolling average position
    df[f'{group_col}_rolling_position'] = df.groupby(group_col)['positionOrder'].transform(
        lambda x: x.rolling(window, min_periods=1).mean())
    
    # Rolling podium percentage
    df[f'{group_col}_rolling_podium_pct'] = df.groupby(group_col)['podium'].transform(
        lambda x: x.rolling(window, min_periods=1).mean())
    
    # Rolling points percentage
    df[f'{group_col}_rolling_points_pct'] = df.groupby(group_col)['points_finish'].transform(
        lambda x: x.rolling(window, min_periods=1).mean())
    
    return df

In [39]:
# Create features for drivers and constructors
df = create_rolling_features(df, 'driverId')
df = create_rolling_features(df, 'constructorId')

In [40]:

circuit_features = df.groupby(['driverId', 'circuitId']).agg(
    circuit_avg_position=('positionOrder', 'mean'),
    circuit_podium_pct=('podium', 'mean'),
    circuit_points_pct=('points_finish', 'mean')
).reset_index()

In [41]:
df = df.merge(circuit_features, on=['driverId', 'circuitId'], how='left')

In [42]:

for col in ['circuit_avg_position', 'circuit_podium_pct', 'circuit_points_pct']:
    df[col] = df.groupby('driverId')[col].transform(lambda x: x.fillna(x.mean()))

In [43]:

df['season_position_avg'] = df.groupby(['year', 'driverId'])['positionOrder'].transform(
    lambda x: x.expanding().mean())
df['season_podium_pct'] = df.groupby(['year', 'driverId'])['podium'].transform(
    lambda x: x.expanding().mean())
df['season_points_pct'] = df.groupby(['year', 'driverId'])['points_finish'].transform(
    lambda x: x.expanding().mean())

In [44]:
features = [
    'grid', 'qualifying_position',
    'driverId_rolling_position', 'driverId_rolling_podium_pct', 'driverId_rolling_points_pct',
    'constructorId_rolling_position', 'constructorId_rolling_podium_pct', 'constructorId_rolling_points_pct',
    'circuit_avg_position', 'circuit_podium_pct', 'circuit_points_pct',
    'season_position_avg', 'season_podium_pct', 'season_points_pct',
    'points_before_race', 'standing_position_before_race', 'wins_before_race',
    'constructorId'  # Will be encoded
]

In [45]:
target_position = 'positionOrder'
target_podium = 'podium'
target_points = 'points_finish'


In [46]:

X = df[features].copy()
y_position = df[target_position]
y_podium = df[target_podium]
y_points = df[target_points]


In [47]:

X = pd.get_dummies(X, columns=['constructorId'], drop_first=True)

In [48]:

train_mask = df['year'] < 2023
X_train, X_test = X[train_mask], X[~train_mask]
y_position_train, y_position_test = y_position[train_mask], y_position[~train_mask]
y_podium_train, y_podium_test = y_podium[train_mask], y_podium[~train_mask]
y_points_train, y_points_test = y_points[train_mask], y_points[~train_mask]

In [50]:
!pip install xgboost


Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.5/150.0 MB 932.9 kB/s eta 0:02:41
   ---------------------------------------- 1.0/150.0 MB 1.4 MB/s eta 0:01:50
   ---------------------------------------- 1.3/150.0 MB 1.4 MB/s eta 0:01:47
   ---------------------------------------- 1.8/150.0 MB 1.5 MB/s eta 0:01:41
    --------------------------------------- 2.6/150.0 MB 1.9 MB/s eta 0:01:19
    ------------------------------

In [51]:
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import mean_absolute_error, accuracy_score


In [52]:
# Train position prediction model
position_model = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    random_state=42
)
position_model.fit(X_train, y_position_train)


In [53]:
# Train podium prediction model
podium_model = XGBClassifier(
    objective='binary:logistic',
    n_estimators=100,
    random_state=42
)
podium_model.fit(X_train, y_podium_train)


In [54]:
# Train points prediction model
points_model = XGBClassifier(
    objective='binary:logistic',
    n_estimators=100,
    random_state=42
)
points_model.fit(X_train, y_points_train)

In [55]:
# Evaluate models
def evaluate_model(model, X_test, y_test, is_classifier=False):
    y_pred = model.predict(X_test)
    if is_classifier:
        return accuracy_score(y_test, y_pred.round())
    else:
        return mean_absolute_error(y_test, y_pred)


In [56]:
print(f"Position MAE: {evaluate_model(position_model, X_test, y_position_test)}")
print(f"Podium Accuracy: {evaluate_model(podium_model, X_test, y_podium_test, True)}")
print(f"Points Accuracy: {evaluate_model(points_model, X_test, y_points_test, True)}")

Position MAE: 1.8735667405105134
Podium Accuracy: 0.9498164014687882
Points Accuracy: 0.9008567931456548


In [57]:
# Current 2025 drivers and teams (updated)
current_drivers_info = {
    'Pierre Gasly': {'constructor': 'Alpine', 'driverId': -1},
    'Franco Colapinto': {'constructor': 'Alpine', 'driverId': -2},
    'Fernando Alonso': {'constructor': 'Aston Martin', 'driverId': -3},
    'Lance Stroll': {'constructor': 'Aston Martin', 'driverId': -4},
    'Charles Leclerc': {'constructor': 'Ferrari', 'driverId': -5},
    'Lewis Hamilton': {'constructor': 'Ferrari', 'driverId': -6},
    'Oliver Bearman': {'constructor': 'Haas', 'driverId': -7},
    'Esteban Ocon': {'constructor': 'Haas', 'driverId': -8},
    'Oscar Piastri': {'constructor': 'McLaren', 'driverId': -9},
    'Lando Norris': {'constructor': 'McLaren', 'driverId': -10},
    'George Russell': {'constructor': 'Mercedes', 'driverId': -11},
    'Kimi Antonelli': {'constructor': 'Mercedes', 'driverId': -12},
    'Liam Lawson': {'constructor': 'Racing Bulls', 'driverId': -13},
    'Isack Hadjar': {'constructor': 'Racing Bulls', 'driverId': -14},
    'Max Verstappen': {'constructor': 'Red Bull', 'driverId': -15},
    'Yuki Tsunoda': {'constructor': 'Red Bull', 'driverId': -16},
    'Nico Hulkenberg': {'constructor': 'Sauber', 'driverId': -17},
    'Gabriel Bortoleto': {'constructor': 'Sauber', 'driverId': -18},
    'Alex Albon': {'constructor': 'Williams', 'driverId': -19},
    'Carlos Sainz Jr': {'constructor': 'Williams', 'driverId': -20}
}


In [58]:
# Map constructor names to IDs
constructor_name_to_id = {row['constructor_name']: row['constructorId'] 
                         for _, row in constructors.iterrows()}


In [59]:
# 2025 races
races_2025 = {
    'Belgian GP': {'circuitId': 1, 'round': 13, 'date': '2025-07-27'},
    'Hungarian GP': {'circuitId': 2, 'round': 14, 'date': '2025-08-03'},
    'Dutch GP': {'circuitId': 3, 'round': 15, 'date': '2025-08-31'},
    'Italian GP': {'circuitId': 4, 'round': 16, 'date': '2025-09-07'},
    'Azerbaijan GP': {'circuitId': 5, 'round': 17, 'date': '2025-09-21'},
    'Singapore GP': {'circuitId': 6, 'round': 18, 'date': '2025-10-05'},
    'United States GP': {'circuitId': 7, 'round': 19, 'date': '2025-10-19'},
    'Mexican GP': {'circuitId': 8, 'round': 20, 'date': '2025-10-26'},
    'São Paulo GP': {'circuitId': 9, 'round': 21, 'date': '2025-11-09'},
    'Las Vegas GP': {'circuitId': 10, 'round': 22, 'date': '2025-11-22'},
    'Qatar GP': {'circuitId': 11, 'round': 23, 'date': '2025-11-30'},
    'Abu Dhabi GP': {'circuitId': 12, 'round': 24, 'date': '2025-12-07'}
}


In [60]:
def prepare_prediction_data(driver_name, race_name, df, current_drivers_info, races_2025, constructor_name_to_id):
    driver_info = current_drivers_info[driver_name]
    constructor_name = driver_info['constructor']
    constructor_id = constructor_name_to_id.get(constructor_name, -1)
    circuit_id = races_2025[race_name]['circuitId']
    
    # Get driver's historical data (most recent if available)
    driver_history = df[df['driver_name'] == driver_name].sort_values(['year', 'round'])
    
    # If no history (new driver), use average values
    if len(driver_history) == 0:
        features = {
            'grid': 10,
            'qualifying_position': 10,
            'driverId_rolling_position': df['positionOrder'].mean(),
            'driverId_rolling_podium_pct': df['podium'].mean(),
            'driverId_rolling_points_pct': df['points_finish'].mean(),
            'constructorId_rolling_position': df[df['constructorId'] == constructor_id]['positionOrder'].mean(),
            'constructorId_rolling_podium_pct': df[df['constructorId'] == constructor_id]['podium'].mean(),
            'constructorId_rolling_points_pct': df[df['constructorId'] == constructor_id]['points_finish'].mean(),
            'circuit_avg_position': df['positionOrder'].mean(),
            'circuit_podium_pct': df['podium'].mean(),
            'circuit_points_pct': df['points_finish'].mean(),
            'season_position_avg': df['positionOrder'].mean(),
            'season_podium_pct': df['podium'].mean(),
            'season_points_pct': df['points_finish'].mean(),
            'points_before_race': 0,
            'standing_position_before_race': 20,
            'wins_before_race': 0,
            'constructorId': constructor_id
        }
    else:
        # Get rolling averages from last 5 races
        last_race = driver_history.iloc[-1]
        features = {
            'grid': last_race['grid'],
            'qualifying_position': last_race.get('qualifying_position', 10),
            'driverId_rolling_position': last_race['driverId_rolling_position'],
            'driverId_rolling_podium_pct': last_race['driverId_rolling_podium_pct'],
            'driverId_rolling_points_pct': last_race['driverId_rolling_points_pct'],
            'constructorId_rolling_position': last_race['constructorId_rolling_position'],
            'constructorId_rolling_podium_pct': last_race['constructorId_rolling_podium_pct'],
            'constructorId_rolling_points_pct': last_race['constructorId_rolling_points_pct'],
            'season_position_avg': last_race['season_position_avg'],
            'season_podium_pct': last_race['season_podium_pct'],
            'season_points_pct': last_race['season_points_pct'],
            'points_before_race': last_race.get('points_before_race', 0),
            'standing_position_before_race': last_race.get('standing_position_before_race', 20),
            'wins_before_race': last_race.get('wins_before_race', 0),
            'constructorId': constructor_id
        }
        
        # Get circuit-specific features if available
        circuit_history = driver_history[driver_history['circuitId'] == circuit_id]
        if len(circuit_history) > 0:
            features.update({
                'circuit_avg_position': circuit_history['positionOrder'].mean(),
                'circuit_podium_pct': circuit_history['podium'].mean(),
                'circuit_points_pct': circuit_history['points_finish'].mean()
            })
    
    # Create DataFrame
    features_df = pd.DataFrame([features])
    
    # One-hot encode constructorId
    for c in constructors['constructorId'].unique():
        features_df[f'constructorId_{c}'] = (features_df['constructorId'] == c).astype(int)
    features_df = features_df.drop('constructorId', axis=1)
    
    # Ensure all expected columns are present
    expected_columns = X_train.columns
    for col in expected_columns:
        if col not in features_df.columns:
            features_df[col] = 0
    
    return features_df[expected_columns]

In [61]:

sample_driver = "Lewis Hamilton"
sample_race = "Belgian GP"
sample_input = prepare_prediction_data(sample_driver, sample_race, df, current_drivers_info, races_2025, constructor_name_to_id)

position_pred = position_model.predict(sample_input)[0]
podium_prob = podium_model.predict_proba(sample_input)[0][1]
points_prob = points_model.predict_proba(sample_input)[0][1]

print(f"Prediction for {sample_driver} at {sample_race}:")
print(f"  Predicted position: {round(position_pred)}")
print(f"  Podium probability: {podium_prob:.1%}")
print(f"  Points probability: {points_prob:.1%}")

Prediction for Lewis Hamilton at Belgian GP:
  Predicted position: 8
  Podium probability: 96.0%
  Points probability: 99.9%


In [62]:
import joblib
joblib.dump(position_model, 'position_model.pkl')
joblib.dump(podium_model, 'podium_model.pkl')
joblib.dump(points_model, 'points_model.pkl')

['points_model.pkl']

In [63]:
sample_driver = "Lando Norris"
sample_race = "Belgian GP"
sample_input = prepare_prediction_data(sample_driver, sample_race, df, current_drivers_info, races_2025, constructor_name_to_id)

position_pred = position_model.predict(sample_input)[0]
podium_prob = podium_model.predict_proba(sample_input)[0][1]
points_prob = points_model.predict_proba(sample_input)[0][1]

print(f"Prediction for {sample_driver} at {sample_race}:")
print(f"  Predicted position: {round(position_pred)}")
print(f"  Podium probability: {podium_prob:.1%}")
print(f"  Points probability: {points_prob:.1%}")

Prediction for Lando Norris at Belgian GP:
  Predicted position: 4
  Podium probability: 89.7%
  Points probability: 100.0%


In [64]:
sample_driver = "Oscar Piastri"
sample_race = "Belgian GP"
sample_input = prepare_prediction_data(sample_driver, sample_race, df, current_drivers_info, races_2025, constructor_name_to_id)

position_pred = position_model.predict(sample_input)[0]
podium_prob = podium_model.predict_proba(sample_input)[0][1]
points_prob = points_model.predict_proba(sample_input)[0][1]

print(f"Prediction for {sample_driver} at {sample_race}:")
print(f"  Predicted position: {round(position_pred)}")
print(f"  Podium probability: {podium_prob:.1%}")
print(f"  Points probability: {points_prob:.1%}")

Prediction for Oscar Piastri at Belgian GP:
  Predicted position: 7
  Podium probability: 3.7%
  Points probability: 100.0%


In [65]:
sample_driver = "Max Verstappen"
sample_race = "Belgian GP"
sample_input = prepare_prediction_data(sample_driver, sample_race, df, current_drivers_info, races_2025, constructor_name_to_id)

position_pred = position_model.predict(sample_input)[0]
podium_prob = podium_model.predict_proba(sample_input)[0][1]
points_prob = points_model.predict_proba(sample_input)[0][1]

print(f"Prediction for {sample_driver} at {sample_race}:")
print(f"  Predicted position: {round(position_pred)}")
print(f"  Podium probability: {podium_prob:.1%}")
print(f"  Points probability: {points_prob:.1%}")

Prediction for Max Verstappen at Belgian GP:
  Predicted position: 3
  Podium probability: 30.5%
  Points probability: 100.0%


In [1]:
pip install streamlit pandas numpy xgboost scikit-learn pillow

Note: you may need to restart the kernel to use updated packages.
