In [5]:
# =====================================================
# Cell 1 â€” Robust paths for local project layout
# Works whether you run the notebook from repo root or /server
# =====================================================
import os, json
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
import joblib

# Candidate roots to search (current dir, parent, grandparent)
_CWD = Path.cwd().resolve()
_candidates = [_CWD, _CWD.parent, _CWD.parent.parent]

def _find_project_root():
    for base in _candidates:
        if (base / "data" / "constructors.csv").exists():
            return base
    # last resort: look for a 'data' dir containing a few expected files
    for base in _candidates:
        d = base / "data"
        if d.exists() and (d / "results.csv").exists() and (d / "races.csv").exists():
            return base
    raise FileNotFoundError(
        "Could not locate project root with a ./data folder containing the F1 CSVs.\n"
        f"Checked: {[str(p) for p in _candidates]}"
    )

PROJECT_ROOT = _find_project_root()

# Define canonical paths relative to project root
DATA_DIR       = PROJECT_ROOT / "data"               # <-- your CSVs live here
ARTIFACTS_DIR  = PROJECT_ROOT / "artifacts"
HELPER_DIR     = PROJECT_ROOT / "server" / "helper"
MODEL_PATH     = ARTIFACTS_DIR / "finish_regressor_xgb.pkl"
SCHEMA_PATH    = ARTIFACTS_DIR / "schema_contract.json"

# Create output dirs if needed
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
HELPER_DIR.mkdir(parents=True, exist_ok=True)

print("ðŸ§­ Using directories:")
print("  PROJECT_ROOT :", PROJECT_ROOT)
print("  DATA_DIR     :", DATA_DIR)
print("  ARTIFACTS_DIR:", ARTIFACTS_DIR)
print("  HELPER_DIR   :", HELPER_DIR)

# Quick guard: ensure a few core CSVs exist
_required = ["constructors.csv", "drivers.csv", "races.csv", "circuits.csv", "results.csv", "pit_stops.csv"]
missing = [f for f in _required if not (DATA_DIR / f).exists()]
if missing:
    raise FileNotFoundError(f"Missing CSVs in {DATA_DIR}: {missing}\n"
                            "Make sure your Kaggle dataset is extracted into the project's ./data folder.")


ðŸ§­ Using directories:
  PROJECT_ROOT : /Users/girithchoudhary/Documents/morro/f1/f1-prediction
  DATA_DIR     : /Users/girithchoudhary/Documents/morro/f1/f1-prediction/data
  ARTIFACTS_DIR: /Users/girithchoudhary/Documents/morro/f1/f1-prediction/artifacts
  HELPER_DIR   : /Users/girithchoudhary/Documents/morro/f1/f1-prediction/server/helper


In [6]:
# =====================================================
# 2. Load core CSVs from ./data
# =====================================================
constructors = pd.read_csv(DATA_DIR / "constructors.csv")
drivers      = pd.read_csv(DATA_DIR / "drivers.csv")
races        = pd.read_csv(DATA_DIR / "races.csv")
circuits     = pd.read_csv(DATA_DIR / "circuits.csv")
results      = pd.read_csv(DATA_DIR / "results.csv")
pit_stops    = pd.read_csv(DATA_DIR / "pit_stops.csv")


In [8]:
# --- circuit_laps.json (robust; derives laps from results.csv) ---
# For each race, compute the laps as the maximum laps completed by any classified finisher.
race_laps = (
    results[results['positionOrder'] > 0]           # only classified finishes
        .groupby('raceId', as_index=False)['laps']
        .max()
        .rename(columns={'laps': 'race_laps'})
)

# Join to races to get circuitId, then to circuits for name/country
circuit_meta = (
    races[['raceId', 'circuitId']]
        .merge(race_laps, on='raceId', how='left')
        .merge(circuits[['circuitId', 'name', 'country']], on='circuitId', how='left')
)

# Median race_laps per circuit â†’ avgLaps
circuit_meta = (
    circuit_meta
        .groupby(['circuitId', 'name', 'country'], as_index=False)['race_laps']
        .median()
        .rename(columns={'race_laps': 'avgLaps', 'name': 'name_circuit'})
)

# Fill any missing with overall median as a fallback
circuit_meta['avgLaps'] = circuit_meta['avgLaps'].fillna(circuit_meta['avgLaps'].median())

# Save for API
with open(HELPER_DIR / "circuit_laps.json", "w") as f:
    json.dump(circuit_meta.to_dict(orient='records'), f, indent=2)


In [12]:
# --- overtake_index.json ---
# use a dedicated variable name to avoid re-using 'res'
res_movement = results.merge(races[['raceId','circuitId','year']], on='raceId', how='left')
res_movement = res_movement[(res_movement['grid'] > 0) & (res_movement['positionOrder'] > 0)]
res_movement['pos_gain'] = res_movement['grid'] - res_movement['positionOrder']

race_movement = (res_movement.groupby(['raceId','circuitId'], as_index=False)['pos_gain']
                   .apply(lambda s: float(np.mean(np.abs(s)))))
race_movement.rename(columns={'pos_gain':'abs_movement'}, inplace=True)

circ_movement = race_movement.groupby('circuitId', as_index=False)['abs_movement'].mean()
vmin, vmax = circ_movement['abs_movement'].min(), circ_movement['abs_movement'].max()
circ_movement['overtakeIndex'] = (circ_movement['abs_movement'] - vmin) / (vmax - vmin + 1e-9)
overtake_index = circ_movement[['circuitId','overtakeIndex']]

# save helper file
with open(HELPER_DIR / "overtake_index.json", "w") as f:
    json.dump(overtake_index.to_dict(orient='records'), f, indent=2)



In [13]:
# =====================================================
# 4. Car Performance Index (constructor pace proxy)
# =====================================================
# Build a fresh base with the columns we need; do NOT reuse 'res' from other cells.
res_points = (
    results.merge(races[['raceId', 'year']], on='raceId', how='left')
           [['year', 'constructorId', 'points']]
)

# Points can be NaN in some rows; treat as 0 for season totals
res_points['points'] = res_points['points'].fillna(0)

# Season total points per constructor
season_cons_pts = (res_points
    .groupby(['year', 'constructorId'], as_index=False)['points']
    .sum()
    .rename(columns={'points': 'constructor_points'})
)

# Normalize within each season to [0,1]
season_max = (season_cons_pts
    .groupby('year', as_index=False)['constructor_points']
    .max()
    .rename(columns={'constructor_points': 'season_max_points'})
)

season_cons_pts = season_cons_pts.merge(season_max, on='year', how='left')
season_cons_pts['carPerformanceIndex'] = (
    season_cons_pts['constructor_points'] / (season_cons_pts['season_max_points'] + 1e-9)
)

# Keep only what we'll merge later
season_cons_pts = season_cons_pts[['year', 'constructorId', 'carPerformanceIndex']]
season_cons_pts.head()



Unnamed: 0,year,constructorId,carPerformanceIndex
0,1950,6,0.235955
1,1950,51,1.0
2,1950,87,0.0
3,1950,105,0.123596
4,1950,107,0.0


In [14]:
# =====================================================
# 5. Pit features (count, durations, stints)
# =====================================================
ps = pit_stops.copy()
ps['milliseconds'] = ps['milliseconds'].fillna(0).astype(float)
agg = (ps.groupby(['raceId','driverId'], as_index=False)
         .agg(pit_count=('stop','count'),
              pit_total_duration=('milliseconds','sum'),
              pit_avg_duration=('milliseconds','mean'),
              first_pit_lap=('lap','min'),
              last_pit_lap=('lap','max')))

def proxy_tire_score(nstops):
    if pd.isna(nstops) or nstops == 0: return 1.5
    if nstops == 1: return 2.0
    if nstops == 2: return 2.4
    return 2.7
agg['avgTireScore'] = agg['pit_count'].apply(proxy_tire_score)


In [15]:
# =====================================================
# 6. Build training dataset
# =====================================================
Y = results.merge(races[['raceId','year','round','circuitId']], on='raceId', how='left')
Y = Y.merge(circuits[['circuitId','country']], on='circuitId', how='left')
Y = Y.merge(overtake_index, on='circuitId', how='left')
Y = Y.merge(agg, on=['raceId','driverId'], how='left')
Y = Y.merge(season_cons_pts, on=['year','constructorId'], how='left')

# Fill missing
for c in ['pit_count','pit_total_duration','pit_avg_duration','first_pit_lap','last_pit_lap','avgTireScore']:
    Y[c] = Y[c].fillna(0 if c!='avgTireScore' else 1.8)
Y['circuit_overtake_difficulty'] = Y['overtakeIndex'].fillna(Y['overtakeIndex'].median())
Y['carPerformanceIndex'] = Y['carPerformanceIndex'].fillna(Y['carPerformanceIndex'].median())

TARGET = 'positionOrder'
FEATURES = [
 'grid','pit_count','pit_total_duration','pit_avg_duration',
 'first_pit_lap','last_pit_lap','circuit_overtake_difficulty',
 'round','circuitId','country','carPerformanceIndex','avgTireScore'
]
dataset = Y[Y[TARGET] > 0][FEATURES + [TARGET]].copy()


In [16]:
# =====================================================
# 7. Train/Validation Split
# =====================================================
groups = Y.loc[dataset.index, 'year']
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, valid_idx = next(gss.split(dataset, dataset[TARGET], groups))

train_df = dataset.iloc[train_idx].reset_index(drop=True)
valid_df = dataset.iloc[valid_idx].reset_index(drop=True)
X_train, y_train = train_df.drop(columns=[TARGET]), train_df[TARGET]
X_valid, y_valid = valid_df.drop(columns=[TARGET]), valid_df[TARGET]


In [17]:
# =====================================================
# 8. XGBRegressor Pipeline (unified)
# =====================================================
numeric_cols = [
 'grid','pit_count','pit_total_duration','pit_avg_duration',
 'first_pit_lap','last_pit_lap','circuit_overtake_difficulty',
 'round','circuitId','carPerformanceIndex','avgTireScore'
]
cat_cols = ['country']

preprocess = ColumnTransformer([
    ("num", "passthrough", numeric_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

xgb = XGBRegressor(
    n_estimators=900,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=42,
    tree_method="hist"
)

pipe = Pipeline([
    ("prep", preprocess),
    ("model", xgb)
])

pipe.fit(X_train, y_train)


In [21]:
# =====================================================
# 9. Evaluate (version-agnostic)
# =====================================================
pred_train = pipe.predict(X_train)
pred_valid = pipe.predict(X_valid)

mse  = mean_squared_error(y_valid, pred_valid)  # no 'squared' kwarg
rmse = np.sqrt(mse)
mae  = mean_absolute_error(y_valid, pred_valid)

print(f"Validation RMSE: {rmse:.3f} | MAE: {mae:.3f}")


Validation RMSE: 5.844 | MAE: 4.644


In [22]:
from sklearn.metrics import mean_absolute_error

# Baseline 1: "finish = grid"
baseline_grid_mae = mean_absolute_error(y_valid, X_valid['grid'])
print("Baseline (finish=grid) MAE:", baseline_grid_mae)

# Baseline 2: only car pace + overtake + circuit (very rough)
import numpy as np
pseudo = (
    21
    - 10 * X_valid['carPerformanceIndex'].fillna(0.5)
    - 3  * X_valid['circuit_overtake_difficulty'].fillna(0.5)
)
pseudo = np.clip(pseudo, 1, 20)
print("Pseudo baseline MAE:", mean_absolute_error(y_valid, pseudo))


Baseline (finish=grid) MAE: 6.525088200705605
Pseudo baseline MAE: 6.701780640667841


In [23]:
# =====================================================
# 10. Save model + schema contract
# =====================================================
joblib.dump(pipe, MODEL_PATH)

schema_contract = {
  "feature_order_api": FEATURES,
  "target": TARGET,
  "version": "v1.0.0"
}
with open(ARTIFACTS_DIR / "schema_contract.json", "w") as f:
    json.dump(schema_contract, f, indent=2)

print("Saved model:", MODEL_PATH)


Saved model: /Users/girithchoudhary/Documents/morro/f1/f1-prediction/artifacts/finish_regressor_xgb.pkl
