In [231]:
# =====================================================
# Cell 1 â€” Robust paths for local project layout
# Works whether you run the notebook from repo root or /server
# =====================================================
import os, json
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
import joblib

# Candidate roots to search (current dir, parent, grandparent)
_CWD = Path.cwd().resolve()
_candidates = [_CWD, _CWD.parent, _CWD.parent.parent]

def _find_project_root():
    for base in _candidates:
        if (base / "data" / "constructors.csv").exists():
            return base
    # last resort: look for a 'data' dir containing a few expected files
    for base in _candidates:
        d = base / "data"
        if d.exists() and (d / "results.csv").exists() and (d / "races.csv").exists():
            return base
    raise FileNotFoundError(
        "Could not locate project root with a ./data folder containing the F1 CSVs.\n"
        f"Checked: {[str(p) for p in _candidates]}"
    )

PROJECT_ROOT = _find_project_root()

# Define canonical paths relative to project root
DATA_DIR       = PROJECT_ROOT / "data"               # <-- your CSVs live here
ARTIFACTS_DIR  = PROJECT_ROOT / "artifacts"
HELPER_DIR     = PROJECT_ROOT / "server" / "helper"
MODEL_PATH     = ARTIFACTS_DIR / "finish_regressor_xgb.pkl"
SCHEMA_PATH    = ARTIFACTS_DIR / "schema_contract.json"

# Create output dirs if needed
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
HELPER_DIR.mkdir(parents=True, exist_ok=True)

print("ðŸ§­ Using directories:")
print("  PROJECT_ROOT :", PROJECT_ROOT)
print("  DATA_DIR     :", DATA_DIR)
print("  ARTIFACTS_DIR:", ARTIFACTS_DIR)
print("  HELPER_DIR   :", HELPER_DIR)

# Quick guard: ensure a few core CSVs exist
_required = ["constructors.csv", "drivers.csv", "races.csv", "circuits.csv", "results.csv", "pit_stops.csv"]
missing = [f for f in _required if not (DATA_DIR / f).exists()]
if missing:
    raise FileNotFoundError(f"Missing CSVs in {DATA_DIR}: {missing}\n"
                            "Make sure your Kaggle dataset is extracted into the project's ./data folder.")


ðŸ§­ Using directories:
  PROJECT_ROOT : /Users/girithchoudhary/Documents/morro/f1/f1-prediction
  DATA_DIR     : /Users/girithchoudhary/Documents/morro/f1/f1-prediction/data
  ARTIFACTS_DIR: /Users/girithchoudhary/Documents/morro/f1/f1-prediction/artifacts
  HELPER_DIR   : /Users/girithchoudhary/Documents/morro/f1/f1-prediction/server/helper


In [232]:
# =====================================================
# 2. Load core CSVs from ./data
# =====================================================
constructors = pd.read_csv(DATA_DIR / "constructors.csv")
drivers      = pd.read_csv(DATA_DIR / "drivers.csv")
races        = pd.read_csv(DATA_DIR / "races.csv")
circuits     = pd.read_csv(DATA_DIR / "circuits.csv")
results      = pd.read_csv(DATA_DIR / "results.csv")
pit_stops    = pd.read_csv(DATA_DIR / "pit_stops.csv")


In [233]:
# --- circuit_laps.json (robust; derives laps from results.csv) ---
# For each race, compute the laps as the maximum laps completed by any classified finisher.
race_laps = (
    results[results['positionOrder'] > 0]           # only classified finishes
        .groupby('raceId', as_index=False)['laps']
        .max()
        .rename(columns={'laps': 'race_laps'})
)

# Join to races to get circuitId, then to circuits for name/country
circuit_meta = (
    races[['raceId', 'circuitId']]
        .merge(race_laps, on='raceId', how='left')
        .merge(circuits[['circuitId', 'name', 'country']], on='circuitId', how='left')
)

# Median race_laps per circuit â†’ avgLaps
circuit_meta = (
    circuit_meta
        .groupby(['circuitId', 'name', 'country'], as_index=False)['race_laps']
        .median()
        .rename(columns={'race_laps': 'avgLaps', 'name': 'name_circuit'})
)

# Fill any missing with overall median as a fallback
circuit_meta['avgLaps'] = circuit_meta['avgLaps'].fillna(circuit_meta['avgLaps'].median())

# Save for API
with open(HELPER_DIR / "circuit_laps.json", "w") as f:
    json.dump(circuit_meta.to_dict(orient='records'), f, indent=2)


In [234]:
# --- overtake_index.json ---
# use a dedicated variable name to avoid re-using 'res'
res_movement = results.merge(races[['raceId','circuitId','year']], on='raceId', how='left')
res_movement = res_movement[(res_movement['grid'] > 0) & (res_movement['positionOrder'] > 0)]
res_movement['pos_gain'] = res_movement['grid'] - res_movement['positionOrder']

race_movement = (res_movement.groupby(['raceId','circuitId'], as_index=False)['pos_gain']
                   .apply(lambda s: float(np.mean(np.abs(s)))))
race_movement.rename(columns={'pos_gain':'abs_movement'}, inplace=True)

circ_movement = race_movement.groupby('circuitId', as_index=False)['abs_movement'].mean()
vmin, vmax = circ_movement['abs_movement'].min(), circ_movement['abs_movement'].max()
circ_movement['overtakeIndex'] = (circ_movement['abs_movement'] - vmin) / (vmax - vmin + 1e-9)
overtake_index = circ_movement[['circuitId','overtakeIndex']]

# save helper file
with open(HELPER_DIR / "overtake_index.json", "w") as f:
    json.dump(overtake_index.to_dict(orient='records'), f, indent=2)



In [235]:
# =====================================================
# Car Performance Index â€” Qualifying *time*-based (constructor/year)
# =====================================================
qualifying = pd.read_csv(DATA_DIR / "qualifying.csv")

def _to_ms(x):
    if pd.isna(x): 
        return np.nan
    s = str(x).strip()
    try:
        if ":" in s:
            m, rest = s.split(":")
            return (int(m) * 60.0 + float(rest)) * 1000.0
        return float(s) * 1000.0
    except:
        return np.nan

# Convert q1/q2/q3 to milliseconds and take the best per driver
for col in ["q1", "q2", "q3"]:
    qualifying[col + "_ms"] = qualifying[col].map(_to_ms)
qualifying["bestQ_ms"] = qualifying[["q1_ms", "q2_ms", "q3_ms"]].min(axis=1)

# Attach constructorId to each qualifying row (qualifying doesnâ€™t have it)
drv_cons = results[["raceId", "driverId", "constructorId"]].drop_duplicates()
q = (qualifying
     .merge(drv_cons, on=["raceId", "driverId"], how="left")
     .merge(races[["raceId", "year"]], on="raceId", how="left"))

# Ensure a single constructorId column after merges
if "constructorId" in q.columns:
    pass  # already good (comes from qualifying.csv)
else:
    # If duplicates exist, prefer qualifying's (usually _x), then fill from results' (_y)
    cand_x = "constructorId_x" if "constructorId_x" in q.columns else None
    cand_y = "constructorId_y" if "constructorId_y" in q.columns else None

    if cand_x or cand_y:
        q["constructorId"] = np.nan
        if cand_x:
            q["constructorId"] = q[cand_x]
        if cand_y:
            q["constructorId"] = q["constructorId"].fillna(q[cand_y])

        # clean up extras
        drop_cols = [c for c in [cand_x, cand_y] if c]
        q.drop(columns=drop_cols, inplace=True)
    else:
        raise RuntimeError("constructorId not found after merges; check input files/merges.")


# For each race & team, keep the team's best qualifying time (fastest driver of that team)
# IMPORTANT: include 'year' in the group keys so we don't lose it
team_best = (q.dropna(subset=["bestQ_ms"])
               .groupby(["raceId", "year", "constructorId"], as_index=False)["bestQ_ms"]
               .min())

# For each season, derive a constructor pace index from median of race-best times
cons_season = (team_best
               .groupby(["year", "constructorId"], as_index=False)["bestQ_ms"]
               .median()
               .rename(columns={"bestQ_ms": "med_bestQ_ms"}))

# Season-wise min-max to [0,1], where 1.0 = fastest in that season
season_minmax = cons_season.groupby("year")["med_bestQ_ms"].agg(["min", "max"]).reset_index()
cons_season = cons_season.merge(season_minmax, on="year", how="left")
rng = (cons_season["max"] - cons_season["min"]).replace(0, 1.0)
cons_season["carPerformanceIndex"] = 1.0 - ((cons_season["med_bestQ_ms"] - cons_season["min"]) / rng)

# Definitive CPI table for downstream merges
season_cons_pts = cons_season[["year", "constructorId", "carPerformanceIndex"]].copy()



In [236]:
# =====================================================
# 5. Pit features (count, durations, stints)
# =====================================================
ps = pit_stops.copy()
ps['milliseconds'] = ps['milliseconds'].fillna(0).astype(float)
agg = (ps.groupby(['raceId','driverId'], as_index=False)
         .agg(pit_count=('stop','count'),
              pit_total_duration=('milliseconds','sum'),
              pit_avg_duration=('milliseconds','mean'),
              first_pit_lap=('lap','min'),
              last_pit_lap=('lap','max')))

def proxy_tire_score(nstops):
    if pd.isna(nstops) or nstops == 0: return 1.5
    if nstops == 1: return 2.0
    if nstops == 2: return 2.4
    return 2.7
agg['avgTireScore'] = agg['pit_count'].apply(proxy_tire_score)

# --- ðŸ‘‡ ADD CHANGE 4 here ---
# Tire strategy aggressiveness proxy
agg['stints'] = (agg['pit_count'].fillna(0) + 1).clip(1, 5)
agg['tire_aggr_index'] = agg['stints'] / agg['pit_total_duration'].replace(0, np.nan)
agg['tire_aggr_index'] = agg['tire_aggr_index'].fillna(agg['tire_aggr_index'].median())
# --- ðŸ‘† END CHANGE 4 ---


In [237]:
# =====================================================
# 6. Build training dataset
# =====================================================
Y = results.merge(races[['raceId','year','round','circuitId']], on='raceId', how='left')
Y = Y.merge(circuits[['circuitId','country']], on='circuitId', how='left')
Y = Y.merge(overtake_index, on='circuitId', how='left')
Y = Y.merge(agg, on=['raceId','driverId'], how='left')
Y = Y.merge(season_cons_pts, on=['year','constructorId'], how='left')

# Fill missing
for c in ['pit_count','pit_total_duration','pit_avg_duration','first_pit_lap','last_pit_lap','avgTireScore']:
    Y[c] = Y[c].fillna(0 if c!='avgTireScore' else 1.8)
Y['circuit_overtake_difficulty'] = Y['overtakeIndex'].fillna(Y['overtakeIndex'].median())
Y['carPerformanceIndex'] = Y['carPerformanceIndex'].fillna(Y['carPerformanceIndex'].median())

# --- ðŸ‘‡ ADD CHANGE 3 here ---
# first_stop_delta = normalized pit timing (how early the first stop is vs race length)
Y = Y.merge(circuit_meta[['circuitId','avgLaps']], on='circuitId', how='left')
Y['first_stop_delta'] = np.where(
    (Y['avgLaps'].notna()) & (Y['first_pit_lap'] > 0),
    Y['first_pit_lap'] / Y['avgLaps'],
    0.0
)
# --- ðŸ‘† END CHANGE 3 ---

rounds_per_year = Y.groupby('year', as_index=False)['round'].max().rename(columns={'round':'round_max'})
Y = Y.merge(rounds_per_year, on='year', how='left')
Y['season_progress'] = (Y['round'] - 1) / (Y['round_max'] - 1 + 1e-9)

TARGET = 'positionOrder'
FEATURES = [
 'grid','pit_count','pit_total_duration','pit_avg_duration',
 'first_pit_lap','last_pit_lap','circuit_overtake_difficulty',
 'round','circuitId','country','carPerformanceIndex','avgTireScore',
 'season_progress','first_stop_delta', 'tire_aggr_index'   # ðŸ‘ˆ add new feature here
]
dataset = Y[Y[TARGET] > 0][FEATURES + [TARGET]].copy()


In [238]:
# =====================================================
# 6b. Tire strategy features (per driver/race)
# =====================================================
pit_stops  = pd.read_csv(DATA_DIR / "pit_stops.csv")   # raceId, driverId, lap, duration, etc.
lap_times  = pd.read_csv(DATA_DIR / "lap_times.csv")   # raceId, driverId, lap, position, time
# Some Kaggle dumps include 'compound' in other tables; if you donâ€™t have it, skip compound shares.

# Stint count = (#pit_stops + 1)
stints = (pit_stops.groupby(["raceId","driverId"], as_index=False)["stop"]
                   .count().rename(columns={"stop":"pitStops"}))
stints["tireStints"] = stints["pitStops"] + 1

# Average pit duration (ms)
pit_stops["duration_ms"] = pd.to_numeric(pit_stops["milliseconds"], errors="coerce")
pit_agg = (pit_stops.groupby(["raceId","driverId"], as_index=False)["duration_ms"]
                   .mean().rename(columns={"duration_ms":"avgPitMs"}))

# Optional: Compound shares per race (if you have compound per lap or per stint)
# If not available, set zeros; your server can default to 0
tire_feats = stints.merge(pit_agg, on=["raceId","driverId"], how="outer")
tire_feats["avgPitMs"] = tire_feats["avgPitMs"].fillna(0)
tire_feats["tireStints"] = tire_feats["tireStints"].fillna(1)


In [239]:
# =====================================================
# Target cleanup & outliers
# =====================================================
Y = Y.copy()

# Keep only classified finishers with real positions (1..20)
Y = Y[(Y["positionOrder"].notna()) & (Y["positionOrder"] > 0)]
Y["finish_pos"] = Y["positionOrder"].clip(1, 20)

# Remove grid==0 entries (no proper start position)
Y = Y[Y["grid"] > 0]

# (Optional) drop DNS/DNF by status if you joined 'status.csv'
# status = pd.read_csv(DATA_DIR / "status.csv")
# Y = Y.merge(status[['statusId','status']], on='statusId', how='left')
# Y = Y[~Y['status'].str.contains("DNF|DSQ|DNS", na=False)]

# Light winsorize on pit durations
if "avgPitMs" in Y.columns:
    Y["avgPitMs"] = Y["avgPitMs"].clip(
        lower=Y["avgPitMs"].quantile(0.01),
        upper=Y["avgPitMs"].quantile(0.99)
    )

# ðŸš¦ Filter chaotic races: require at least 16 classified finishers in the race
finishers = (results.assign(classified=(results['positionOrder'] > 0).astype(int))
                    .groupby('raceId', as_index=False)['classified'].sum()
                    .rename(columns={'classified': 'n_finishers'}))
Y = Y.merge(finishers, on='raceId', how='left')
Y = Y[Y['n_finishers'] >= 16]


# (If you also filter to modern era, do it AFTER this)
# Y = Y[Y['year'] >= 2014]


In [240]:
# Restrict to modern era (Hybrid + DRS era)
Y = Y[Y['year'] >= 2014]


In [241]:
# =====================================================
# 9. Train/valid split (grouped by year)
# =====================================================
from sklearn.model_selection import GroupShuffleSplit

feature_cols = [
    "grid","pit_count","pit_total_duration","pit_avg_duration",
    "first_pit_lap","last_pit_lap",
    "circuit_overtake_difficulty","round","circuitId","country",
    "carPerformanceIndex","tireStints","avgPitMs",
    "first_stop_delta", "tire_aggr_index"  # <--- ADD THESE
]

# Keep only columns that actually exist
feature_cols = [c for c in feature_cols if c in Y.columns]
X = Y[feature_cols].copy()
y = Y["finish_pos"].astype(float)
groups = Y["raceId"]

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, valid_idx = next(gss.split(X, y, groups))

X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]


In [242]:
# =====================================================
# 10. Model: ColumnTransformer + xgb.train (MAE + early stopping on old xgboost)
# =====================================================
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from xgboost import DMatrix, train as xgb_train
import xgboost as xgb
import numpy as np

numeric_features = [c for c in feature_cols if c not in ["country"]]
categorical_features = [c for c in feature_cols if c in ["country"]]

preprocess = ColumnTransformer(
    transformers=[
        ("num","passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ],
    remainder="drop"
)

# Fit preprocess, transform splits
prep_fitted = preprocess.fit(X_train, y_train)
X_train_t = prep_fitted.transform(X_train)
X_valid_t = prep_fitted.transform(X_valid)

dtrain = DMatrix(X_train_t, label=y_train.values)
dvalid = DMatrix(X_valid_t, label=y_valid.values)

# monotonicity: higher grid -> worse finish
mono = [1 if col == "grid" else 0 for col in numeric_features]
monotone_str = "(" + ",".join(str(v) for v in mono) + ")"

params = {
    "objective": "reg:absoluteerror",  # MAE
    "eta": 0.03,
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "min_child_weight": 4,
    "alpha": 1.0,
    "lambda": 3.0,
    "tree_method": "hist",
    "seed": 42,
    "monotone_constraints": monotone_str,
    "eval_metric": "mae",
}

watchlist = [(dtrain, "train"), (dvalid, "valid")]
booster = xgb_train(
    params,
    dtrain,
    num_boost_round=2000,
    evals=watchlist,
    early_stopping_rounds=100,
    verbose_eval=False
)

# A tiny wrapper so we can reuse in the rest of the notebook like a sklearn estimator
class BoosterWrapper:
    def __init__(self, booster, preprocessor):
        self.booster = booster
        self.preprocessor = preprocessor
    def predict(self, X):
        Xt = self.preprocessor.transform(X)
        return self.booster.predict(DMatrix(Xt))

# 'pipe' compatible object with .predict(X)
pipe = BoosterWrapper(booster, prep_fitted)


In [243]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# RAW features only â€” the wrapper/pipeline will transform internally
pred_valid_raw = pipe.predict(X_valid)
pred_valid = np.clip(pred_valid_raw, 1, 20)

mae  = mean_absolute_error(y_valid, pred_valid)
rmse = np.sqrt(mean_squared_error(y_valid, pred_valid))
print(f"Validation MAE: {mae:.3f} | RMSE: {rmse:.3f}")

baseline_grid = np.clip(X_valid["grid"].values, 1, 20)
print("Baseline (finishâ‰ˆgrid) MAE:", mean_absolute_error(y_valid, baseline_grid))


Validation MAE: 2.748 | RMSE: 3.880
Baseline (finishâ‰ˆgrid) MAE: 3.5421286031042127


In [244]:
# =====================================================
# Feature importances (works for Pipeline, XGBRegressor, or Booster)
# =====================================================
import numpy as np

# 1) Identify model (Pipeline or raw Booster)
mdl = None
prep = None
try:
    # If you trained a Pipeline: pipe = Pipeline([("prep", preprocess), ("model", xgb)])
    mdl = pipe.named_steps["model"]
    prep = pipe.named_steps.get("prep", None)
except Exception:
    # If you stored the raw model in `pipe`
    mdl = pipe
    try:
        prep = preprocess  # your ColumnTransformer, if you kept it in a variable named 'preprocess'
    except NameError:
        prep = None

# 2) Get underlying Booster
try:
    booster = mdl.get_booster()     # XGBRegressor -> Booster
except Exception:
    booster = getattr(mdl, "booster", None) or mdl  # sometimes it's already a Booster

# 3) Importance dict (try several types)
imp = booster.get_score(importance_type="gain")
if not imp:
    imp = booster.get_score(importance_type="weight")
if not imp:
    imp = booster.get_score(importance_type="cover")
if not imp:
    raise ValueError("Booster returned empty importance dict. Make sure the model is fitted.")

# 4) Feature names
feat_names = None
if prep is not None:
    try:
        feat_names = list(prep.get_feature_names_out())
    except Exception:
        pass

if feat_names is None:
    # Infer count from keys (f0..fN-1), else fall back to length of dict
    try:
        max_idx = max(int(k[1:]) for k in imp.keys() if str(k).startswith("f") and str(k[1:]).isdigit())
        n_feats = max_idx + 1
    except Exception:
        # last resort: if X_train is in scope and has a column count, use that
        if "X_train" in globals():
            n_feats = X_train.shape[1]
        else:
            n_feats = len(imp)
    feat_names = [f"f{i}" for i in range(n_feats)]

# 5) Map importance to a dense array aligned to feature indices
scores = np.zeros(len(feat_names), dtype=float)
for k, v in imp.items():
    if k.startswith("f") and k[1:].isdigit():
        idx = int(k[1:])
        if 0 <= idx < len(scores):
            scores[idx] = float(v)
    else:
        # sometimes keys are real feature names (rare with Booster)
        try:
            idx = feat_names.index(k)
            scores[idx] = float(v)
        except ValueError:
            pass

# Normalize for readability (optional)
total = scores.sum()
if total > 0:
    scores = scores / total

pairs = list(zip(feat_names, scores))
pairs.sort(key=lambda x: x[1], reverse=True)
top = pairs[:20]

print("\nTop features:")
for n, w in top:
    print(f"{n:40s} {w:.4f}")



Top features:
num__grid                                0.2041
num__pit_count                           0.0621
num__last_pit_lap                        0.0545
num__carPerformanceIndex                 0.0471
num__tire_aggr_index                     0.0306
num__first_stop_delta                    0.0273
num__first_pit_lap                       0.0263
num__pit_avg_duration                    0.0231
cat__country_Belgium                     0.0215
cat__country_Netherlands                 0.0203
cat__country_Australia                   0.0201
num__pit_total_duration                  0.0195
cat__country_Qatar                       0.0194
cat__country_Mexico                      0.0192
num__circuitId                           0.0192
cat__country_Spain                       0.0190
cat__country_Singapore                   0.0183
cat__country_China                       0.0182
cat__country_Germany                     0.0179
num__circuit_overtake_difficulty         0.0178


In [245]:
from sklearn.metrics import mean_absolute_error

# Baseline 1: "finish = grid"
baseline_grid_mae = mean_absolute_error(y_valid, X_valid['grid'])
print("Baseline (finish=grid) MAE:", baseline_grid_mae)

# Baseline 2: only car pace + overtake + circuit (very rough)
import numpy as np
pseudo = (
    21
    - 10 * X_valid['carPerformanceIndex'].fillna(0.5)
    - 3  * X_valid['circuit_overtake_difficulty'].fillna(0.5)
)
pseudo = np.clip(pseudo, 1, 20)
print("Pseudo baseline MAE:", mean_absolute_error(y_valid, pseudo))


Baseline (finish=grid) MAE: 3.549889135254989
Pseudo baseline MAE: 5.3046575982538355


In [246]:
# =====================================================
# 12. Save artifacts & helpers
# =====================================================
from joblib import dump
import json

ARTIFACTS_DIR = Path("./artifacts")
ARTIFACTS_DIR.mkdir(exist_ok=True, parents=True)

# pack a tiny schema alongside the model (what server must provide)
serve_schema = {
    "numeric": numeric_features,       # <--- Changed from numeric_cols
    "categorical": categorical_features # <--- Changed from cat_cols
}

with open(ARTIFACTS_DIR / "serve_schema.json", "w") as f:
    json.dump(serve_schema, f, indent=2)

# save model (keep same filename if server expects it)
dump(pipe, ARTIFACTS_DIR / "finish_regressor_xgb_v2.pkl")
dump(pipe, ARTIFACTS_DIR / "finish_regressor_xgb.pkl")  # overwrite current for deployment
print("Saved model + serve_schema.json")

Saved model + serve_schema.json
