In [11]:
from pathlib import Path
import numpy as np
import pandas as pd

def _phase(progress):
    if progress < 0.33:
        return "early"
    if progress < 0.66:
        return "middle"
    return "late"

def load_default_df():
    candidates = [
        Path("fastf1_lap_dataset.csv"),
        Path("models/fastf1_lap_dataset.csv"),
        Path("driver_lap_dataset.csv"),
        Path("models/driver_lap_dataset.csv"),
    ]
    csv_path = next((p for p in candidates if p.exists()), None)
    if csv_path is None:
        raise FileNotFoundError("No lap dataset found (fastf1_lap_dataset.csv / driver_lap_dataset.csv).")
    df = pd.read_csv(csv_path)
    required = [
        "safety_car_this_lap",
        "virtual_sc_this_lap",
        "total_race_laps",
        "lap_number",
        "circuit_id",
        "year",
        "session_key",
    ]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")
    return df

MAX_LEN_BUCKET = 12

df = load_default_df()
laps = df.copy()
laps = laps.sort_values(["session_key", "lap_number"])
laps["lap_number"] = pd.to_numeric(laps["lap_number"], errors="coerce").astype("Int64")
laps["total_race_laps"] = pd.to_numeric(laps["total_race_laps"], errors="coerce").fillna(1)
laps["race_progress"] = laps["lap_number"] / laps["total_race_laps"].replace(0, np.nan).fillna(1.0)
laps["phase"] = laps["race_progress"].apply(_phase)
laps["safety_car_this_lap"] = laps["safety_car_this_lap"].astype(bool)
laps["virtual_sc_this_lap"] = laps["virtual_sc_this_lap"].astype(bool)
laps["circuit_id"] = laps["circuit_id"].fillna("unknown").astype(str)
laps["year_key"] = laps["year"].apply(lambda v: str(int(v)) if pd.notna(v) else "unknown")

laps["state"] = np.where(
    laps["safety_car_this_lap"],
    "sc",
    np.where(laps["virtual_sc_this_lap"], "vsc", "green"),
)
prev_state = laps.groupby("session_key")["state"].shift(1)
laps["state_change"] = prev_state.ne(laps["state"]).fillna(True)
laps["stint_id"] = laps.groupby("session_key")["state_change"].cumsum()
laps["stint_len"] = laps.groupby(["session_key", "stint_id"]).cumcount() + 1
laps["stint_len"] = np.where(laps["state"] == "green", 0, laps["stint_len"])
laps["stint_bucket"] = np.where(
    laps["state"] == "green",
    0,
    np.minimum(laps["stint_len"], MAX_LEN_BUCKET),
)
laps["next_state"] = laps.groupby("session_key")["state"].shift(-1)

model_df = laps[laps["next_state"].notna()].copy()
model_df.head()


Unnamed: 0,driver_id,team_id,circuit_id,total_race_laps,year,session_name,grid_position,current_position,gap_to_leader_s,gap_to_ahead_s,...,wind_direction,race_progress,phase,year_key,state,state_change,stint_id,stint_len,stint_bucket,next_state
0,ALO,mclaren,yas_marina,55,2018,Race,15,14.0,26.062,2.134,...,272,0.018182,early,2018,sc,True,1,1,1,sc
54,BOT,mercedes,yas_marina,55,2018,Race,2,2.0,8.319,8.319,...,272,0.018182,early,2018,sc,False,1,2,2,sc
109,ERI,sauber,yas_marina,55,2018,Race,12,12.0,22.747,2.308,...,272,0.018182,early,2018,sc,False,1,3,3,sc
134,GAS,toro_rosso,yas_marina,55,2018,Race,17,13.0,23.928,1.181,...,272,0.018182,early,2018,sc,False,1,4,4,sc
181,GRO,haas_f1_team,yas_marina,55,2018,Race,7,7.0,15.236,1.04,...,272,0.018182,early,2018,sc,False,1,5,5,sc


In [12]:
import numpy as np
import pandas as pd
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

INCLUDE_YEAR = True

def build_features(frame, include_year=True):
    data = {
        "state": frame["state"].astype(str).fillna("green"),
        "stint_bucket": frame["stint_bucket"].astype(float).fillna(0),
        "race_progress": frame["race_progress"].astype(float).fillna(0.0),
        "lap_number": frame["lap_number"].astype(float).fillna(0.0),
        "circuit_id": frame["circuit_id"].astype(str).fillna("unknown"),
        "phase": frame["phase"].astype(str).fillna("early"),
    }
    if include_year:
        data["year"] = frame["year_key"].astype(str).fillna("unknown")
    return pd.DataFrame(data)

def build_pipeline(include_year=True):
    categorical = ["state", "circuit_id", "phase"]
    if include_year:
        categorical.append("year")
    numeric = ["stint_bucket", "race_progress", "lap_number"]

    preprocess = ColumnTransformer(
        [
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
            ("num", "passthrough", numeric),
        ]
    )
    model = LogisticRegression(max_iter=2000, multi_class="multinomial")
    return Pipeline([("preprocess", preprocess), ("model", model)])

class SafetyCarGenerativeModel:
    """Multinomial Markov transition model with next_state entrypoint."""

    def __init__(
        self,
        pipeline=None,
        include_year=True,
        max_len_bucket=12,
        rng=None,
    ):
        self.include_year = include_year
        self.max_len_bucket = int(max_len_bucket)
        self.rng = rng or np.random.default_rng()

        if pipeline is None:
            model_path = Path("models/safety_car_model.joblib")
            if not model_path.exists():
                raise FileNotFoundError("Missing models/safety_car_model.joblib. Run the training cell to export.")
            bundle = joblib.load(model_path)
            pipeline = bundle["pipeline"]
            self.include_year = bool(bundle.get("include_year", include_year))
            self.max_len_bucket = int(bundle.get("max_len_bucket", self.max_len_bucket))

        self.pipeline = pipeline

    def transition_probs(self, state, stint_len, circuit_id, year, progress, lap_number):
        stint_bucket = 0 if state == "green" else min(int(stint_len), self.max_len_bucket)
        phase = _phase(progress)
        row = {
            "state": state,
            "stint_bucket": float(stint_bucket),
            "race_progress": float(progress),
            "lap_number": float(lap_number),
            "circuit_id": str(circuit_id) if circuit_id is not None else "unknown",
            "phase": phase,
            "year": str(int(year)) if (self.include_year and year is not None and not pd.isna(year)) else "unknown",
        }
        X = pd.DataFrame([row])
        probs = self.pipeline.predict_proba(X)[0]
        class_map = dict(zip(self.pipeline.classes_, probs))
        return {
            "green": float(class_map.get("green", 0.0)),
            "vsc": float(class_map.get("vsc", 0.0)),
            "sc": float(class_map.get("sc", 0.0)),
        }

    def next_state(self, state, stint_len, circuit_id, year, progress, rng=None, lap_number=0):
        if rng is None:
            rng = self.rng or np.random.default_rng()
        probs = self.transition_probs(state, stint_len, circuit_id, year, progress, lap_number)
        r = rng.random()
        if r < probs["green"]:
            next_state = "green"
        elif r < probs["green"] + probs["vsc"]:
            next_state = "vsc"
        else:
            next_state = "sc"

        if next_state == state and state in ("vsc", "sc"):
            next_len = int(stint_len) + 1
        elif next_state in ("vsc", "sc"):
            next_len = 1
        else:
            next_len = 0
        return next_state, next_len



In [13]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import GroupShuffleSplit

X = build_features(model_df, include_year=INCLUDE_YEAR)
y = model_df["next_state"].astype(str).to_numpy()
groups = model_df["session_key"].astype(str).fillna("unknown")

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))

X_train, y_train = X.iloc[train_idx], y[train_idx]
X_test, y_test = X.iloc[test_idx], y[test_idx]

pipeline = build_pipeline(include_year=INCLUDE_YEAR)
pipeline.fit(X_train, y_train)

test_probs = pipeline.predict_proba(X_test)
test_pred = pipeline.predict(X_test)

print(f"Test log loss: {log_loss(y_test, test_probs, labels=pipeline.classes_):.6f}")
print(f"Test accuracy: {accuracy_score(y_test, test_pred):.6f}")

sc_gen_model = SafetyCarGenerativeModel(
    pipeline=pipeline,
    include_year=INCLUDE_YEAR,
    max_len_bucket=MAX_LEN_BUCKET,
)




Test log loss: 0.052816
Test accuracy: 0.986508


In [14]:
import joblib
from pathlib import Path

model_path = Path("models/safety_car_model.joblib")
model_path.parent.mkdir(parents=True, exist_ok=True)
bundle = {
    "pipeline": pipeline,
    "include_year": INCLUDE_YEAR,
    "max_len_bucket": MAX_LEN_BUCKET,
}
joblib.dump(bundle, model_path)


['models/safety_car_model.joblib']

In [15]:
from IPython.display import display

full_X = build_features(model_df, include_year=INCLUDE_YEAR)
full_probs = pipeline.predict_proba(full_X)
classes = list(pipeline.classes_)

pred_df = model_df[["circuit_id", "next_state"]].copy()
for idx, cls in enumerate(classes):
    pred_df[f"pred_{cls}"] = full_probs[:, idx]

for cls in ["green", "vsc", "sc"]:
    col = f"pred_{cls}"
    if col not in pred_df.columns:
        pred_df[col] = 0.0

observed = pred_df["next_state"].value_counts(normalize=True)
pred_means = pred_df[[f"pred_{c}" for c in classes]].mean()
print("Observed next-state rates:")
print(observed)
print("\nMean predicted probabilities:")
print(pred_means)

pred_circuit = pred_df.groupby("circuit_id")[[f"pred_{c}" for c in classes]].mean()
obs_circuit = (
    pred_df.groupby("circuit_id")["next_state"]
    .value_counts(normalize=True)
    .unstack(fill_value=0.0)
)

compare = pred_circuit.join(obs_circuit, how="inner", rsuffix="_obs")
compare["sc_error"] = compare.get("pred_sc", 0.0) - compare.get("sc", 0.0)
compare["vsc_error"] = compare.get("pred_vsc", 0.0) - compare.get("vsc", 0.0)

print("\nLargest absolute SC errors:")
display(compare.reindex(compare["sc_error"].abs().sort_values(ascending=False).head(10).index))

print("\nLargest absolute VSC errors:")
display(compare.reindex(compare["vsc_error"].abs().sort_values(ascending=False).head(10).index))



Observed next-state rates:
next_state
green    0.97191
sc       0.02809
Name: proportion, dtype: float64

Mean predicted probabilities:
pred_green    0.972074
pred_sc       0.027926
dtype: float64

Largest absolute SC errors:


Unnamed: 0_level_0,pred_green,pred_sc,green,sc,sc_error,vsc_error
circuit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
miami,0.97576,0.02424,0.971201,0.028799,-0.004559,0.0
shanghai,0.981601,0.018399,0.977046,0.022954,-0.004555,0.0
istanbul,0.978624,0.021376,0.982448,0.017552,0.003824,0.0
singapore,0.963807,0.036193,0.967518,0.032482,0.003711,0.0
silverstone,0.972947,0.027053,0.969458,0.030542,-0.003489,0.0
mexico_city,0.970684,0.029316,0.967473,0.032527,-0.003211,0.0
le_castellet,0.968583,0.031417,0.971479,0.028521,0.002896,0.0
jeddah,0.944959,0.055041,0.947524,0.052476,0.002565,0.0
sochi,0.972199,0.027801,0.97407,0.02593,0.001871,0.0
baku,0.958415,0.041585,0.9599,0.0401,0.001485,0.0



Largest absolute VSC errors:


Unnamed: 0_level_0,pred_green,pred_sc,green,sc,sc_error,vsc_error
circuit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
austin,0.97713,0.02287,0.977215,0.022785,8.4e-05,0.0
singapore,0.963807,0.036193,0.967518,0.032482,0.003711,0.0
mugello,0.936098,0.063902,0.936306,0.063694,0.000208,0.0
nürburgring,0.959438,0.040562,0.959725,0.040275,0.000287,0.0
portimão,0.990845,0.009155,0.992098,0.007902,0.001253,0.0
sakhir,0.970847,0.029153,0.970168,0.029832,-0.000679,0.0
shanghai,0.981601,0.018399,0.977046,0.022954,-0.004555,0.0
silverstone,0.972947,0.027053,0.969458,0.030542,-0.003489,0.0
sochi,0.972199,0.027801,0.97407,0.02593,0.001871,0.0
montréal,0.972527,0.027473,0.972434,0.027566,-9.3e-05,0.0
