# Monte Carlo race simulation

Loads the trained XGBoost lap-time model plus overtaking, DNF, and safety-car models, then runs repeated race simulations with aggregated results.

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import joblib

import ipynb.fs.defs.overtaking_model as overtaking_model
import ipynb.fs.defs.dnf_model as dnf_model
import ipynb.fs.defs.safety_car_model as safety_car_model


In [7]:
# Load bundled model and artifacts
bundle_path = Path("models/xgboost_laptime_bundle.joblib")
bundle = joblib.load(bundle_path)
model = bundle["model"]
driver_skill_map = bundle["driver_skill_map"]
circuit_cat = pd.CategoricalDtype(categories=bundle["circuit_categories"])
compound_cat = pd.CategoricalDtype(categories=bundle["compound_categories"])

# Load dataset for grid building and base lap times
csv_candidates = [Path("fastf1_lap_dataset.csv"), Path("models/fastf1_lap_dataset.csv")]
csv_path = next((p for p in csv_candidates if p.exists()), None)
if csv_path is None:
    raise FileNotFoundError("fastf1_lap_dataset.csv not found")
df = pd.read_csv(csv_path)

# Filter to clean laps for baseline pace
race_df = df[
    (~df["safety_car_this_lap"]) & (~df["virtual_sc_this_lap"]) & df["lap_time_s"].notna()
].copy()
circuit_median_map = race_df.groupby("circuit_id")["lap_time_s"].median().to_dict()

# Build a simple grid from the last session (order by first lap position)
last_session_key = df["session_key"].iloc[-1000]
grid_source = df[df["session_key"] == last_session_key]
first_lap_rows = grid_source[grid_source["lap_number"] == grid_source["lap_number"].min()]
grid_drivers = (
    first_lap_rows.sort_values("current_position")["driver_id"].drop_duplicates().tolist()
)
print(f"Grid built from session {last_session_key}: {grid_drivers}")


Grid built from session 2025_united_states_grand_prix_race: ['VER', 'LEC', 'NOR', 'HAM', 'PIA', 'RUS', 'ANT', 'BEA', 'SAI', 'TSU', 'HUL', 'ALO', 'LAW', 'GAS', 'OCO', 'BOR', 'COL', 'STR', 'HAD', 'ALB']


In [8]:
# Initialize models with shared RNG
master_rng = np.random.default_rng(0)

ot_model = overtaking_model.OvertakingModel(
    df=df,
    driver_skill_map=driver_skill_map,
    include_year=True,
    rng=master_rng,
)
hazard_model = dnf_model.DNFModel(include_year=True, rng=master_rng)
sc_gen_model = safety_car_model.SafetyCarGenerativeModel(include_year=True, rng=master_rng)


  "        laps['safety_car_this_lap'] = laps['safety_car_this_lap'].astype(bool)\n",
  "        laps['virtual_sc_this_lap'] = laps['virtual_sc_this_lap'].astype(bool)\n",


In [9]:
def format_seconds(x):
    if x is None or pd.isna(x):
        return "--"
    minutes = int(x // 60)
    seconds = x - minutes * 60
    return f"{minutes:02d}:{seconds:06.3f}"


def simulate_race(
    circuit_id,
    grid_drivers,
    total_laps=50,
    year=2025,
    global_strategy=None,
    driver_strategies=None,
    safety_car_laps=None,
    rain_laps=None,
    pit_loss=20.0,
    rng=None,
):
    """Simulate a race and return (race_log_df, safety_car_lap_set)."""

    rng = rng or np.random.default_rng()

    if global_strategy is None:
        raise ValueError("global_strategy must be provided, e.g. [(20, 'MEDIUM'), (40, 'SOFT')]")
    if driver_strategies is None:
        driver_strategies = {}

    if safety_car_laps is None:
        auto_sc_laps = set()
        state, stint_len = 'green', 0
        for lap in range(1, total_laps + 1):
            if state == 'sc':
                auto_sc_laps.add(lap)
            progress = lap / total_laps
            state, stint_len = sc_gen_model.next_state(state, stint_len, circuit_id, year, progress, rng)
        safety_car_laps = auto_sc_laps
    else:
        safety_car_laps = set(safety_car_laps)

    if rain_laps is None:
        rain_laps = set()
    else:
        rain_laps = set(rain_laps)

    base_lap = circuit_median_map.get(circuit_id)
    if base_lap is None:
        raise ValueError(f"No circuit_median_lap available for circuit_id={circuit_id!r}")

    grid_pos_map = {drv: idx + 1 for idx, drv in enumerate(grid_drivers)}

    drivers_state = []
    for idx, drv in enumerate(grid_drivers):
        strat = driver_strategies.get(drv, global_strategy)
        stops_map = {int(lap): compound for lap, compound in strat}
        drivers_state.append(
            {
                "driver_id": drv,
                "grid_position": idx + 1,
                "position": idx + 1,
                "cumul_time": float(idx * 0.3),
                "laps_on_current_tyre": 1,
                "tyre_compound": "SOFT",
                "gap_to_ahead": 0.0,
                "stops": stops_map,
                "history": [],
                "dnf": False,
            }
        )

    race_log = []

    for lap in range(1, total_laps + 1):
        prev_positions = {s["driver_id"]: s["position"] for s in drivers_state}
        drivers_by_pos = sorted(
            [s for s in drivers_state if not s.get("dnf", False)],
            key=lambda s: s["position"],
        )

        for idx, s in enumerate(drivers_by_pos):
            if idx == 0:
                s["gap_to_ahead"] = 0.0
            else:
                ahead = drivers_by_pos[idx - 1]
                s["gap_to_ahead"] = s["cumul_time"] - ahead["cumul_time"]

        rows = []
        laps_on_tyre_for_update = []
        for s in drivers_by_pos:
            laps_on_current_tyre_next = s["laps_on_current_tyre"] + 1
            race_progress = lap / total_laps
            rain_flag = 1 if lap in rain_laps else 0
            rows.append(
                {
                    "circuit_id": circuit_id,
                    "laps_on_current_tyre": laps_on_current_tyre_next,
                    "tyre_compound": s["tyre_compound"],
                    "race_progress": race_progress,
                    "rainfall": rain_flag,
                    "current_position": s["position"],
                    "gap_to_ahead_s": s["gap_to_ahead"],
                    "year": year,
                    "driver_skill": driver_skill_map.get(s["driver_id"], 0.0),
                }
            )
            laps_on_tyre_for_update.append(laps_on_current_tyre_next)

        scenario_df = pd.DataFrame(rows)
        scenario_df["circuit_id"] = scenario_df["circuit_id"].astype(circuit_cat)
        scenario_df["tyre_compound"] = scenario_df["tyre_compound"].astype(compound_cat)

        pred_deltas = model.predict(scenario_df)
        pred_deltas = np.asarray(pred_deltas, dtype=float)
        lap_times = base_lap + pred_deltas

        safety_car_active = lap in safety_car_laps
        if safety_car_active:
            lap_times = np.asarray(lap_times, dtype=float)
            leader_time = lap_times[0] * 1.35
            sc_lap_times = [leader_time]
            for idx in range(1, len(drivers_by_pos)):
                candidate = float(lap_times[idx])
                start_gap = float(drivers_by_pos[idx]["gap_to_ahead"])
                gap_end = start_gap + (candidate - sc_lap_times[idx - 1])
                if gap_end < 0.0:
                    candidate = candidate + abs(gap_end) + 0.3
                sc_lap_times.append(candidate)
            lap_times = np.array(sc_lap_times)
            pred_deltas = lap_times - float(base_lap)
            overtake_attempts = np.zeros(len(drivers_by_pos), dtype=bool)
        else:
            lap_times, pred_deltas, overtake_attempts = ot_model.apply_overtakes_for_lap(
                circuit_id=circuit_id,
                drivers_by_pos=drivers_by_pos,
                lap_times=lap_times,
                pred_deltas=pred_deltas,
                base_lap=base_lap,
                year=year,
                close_gap_threshold=1.0,
                fail_gap=0.3,
                rng=rng,
            )

        drivers_by_pos, dnfs_this_lap = hazard_model.apply_dnfs_for_lap(
            circuit_id=circuit_id,
            drivers_by_pos=drivers_by_pos,
            lap_number=lap,
            year=year,
            rng=rng,
        )

        attempts_this_lap = {
            drivers_by_pos[i]["driver_id"]: bool(overtake_attempts[i])
            for i in range(len(drivers_by_pos))
        }
        dnfs_map_this_lap = {
            drivers_by_pos[i]["driver_id"]: bool(dnfs_this_lap[i])
            for i in range(len(drivers_by_pos))
        }

        for idx, s in enumerate(drivers_by_pos):
            lap_time = float(lap_times[idx])
            delta = float(pred_deltas[idx])
            laps_on_current_tyre_next = int(laps_on_tyre_for_update[idx])

            compound_this_lap = s["tyre_compound"]
            pit_compound = s["stops"].get(lap)
            pitted = False
            if pit_compound is not None:
                lap_time += pit_loss
                pitted = True

            dnf_now = dnfs_map_this_lap.get(s["driver_id"], False)
            s["dnf"] = bool(s.get("dnf", False) or dnf_now)

            if not s["dnf"]:
                s["laps_on_current_tyre"] = laps_on_current_tyre_next
                s["cumul_time"] += lap_time

            s["history"].append(
                {
                    "lap": lap,
                    "lap_time": lap_time if not dnf_now else None,
                    "delta": delta if not dnf_now else None,
                    "tyre_compound": compound_this_lap,
                    "pitted": pitted,
                    "overtake_attempt": attempts_this_lap.get(s["driver_id"], False),
                    "dnf": dnf_now,
                }
            )

            if s["dnf"]:
                continue
            if pit_compound is not None:
                s["tyre_compound"] = pit_compound
                s["laps_on_current_tyre"] = 1

        drivers_state = sorted(
            drivers_state,
            key=lambda s: (s.get("dnf", False), s["cumul_time"], s["grid_position"]),
        )
        for pos, s in enumerate(drivers_state, start=1):
            s["position"] = pos

        leader_time = drivers_state[0]["cumul_time"]
        for s in drivers_state:
            last_lap = s["history"][-1]
            gap_to_leader = s["cumul_time"] - leader_time
            pitted = last_lap["pitted"]
            attempted = last_lap["overtake_attempt"]
            dnf_now = last_lap.get("dnf", False)
            lap_time = last_lap["lap_time"]
            delta = last_lap["delta"]

            race_log.append(
                {
                    "lap": lap,
                    "position": s["position"],
                    "driver_id": s["driver_id"],
                    "lap_time": lap_time,
                    "delta": delta,
                    "tyre_compound": last_lap["tyre_compound"],
                    "pitted": pitted,
                    "gap_to_leader": gap_to_leader,
                    "cumul_time": s["cumul_time"],
                    "overtake_attempt": attempted,
                    "dnf": dnf_now or s.get("dnf", False),
                    "pos_change_lap": prev_positions[s["driver_id"]] - s["position"],
                    "pos_change_total": grid_pos_map[s["driver_id"]] - s["position"],
                    "safety_car": safety_car_active,
                }
            )

    return pd.DataFrame(race_log), safety_car_laps


In [20]:
# Monte Carlo execution
num_runs = 20
race_length = 50
global_strategy = [(20, "MEDIUM"), (40, "SOFT")]

results = []
summary_rows = []

driver_strategies = {
     "VER": [
         (15, "SOFT"),
         (25, "HARD"),
         (26, "HARD"),
         (27, "HARD"),
         (28, "HARD"),
         (29, "HARD"),
         (30, "HARD"),
         (31, "HARD"),
         (32, "HARD"),
         (33, "HARD"),
         (34, "HARD"),
         (35, "HARD"),
         (45, "HARD"),
     ]
}

circuits = df["circuit_id"].dropna().unique().tolist()
years_by_circuit = df.groupby("circuit_id")["year"].unique().to_dict()

for run in range(num_runs):
    run_rng = np.random.default_rng(master_rng.integers(0, 1_000_000_000))
    circuit_id = run_rng.choice(circuits)
    year = int(run_rng.choice(years_by_circuit.get(circuit_id, [2025])))

    race_log, sc_laps = simulate_race(
        circuit_id=circuit_id,
        grid_drivers=grid_drivers,
        total_laps=race_length,
        year=year,
        global_strategy=global_strategy,
        driver_strategies=driver_strategies,
        safety_car_laps=None,
        rain_laps=None,
        pit_loss=20.0,
        rng=run_rng,
    )

    race_log["run"] = run
    race_log["circuit_id"] = circuit_id
    race_log["year"] = year
    results.append(race_log)

    last_lap = race_log["lap"].max()
    final_class = race_log[race_log["lap"] == last_lap].sort_values("position")
    for _, row in final_class.iterrows():
        summary_rows.append(
            {
                "run": run,
                "circuit_id": circuit_id,
                "year": year,
                "driver_id": row["driver_id"],
                "finish_pos": row["position"],
                "dnf": bool(row["dnf"]),
                "sc_laps": len(sc_laps),
            }
        )

all_logs = pd.concat(results, ignore_index=True)
summary_df = pd.DataFrame(summary_rows)

# Aggregate overview
overview = (
    summary_df.groupby("driver_id")
    .agg(
        runs=("run", "nunique"),
        wins=("finish_pos", lambda s: (s == 1).sum()),
        podiums=("finish_pos", lambda s: (s <= 3).sum()),
        avg_finish=("finish_pos", "mean"),
        dnfs=("dnf", "sum"),
    )
    .sort_values(["wins", "podiums"], ascending=[False, False])
)

print("Overview per driver:\n", overview)
print("\nSample final classification from last run:\n", summary_df[summary_df["run"] == (num_runs - 1)].sort_values("finish_pos"))


Overview per driver:
            runs  wins  podiums  avg_finish  dnfs
driver_id                                       
LEC          20     7        9        5.80     0
ANT          20     4        6        6.30     0
VER          20     3        3       16.50     0
HAM          20     2        5        7.10     0
NOR          20     1        6        7.00     1
RUS          20     1        5        8.05     0
LAW          20     1        2       10.85     0
BOR          20     1        1       13.45     0
ALB          20     0        4       10.55     0
PIA          20     0        4       10.15     5
ALO          20     0        3        9.90     1
BEA          20     0        3       10.90     1
SAI          20     0        3       11.60     3
HAD          20     0        2       13.35     2
HUL          20     0        2        9.55     0
GAS          20     0        1       11.05     1
STR          20     0        1       10.85     0
COL          20     0        0       13.30     

In [23]:
# Strategy comparison (side-by-side Monte Carlo) with per-driver overrides
strategy_a_global = [(20, 'MEDIUM'), (40, 'SOFT')]
strategy_b_global = [(20, 'MEDIUM'), (40, 'SOFT')]

# Optional per-driver overrides; leave empty to fall back to global strategy
strategy_a_driver = {}  
strategy_b_driver = {
    "VER": [(20, 'HARD')],
}

num_runs_compare = 200
summary_comp = []

for run in range(num_runs_compare):
    run_rng = np.random.default_rng(master_rng.integers(0, 1_000_000_000))
    circuit_id = run_rng.choice(circuits)
    year = int(run_rng.choice(years_by_circuit.get(circuit_id, [2025])))

    configs = [
        ('A', strategy_a_global, strategy_a_driver),
        ('B', strategy_b_global, strategy_b_driver),
    ]

    for label, glob_strat, driver_strats in configs:
        rng_run = np.random.default_rng(run_rng.integers(0, 1_000_000_000))
        race_log, sc_laps = simulate_race(
            circuit_id=circuit_id,
            grid_drivers=grid_drivers,
            total_laps=race_length,
            year=year,
            global_strategy=glob_strat,
            driver_strategies=driver_strats,
            safety_car_laps=None,
            rain_laps=None,
            pit_loss=20.0,
            rng=rng_run,
        )
        last_lap = race_log['lap'].max()
        final_class = race_log[race_log['lap'] == last_lap].sort_values('position')
        for _, row in final_class.iterrows():
            summary_comp.append({
                'run': run,
                'strategy': label,
                'circuit_id': circuit_id,
                'year': year,
                'driver_id': row['driver_id'],
                'finish_pos': row['position'],
                'dnf': bool(row['dnf']),
                'sc_laps': len(sc_laps),
            })

summary_comp_df = pd.DataFrame(summary_comp)

wins = summary_comp_df[summary_comp_df['finish_pos'] == 1].groupby('strategy')['driver_id'].count()
avg_finish = summary_comp_df.groupby(['driver_id', 'strategy'])['finish_pos'].mean().unstack()
avg_finish['delta_B_minus_A'] = avg_finish.get('B', np.nan) - avg_finish.get('A', np.nan)

print('Wins per strategy:', wins)
print('Average finish per driver (A vs B, lower is better): ', avg_finish.sort_values('delta_B_minus_A'))


Wins per strategy: strategy
A    200
B    200
Name: driver_id, dtype: int64
Average finish per driver (A vs B, lower is better):  strategy        A       B  delta_B_minus_A
driver_id                                 
VER         6.110   3.965           -2.145
OCO        12.445  11.985           -0.460
PIA         8.600   8.310           -0.290
HAD        12.425  12.175           -0.250
TSU        11.805  11.650           -0.155
STR        11.815  11.680           -0.135
COL        14.845  14.780           -0.065
LAW        12.660  12.610           -0.050
LEC         7.320   7.325            0.005
SAI        10.295  10.315            0.020
ALB        11.420  11.525            0.105
BEA        11.045  11.180            0.135
HAM         8.610   8.750            0.140
GAS        11.325  11.485            0.160
HUL        11.995  12.300            0.305
BOR        13.445  13.815            0.370
RUS         8.170   8.550            0.380
ANT         8.140   8.685            0.545
ALO       