# Monte Carlo race simulation

Loads the trained XGBoost lap-time model plus overtaking, DNF, and safety-car models, then runs repeated race simulations with aggregated results.

In [34]:
from pathlib import Path
import numpy as np
import pandas as pd
import joblib

import ipynb.fs.defs.overtaking_model as overtaking_model
import ipynb.fs.defs.dnf_model as dnf_model
import ipynb.fs.defs.safety_car_model as safety_car_model

import ipynb.fs.defs.weather_model as weather_model


In [35]:
# Load bundled model and artifacts
bundle_path = Path("models/xgboost_laptime_bundle.joblib")
bundle = joblib.load(bundle_path)
model = bundle["model"]
driver_skill_map = bundle["driver_skill_map"]
circuit_cat = pd.CategoricalDtype(categories=bundle["circuit_categories"])
compound_cat = pd.CategoricalDtype(categories=bundle["compound_categories"])

# Override threshold from bundle when available
PACK_THRESHOLD = float(bundle.get("pack_threshold", 2.0))

# Load dataset for grid building and base lap times
csv_candidates = [Path("fastf1_lap_dataset.csv"), Path("models/fastf1_lap_dataset.csv")]
csv_path = next((p for p in csv_candidates if p.exists()), None)
if csv_path is None:
    raise FileNotFoundError("fastf1_lap_dataset.csv not found")
df = pd.read_csv(csv_path)

# Filter to clean laps for baseline pace
race_df = df[
    (~df["safety_car_this_lap"]) & (~df["virtual_sc_this_lap"]) & df["lap_time_s"].notna()
].copy()
circuit_median_map = race_df.groupby("circuit_id")["lap_time_s"].median().to_dict()

# Build a simple grid from the last session (order by first lap position)
last_session_key = df["session_key"].iloc[-1000]
grid_source = df[df["session_key"] == last_session_key]
first_lap_rows = grid_source[grid_source["lap_number"] == grid_source["lap_number"].min()]
grid_drivers = (
    first_lap_rows.sort_values("current_position")["driver_id"].drop_duplicates().tolist()
)
print(f"Grid built from session {last_session_key}: {grid_drivers}")


Grid built from session 2025_united_states_grand_prix_race: ['VER', 'LEC', 'NOR', 'HAM', 'PIA', 'RUS', 'ANT', 'BEA', 'SAI', 'TSU', 'HUL', 'ALO', 'LAW', 'GAS', 'OCO', 'BOR', 'COL', 'STR', 'HAD', 'ALB']


In [36]:
# Initialize shared RNG and helper models
master_rng = np.random.default_rng(12345)

ot_model = overtaking_model.OvertakingModel(
    df=df,
    driver_skill_map=driver_skill_map,
    include_year=True,
    rng=master_rng,
)
hazard_model = dnf_model.DNFModel(include_year=True, rng=master_rng)
sc_gen_model = safety_car_model.SafetyCarGenerativeModel(include_year=True, rng=master_rng)
weather_gen_model = weather_model.WeatherModel(rng=master_rng)


  "        laps['safety_car_this_lap'] = laps['safety_car_this_lap'].astype(bool)\n",
  "        laps['virtual_sc_this_lap'] = laps['virtual_sc_this_lap'].astype(bool)\n",


In [37]:
def simulate_race(
    circuit_id,
    grid_drivers,
    total_laps=50,
    year=2025,
    global_strategy=None,
    driver_strategies=None,
    safety_car_laps=None,
    rain_laps=None,
    pit_loss=20.0,
    rng=None,
):
    """Simulate a race and return (race_log_df, safety_car_lap_set)."""

    rng = rng or np.random.default_rng()

    if global_strategy is None:
        raise ValueError("global_strategy must be provided, e.g. [(20, 'MEDIUM'), (40, 'SOFT')]")
    if driver_strategies is None:
        driver_strategies = {}

    if safety_car_laps is None:
        auto_sc_laps = set()
        state, stint_len = 'green', 0
        for lap in range(1, total_laps + 1):
            if state == 'sc':
                auto_sc_laps.add(lap)
            progress = lap / total_laps
            state, stint_len = sc_gen_model.next_state(state, stint_len, circuit_id, year, progress, rng)
        safety_car_laps = auto_sc_laps
    else:
        safety_car_laps = set(safety_car_laps)

    if rain_laps is None:
        rain_laps = set()
    else:
        rain_laps = set(rain_laps)

    base_lap = circuit_median_map.get(circuit_id)
    if base_lap is None:
        raise ValueError(f"No circuit_median_lap available for circuit_id={circuit_id!r}")

    grid_pos_map = {drv: idx + 1 for idx, drv in enumerate(grid_drivers)}

    drivers_state = []
    for idx, drv in enumerate(grid_drivers):
        strat = driver_strategies.get(drv, global_strategy)
        stops_map = {int(lap): compound for lap, compound in strat}
        if 0 not in stops_map:
            raise ValueError(f"Strategy for {drv} must include lap 0 entry for starting tyre")
        starting_tyre = stops_map.pop(0)
        drivers_state.append(
            {
                "driver_id": drv,
                "grid_position": idx + 1,
                "position": idx + 1,
                "cumul_time": float(idx * 0.3),
                "laps_on_current_tyre": 1,
                "tyre_compound": starting_tyre,
                "gap_to_ahead": 0.0,
                "stops": stops_map,
                "history": [],
                "dnf": False,
            }
        )

    race_log = []

    weather = weather_gen_model.generate_race_weather(circuit_id, year, total_laps, rng)

    for lap in range(1, total_laps + 1):
        prev_positions = {s["driver_id"]: s["position"] for s in drivers_state}
        drivers_by_pos = sorted(
            [s for s in drivers_state if not s.get("dnf", False)],
            key=lambda s: s["position"],
        )

        for idx, s in enumerate(drivers_by_pos):
            if idx == 0:
                s["gap_to_ahead"] = 0.0
            else:
                ahead = drivers_by_pos[idx - 1]
                s["gap_to_ahead"] = s["cumul_time"] - ahead["cumul_time"]

        behind_gaps = []
        for idx, s in enumerate(drivers_by_pos):
            if idx < len(drivers_by_pos) - 1:
                behind_gaps.append(float(drivers_by_pos[idx + 1]["gap_to_ahead"]))
            else:
                behind_gaps.append(np.inf)

        rows = []
        laps_on_tyre_for_update = []
        for idx, s in enumerate(drivers_by_pos):
            tyre_age_feature = s["laps_on_current_tyre"]
            laps_on_current_tyre_next = tyre_age_feature + 1
            race_progress = lap / total_laps
            rain_flag = 1 if lap in rain_laps else 0
            gap_ahead = float(s["gap_to_ahead"])
            gap_behind = float(behind_gaps[idx]) if idx < len(behind_gaps) else np.inf
            pack_density = int((gap_ahead <= PACK_THRESHOLD) + (gap_behind <= PACK_THRESHOLD))

            rows.append(
                {
                    "circuit_id": circuit_id,
                    "laps_on_current_tyre": tyre_age_feature,
                    "tyre_compound": s["tyre_compound"],
                    "race_progress": race_progress,
                    "rainfall": float(weather.get("rainfall", [rain_flag] * total_laps)[lap - 1]) if "rainfall" in weather else rain_flag,
                    "gap_to_ahead_s": gap_ahead,
                    "pack_density": pack_density,
                    "track_temperature": float(weather.get("track_temperature", [0.0] * total_laps)[lap - 1]) if "track_temperature" in weather else 0.0,
                    "air_temperature": float(weather.get("air_temperature", [0.0] * total_laps)[lap - 1]) if "air_temperature" in weather else 0.0,
                    "humidity": float(weather.get("humidity", [0.0] * total_laps)[lap - 1]) if "humidity" in weather else 0.0,
                    "pressure": float(weather.get("pressure", [0.0] * total_laps)[lap - 1]) if "pressure" in weather else 0.0,
                    "wind_speed": float(weather.get("wind_speed", [0.0] * total_laps)[lap - 1]) if "wind_speed" in weather else 0.0,
                    "wind_direction": float(weather.get("wind_direction", [0.0] * total_laps)[lap - 1]) if "wind_direction" in weather else 0.0,
                    "year": year,
                    "driver_skill": driver_skill_map.get(s["driver_id"], 0.0),
                }
            )
            laps_on_tyre_for_update.append(laps_on_current_tyre_next)

        scenario_df = pd.DataFrame(rows)
        scenario_df["circuit_id"] = scenario_df["circuit_id"].astype(circuit_cat)
        scenario_df["tyre_compound"] = scenario_df["tyre_compound"].astype(compound_cat)

        pred_deltas = model.predict(scenario_df)
        pred_deltas = np.asarray(pred_deltas, dtype=float)
        lap_times = base_lap + pred_deltas

        safety_car_active = lap in safety_car_laps
        if safety_car_active:
            lap_times = np.asarray(lap_times, dtype=float)
            leader_time = lap_times[0] * 1.35
            sc_lap_times = [leader_time]
            for idx in range(1, len(drivers_by_pos)):
                candidate = float(lap_times[idx])
                start_gap = float(drivers_by_pos[idx]["gap_to_ahead"])
                gap_end = start_gap + (candidate - sc_lap_times[idx - 1])
                if gap_end < 0.0:
                    candidate = candidate + abs(gap_end) + 0.3
                sc_lap_times.append(candidate)
            lap_times = np.array(sc_lap_times)
            pred_deltas = lap_times - float(base_lap)
            overtake_attempts = np.zeros(len(drivers_by_pos), dtype=bool)
        else:
            lap_times, pred_deltas, overtake_attempts = ot_model.apply_overtakes_for_lap(
                circuit_id=circuit_id,
                drivers_by_pos=drivers_by_pos,
                lap_times=lap_times,
                pred_deltas=pred_deltas,
                base_lap=base_lap,
                year=year,
                close_gap_threshold=1.0,
                fail_gap=0.3,
                rng=rng,
            )

        drivers_by_pos, dnfs_this_lap = hazard_model.apply_dnfs_for_lap(
            circuit_id=circuit_id,
            drivers_by_pos=drivers_by_pos,
            lap_number=lap,
            year=year,
            rng=rng,
        )

        attempts_this_lap = {
            drivers_by_pos[i]["driver_id"]: bool(overtake_attempts[i])
            for i in range(len(drivers_by_pos))
        }
        dnfs_map_this_lap = {
            drivers_by_pos[i]["driver_id"]: bool(dnfs_this_lap[i])
            for i in range(len(drivers_by_pos))
        }

        for idx, s in enumerate(drivers_by_pos):
            lap_time = float(lap_times[idx])
            delta = float(pred_deltas[idx])
            laps_on_current_tyre_next = int(laps_on_tyre_for_update[idx])

            compound_this_lap = s["tyre_compound"]
            pit_compound = s["stops"].get(lap)
            pitted = False
            if pit_compound is not None:
                lap_time += pit_loss
                pitted = True

            dnf_now = dnfs_map_this_lap.get(s["driver_id"], False)
            s["dnf"] = bool(s.get("dnf", False) or dnf_now)

            if not s["dnf"]:
                s["laps_on_current_tyre"] = laps_on_current_tyre_next
                s["cumul_time"] += lap_time

            s["history"].append(
                {
                    "lap": lap,
                    "lap_time": lap_time if not dnf_now else None,
                    "delta": delta if not dnf_now else None,
                    "tyre_compound": compound_this_lap,
                    "pitted": pitted,
                    "overtake_attempt": attempts_this_lap.get(s["driver_id"], False),
                    "dnf": dnf_now,
                }
            )

            if s["dnf"]:
                continue
            if pit_compound is not None:
                s["tyre_compound"] = pit_compound
                s["laps_on_current_tyre"] = 1

        drivers_state = sorted(
            drivers_state,
            key=lambda s: (s.get("dnf", False), s["cumul_time"], s["grid_position"]),
        )
        for pos, s in enumerate(drivers_state, start=1):
            s["position"] = pos

        leader_time = drivers_state[0]["cumul_time"]
        for s in drivers_state:
            last_lap = s["history"][-1]
            gap_to_leader = s["cumul_time"] - leader_time
            pitted = last_lap["pitted"]
            attempted = last_lap["overtake_attempt"]
            dnf_now = last_lap.get("dnf", False)
            lap_time = last_lap["lap_time"]
            delta = last_lap["delta"]

            race_log.append(
                {
                    "lap": lap,
                    "position": s["position"],
                    "driver_id": s["driver_id"],
                    "lap_time": lap_time,
                    "delta": delta,
                    "tyre_compound": last_lap["tyre_compound"],
                    "pitted": pitted,
                    "gap_to_leader": gap_to_leader,
                    "cumul_time": s["cumul_time"],
                    "overtake_attempt": attempted,
                    "dnf": dnf_now or s.get("dnf", False),
                    "pos_change_lap": prev_positions[s["driver_id"]] - s["position"],
                    "pos_change_total": grid_pos_map[s["driver_id"]] - s["position"],
                    "safety_car": safety_car_active,
                }
            )

    return pd.DataFrame(race_log), safety_car_laps


# Single race monte carlo simulation

In [38]:
# Monte Carlo execution
num_runs = 20
race_length = 50
global_strategy = [(0, "SOFT"), (20, "MEDIUM"), (40, "SOFT")]
results = []
summary_rows = []
master_rng = np.random.default_rng(12345)
driver_strategies = {
     "VER": [
         (0, "SOFT"),
         (15, "SOFT"),
         (25, "HARD"),
         (26, "HARD"),
         (27, "HARD"),
         (28, "HARD"),
         (29, "HARD"),
         (30, "HARD"),
         (31, "HARD"),
         (32, "HARD"),
         (33, "HARD"),
         (34, "HARD"),
         (35, "HARD"),
         (45, "HARD"),
     ]
}
circuits = df["circuit_id"].dropna().unique().tolist()
years_by_circuit = df.groupby("circuit_id")["year"].unique().to_dict()
for run in range(num_runs):
    run_rng = np.random.default_rng(master_rng.integers(0, 1_000_000_000))
    circuit_id = run_rng.choice(circuits)
    year = int(run_rng.choice(years_by_circuit.get(circuit_id, [2025])))
    race_log, sc_laps = simulate_race(
        circuit_id=circuit_id,
        grid_drivers=grid_drivers,
        total_laps=race_length,
        year=year,
        global_strategy=global_strategy,
        driver_strategies=driver_strategies,
        safety_car_laps=None,
        rain_laps=None,
        pit_loss=20.0,
        rng=run_rng,
    )
    race_log["run"] = run
    race_log["circuit_id"] = circuit_id
    race_log["year"] = year
    results.append(race_log)
    last_lap = race_log["lap"].max()
    final_class = race_log[race_log["lap"] == last_lap].sort_values("position")
    for _, row in final_class.iterrows():
        summary_rows.append(
            {
                "run": run,
                "circuit_id": circuit_id,
                "year": year,
                "driver_id": row["driver_id"],
                "finish_pos": row["position"],
                "dnf": bool(row["dnf"]),
                "sc_laps": len(sc_laps),
            }
        )
all_logs = pd.concat(results, ignore_index=True)
summary_df = pd.DataFrame(summary_rows)
# Aggregate overview
overview = (
    summary_df.groupby("driver_id")
    .agg(
        runs=("run", "nunique"),
        wins=("finish_pos", lambda s: (s == 1).sum()),
        podiums=("finish_pos", lambda s: (s <= 3).sum()),
        avg_finish=("finish_pos", "mean"),
        dnfs=("dnf", "sum"),
    )
    .sort_values(["wins", "podiums"], ascending=[False, False])
)
print("Overview per driver:\n", overview)
print("\nSample final classification from last run:\n", summary_df[summary_df["run"] == (num_runs - 1)].sort_values("finish_pos"))


Overview per driver:
            runs  wins  podiums  avg_finish  dnfs
driver_id                                       
HAM          20     7       11        5.15     1
LEC          20     5       12        5.70     1
NOR          20     1        7        7.35     1
RUS          20     1        4        7.55     0
ANT          20     1        3        8.10     1
LAW          20     1        3       10.20     0
ALO          20     1        2       11.15     1
GAS          20     1        2        9.65     0
BOR          20     1        1       13.10     1
SAI          20     1        1        9.70     1
PIA          20     0        7        5.85     0
HAD          20     0        2       11.55     0
OCO          20     0        2       10.75     0
HUL          20     0        1       10.85     0
STR          20     0        1       12.35     0
TSU          20     0        1       11.35     2
ALB          20     0        0       13.10     1
BEA          20     0        0        9.95     

# Compare two strategies with monte carlo


In [43]:
# Strategy comparison (side-by-side Monte Carlo) with per-driver overrides
strategy_a_global = [(0, 'SOFT'), (15, 'MEDIUM'), (35, 'SOFT')]
strategy_b_global = [(0, 'SOFT'), (15, 'MEDIUM'), (35, 'SOFT')]

# Optional per-driver overrides; leave empty to fall back to global strategy

strategy_a_driver = {
     "VER": [
         (0, "SOFT"),
         (15, "SOFT"),
         (25, "HARD"),
         (26, "HARD"),
         (27, "HARD"),
         (28, "HARD"),
         (29, "HARD"),
         (30, "HARD"),
         (31, "HARD"),
         (32, "HARD"),
         (33, "HARD"),
         (34, "HARD"),
         (35, "HARD"),
         (45, "HARD"),
     ]
}
#strategy_a_driver = {}  
strategy_b_driver = {
    "VER": [(0, 'HARD'), (35, 'MEDIUM')],
}

num_runs_compare = 160
race_length = 50
summary_comp = []

for run in range(num_runs_compare):
    run_rng = np.random.default_rng(master_rng.integers(0, 1_000_000_000))
    circuit_id = run_rng.choice(circuits)
    year = int(run_rng.choice(years_by_circuit.get(circuit_id, [2025])))

    configs = [
        ('A', strategy_a_global, strategy_a_driver),
        ('B', strategy_b_global, strategy_b_driver),
    ]

    for label, glob_strat, driver_strats in configs:
        rng_run = np.random.default_rng(run_rng.integers(0, 1_000_000_000))
        race_log, sc_laps = simulate_race(
            circuit_id=circuit_id,
            grid_drivers=grid_drivers,
            total_laps=race_length,
            year=year,
            global_strategy=glob_strat,
            driver_strategies=driver_strats,
            safety_car_laps=None,
            rain_laps=None,
            pit_loss=20.0,
            rng=rng_run,
        )
        last_lap = race_log['lap'].max()
        final_class = race_log[race_log['lap'] == last_lap].sort_values('position')
        for _, row in final_class.iterrows():
            summary_comp.append({
                'run': run,
                'strategy': label,
                'circuit_id': circuit_id,
                'year': year,
                'driver_id': row['driver_id'],
                'finish_pos': row['position'],
                'dnf': bool(row['dnf']),
                'sc_laps': len(sc_laps),
            })

summary_comp_df = pd.DataFrame(summary_comp)

wins = summary_comp_df[summary_comp_df['finish_pos'] == 1].groupby('strategy')['driver_id'].count()
avg_finish = summary_comp_df.groupby(['driver_id', 'strategy'])['finish_pos'].mean().unstack()
avg_finish['delta_B_minus_A'] = avg_finish.get('B', np.nan) - avg_finish.get('A', np.nan)

print('Wins per strategy:', wins)
print('Average finish per driver (A vs B, lower is better): ', avg_finish.sort_values('delta_B_minus_A'))


Wins per strategy: strategy
A    160
B    160
Name: driver_id, dtype: int64
Average finish per driver (A vs B, lower is better):  strategy          A         B  delta_B_minus_A
driver_id                                     
VER        19.29375   2.11875        -17.17500
SAI         9.80000   9.90000          0.10000
HAD        13.38750  13.54375          0.15625
ALB        11.05000  11.28125          0.23125
STR        13.38750  13.78125          0.39375
BOR        13.23750  13.63750          0.40000
ALO        10.79375  11.28125          0.48750
RUS         9.25000   9.84375          0.59375
GAS        11.17500  11.81875          0.64375
OCO        12.16875  13.01250          0.84375
BEA         9.13125  10.03750          0.90625
TSU        10.42500  11.37500          0.95000
LAW        12.21875  13.22500          1.00625
NOR         6.30000   7.46875          1.16875
COL        15.57500  16.75625          1.18125
ANT         7.83750   9.03125          1.19375
HUL        11.08125  12.