In [20]:
from pathlib import Path
import pandas as pd
import numpy as np

# Load lap-level dataset (tries repo root and ./models/)
csv_candidates = [Path("fastf1_lap_dataset.csv"), Path("models/fastf1_lap_dataset.csv")]
csv_path = None
for cand in csv_candidates:
    if cand.exists():
        csv_path = cand
        break

if csv_path is None:
    raise FileNotFoundError("fastf1_lap_dataset.csv not found in working directory or ./models/")

df = pd.read_csv(csv_path)
df = df.drop(columns=["grid_position"], errors="ignore")
df = df[df["is_pit"] == False].copy()

# Basic imputations for overtaking features
df["current_position"] = df["current_position"].fillna(method="ffill")
df["gap_to_ahead_s"] = df["gap_to_ahead_s"].fillna(method="ffill")
df["lap_time_s"] = df["lap_time_s"].fillna(method="ffill").fillna(method="bfill")
df["tyre_compound"] = df["tyre_compound"].fillna(method="ffill")
df["laps_on_current_tyre"] = df["laps_on_current_tyre"].fillna(df["laps_on_current_tyre"].median())

print(f"Loaded {len(df):,} laps from {csv_path}")
df.head()


Loaded 186,420 laps from fastf1_lap_dataset.csv


  df["current_position"] = df["current_position"].fillna(method="ffill")
  df["gap_to_ahead_s"] = df["gap_to_ahead_s"].fillna(method="ffill")
  df["lap_time_s"] = df["lap_time_s"].fillna(method="ffill").fillna(method="bfill")
  df["tyre_compound"] = df["tyre_compound"].fillna(method="ffill")


Unnamed: 0,driver_id,team_id,circuit_id,total_race_laps,year,session_name,current_position,gap_to_leader_s,gap_to_ahead_s,lap_time_s,...,is_pit,session_key,race_name,team_name,virtual_sc_this_lap,humidity,pressure,rainfall,wind_speed,wind_direction
0,ALO,mclaren,yas_marina,55,2018,Race,14.0,26.062,2.134,160.62,...,False,2018_abu_dhabi_grand_prix_race,Abu Dhabi Grand Prix,McLaren,False,38.7,1012.8,False,0.7,272
1,ALO,mclaren,yas_marina,55,2018,Race,14.0,16.996,1.68,160.62,...,False,2018_abu_dhabi_grand_prix_race,Abu Dhabi Grand Prix,McLaren,False,38.2,1012.7,False,1.5,225
2,ALO,mclaren,yas_marina,55,2018,Race,14.0,16.34,0.813,160.62,...,False,2018_abu_dhabi_grand_prix_race,Abu Dhabi Grand Prix,McLaren,False,37.5,1012.7,False,1.1,268
3,ALO,mclaren,yas_marina,55,2018,Race,14.0,8.01,0.432,160.62,...,False,2018_abu_dhabi_grand_prix_race,Abu Dhabi Grand Prix,McLaren,False,37.7,1012.8,False,0.8,267
4,ALO,mclaren,yas_marina,55,2018,Race,14.0,12.847,0.613,108.819,...,False,2018_abu_dhabi_grand_prix_race,Abu Dhabi Grand Prix,McLaren,False,37.5,1012.8,False,1.3,0


# Overtaking model
Copied from `xgboost_laptime.ipynb` so you can tweak the overtake logic separately.
The notebook reuses the same upstream variables (e.g. `driver_skill_map`, `circuit_median_map`, `circuit_cat`, `compound_cat`, `model`) if you want to integrate with lap predictions; they are not redefined here.
Run the dataset import cell first to build `df`, then run the overtaking model cell and the final backtrace cell to sanity-check overtake rates against the raw data.


In [21]:
import random
import numpy as np
import pandas as pd


class OvertakingModel:
    """Encapsulated overtaking model with circuit and circuit-year ease.

    Attributes set after init:
      - overtake_stats, circuit_overtake_ease
      - overtake_stats_by_year, circuit_year_overtake_ease (if include_year)
    """

    def __init__(
        self,
        df,
        driver_skill_map=None,
        gap_threshold=1.0,
        min_opps_prior=200,
        include_year=True,
        min_opps_score=25,
        clip_bounds=(0.05, 0.95),
        blend_weight=0.5,
        rng=None,
    ):
        self.df = df.copy()
        self.driver_skill_map = driver_skill_map or {}
        self.gap_threshold = gap_threshold
        self.min_opps_prior = min_opps_prior
        self.include_year = include_year
        self.min_opps_score = min_opps_score
        self.clip_bounds = clip_bounds
        self.blend_weight = blend_weight
        self.rng = rng or np.random.default_rng()

        self.overtake_stats = None
        self.circuit_overtake_ease = None
        self.overtake_stats_by_year = None
        self.circuit_year_overtake_ease = None

        self._prepare_and_fit()

    def _prepare_and_fit(self):
        tmp = self.df.sort_values(["session_key", "lap_number", "current_position"])

        grp = tmp.groupby(["session_key", "driver_id"])
        tmp["prev_position"] = grp["current_position"].shift(1)
        tmp["prev_gap_to_ahead_s"] = grp["gap_to_ahead_s"].shift(1)

        valid = tmp["prev_position"].notna()
        self.tmp_valid = tmp[valid].copy()

        self.tmp_valid["positions_gained"] = (
            self.tmp_valid["prev_position"] - self.tmp_valid["current_position"]
        ).clip(lower=0)
        self.tmp_valid["overtake_event"] = (self.tmp_valid["positions_gained"] > 0).astype(int)

        (
            self.overtake_stats,
            self.circuit_overtake_ease,
            self.overtake_stats_by_year,
            self.circuit_year_overtake_ease,
        ) = self._build_circuit_overtake_model()

    def _aggregate(self, tmp_frame, group_cols):
        # Count overtakes and opportunities per group
        overtakes_by_group = tmp_frame.groupby(group_cols)["overtake_event"].sum()

        opp_mask = (
            (tmp_frame["prev_position"] > 1)
            & (tmp_frame["prev_gap_to_ahead_s"].notna())
            & (tmp_frame["prev_gap_to_ahead_s"] <= self.gap_threshold)
        )
        opportunities_by_group = (
            tmp_frame[opp_mask].groupby(group_cols)["driver_id"].count()
        )

        stats = pd.concat(
            [overtakes_by_group, opportunities_by_group],
            axis=1,
        ).fillna(0.0)
        stats.columns = ["overtakes", "opportunities"]

        stats["raw_rate"] = np.where(
            stats["opportunities"] > 0,
            stats["overtakes"] / stats["opportunities"],
            0.0,
        )

        global_rate = stats["raw_rate"].replace(0.0, np.nan).mean()
        if np.isnan(global_rate):
            global_rate = 0.05

        prior_w = float(self.min_opps_prior)
        stats["overtake_rate"] = (
            stats["raw_rate"] * stats["opportunities"] + global_rate * prior_w
        ) / (stats["opportunities"] + prior_w)

        stats["enough_data"] = stats["opportunities"] >= self.min_opps_score
        valid_rates = stats.loc[stats["enough_data"], "overtake_rate"]

        if valid_rates.empty:
            stats["overtake_ease"] = 0.5
        else:
            q_low = valid_rates.quantile(0.1)
            q_high = valid_rates.quantile(0.9)
            if q_high <= q_low:
                stats["overtake_ease"] = 0.5
            else:
                norm = (stats["overtake_rate"] - q_low) / (q_high - q_low)
                stats["overtake_ease"] = norm.clip(
                    self.clip_bounds[0], self.clip_bounds[1]
                )

        stats.loc[~stats["enough_data"], "overtake_ease"] = np.nan
        ease_map = stats["overtake_ease"].dropna().to_dict()
        return stats, ease_map

    def _build_circuit_overtake_model(self):
        stats_by_circuit, ease_by_circuit = self._aggregate(
            self.tmp_valid, ["circuit_id"]
        )

        stats_by_circuit_year = ease_by_circuit_year = None
        if self.include_year and "year" in self.tmp_valid.columns:
            stats_by_circuit_year, ease_by_circuit_year = self._aggregate(
                self.tmp_valid, ["circuit_id", "year"]
            )

        return (
            stats_by_circuit,
            ease_by_circuit,
            stats_by_circuit_year,
            ease_by_circuit_year,
        )

    def ease(self, circuit_id, year=None, default_base=0.3):
        base = self.circuit_overtake_ease.get(circuit_id, default_base)
        if year is not None and self.circuit_year_overtake_ease is not None:
            year_ease = self.circuit_year_overtake_ease.get((circuit_id, year))
            if year_ease is not None:
                return float(
                    (1 - self.blend_weight) * float(base)
                    + self.blend_weight * float(year_ease)
                )
        return float(base)

    def overtake_success_probability(
        self,
        attacker_state,
        defender_state,
        circuit_id,
        gap_start,
        year=None,
    ):
        """Compute overtake success probability for attacker vs defender."""
        ease = self.ease(circuit_id, year)

        skill_att = float(self.driver_skill_map.get(attacker_state["driver_id"], 0.0))
        skill_def = float(self.driver_skill_map.get(defender_state["driver_id"], 0.0))
        skill_diff = skill_att - skill_def

        tyre_adv_laps = defender_state["laps_on_current_tyre"] - attacker_state["laps_on_current_tyre"]

        skill_term = 0.15 * np.tanh(skill_diff / 0.5)
        tyre_term = 0.10 * np.tanh(tyre_adv_laps / 10.0)
        gap_term = -0.15 * np.tanh(max(gap_start, 0.0) / 0.7)

        p = 0.2 + 0.6 * ease + skill_term + tyre_term + gap_term
        p = float(np.clip(p, 0.01, 0.95))
        return p

    def apply_overtakes_for_lap(
        self,
        circuit_id,
        drivers_by_pos,
        lap_times,
        pred_deltas,
        base_lap,
        year=None,
        close_gap_threshold=1.0,
        fail_gap=0.3,
        rng=None,
    ):
        """Apply overtaking effects to one lap prediction for ordered drivers."""
        lap_times = np.asarray(lap_times, dtype=float).copy()
        pred_deltas = np.asarray(pred_deltas, dtype=float).copy()
        n = len(drivers_by_pos)

        ease = self.ease(circuit_id, year, default_base=1.0)
        overtake_attempts = np.zeros(n, dtype=bool)

        rng = rng or self.rng or np.random.default_rng()

        for idx in range(1, n):
            follower = drivers_by_pos[idx]
            leader = drivers_by_pos[idx - 1]

            gap_start = float(follower["gap_to_ahead"])  # gap at start of lap
            leader_time = lap_times[idx - 1]
            follower_time = lap_times[idx]
            gap_end_raw = gap_start + (follower_time - leader_time)

            going_to_pass_raw = gap_end_raw < 0.0
            close_enough = gap_start <= close_gap_threshold

            if not going_to_pass_raw and not close_enough:
                continue

            overtake_attempts[idx] = True

            margin = max(0.0, -gap_end_raw)
            # Use data-driven probability helper
            p_success = self.overtake_success_probability(
                attacker_state=follower,
                defender_state=leader,
                circuit_id=circuit_id,
                gap_start=gap_start,
                year=year,
            )
            # Small bump if margin is large
            p_success = float(min(0.99, p_success + 0.15 * min(margin / 0.5, 1.0)))

            success = (rng.random() < p_success) and going_to_pass_raw
            if success:
                continue

            desired_follower_time = leader_time + fail_gap - gap_start
            if desired_follower_time > follower_time:
                lap_times[idx] = desired_follower_time

        pred_deltas = lap_times - float(base_lap)
        return lap_times, pred_deltas, overtake_attempts


# Instantiate the model for notebook use, keeping backward-compatible variables
_driver_skill_map = locals().get("driver_skill_map", {})
overtaking_model = OvertakingModel(
    df=df,
    driver_skill_map=_driver_skill_map,
    include_year=True,
)

overtake_stats = overtaking_model.overtake_stats
circuit_overtake_ease = overtaking_model.circuit_overtake_ease
overtake_stats_by_year = overtaking_model.overtake_stats_by_year
circuit_year_overtake_ease = overtaking_model.circuit_year_overtake_ease


In [22]:
circuit_year_overtake_ease

{('austin', 2018): 0.95,
 ('austin', 2019): 0.95,
 ('austin', 2021): 0.95,
 ('austin', 2022): 0.6139418279243928,
 ('austin', 2023): 0.6191325716029239,
 ('austin', 2024): 0.24270912142060427,
 ('austin', 2025): 0.19792076962320004,
 ('baku', 2018): 0.8008473328650743,
 ('baku', 2019): 0.95,
 ('baku', 2021): 0.843414809603071,
 ('baku', 2022): 0.8169223476856121,
 ('baku', 2023): 0.05,
 ('baku', 2024): 0.4447971074279327,
 ('baku', 2025): 0.1862516652790352,
 ('barcelona', 2018): 0.7542805644609636,
 ('barcelona', 2019): 0.34859427697933065,
 ('barcelona', 2020): 0.8385656632447119,
 ('barcelona', 2021): 0.5733476956351508,
 ('barcelona', 2022): 0.95,
 ('barcelona', 2023): 0.95,
 ('barcelona', 2024): 0.95,
 ('barcelona', 2025): 0.892724055907857,
 ('budapest', 2018): 0.6600524094083049,
 ('budapest', 2019): 0.303638123407437,
 ('budapest', 2020): 0.5128477538031008,
 ('budapest', 2021): 0.3488948389095475,
 ('budapest', 2022): 0.95,
 ('budapest', 2023): 0.7908661134616948,
 ('budapest'

In [23]:
overtake_stats


Unnamed: 0_level_0,overtakes,opportunities,raw_rate,overtake_rate,enough_data,overtake_ease
circuit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
austin,926,1504,0.615691,0.606028,True,0.723963
baku,753,1427,0.52768,0.528379,True,0.48176
barcelona,1164,1466,0.793997,0.762708,True,0.95
budapest,966,1618,0.597033,0.590029,True,0.674058
hockenheim,315,391,0.805627,0.713489,True,0.95
imola,534,1341,0.39821,0.415751,True,0.130453
istanbul,167,255,0.654902,0.601477,True,0.709768
jeddah,362,1059,0.341832,0.372257,True,0.05
las_vegas,452,753,0.600266,0.586225,True,0.662192
le_castellet,410,737,0.556309,0.551411,True,0.553602


In [24]:
from IPython.display import display

# Backtrace overtaking ease against observed rates in the dataset
backtrace = overtake_stats.copy()
backtrace["observed_rate"] = np.where(
    backtrace["opportunities"] > 0,
    backtrace["overtakes"] / backtrace["opportunities"],
    np.nan,
)
backtrace["shrink_diff"] = backtrace["overtake_rate"] - backtrace["observed_rate"]

low_data_circuits = (~backtrace["overtake_ease"].notna()).sum()
print(f"Circuits without enough data (ease NaN): {int(low_data_circuits)}")

total_opps = backtrace["opportunities"].sum()
total_overtakes = backtrace["overtakes"].sum()
overall_observed_rate = (total_overtakes / total_opps) if total_opps > 0 else float("nan")

print(f"Total overtakes: {total_overtakes:.0f} | total opportunities: {total_opps:.0f}")
print(f"Overall observed overtake rate: {overall_observed_rate:.3f}")

print("Top circuits by observed overtake rate:")
display(
    backtrace
    .sort_values("observed_rate", ascending=False)
    .head(10)[["overtakes", "opportunities", "observed_rate", "overtake_rate", "overtake_ease", "shrink_diff"]]
)

print("Circuits with lowest observed overtake rate:")
display(
    backtrace
    .sort_values("observed_rate", ascending=True)
    .head(10)[["overtakes", "opportunities", "observed_rate", "overtake_rate", "overtake_ease", "shrink_diff"]]
)

# Circuit-year breakdowns if available
if overtake_stats_by_year is not None:
    backtrace_year = overtake_stats_by_year.copy()
    backtrace_year["observed_rate"] = np.where(
        backtrace_year["opportunities"] > 0,
        backtrace_year["overtakes"] / backtrace_year["opportunities"],
        np.nan,
    )
    backtrace_year["shrink_diff"] = backtrace_year["overtake_rate"] - backtrace_year["observed_rate"]

    low_data_cy = (~backtrace_year["overtake_ease"].notna()).sum()
    print(f"Circuit/year combos without enough data (ease NaN): {int(low_data_cy)}")

    print("Top circuit/year combos by observed overtake rate:")
    display(
        backtrace_year
        .sort_values("observed_rate", ascending=False)
        .head(10)[["overtakes", "opportunities", "observed_rate", "overtake_rate", "overtake_ease", "shrink_diff"]]
    )

    print("Circuit/year combos with lowest observed overtake rate:")
    display(
        backtrace_year
        .sort_values("observed_rate", ascending=True)
        .head(10)[["overtakes", "opportunities", "observed_rate", "overtake_rate", "overtake_ease", "shrink_diff"]]
    )

Circuits without enough data (ease NaN): 0
Total overtakes: 20453 | total opportunities: 41392
Overall observed overtake rate: 0.494
Top circuits by observed overtake rate:


Unnamed: 0_level_0,overtakes,opportunities,observed_rate,overtake_rate,overtake_ease,shrink_diff
circuit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
yas_marina,121,124,0.975806,0.702692,0.95,-0.273115
hockenheim,315,391,0.805627,0.713489,0.95,-0.092137
barcelona,1164,1466,0.793997,0.762708,0.95,-0.031289
nürburgring,164,211,0.777251,0.658569,0.887849,-0.118682
yas_island,912,1270,0.71811,0.692974,0.95,-0.025136
sakhir,1354,1900,0.712632,0.695558,0.95,-0.017073
sochi,452,662,0.682779,0.648111,0.855229,-0.034668
suzuka,629,958,0.656576,0.635295,0.815253,-0.021281
istanbul,167,255,0.654902,0.601477,0.709768,-0.053425
mugello,63,97,0.649485,0.571286,0.615597,-0.078198


Circuits with lowest observed overtake rate:


Unnamed: 0_level_0,overtakes,opportunities,observed_rate,overtake_rate,overtake_ease,shrink_diff
circuit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
monaco,226,1367,0.165326,0.212299,0.05,0.046973
monte_carlo,192,771,0.249027,0.307592,0.05,0.058565
jeddah,362,1059,0.341832,0.372257,0.05,0.030425
montréal,635,1842,0.344734,0.363209,0.05,0.018475
miami,365,1053,0.346629,0.376434,0.05,0.029806
melbourne,479,1333,0.35934,0.382043,0.05,0.022703
zandvoort,611,1607,0.380212,0.397162,0.072471,0.016951
miami_gardens,105,268,0.391791,0.452291,0.244428,0.0605
imola,534,1341,0.39821,0.415751,0.130453,0.017541
silverstone,852,2139,0.398317,0.409864,0.112091,0.011547


Circuit/year combos without enough data (ease NaN): 0
Top circuit/year combos by observed overtake rate:


Unnamed: 0_level_0,Unnamed: 1_level_0,overtakes,opportunities,observed_rate,overtake_rate,overtake_ease,shrink_diff
circuit_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
austin,2018,87,70,1.242857,0.737947,0.95,-0.504911
mexico_city,2019,136,121,1.123967,0.773351,0.95,-0.350616
barcelona,2023,214,191,1.120419,0.834388,0.95,-0.286031
austin,2021,114,107,1.065421,0.736956,0.95,-0.328464
suzuka,2023,125,119,1.05042,0.743716,0.95,-0.306704
sakhir,2021,140,134,1.044776,0.755226,0.95,-0.28955
sochi,2018,80,77,1.038961,0.694027,0.93585,-0.344934
barcelona,2022,144,141,1.021277,0.751453,0.95,-0.269823
shanghai,2019,105,104,1.009615,0.714624,0.95,-0.294992
baku,2019,125,124,1.008065,0.732239,0.95,-0.275825


Circuit/year combos with lowest observed overtake rate:


Unnamed: 0_level_0,Unnamed: 1_level_0,overtakes,opportunities,observed_rate,overtake_rate,overtake_ease,shrink_diff
circuit_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
monaco,2024,13,390,0.033333,0.212281,0.05,0.178947
spa_francorchamps,2025,53,383,0.138381,0.28344,0.05,0.145059
imola,2022,61,405,0.150617,0.286356,0.05,0.135739
monte_carlo,2019,66,381,0.173228,0.306791,0.05,0.133563
lusail,2025,78,445,0.175281,0.294954,0.05,0.119673
jeddah,2024,51,289,0.176471,0.333835,0.05,0.157365
monaco,2025,85,456,0.186404,0.300679,0.05,0.114276
montréal,2023,83,433,0.191686,0.308445,0.05,0.116759
são_paulo,2024,96,491,0.195519,0.301368,0.05,0.105849
miami,2024,109,512,0.212891,0.310738,0.05,0.097848


In [25]:
# Backtrace: compare simulated overtakes vs observed overtakes in the dataset

import pandas as pd

def simulate_overtakes_for_dataset(model: OvertakingModel):
    # Reuse tmp_valid prepared in the model (laps with prev info)
    data = model.tmp_valid.copy()
    data = data.sort_values(['session_key', 'lap_number', 'prev_position'])

    simulated_passes = 0
    observed_passes = int(data['overtake_event'].sum())
    total_opps = 0

    rng = np.random.default_rng(123)

    for _, row in data.iterrows():
        gap_start = float(row['prev_gap_to_ahead_s']) if pd.notna(row['prev_gap_to_ahead_s']) else 999.0
        if not (row['prev_position'] > 1 and gap_start <= model.gap_threshold):
            continue
        total_opps += 1
        attacker = {'driver_id': row['driver_id'], 'laps_on_current_tyre': row.get('laps_on_current_tyre', 1)}
        defender = {'driver_id': 'DEF', 'laps_on_current_tyre': row.get('laps_on_current_tyre', 1)}
        p = model.overtake_success_probability(attacker, defender, row['circuit_id'], gap_start, row.get('year'))
        if rng.random() < p:
            simulated_passes += 1

    return observed_passes, simulated_passes, total_opps

obs, sim, opps = simulate_overtakes_for_dataset(overtaking_model)
print(f"Observed overtakes: {obs}")
print(f"Simulated overtakes: {sim}")
print(f"Opportunities considered: {opps}")
if opps:
    print(f"Observed rate: {obs/opps:.3f}, Simulated rate: {sim/opps:.3f}")


Observed overtakes: 20453
Simulated overtakes: 14099
Opportunities considered: 41392
Observed rate: 0.494, Simulated rate: 0.341


In [None]:
# Backtrace: compare simulated overtakes vs observed overtakes in the dataset

import pandas as pd

def simulate_overtakes_for_dataset(model: OvertakingModel):
    data = model.tmp_valid.copy()
    data = data.sort_values(['session_key', 'lap_number', 'prev_position'])

    simulated_passes = 0
    observed_passes = int(data['overtake_event'].sum())
    total_opps = 0

    rng = np.random.default_rng(123)

    for _, row in data.iterrows():
        gap_start = float(row['prev_gap_to_ahead_s']) if pd.notna(row['prev_gap_to_ahead_s']) else 999.0
        if not (row['prev_position'] > 1 and gap_start <= model.gap_threshold):
            continue
        total_opps += 1
        attacker = {'driver_id': row['driver_id'], 'laps_on_current_tyre': row.get('laps_on_current_tyre', 1)}
        defender = {'driver_id': 'DEF', 'laps_on_current_tyre': row.get('laps_on_current_tyre', 1)}
        p = model.overtake_success_probability(attacker, defender, row['circuit_id'], gap_start, row.get('year'))
        if rng.random() < p:
            simulated_passes += 1

    return observed_passes, simulated_passes, total_opps

obs, sim, opps = simulate_overtakes_for_dataset(overtaking_model)
print(f"Observed overtakes: {obs}")
print(f"Simulated overtakes: {sim}")
print(f"Opportunities considered: {opps}")
if opps:
    print(f"Observed rate: {obs/opps:.3f}, Simulated rate: {sim/opps:.3f}")

# Circuit-level comparison
rows = []
for circuit_id, grp in overtaking_model.tmp_valid.groupby('circuit_id'):
    rng = np.random.default_rng(123)
    sim = 0
    opps_c = 0
    obs_c = int(grp['overtake_event'].sum())
    for _, row in grp.iterrows():
        gap_start = float(row['prev_gap_to_ahead_s']) if pd.notna(row['prev_gap_to_ahead_s']) else 999.0
        if not (row['prev_position'] > 1 and gap_start <= overtaking_model.gap_threshold):
            continue
        opps_c += 1
        attacker = {'driver_id': row['driver_id'], 'laps_on_current_tyre': row.get('laps_on_current_tyre', 1)}
        defender = {'driver_id': 'DEF', 'laps_on_current_tyre': row.get('laps_on_current_tyre', 1)}
        p = overtaking_model.overtake_success_probability(attacker, defender, row['circuit_id'], gap_start, row.get('year'))
        if rng.random() < p:
            sim += 1
    if opps_c > 0:
        rows.append({
            'circuit_id': circuit_id,
            'observed': obs_c,
            'simulated': sim,
            'opportunities': opps_c,
            'obs_rate': obs_c/opps_c,
            'sim_rate': sim/opps_c,
            'error_pct': (sim - obs_c)/opps_c * 100.0,
        })

circuit_compare = pd.DataFrame(rows)
if not circuit_compare.empty:
    circuit_compare = circuit_compare.sort_values('error_pct')
    print("\nCircuit-level overtake comparison (sim - obs):")
    print(circuit_compare)

