In [21]:
from pathlib import Path
import pandas as pd
import numpy as np

# Load lap-level dataset (tries repo root and ./models/)
csv_candidates = [Path("fastf1_lap_dataset.csv"), Path("models/fastf1_lap_dataset.csv")]
csv_path = None
for cand in csv_candidates:
    if cand.exists():
        csv_path = cand
        break

if csv_path is None:
    raise FileNotFoundError("fastf1_lap_dataset.csv not found in working directory or ./models/")

df = pd.read_csv(csv_path)
df = df.drop(columns=["grid_position"], errors="ignore")
df = df[df["is_pit"] == False].copy()

# Basic imputations for overtaking features
df["current_position"] = df["current_position"].fillna(method="ffill")
df["gap_to_ahead_s"] = df["gap_to_ahead_s"].fillna(method="ffill")
df["lap_time_s"] = df["lap_time_s"].fillna(method="ffill").fillna(method="bfill")
df["tyre_compound"] = df["tyre_compound"].fillna(method="ffill")
df["laps_on_current_tyre"] = df["laps_on_current_tyre"].fillna(df["laps_on_current_tyre"].median())

print(f"Loaded {len(df):,} laps from {csv_path}")
df.head()

# Overtake opportunity context for stats/backtrace
GAP_THRESHOLD = 1.0
MIN_OPPS_PRIOR = 200
MIN_OPPS_SCORE = 25
CLIP_BOUNDS = (0.05, 0.95)

_tmp = df.sort_values(["session_key", "lap_number", "current_position"]).copy()
_grp = _tmp.groupby(["session_key", "driver_id"])
_tmp["prev_position"] = _grp["current_position"].shift(1)
_tmp["prev_gap_to_ahead_s"] = _grp["gap_to_ahead_s"].shift(1)

_valid = _tmp["prev_position"].notna()
overtake_context_df = _tmp[_valid].copy()

overtake_context_df["positions_gained"] = (
    overtake_context_df["prev_position"] - overtake_context_df["current_position"]
).clip(lower=0)
overtake_context_df["overtake_event"] = (overtake_context_df["positions_gained"] > 0).astype(int)

# Map defender state from the previous lap (start-of-lap conditions)
overtake_context_df["prev_lap_number"] = overtake_context_df["lap_number"].astype(int) - 1
overtake_context_df["prev_position_int"] = overtake_context_df["prev_position"].round().astype("Int64")
overtake_context_df["defender_position"] = (overtake_context_df["prev_position_int"] - 1).astype("Int64")

_prev_state = _tmp[[
    "session_key",
    "lap_number",
    "current_position",
    "driver_id",
    "laps_on_current_tyre",
]].copy()
_prev_state = _prev_state.rename(
    columns={
        "lap_number": "prev_lap_number",
        "current_position": "defender_position",
        "driver_id": "defender_id",
        "laps_on_current_tyre": "defender_laps_on_current_tyre",
    }
)
_prev_state["defender_position"] = _prev_state["defender_position"].round().astype("Int64")

overtake_context_df = overtake_context_df.merge(
    _prev_state,
    on=["session_key", "prev_lap_number", "defender_position"],
    how="left",
)

def build_overtake_stats(tmp_frame, group_cols):
    overtakes_by_group = tmp_frame.groupby(group_cols)["overtake_event"].sum()

    opp_mask = (
        (tmp_frame["prev_position"] > 1)
        & (tmp_frame["prev_gap_to_ahead_s"].notna())
        & (tmp_frame["prev_gap_to_ahead_s"] <= GAP_THRESHOLD)
    )
    opportunities_by_group = (
        tmp_frame[opp_mask].groupby(group_cols)["driver_id"].count()
    )

    stats = pd.concat([overtakes_by_group, opportunities_by_group], axis=1).fillna(0.0)
    stats.columns = ["overtakes", "opportunities"]

    stats["raw_rate"] = np.where(
        stats["opportunities"] > 0,
        stats["overtakes"] / stats["opportunities"],
        0.0,
    )

    global_rate = stats["raw_rate"].replace(0.0, np.nan).mean()
    if np.isnan(global_rate):
        global_rate = 0.05

    stats["overtake_rate"] = (
        stats["raw_rate"] * stats["opportunities"] + global_rate * MIN_OPPS_PRIOR
    ) / (stats["opportunities"] + MIN_OPPS_PRIOR)

    stats["enough_data"] = stats["opportunities"] >= MIN_OPPS_SCORE
    valid_rates = stats.loc[stats["enough_data"], "overtake_rate"]

    if valid_rates.empty:
        stats["overtake_ease"] = 0.5
    else:
        q_low = valid_rates.quantile(0.1)
        q_high = valid_rates.quantile(0.9)
        if q_high <= q_low:
            stats["overtake_ease"] = 0.5
        else:
            norm = (stats["overtake_rate"] - q_low) / (q_high - q_low)
            stats["overtake_ease"] = norm.clip(CLIP_BOUNDS[0], CLIP_BOUNDS[1])

    stats.loc[~stats["enough_data"], "overtake_ease"] = np.nan
    ease_map = stats["overtake_ease"].dropna().to_dict()
    return stats, ease_map


overtake_stats, _ = build_overtake_stats(overtake_context_df, ["circuit_id"])

overtake_stats_by_year = None
if "year" in overtake_context_df.columns:
    overtake_stats_by_year, _ = build_overtake_stats(
        overtake_context_df, ["circuit_id", "year"]
    )




Loaded 186,420 laps from fastf1_lap_dataset.csv


  df["current_position"] = df["current_position"].fillna(method="ffill")
  df["gap_to_ahead_s"] = df["gap_to_ahead_s"].fillna(method="ffill")
  df["lap_time_s"] = df["lap_time_s"].fillna(method="ffill").fillna(method="bfill")
  df["tyre_compound"] = df["tyre_compound"].fillna(method="ffill")


# Overtaking model
Copied from `xgboost_laptime.ipynb` so you can tweak the overtake logic separately.
The notebook reuses the same upstream variables (e.g. `driver_skill_map`, `circuit_median_map`, `circuit_cat`, `compound_cat`, `model`) if you want to integrate with lap predictions; they are not redefined here.
Run the dataset import cell first to build `df`, then run the overtaking model cell and the final backtrace cell to sanity-check overtake rates against the raw data.


In [24]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GroupShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder


class OvertakingModel:
    """Encapsulated overtaking model with a learned probability pipeline."""

    def __init__(
        self,
        df=None,
        driver_skill_map=None,
        gap_threshold=1.0,
        include_year=True,
        auto_fit=True,
        rng=None,
    ):
        self.df = df.copy() if df is not None else None
        self.driver_skill_map = driver_skill_map or {}
        self.gap_threshold = gap_threshold
        self.include_year = include_year
        self.rng = rng or np.random.default_rng()

        self.pipeline = None
        self.base_rate = 0.05
        self.feature_columns = []

        if auto_fit and self.df is not None:
            self.fit(self.df)

    def fit(self, df=None):
        if df is not None:
            self.df = df.copy()
        if self.df is None:
            raise ValueError("No dataframe provided for training.")

        tmp = self.df.sort_values(["session_key", "lap_number", "current_position"]).copy()

        grp = tmp.groupby(["session_key", "driver_id"])
        tmp["prev_position"] = grp["current_position"].shift(1)
        tmp["prev_gap_to_ahead_s"] = grp["gap_to_ahead_s"].shift(1)

        valid = tmp["prev_position"].notna()
        tmp_valid = tmp[valid].copy()

        tmp_valid["positions_gained"] = (
            tmp_valid["prev_position"] - tmp_valid["current_position"]
        ).clip(lower=0)
        tmp_valid["overtake_event"] = (tmp_valid["positions_gained"] > 0).astype(int)

        # Map defender state from the previous lap (start-of-lap conditions)
        tmp_valid["prev_lap_number"] = tmp_valid["lap_number"].astype(int) - 1
        tmp_valid["prev_position_int"] = tmp_valid["prev_position"].round().astype("Int64")
        tmp_valid["defender_position"] = (tmp_valid["prev_position_int"] - 1).astype("Int64")

        prev_state = tmp[[
            "session_key",
            "lap_number",
            "current_position",
            "driver_id",
            "laps_on_current_tyre",
        ]].copy()
        prev_state = prev_state.rename(
            columns={
                "lap_number": "prev_lap_number",
                "current_position": "defender_position",
                "driver_id": "defender_id",
                "laps_on_current_tyre": "defender_laps_on_current_tyre",
            }
        )
        prev_state["defender_position"] = prev_state["defender_position"].round().astype("Int64")

        tmp_valid = tmp_valid.merge(
            prev_state,
            on=["session_key", "prev_lap_number", "defender_position"],
            how="left",
        )

        opp_mask = (
            (tmp_valid["prev_position"] > 1)
            & (tmp_valid["prev_gap_to_ahead_s"].notna())
            & (tmp_valid["prev_gap_to_ahead_s"] <= self.gap_threshold)
        )
        train_df = tmp_valid[opp_mask].copy()
        if train_df.empty:
            self.pipeline = None
            self.base_rate = 0.05
            return self

        train_df["gap_start"] = train_df["prev_gap_to_ahead_s"].astype(float).clip(lower=0.0)
        train_df["defender_laps_on_current_tyre"] = train_df["defender_laps_on_current_tyre"].fillna(
            train_df["laps_on_current_tyre"]
        )
        train_df["tyre_age_diff"] = (
            train_df["defender_laps_on_current_tyre"] - train_df["laps_on_current_tyre"]
        ).astype(float)

        train_df["skill_att"] = train_df["driver_id"].map(self.driver_skill_map).fillna(0.0)
        train_df["skill_def"] = train_df["defender_id"].map(self.driver_skill_map).fillna(0.0)
        train_df["skill_diff"] = (train_df["skill_att"] - train_df["skill_def"]).astype(float)

        X = self._build_feature_frame(train_df)
        y = train_df["overtake_event"].astype(int).to_numpy()
        self.base_rate = float(np.mean(y)) if len(y) else 0.05

        if len(y) < 20 or np.unique(y).size < 2:
            self.pipeline = None
            return self

        groups = train_df["session_key"] if "session_key" in train_df.columns else None
        if groups is not None and groups.nunique() > 1:
            splitter = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
            train_idx, test_idx = next(splitter.split(X, y, groups=groups))
        else:
            idx = np.arange(len(X))
            self.rng.shuffle(idx)
            split = max(1, int(len(idx) * 0.8))
            train_idx, test_idx = idx[:split], idx[split:]

        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        self.pipeline = self._build_pipeline()
        self.pipeline.fit(X_train, y_train)
        self.feature_columns = list(X.columns)

        train_probs = self.pipeline.predict_proba(X_train)[:, 1]
        test_probs = self.pipeline.predict_proba(X_test)[:, 1]

        train_log = log_loss(y_train, train_probs, labels=[0, 1])
        test_log = log_loss(y_test, test_probs, labels=[0, 1])
        train_rmse = np.sqrt(mean_squared_error(y_train, train_probs))
        test_rmse = np.sqrt(mean_squared_error(y_test, test_probs))

        self.train_metrics_ = {"log_loss": float(train_log), "brier_rmse": float(train_rmse)}
        self.test_metrics_ = {"log_loss": float(test_log), "brier_rmse": float(test_rmse)}

        print(f"Overtake train logloss: {train_log:.4f} | train brier RMSE: {train_rmse:.4f}")
        print(f"Overtake test  logloss: {test_log:.4f} | test  brier RMSE: {test_rmse:.4f}")
        return self

    def _build_feature_frame(self, frame: pd.DataFrame) -> pd.DataFrame:
        data = {
            "circuit_id": frame["circuit_id"].astype(str).fillna("unknown"),
            "gap_start": frame["gap_start"].astype(float).fillna(self.gap_threshold),
            "tyre_age_diff": frame["tyre_age_diff"].astype(float).fillna(0.0),
            "skill_diff": frame["skill_diff"].astype(float).fillna(0.0),
        }
        if self.include_year:
            data["year"] = frame["year"].apply(
                lambda v: str(int(v)) if pd.notna(v) else "unknown"
            )
        return pd.DataFrame(data)

    def _build_pipeline(self) -> Pipeline:
        categorical = ["circuit_id"]
        if self.include_year:
            categorical.append("year")
        numeric = ["gap_start", "tyre_age_diff", "skill_diff"]

        preprocess = ColumnTransformer(
            [
                ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
                ("num", "passthrough", numeric),
            ]
        )

        model = LogisticRegression(max_iter=4000, class_weight="balanced")
        return Pipeline([("preprocess", preprocess), ("model", model)])

    def overtake_success_probability(
        self,
        attacker_state,
        defender_state,
        circuit_id,
        gap_start,
        year=None,
    ):
        """Compute overtake success probability for attacker vs defender."""
        if self.pipeline is None:
            return float(np.clip(self.base_rate, 0.01, 0.95))

        def _safe_num(value, default=0.0):
            if value is None or pd.isna(value):
                return default
            return float(value)

        skill_att = float(self.driver_skill_map.get(attacker_state["driver_id"], 0.0))
        skill_def = float(self.driver_skill_map.get(defender_state["driver_id"], 0.0))
        skill_diff = skill_att - skill_def

        att_laps = _safe_num(attacker_state.get("laps_on_current_tyre", 0.0), 0.0)
        def_laps = _safe_num(defender_state.get("laps_on_current_tyre", att_laps), att_laps)
        tyre_adv_laps = def_laps - att_laps

        gap_value = _safe_num(gap_start, self.gap_threshold)

        feature_row = {
            "circuit_id": str(circuit_id) if circuit_id is not None else "unknown",
            "gap_start": float(max(gap_value, 0.0)),
            "tyre_age_diff": float(tyre_adv_laps),
            "skill_diff": float(skill_diff),
        }
        if self.include_year:
            if year is None or pd.isna(year):
                year_value = "unknown"
            else:
                try:
                    year_value = str(int(year))
                except (TypeError, ValueError):
                    year_value = str(year)
            feature_row["year"] = year_value

        X = pd.DataFrame([feature_row])
        prob = float(self.pipeline.predict_proba(X)[0, 1])
        return float(np.clip(prob, 0.01, 0.95))



In [25]:
# Training
_driver_skill_map = locals().get("driver_skill_map", {})
overtaking_model = OvertakingModel(
    df=df,
    driver_skill_map=_driver_skill_map,
    gap_threshold=GAP_THRESHOLD,
    include_year=True,
    auto_fit=False,
)
overtaking_model.fit(df)


Overtake train logloss: 0.5988 | train brier RMSE: 0.4535
Overtake test  logloss: 0.5946 | test  brier RMSE: 0.4513


<__main__.OvertakingModel at 0x126540830>

In [14]:
# Saving
import joblib
from pathlib import Path

model_path = Path("models/overtaking_model.joblib")
model_path.parent.mkdir(parents=True, exist_ok=True)
bundle = {
    "pipeline": overtaking_model.pipeline,
    "include_year": overtaking_model.include_year,
    "gap_threshold": overtaking_model.gap_threshold,
    "base_rate": overtaking_model.base_rate,
    "feature_columns": overtaking_model.feature_columns,
}
joblib.dump(bundle, model_path)



['models/overtaking_model.joblib']

In [15]:
from IPython.display import display

# Backtrace overtaking ease against observed rates in the dataset
backtrace = overtake_stats.copy()
backtrace["observed_rate"] = np.where(
    backtrace["opportunities"] > 0,
    backtrace["overtakes"] / backtrace["opportunities"],
    np.nan,
)
backtrace["shrink_diff"] = backtrace["overtake_rate"] - backtrace["observed_rate"]

low_data_circuits = (~backtrace["overtake_ease"].notna()).sum()
print(f"Circuits without enough data (ease NaN): {int(low_data_circuits)}")

total_opps = backtrace["opportunities"].sum()
total_overtakes = backtrace["overtakes"].sum()
overall_observed_rate = (total_overtakes / total_opps) if total_opps > 0 else float("nan")

print(f"Total overtakes: {total_overtakes:.0f} | total opportunities: {total_opps:.0f}")
print(f"Overall observed overtake rate: {overall_observed_rate:.3f}")

print("Top circuits by observed overtake rate:")
display(
    backtrace
    .sort_values("observed_rate", ascending=False)
    .head(10)[["overtakes", "opportunities", "observed_rate", "overtake_rate", "overtake_ease", "shrink_diff"]]
)

print("Circuits with lowest observed overtake rate:")
display(
    backtrace
    .sort_values("observed_rate", ascending=True)
    .head(10)[["overtakes", "opportunities", "observed_rate", "overtake_rate", "overtake_ease", "shrink_diff"]]
)

# Circuit-year breakdowns if available
if overtake_stats_by_year is not None:
    backtrace_year = overtake_stats_by_year.copy()
    backtrace_year["observed_rate"] = np.where(
        backtrace_year["opportunities"] > 0,
        backtrace_year["overtakes"] / backtrace_year["opportunities"],
        np.nan,
    )
    backtrace_year["shrink_diff"] = backtrace_year["overtake_rate"] - backtrace_year["observed_rate"]

    low_data_cy = (~backtrace_year["overtake_ease"].notna()).sum()
    print(f"Circuit/year combos without enough data (ease NaN): {int(low_data_cy)}")

    print("Top circuit/year combos by observed overtake rate:")
    display(
        backtrace_year
        .sort_values("observed_rate", ascending=False)
        .head(10)[["overtakes", "opportunities", "observed_rate", "overtake_rate", "overtake_ease", "shrink_diff"]]
    )

    print("Circuit/year combos with lowest observed overtake rate:")
    display(
        backtrace_year
        .sort_values("observed_rate", ascending=True)
        .head(10)[["overtakes", "opportunities", "observed_rate", "overtake_rate", "overtake_ease", "shrink_diff"]]
    )

# Backtrace: compare simulated overtakes vs observed overtakes in the dataset
def simulate_overtakes_for_dataset(model: OvertakingModel):
    data = overtake_context_df.copy()
    data = data.sort_values(["session_key", "lap_number", "prev_position"])

    simulated_passes = 0
    observed_passes = int(data["overtake_event"].sum())
    total_opps = 0

    rng = np.random.default_rng(123)

    for _, row in data.iterrows():
        gap_start = float(row["prev_gap_to_ahead_s"]) if pd.notna(row["prev_gap_to_ahead_s"]) else 999.0
        if not (row["prev_position"] > 1 and gap_start <= model.gap_threshold):
            continue
        total_opps += 1
        attacker = {"driver_id": row["driver_id"], "laps_on_current_tyre": row.get("laps_on_current_tyre", 1)}
        defender = {"driver_id": row.get("defender_id", "DEF"), "laps_on_current_tyre": row.get("defender_laps_on_current_tyre", row.get("laps_on_current_tyre", 1))}
        p = model.overtake_success_probability(attacker, defender, row["circuit_id"], gap_start, row.get("year"))
        if rng.random() < p:
            simulated_passes += 1

    return observed_passes, simulated_passes, total_opps

obs, sim, opps = simulate_overtakes_for_dataset(overtaking_model)
print(f"Observed overtakes: {obs}")
print(f"Simulated overtakes: {sim}")
print(f"Opportunities considered: {opps}")
if opps:
    print(f"Observed rate: {obs/opps:.3f}, Simulated rate: {sim/opps:.3f}")

# Circuit-level comparison
rows = []
for circuit_id, grp in overtake_context_df.groupby("circuit_id"):
    rng = np.random.default_rng(123)
    sim = 0
    opps_c = 0
    obs_c = int(grp["overtake_event"].sum())
    for _, row in grp.iterrows():
        gap_start = float(row["prev_gap_to_ahead_s"]) if pd.notna(row["prev_gap_to_ahead_s"]) else 999.0
        if not (row["prev_position"] > 1 and gap_start <= overtaking_model.gap_threshold):
            continue
        opps_c += 1
        attacker = {"driver_id": row["driver_id"], "laps_on_current_tyre": row.get("laps_on_current_tyre", 1)}
        defender = {"driver_id": row.get("defender_id", "DEF"), "laps_on_current_tyre": row.get("defender_laps_on_current_tyre", row.get("laps_on_current_tyre", 1))}
        p = overtaking_model.overtake_success_probability(attacker, defender, row["circuit_id"], gap_start, row.get("year"))
        if rng.random() < p:
            sim += 1
    if opps_c > 0:
        rows.append({
            "circuit_id": circuit_id,
            "observed": obs_c,
            "simulated": sim,
            "opportunities": opps_c,
            "obs_rate": obs_c/opps_c,
            "sim_rate": sim/opps_c,
            "error_pct": (sim - obs_c)/opps_c * 100.0,
        })

circuit_compare = pd.DataFrame(rows)
if not circuit_compare.empty:
    circuit_compare = circuit_compare.sort_values("error_pct")
    print("\nCircuit-level overtake comparison (sim - obs):")
    print(circuit_compare)



Circuits without enough data (ease NaN): 0
Total overtakes: 20465 | total opportunities: 41431
Overall observed overtake rate: 0.494
Top circuits by observed overtake rate:


Unnamed: 0_level_0,overtakes,opportunities,observed_rate,overtake_rate,overtake_ease,shrink_diff
circuit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
yas_marina,121,124,0.975806,0.702645,0.95,-0.273161
hockenheim,315,392,0.803571,0.712259,0.95,-0.091313
barcelona,1165,1467,0.794138,0.762842,0.95,-0.031296
nürburgring,165,211,0.781991,0.660966,0.896864,-0.121024
yas_island,912,1270,0.71811,0.692964,0.95,-0.025146
sakhir,1354,1903,0.711508,0.694559,0.95,-0.016949
sochi,452,662,0.682779,0.648094,0.85658,-0.034685
suzuka,629,958,0.656576,0.635282,0.816484,-0.021294
istanbul,167,255,0.654902,0.601444,0.710583,-0.053458
mugello,63,97,0.649485,0.571236,0.616043,-0.078249


Circuits with lowest observed overtake rate:


Unnamed: 0_level_0,overtakes,opportunities,observed_rate,overtake_rate,overtake_ease,shrink_diff
circuit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
monaco,226,1367,0.165326,0.212289,0.05,0.046964
monte_carlo,192,772,0.248705,0.30726,0.05,0.058556
jeddah,363,1059,0.342776,0.37304,0.05,0.030264
montréal,635,1844,0.34436,0.362846,0.05,0.018486
miami,365,1053,0.346629,0.376422,0.05,0.029794
melbourne,480,1336,0.359281,0.381938,0.05,0.022657
zandvoort,611,1607,0.380212,0.397154,0.071234,0.016942
miami_gardens,105,269,0.390335,0.451294,0.240672,0.06096
silverstone,852,2141,0.397945,0.409507,0.109896,0.011563
imola,534,1341,0.39821,0.415741,0.129404,0.017531


Circuit/year combos without enough data (ease NaN): 0
Top circuit/year combos by observed overtake rate:


Unnamed: 0_level_0,Unnamed: 1_level_0,overtakes,opportunities,observed_rate,overtake_rate,overtake_ease,shrink_diff
circuit_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
austin,2018,87,70,1.242857,0.737782,0.95,-0.505075
mexico_city,2019,136,121,1.123967,0.773212,0.95,-0.350754
barcelona,2023,214,191,1.120419,0.834274,0.95,-0.286145
austin,2021,114,107,1.065421,0.736812,0.95,-0.328609
suzuka,2023,125,119,1.05042,0.743577,0.95,-0.306843
sakhir,2021,140,134,1.044776,0.755093,0.95,-0.289683
sochi,2018,80,77,1.038961,0.693867,0.936036,-0.345094
barcelona,2022,144,141,1.021277,0.751323,0.95,-0.269953
shanghai,2019,105,104,1.009615,0.714478,0.95,-0.295138
baku,2019,125,124,1.008065,0.732102,0.95,-0.275962


Circuit/year combos with lowest observed overtake rate:


Unnamed: 0_level_0,Unnamed: 1_level_0,overtakes,opportunities,observed_rate,overtake_rate,overtake_ease,shrink_diff
circuit_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
monaco,2024,13,390,0.033333,0.212205,0.05,0.178872
spa_francorchamps,2025,53,383,0.138381,0.283364,0.05,0.144983
imola,2022,61,405,0.150617,0.286283,0.05,0.135666
monte_carlo,2019,66,381,0.173228,0.306715,0.05,0.133486
lusail,2025,78,445,0.175281,0.294886,0.05,0.119605
jeddah,2024,51,289,0.176471,0.333745,0.05,0.157274
monaco,2025,85,456,0.186404,0.300612,0.05,0.114208
montréal,2023,83,433,0.191686,0.308375,0.05,0.116689
são_paulo,2024,96,491,0.195519,0.301304,0.05,0.105785
miami,2024,109,512,0.212891,0.310676,0.05,0.097785


Observed overtakes: 20465
Simulated overtakes: 18888
Opportunities considered: 41431
Observed rate: 0.494, Simulated rate: 0.456

Circuit-level overtake comparison (sim - obs):
           circuit_id  observed  simulated  opportunities  obs_rate  sim_rate  \
33         yas_marina       121         75            124  0.975806  0.604839   
2           barcelona      1165        787           1467  0.794138  0.536469   
21        nürburgring       165        119            211  0.781991  0.563981   
27              sochi       452        339            662  0.682779  0.512085   
32         yas_island       912        705           1270  0.718110  0.555118   
23             sakhir      1354       1064           1903  0.711508  0.559117   
4          hockenheim       315        259            392  0.803571  0.660714   
30             suzuka       629        502            958  0.656576  0.524008   
6            istanbul       167        136            255  0.654902  0.533333   
3            