In [1]:
import sys

sys.path.append("../src")
import numpy as np
import pandas as pd

from ranking_sim import (
    process_raw_data,
    softmax_weights,
    simulate_round_based,
    brier_score,
)

In [2]:
np.random.seed(20250618)

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
INPUT_FOLDER = "../data/raw"

In [5]:
df = process_raw_data(f"{INPUT_FOLDER}/llm_and_human_leaderboard_overall.pkl")
df.head()

Unnamed: 0,id,source,direction,forecast_due_date,market_value_on_due_date,resolution_date,resolved_to,resolved,forecast,imputed,score,model,organization,question_type,horizon,question_id
0,1285,infer,(),2024-07-21,0.0163,2025-01-01,0.0,True,0.5,False,0.25,Claude-2.1 (scratchpad),Anthropic,market,,infer-1285-nan-2024-07-21
1,1286,infer,(),2024-07-21,0.0292,2025-01-01,0.0,True,0.0292,True,0.000853,Claude-2.1 (scratchpad),Anthropic,market,,infer-1286-nan-2024-07-21
2,1342,infer,(),2024-07-21,0.0088,2025-01-01,0.0,True,0.0088,True,7.7e-05,Claude-2.1 (scratchpad),Anthropic,market,,infer-1342-nan-2024-07-21
3,1353,infer,(),2024-07-21,0.4943,2024-12-01,0.0,True,0.4943,True,0.244332,Claude-2.1 (scratchpad),Anthropic,market,,infer-1353-nan-2024-07-21
5,1373,infer,(),2024-07-21,0.7474,2025-01-01,0.0,True,0.7474,True,0.558607,Claude-2.1 (scratchpad),Anthropic,market,,infer-1373-nan-2024-07-21


In [6]:
len(df["model"].unique())

141

In [7]:
len(df["question_id"].unique())

473

In [8]:
# Run a simulation to calibrate the change in Brier scores
# for the MODEL DRIFT scenario
res = []
for mm in range(1000):
    df_sim = simulate_round_based(
        df=df,
        n_rounds=2,
        questions_per_round=25,
        models_per_round_mean=40,
        ref_model="Naive Forecaster",
        skill_temperature=lambda round_id: -15.0 if round_id == 0 else 15.0,
        difficulty_temperature=None,
        model_persistence=0.0,
        fixed_models_per_round=False,
    )
    df_sim["score"] = brier_score(df_sim)
    df_temp = df_sim[["round_id", "score"]].groupby("round_id").mean().reset_index()
    brier_low_temp_sim = df_temp["score"].values[0]
    brier_high_temp_sim = df_temp["score"].values[-1]
    brier_delta_sim = brier_high_temp_sim - brier_low_temp_sim
    res.append(
        {
            "simulation_id": mm,
            "brier_low_temp": brier_low_temp_sim,
            "brier_high_temp": brier_high_temp_sim,
            "brier_delta": brier_delta_sim,
        }
    )
res = pd.DataFrame(res)
print(res["brier_delta"].mean())
res.head()

-0.05270874257884196


Unnamed: 0,simulation_id,brier_low_temp,brier_high_temp,brier_delta
0,0,0.271249,0.155436,-0.115813
1,1,0.259052,0.195724,-0.063328
2,2,0.248681,0.179661,-0.06902
3,3,0.251821,0.206085,-0.045736
4,4,0.260123,0.226186,-0.033937


In [9]:
# Run a simulation to calibrate the change in Brier scores
# for the QUESTION DRIFT scenario
res = []
for mm in range(1000):
    df_sim = simulate_round_based(
        df=df,
        n_rounds=2,
        questions_per_round=25,
        models_per_round_mean=40,
        ref_model="Naive Forecaster",
        skill_temperature=None,
        difficulty_temperature=lambda round_id: -10.0 if round_id == 0 else 0.0,
        model_persistence=0.0,
        fixed_models_per_round=False,
    )
    df_sim["score"] = brier_score(df_sim)
    df_temp = df_sim[["round_id", "score"]].groupby("round_id").mean().reset_index()
    brier_low_temp_sim = df_temp["score"].values[0]
    brier_high_temp_sim = df_temp["score"].values[-1]
    brier_delta_sim = brier_high_temp_sim - brier_low_temp_sim
    res.append(
        {
            "simulation_id": mm,
            "brier_low_temp": brier_low_temp_sim,
            "brier_high_temp": brier_high_temp_sim,
            "brier_delta": brier_delta_sim,
        }
    )
res = pd.DataFrame(res)
print(res["brier_delta"].mean())
res.head()

0.07269868518258737


Unnamed: 0,simulation_id,brier_low_temp,brier_high_temp,brier_delta
0,0,0.124651,0.231059,0.106408
1,1,0.141972,0.192513,0.050542
2,2,0.151883,0.189707,0.037823
3,3,0.09748,0.204283,0.106803
4,4,0.141098,0.169035,0.027937
