In [None]:
import sys

sys.path.append("../src")
import numpy as np
import pandas as pd

from ranking_sim import (
    process_raw_data,
    softmax_weights,
    simulate_round_based,
    brier_score,
)

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
INPUT_FOLDER = "../data/raw"

In [4]:
df = process_raw_data(f"{INPUT_FOLDER}/llm_and_human_leaderboard_overall.pkl")
df.head()

Unnamed: 0,id,source,direction,forecast_due_date,market_value_on_due_date,resolution_date,resolved_to,resolved,forecast,imputed,score,model,organization,question_type,horizon,question_id
0,1285,infer,(),2024-07-21,0.0163,2025-01-01,0.0,True,0.5,False,0.25,Claude-2.1 (scratchpad),Anthropic,market,,infer-1285-nan-2024-07-21
1,1286,infer,(),2024-07-21,0.0292,2025-01-01,0.0,True,0.0292,True,0.000853,Claude-2.1 (scratchpad),Anthropic,market,,infer-1286-nan-2024-07-21
2,1342,infer,(),2024-07-21,0.0088,2025-01-01,0.0,True,0.0088,True,7.7e-05,Claude-2.1 (scratchpad),Anthropic,market,,infer-1342-nan-2024-07-21
3,1353,infer,(),2024-07-21,0.4943,2024-12-01,0.0,True,0.4943,True,0.244332,Claude-2.1 (scratchpad),Anthropic,market,,infer-1353-nan-2024-07-21
5,1373,infer,(),2024-07-21,0.7474,2025-01-01,0.0,True,0.7474,True,0.558607,Claude-2.1 (scratchpad),Anthropic,market,,infer-1373-nan-2024-07-21


In [5]:
len(df["model"].unique())

141

In [6]:
len(df["question_id"].unique())

473

In [None]:
# Run a simulation to calibrate the change in Brier scores
# for the model drift scenario
res = []
for mm in range(100):
    df_sim = simulate_round_based(
        df = df,
        n_rounds=2,
        questions_per_round=25,
        models_per_round_mean=40,
        ref_model="Naive Forecaster",
        skill_temperature=lambda round_id: -10.0 if round_id == 0 else 10.0,
        difficulty_temperature=None,
        model_persistence=0.0,
        fixed_models_per_round=False,
    )
    df_sim["score"] = brier_score(df_sim)
    df_temp = df_sim[["round_id", "score"]].groupby("round_id").mean().reset_index()
    brier_low_temp_sim = df_temp["score"].values[0]
    brier_high_temp_sim = df_temp["score"].values[-1]
    brier_delta_sim = brier_high_temp_sim - brier_low_temp_sim
    res.append(
        {
            "simulation_id": mm,
            "brier_low_temp": brier_low_temp_sim,
            "brier_high_temp": brier_high_temp_sim,
            "brier_delta": brier_delta_sim,
        }
    )
res = pd.DataFrame(res)

In [7]:
# Calculate the implied Brier score for a given
# model-drift specification to check that it's reasonable
df_temp = df[["model", "score"]].groupby("model").mean().reset_index()
df_temp["brier_skill"] = df_temp["score"]  * (-1)
df_temp["probs_low_temp"] = softmax_weights(df_temp["brier_skill"], temp=-5)
df_temp["probs_high_temp"] = softmax_weights(df_temp["brier_skill"], temp=5)
df_temp.sort_values(by  = "probs_high_temp", ascending=False)

Unnamed: 0,model,score,brier_skill,probs_low_temp,probs_high_temp
140,Superforecaster median forecast,0.114365,-0.114365,0.004082,0.011087
13,Claude-3-5-Sonnet-20240620 (scratchpad with fr...,0.140280,-0.140280,0.004646,0.009739
16,Claude-3-5-Sonnet-20240620 (scratchpad),0.143347,-0.143347,0.004718,0.009591
14,Claude-3-5-Sonnet-20240620 (scratchpad with ne...,0.144096,-0.144096,0.004736,0.009555
15,Claude-3-5-Sonnet-20240620 (scratchpad with news),0.146248,-0.146248,0.004787,0.009453
...,...,...,...,...,...
139,Random Uniform,0.309020,-0.309020,0.010803,0.004189
0,Always 0,0.321353,-0.321353,0.011490,0.003939
42,GPT-3.5-Turbo-0125 (zero shot with freeze values),0.394058,-0.394058,0.016527,0.002738
43,GPT-3.5-Turbo-0125 (zero shot),0.404725,-0.404725,0.017433,0.002596


In [8]:
# Calculate the Brier skill score for the low and high temperature models
print((df_temp['probs_low_temp'] * df_temp['brier_skill']).sum() * (-1))
print((df_temp['probs_high_temp'] * df_temp['brier_skill']).sum() * (-1))

0.24966806743569947
0.19879008741161733


In [9]:
# Calculate the implied Brier score for a given
# question-drift specification to check that it's reasonable
df_temp = df[["question_id", "score"]].groupby("question_id").mean().reset_index()
df_temp["probs_low_temp"] = softmax_weights(df_temp["score"], temp=-10)
df_temp["probs_high_temp"] = softmax_weights(df_temp["score"], temp=0.0)
df_temp.sort_values(by  = "probs_high_temp", ascending=False)

Unnamed: 0,question_id,score,probs_low_temp,probs_high_temp
0,acled-0c20162bae6257b7fcaa67c009a710e82f0faad0...,0.174340,0.002014,0.002114
311,wikipedia-27e948952d66a749e55f37f9cb6303695576...,0.120002,0.003467,0.002114
323,wikipedia-38ffedf32d4ca14903ace5aea31ec5a49444...,0.064628,0.006032,0.002114
322,wikipedia-38ffedf32d4ca14903ace5aea31ec5a49444...,0.082277,0.005056,0.002114
321,wikipedia-38ffedf32d4ca14903ace5aea31ec5a49444...,0.103605,0.004085,0.002114
...,...,...,...,...
154,dbnomics-meteofrance_TEMPERATURE_celsius.78897...,0.340315,0.000383,0.002114
153,dbnomics-meteofrance_TEMPERATURE_celsius.78897...,0.228063,0.001177,0.002114
152,dbnomics-meteofrance_TEMPERATURE_celsius.78897...,0.220180,0.001273,0.002114
151,dbnomics-meteofrance_TEMPERATURE_celsius.07761...,0.244206,0.001001,0.002114


In [10]:
# Calculate the Brier skill score for the low and high question-drift parameters
print((df_temp['probs_low_temp'] * df_temp['score']).sum() * (-1))
print((df_temp['probs_high_temp'] * df_temp['score']).sum() * (-1))

-0.13559097070749077
-0.21008818308932872


In [11]:
mask = df['question_type'] == 'market'
df.loc[mask, 'horizon'].unique()

array([nan])

In [12]:
mask = df['question_type'] == 'market'
df.loc[mask, ['source', 'id', 'model', 'forecast']].duplicated().any()

np.False_