In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict
from datetime import datetime
import kaleido  # needed for saving plots
import numpy as np
import os
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from sklearn.model_selection import KFold

from regression import *

In [3]:
results_dir = 'results/benchmark/30Aug/'
os.makedirs(results_dir, exist_ok=True)

In [4]:
def save_plot(fig, folder, filename, extensions=['png', 'svg', 'pdf'], scale=2):
    for ext in extensions:
        fig.write_image(folder + filename + '.' + ext, scale=scale)
    fig.write_html(folder + filename + '.html')

# Prepare data

In [5]:
url = "https://docs.google.com/spreadsheets/d/1etu9rXcME0uUA-S2ANA8bsfQbIZgNu-8NxqFGQdDIzQ/export?format=csv&gid=1305280917#gid=1305280917"
df = pd.read_csv(url)
df

Unnamed: 0,System,Model size (parameters),Dataset size,Date,Open/Closed,Training compute (FLOP),Training compute notes,BBH,GPQA,MMLU,...,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,GPQA Notes,MMLU Notes,HELM MMLU Notes,Trust in benchmark results,Trust notes
0,Random chance,,,,,1.000000e+20,,0.2500,0.250,0.2500,...,0.0,0.0,,,,,,,0.0,
1,BLOOM-176B,1.760000e+11,3.900000e+11,2022-11-09,Open,4.120000e+23,,0.4491,,0.3913,...,,,,,,,,,0.0,
2,BloombergGPT,5.000000e+10,7.080000e+11,2023-03-30,Closed,2.120000e+23,,0.4197,,0.3918,...,,,,,,,,,0.0,
3,Camelidae-8x34B,,,2024-01-05,Open,,,,,0.7560,...,,,,,,,,,,
4,ChatGLM-6B,6.000000e+09,,2023-03-01,Open,,,0.1873,,,...,,880.0,,,,,,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,XVerse-65B,,,2023-11-05,Open,,,,,,...,,,,,,,,,0.0,
109,XVerse-7B,,,2023-09-26,Open,,,,,,...,,,,,,,,,0.0,
110,Yi-34B,3.400000e+10,3.000000e+12,2023-11-02,Open,6.120000e+23,,0.5430,0.370,0.7635,...,,1111.0,chat,,,"Not sure this is zero-shot CoT, OpenLLM2 Leade...",,,0.0,
111,Yi-6B,6.000000e+09,3.000000e+12,2023-11-02,Open,1.080000e+23,,0.4280,,0.6385,...,,,,,,,,,0.0,


In [6]:
url = "https://docs.google.com/spreadsheets/d/1KYp4h3urj-698IE9bR7n1ctuH1iyCAQ5pTZIqQ_qs9g/export?format=csv"
gsm1k_df = pd.read_csv(url)
gsm1k_df

Unnamed: 0,model,Date,gsm8k,gsm1k,Compute,Speculative Compute,Open weights?
0,claude-2.1,2023-07-11,88.7,89.4,3.800000e+24,,n
1,claude-3-haiku-20240307,2024-03-04,78.5,78.5,,,n
2,claude-3-opus-20240229,2024-03-04,80.2,82.5,,4.000000e+25,n
3,claude-3-sonnet-20240229,2024-03-04,71.9,74.4,,,n
4,codegemma-7b,2024-04-09,47.9,41.6,3.330000e+23,,y
...,...,...,...,...,...,...,...
66,vicuna-33b-v1.3,2023-06-22,37.9,34.1,,,y
67,Xwin-Math-13B-V1.0,2024-03-07,63.1,52.9,,,
68,Xwin-Math-7B-V1.0,2024-03-07,52.9,42.8,,,
69,Yi-34B-Chat,2023-11-02,64.1,56.9,6.100000e+23,,y


In [7]:
# Rename columns
gsm1k_df.rename(columns={'model': 'System', 'gsm8k': 'GSM8k', 'gsm1k': 'GSM1k', 'Compute': 'Training compute (FLOP)', 'Open weights?': 'Open/Closed'}, inplace=True)
gsm1k_df

Unnamed: 0,System,Date,GSM8k,GSM1k,Training compute (FLOP),Speculative Compute,Open/Closed
0,claude-2.1,2023-07-11,88.7,89.4,3.800000e+24,,n
1,claude-3-haiku-20240307,2024-03-04,78.5,78.5,,,n
2,claude-3-opus-20240229,2024-03-04,80.2,82.5,,4.000000e+25,n
3,claude-3-sonnet-20240229,2024-03-04,71.9,74.4,,,n
4,codegemma-7b,2024-04-09,47.9,41.6,3.330000e+23,,y
...,...,...,...,...,...,...,...
66,vicuna-33b-v1.3,2023-06-22,37.9,34.1,,,y
67,Xwin-Math-13B-V1.0,2024-03-07,63.1,52.9,,,
68,Xwin-Math-7B-V1.0,2024-03-07,52.9,42.8,,,
69,Yi-34B-Chat,2023-11-02,64.1,56.9,6.100000e+23,,y


In [8]:
gsm1k_df['Open/Closed'] = gsm1k_df['Open/Closed'].map({'y': 'Open', 'n': 'Closed'})
gsm1k_df

Unnamed: 0,System,Date,GSM8k,GSM1k,Training compute (FLOP),Speculative Compute,Open/Closed
0,claude-2.1,2023-07-11,88.7,89.4,3.800000e+24,,Closed
1,claude-3-haiku-20240307,2024-03-04,78.5,78.5,,,Closed
2,claude-3-opus-20240229,2024-03-04,80.2,82.5,,4.000000e+25,Closed
3,claude-3-sonnet-20240229,2024-03-04,71.9,74.4,,,Closed
4,codegemma-7b,2024-04-09,47.9,41.6,3.330000e+23,,Open
...,...,...,...,...,...,...,...
66,vicuna-33b-v1.3,2023-06-22,37.9,34.1,,,Open
67,Xwin-Math-13B-V1.0,2024-03-07,63.1,52.9,,,
68,Xwin-Math-7B-V1.0,2024-03-07,52.9,42.8,,,
69,Yi-34B-Chat,2023-11-02,64.1,56.9,6.100000e+23,,Open


In [9]:
# Convert percentage to fraction
gsm1k_df['GSM1k'] /= 100
gsm1k_df['GSM8k'] /= 100

In [10]:
# Concatenate gsm1k_df and df
df = pd.concat([gsm1k_df, df], axis=0, join='outer', ignore_index=True)
df

Unnamed: 0,System,Date,GSM8k,GSM1k,Training compute (FLOP),Speculative Compute,Open/Closed,Model size (parameters),Dataset size,Training compute notes,...,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,GPQA Notes,MMLU Notes,HELM MMLU Notes,Trust in benchmark results,Trust notes
0,claude-2.1,2023-07-11,0.887,0.894,3.800000e+24,,Closed,,,,...,,,,,,,,,,
1,claude-3-haiku-20240307,2024-03-04,0.785,0.785,,,Closed,,,,...,,,,,,,,,,
2,claude-3-opus-20240229,2024-03-04,0.802,0.825,,4.000000e+25,Closed,,,,...,,,,,,,,,,
3,claude-3-sonnet-20240229,2024-03-04,0.719,0.744,,,Closed,,,,...,,,,,,,,,,
4,codegemma-7b,2024-04-09,0.479,0.416,3.330000e+23,,Open,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,XVerse-65B,2023-11-05,,,,,Open,,,,...,,,,,,,,,0.0,
180,XVerse-7B,2023-09-26,,,,,Open,,,,...,,,,,,,,,0.0,
181,Yi-34B,2023-11-02,,,6.120000e+23,,Open,3.400000e+10,3.000000e+12,,...,,1111.0,chat,,,"Not sure this is zero-shot CoT, OpenLLM2 Leade...",,,0.0,
182,Yi-6B,2023-11-02,,,1.080000e+23,,Open,6.000000e+09,3.000000e+12,,...,,,,,,,,,0.0,


In [11]:
df['Date'] = pd.to_datetime(df['Date'], format='mixed')
df

Unnamed: 0,System,Date,GSM8k,GSM1k,Training compute (FLOP),Speculative Compute,Open/Closed,Model size (parameters),Dataset size,Training compute notes,...,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,GPQA Notes,MMLU Notes,HELM MMLU Notes,Trust in benchmark results,Trust notes
0,claude-2.1,2023-07-11,0.887,0.894,3.800000e+24,,Closed,,,,...,,,,,,,,,,
1,claude-3-haiku-20240307,2024-03-04,0.785,0.785,,,Closed,,,,...,,,,,,,,,,
2,claude-3-opus-20240229,2024-03-04,0.802,0.825,,4.000000e+25,Closed,,,,...,,,,,,,,,,
3,claude-3-sonnet-20240229,2024-03-04,0.719,0.744,,,Closed,,,,...,,,,,,,,,,
4,codegemma-7b,2024-04-09,0.479,0.416,3.330000e+23,,Open,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,XVerse-65B,2023-11-05,,,,,Open,,,,...,,,,,,,,,0.0,
180,XVerse-7B,2023-09-26,,,,,Open,,,,...,,,,,,,,,0.0,
181,Yi-34B,2023-11-02,,,6.120000e+23,,Open,3.400000e+10,3.000000e+12,,...,,1111.0,chat,,,"Not sure this is zero-shot CoT, OpenLLM2 Leade...",,,0.0,
182,Yi-6B,2023-11-02,,,1.080000e+23,,Open,6.000000e+09,3.000000e+12,,...,,,,,,,,,0.0,


In [12]:
# Filter out finetuned systems

finetuned_systems = [
 'Layer Normalization: Handwriting sequence generation',
 'ULM-FiT',
 'ADP-FAIRSEQ + NGRAMRES',
 'Cross-lingual alignment',
 'UnifiedQA',
 '$\\infty$-former (SM)',
 'FLAN 137B',
 'AlphaFold-Multimer',
 'Masked Autoencoders',
 'Contriever',
 'BERT-RBP',
 'Minerva',
 'BlenderBot 3',
 'PaLM-SayCan',
 'NMST+GPT-2',
 'Decaying Fast Weights Transformer (WT-103)',
 'GPT-2 + Progressive LRD',
 'U-PaLM',
 'Flan-T5 11B',
 'Flan-PaLM 540B',
 'Taiyi-Stable Diffusion',
 'OPT-IML (175B)',
 'SparseOPT-175B',
 'DiT-XL/2',
 'VideoMAE V2',
 'Segment Anything Model',
 'gLM',
 'MOSS-Moon-003',
 'WizardLM-7B',
 'InstructBLIP',
 'Guanaco-65B',
 'WizardCoder-15.5B',
 'Code Llama-34B',
 'Code Llama-7B',
 'TigerBot-70B',
 'MiniGPT4 (Vicuna finetune)',
 'LLaMA-7B (protein-oriented instructions finetuned)',
 'FinGPT-13B',
 'LLaVA 1.5',
 'CogVLM',
 'Volcano 13B',
 'SPHINX (Llama 2 13B)',
 'Orca 2-13B',
 'Llama Guard',
 'FunSearch',
 'Elyza',
 'Code Llama-70B',
 'Swallow'
]

df = df[~df['System'].isin(finetuned_systems)]
df = df[~df['System'].str.contains('Flan')]
df = df[~df['System'].str.contains('FLAN')]

## Merge SEAL Math with GSM1k

In [13]:
for i, row in df.iterrows():
  if pd.notna(row['SEAL Math']):
    df.at[i, 'GSM1k'] = row['SEAL Math']
df

Unnamed: 0,System,Date,GSM8k,GSM1k,Training compute (FLOP),Speculative Compute,Open/Closed,Model size (parameters),Dataset size,Training compute notes,...,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,GPQA Notes,MMLU Notes,HELM MMLU Notes,Trust in benchmark results,Trust notes
0,claude-2.1,2023-07-11,0.887,0.894,3.800000e+24,,Closed,,,,...,,,,,,,,,,
1,claude-3-haiku-20240307,2024-03-04,0.785,0.785,,,Closed,,,,...,,,,,,,,,,
2,claude-3-opus-20240229,2024-03-04,0.802,0.825,,4.000000e+25,Closed,,,,...,,,,,,,,,,
3,claude-3-sonnet-20240229,2024-03-04,0.719,0.744,,,Closed,,,,...,,,,,,,,,,
4,codegemma-7b,2024-04-09,0.479,0.416,3.330000e+23,,Open,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,XVerse-65B,2023-11-05,,,,,Open,,,,...,,,,,,,,,0.0,
180,XVerse-7B,2023-09-26,,,,,Open,,,,...,,,,,,,,,0.0,
181,Yi-34B,2023-11-02,,,6.120000e+23,,Open,3.400000e+10,3.000000e+12,,...,,1111.0,chat,,,"Not sure this is zero-shot CoT, OpenLLM2 Leade...",,,0.0,
182,Yi-6B,2023-11-02,,,1.080000e+23,,Open,6.000000e+09,3.000000e+12,,...,,,,,,,,,0.0,


# Bench-to-bench comparisons for vetting purposes

In [14]:
bench1 = 'MMLU'
bench2 = 'GPQA' 
df[f'{bench1} (log-ratio)'] = -np.log(1/df[bench1] - 1)
df[f'{bench2} (log-ratio)'] = -np.log(1/df[bench2] - 1)

In [15]:
color_map = {'Open': 'blue', 'Closed': 'darkorange'}

# Create the plot
fig = px.scatter(df, x=f'{bench1} (log-ratio)', y=f'{bench2} (log-ratio)', color='Open/Closed',
                 title=f'{bench1} vs. {bench2} for Open and Closed Models',
                 labels={f'{bench1} (log-ratio)': f'{bench1} Score', f'{bench2} (log-ratio)': f'{bench2} Score', 'Date': 'Date', 'System': 'Model'},
                 hover_data=['System', 'Date', f'{bench1}', f'{bench2}'],
                 color_discrete_map=color_map)

# x limits
# fig.update_xaxes(range=[0.6, 1])
fig.update_xaxes(range=[0.5, 2.5])

# Improve the layout
fig.update_layout(
    width=800,
    height=400,
    xaxis_title=f"{bench1} (log-ratio)",
    yaxis_title=f"{bench2} (log-ratio)",
    legend_title="Model accessibility",
    font=dict(size=12),
    hovermode="closest"
)

# Show the plot
fig.show()

In [16]:
mmlu_gqpa_df = df.dropna(subset=[f'{bench1} (log-ratio)', f'{bench2} (log-ratio)'])
largest_diffs = (mmlu_gqpa_df[f'{bench1} (log-ratio)'] - mmlu_gqpa_df[f'{bench2} (log-ratio)']).sort_values(ascending=False).index
for idx in largest_diffs:
    print(df.loc[idx, 'System'], df.loc[idx, f'{bench1} (log-ratio)'] - df.loc[idx, f'{bench2} (log-ratio)'])

Qwen2-72B 2.1204971992590247
gpt-4-0125-preview 2.113778387831932
Reka Core 2.0809363102690654
gpt-4-0613 2.0580314351512046
Gemini 1.5 Pro (May 2024) 1.959302700686132
Jamba 1.5 Large 1.9228035796205765
GPT-4o mini 1.913486154699699
Yi-Large 1.904901465307455
Llama 3.1 405B 1.883741368652179
Llama 3 70B 1.8679247162351522
Claude 3 Opus 1.8673894507275375
Gemma 2 27B 1.863079380161238
Gemini 1.5 Pro (April 2024) 1.8529203795884015
Inflection-2.5 1.8340836029459597
GPT-3.5-turbo-16k 1.7868045487745043
GPT-4o 1.7745095511448155
Llama 3.1 70B 1.76095435175745
Gemini 1.5 Flash Preview 1.7452508804995275
Mistral Large 2 1.7222499353684637
Claude 3 Sonnet 1.7137512038477987
Yi-34B 1.7041817304740077
DBRX-Instruct 1.693728077421836
Claude 3.5 Sonnet 1.6799310038363275
Nemotron-4-340B Instruct 1.6215046373443438
Llama 3 8B 1.4266098982983506
Random chance 0.0


In [17]:
# Fit regression to bench1 vs. bench2
model = fit_ols_regression(df, [f'{bench1} (log-ratio)'], f'{bench2} (log-ratio)')
model.summary()
# Calculate residuals
residuals = pd.Series(model.resid, index=df.dropna(subset=[f'{bench1} (log-ratio)', f'{bench2} (log-ratio)']).index)
# Find the n models with the largest residuals
n = 10  # Number of models with largest residuals to find

# Sort the residuals by absolute value in descending order
largest_residuals = residuals.sort_values(ascending=True)

# Get the indices of the n largest residuals
largest_residual_indices = largest_residuals.head(n).index

# Print the models with the largest residuals
print(f"The {n} models with the largest negative residuals:")
for idx in largest_residual_indices:
    model_name = df.loc[idx, 'System']
    residual_value = residuals[idx]
    bench1_score = df.loc[idx, f'{bench1} (log-ratio)']
    bench2_score = df.loc[idx, f'{bench2} (log-ratio)']
    print(f"{model_name}: Residual = {residual_value:.4f}, {bench1} = {bench1_score:.4f}, {bench2} = {bench2_score:.4f}")


The 10 models with the largest negative residuals:
GPT-3.5-turbo-16k: Residual = -0.3291, MMLU = 0.8473, GPQA = -0.9395
Gemma 2 27B: Residual = -0.2570, MMLU = 1.1093, GPQA = -0.7538
gpt-4-0613: Residual = -0.2480, MMLU = 1.4696, GPQA = -0.5884
Reka Core: Residual = -0.1972, MMLU = 1.5999, GPQA = -0.4811
Qwen2-72B: Residual = -0.1952, MMLU = 1.6732, GPQA = -0.4473
Jamba 1.5 Large: Residual = -0.1599, MMLU = 1.3863, GPQA = -0.5365
gpt-4-0125-preview: Residual = -0.1358, MMLU = 1.7663, GPQA = -0.3475
DBRX-Instruct: Residual = -0.1323, MMLU = 1.0304, GPQA = -0.6633
Inflection-2.5: Residual = -0.0853, MMLU = 1.3615, GPQA = -0.4726
GPT-4o mini: Residual = -0.0770, MMLU = 1.5163, GPQA = -0.3971


# LMSys leaderboard

In [18]:
lmsys_leaderboard_bootstrap_elo_lu = pd.read_csv('https://docs.google.com/spreadsheets/d/12zpanuQ1Vf_ZsZ6yjIUwsN7uGPBv3ChLnEOH-g9yZDA/export?format=csv')
lmsys_leaderboard_bootstrap_elo_lu = lmsys_leaderboard_bootstrap_elo_lu.iloc[:, 1:]  # remove first column which is meaningless
lmsys_leaderboard_bootstrap_elo_lu

Unnamed: 0,gpt-4o-2024-05-13,gpt-4o-mini-2024-07-18,claude-3-5-sonnet-20240620,gemini-advanced-0514,llama-3.1-405b-instruct,gemini-1.5-pro-api-0514,gemini-1.5-pro-api-0409-preview,gpt-4-turbo-2024-04-09,gpt-4-1106-preview,claude-3-opus-20240229,...,mpt-7b-chat,chatglm2-6b,RWKV-4-Raven-14B,alpaca-13b,oasst-pythia-12b,chatglm-6b,fastchat-t5-3b,stablelm-tuned-alpha-7b,dolly-v2-12b,llama-13b
0,1286.563455,1279.608815,1274.564379,1266.819548,1263.930629,1262.800113,1256.751200,1259.245346,1251.302638,1246.760897,...,924.081608,930.543839,918.995447,901.694532,895.192400,883.743569,867.977489,837.073711,818.981457,808.397845
1,1288.428940,1287.494945,1274.828503,1268.867005,1263.724865,1263.094929,1258.320223,1258.824223,1252.959457,1250.215882,...,935.186934,920.658893,920.372902,902.614513,894.466683,882.769816,864.055190,840.495344,834.205668,808.046396
2,1287.656293,1281.667695,1275.934504,1268.043471,1261.739568,1261.318913,1258.466691,1258.400212,1254.262636,1250.576760,...,929.251400,923.474876,922.648498,902.220662,894.705996,882.850605,866.929706,847.122558,827.818429,800.490271
3,1284.869154,1279.653476,1269.833920,1270.960974,1269.859096,1260.875571,1257.971376,1257.228266,1251.218192,1249.384635,...,924.035959,922.679121,927.995020,907.170559,897.610422,886.338026,869.905058,842.907147,822.159108,802.589771
4,1285.265726,1282.100890,1268.621384,1266.144561,1262.945738,1260.335847,1257.264334,1257.701172,1248.990652,1247.745614,...,932.584454,915.986844,925.938795,907.229511,892.749966,885.482519,870.742351,844.870609,836.226871,792.792144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1285.964492,1276.413219,1272.279060,1265.682928,1259.271852,1259.780934,1257.705533,1255.287783,1253.316087,1249.462305,...,923.560224,923.189484,917.675145,905.453634,895.762404,881.942653,870.194162,848.375467,827.956177,791.777917
96,1285.769193,1280.955978,1272.596400,1265.313367,1265.510474,1259.210643,1257.968203,1255.820464,1249.073128,1246.840205,...,926.643587,917.478913,922.521962,902.473931,895.742491,876.732514,866.833976,841.230148,817.499211,799.746847
97,1283.361066,1279.543617,1269.244543,1263.704350,1262.416331,1258.528302,1255.166021,1256.156088,1249.962949,1247.240214,...,923.772273,912.598287,917.762886,891.914471,893.081979,872.800000,861.065138,833.636823,818.914647,789.664182
98,1286.871546,1280.301802,1271.749040,1270.644211,1260.150249,1261.494653,1257.594055,1259.429122,1253.718693,1249.689318,...,920.938684,935.274025,917.555680,900.601728,889.143356,884.380531,866.582044,840.679533,821.720833,802.801129


In [19]:
# Alphabetical order
lmsys_leaderboard_bootstrap_elo_lu.mean().round().sort_index()

RWKV-4-Raven-14B               922.0
alpaca-13b                     902.0
athene-70b-0725               1246.0
bard-jan-24-gemini-pro        1208.0
chatglm-6b                     880.0
                               ...  
yi-large                      1212.0
yi-large-preview              1240.0
zephyr-7b-alpha               1042.0
zephyr-7b-beta                1053.0
zephyr-orpo-141b-A35b-v0.1    1126.0
Length: 120, dtype: float64

## Trust in evaluations

In [20]:
# Models whose evaluations we have a concrete reason to distrust
list(df[df['Trust in benchmark results'] < 0]['System'])

['DBRX-Instruct',
 'DeepSeek-67B',
 'Falcon 180B',
 'Gemma 2 27B',
 'gpt-4-0125-preview',
 'gpt-4-0613',
 'Llama 3 8B',
 'Mistral-7B',
 'Mixtral8x22B',
 'Qwen2-72B',
 'Reka Core']

In [21]:
# Models whose evaluations we have a concrete reason to trust
list(df[df['Trust in benchmark results'] > 0]['System'])

['Claude 2',
 'Claude 3 Opus',
 'Claude 3 Sonnet',
 'Gemini 1.0 Pro',
 'Gemini 1.5 Pro (April 2024)',
 'GPT-3.5-turbo-16k',
 'GPT-4 (original)',
 'gpt-4-turbo-2024-04-09',
 'Llama 3 70B',
 'Mistral Large']

# Analysis

In [22]:
benchmarks_to_analyze = ['MMLU', 'GPQA', 'GSM1k', 'BBH']
bench_is_accuracy = {'MMLU': True, 'BBH': True, 'GSM1k': True, 'GPQA': True, 'LMSys Elo': False, 'SEAL Coding': False, 'SEAL Math': False}
plot_log_error = True
non_suspects_only = True
trusted_only = False
old_models_only = False
new_models_only = False
show_model_age = False
save = True

color_map = {'Open': 'blue', 'Closed': 'darkorange'}
marker_map = {'Before': 'circle', 'After': 'diamond'}

In [23]:
df['Before November 2023'] = ['Before' if pd.to_datetime(row['Date']) < pd.to_datetime('2023-11-01') else 'After' for i, row in df.iterrows()]

## Date

In [24]:
filtered_dfs = {}

for i, bench in enumerate(benchmarks_to_analyze):
  print(bench)
  filtered_df = df[~(df['System'] == 'Random chance')]
  if old_models_only:
    filtered_df = filtered_df[filtered_df['Before November 2023'] == 'Before']
  elif new_models_only:
    filtered_df = filtered_df[filtered_df['Before November 2023'] == 'After']
  if non_suspects_only:
    if bench == 'GPQA':
      # GPQA was released November 20, 2023
      old_df = filtered_df[filtered_df['Before November 2023'] == 'Before']
      new_df = filtered_df[filtered_df['Before November 2023'] == 'After']
      new_df = new_df[new_df['Trust in benchmark results'] >= 0]
      filtered_df = pd.concat([old_df, new_df])
    elif bench == 'MMLU':
      filtered_df = filtered_df[filtered_df['Trust in benchmark results'] >= 0]
  elif trusted_only:
    filtered_df = filtered_df[filtered_df['Trust in benchmark results'] > 0]

  # Find the max score over time for each category
  filtered_df = filtered_df.sort_values('Date')
  filtered_dfs[bench] = filtered_df

MMLU
GPQA
GSM1k
BBH


In [25]:
max_rows_by_bench = defaultdict(dict)
for i, bench in enumerate(benchmarks_to_analyze):
    print(bench)
    filtered_df = filtered_dfs[bench]

    # Find the max score over time for each category
    filtered_df = filtered_df.sort_values('Date')
    max_rows = defaultdict(list)
    for category in ['Open', 'Closed']:
        category_df = filtered_df[filtered_df['Open/Closed'] == category].dropna(subset=[bench])
        max_score = 0
        for i, row in category_df.iterrows():
            score = -np.log(1 - row[bench])
            if score > max_score:
                # Models released on the same date: only include the maximum of those models
                # Example: Claude 3 Sonnet vs. Opus
                max_score = score
                max_row = {'Date': row['Date'], 'System': row['System'], 'Score': score}
                if len(max_rows[category]) > 0 and max_rows[category][-1]['Date'] == row['Date']:
                    max_rows[category][-1] = max_row
                else:
                    max_rows[category].append(max_row)

        print(f"{category} models:")
        for row in max_rows[category]:
            print(row['System'], row['Date'], row['Score'])
        print()

    max_rows_by_bench[bench] = max_rows

MMLU
Open models:
T5-Small 2019-10-23 00:00:00 0.3106095770954856
GPT-NeoX 20B 2022-02-09 00:00:00 0.4094731295057033
OPT-66B 2022-05-02 00:00:00 0.44613086483417935
BLOOM-176B 2022-11-09 00:00:00 0.49642974348225827
LLaMa-1 65B 2023-02-24 00:00:00 1.0051219455807707
LLaMa-2 70B 2023-07-18 00:00:00 1.1679623668029027
Yi-34B 2023-11-02 00:00:00 1.4418070710501492
Llama 3 70B 2024-04-18 00:00:00 1.7147984280919264
Llama 3.1 405B 2024-07-23 00:00:00 2.0635681925235456

Closed models:
text-davinci-001 2020-05-28 00:00:00 0.5058380822549516
code-davinci-002 2022-03-01 00:00:00 1.1457038962019603
PaLM 540B 2022-04-04 00:00:00 1.2482730632225159
GPT-4 (original) 2023-03-15 00:00:00 1.995100393246085
Claude 3 Opus 2024-03-04 00:00:00 2.0249533563957662
GPT-4o 2024-05-13 00:00:00 2.05572501506252
Claude 3.5 Sonnet 2024-06-20 00:00:00 2.1803674602697964

GPQA
Open models:
Yi-34B 2023-11-02 00:00:00 0.4620354595965587
Llama 3 70B 2024-04-18 00:00:00 0.5327304591540407
Nemotron-4-340B Instruct 202

In [26]:
max_rows_by_bench

defaultdict(dict,
            {'MMLU': defaultdict(list,
                         {'Open': [{'Date': Timestamp('2019-10-23 00:00:00'),
                            'System': 'T5-Small',
                            'Score': 0.3106095770954856},
                           {'Date': Timestamp('2022-02-09 00:00:00'),
                            'System': 'GPT-NeoX 20B',
                            'Score': 0.4094731295057033},
                           {'Date': Timestamp('2022-05-02 00:00:00'),
                            'System': 'OPT-66B',
                            'Score': 0.44613086483417935},
                           {'Date': Timestamp('2022-11-09 00:00:00'),
                            'System': 'BLOOM-176B',
                            'Score': 0.49642974348225827},
                           {'Date': Timestamp('2023-02-24 00:00:00'),
                            'System': 'LLaMa-1 65B',
                            'Score': 1.0051219455807707},
                           {'Date':

### Estimate lag by area between curves

In [27]:
max_dfs_for_auc = defaultdict(dict)
for i, bench in enumerate(benchmarks_to_analyze):
    print(bench)
    max_rows = max_rows_by_bench[bench]

    if max_rows['Open'][-1]['Score'] < max_rows['Closed'][-1]['Score']:
        lead_category = 'Closed'
        lag_category = 'Open'
    else:
        lead_category = 'Open'
        lag_category = 'Closed'

    lead_rows = [row for row in max_rows[lead_category]]
    lag_rows = [row for row in max_rows[lag_category]]
    # Start scores from the random chance baseline
    lead_rows.insert(0, {'Date': lead_rows[0]['Date'], 'System': 'Random chance', 'Score': df[df['System'] == 'Random chance'][bench].values[0]})
    lag_rows.insert(0, {'Date': lag_rows[0]['Date'], 'System': 'Random chance', 'Score': df[df['System'] == 'Random chance'][bench].values[0]})
    # We don't know how long it will take the lag category to catch up from now
    # So truncate the lag category to the last score that surpassed the lead category,
    # and truncate the lead category to that score too
    lag_row_to_truncate_to = None
    for lead_row in lead_rows[::-1]:
        if lag_row_to_truncate_to is not None:
            break
        if lead_row['Score'] < lag_rows[-1]['Score']:
            for i, lag_row in enumerate(lag_rows):
                if lag_row['Score'] > lead_row['Score']:
                    lag_rows = lag_rows[:i+1]
                    lag_row_to_truncate_to = lag_row
                    break

    next_lead_row = [row for row in lead_rows if row['Score'] > lag_row_to_truncate_to['Score']][0]
    lead_rows = [row for row in lead_rows if row['Score'] <= lag_row_to_truncate_to['Score']]
    lead_rows.append({'Date': lead_rows[-1]['Date'], 'System': f'{next_lead_row["System"]} (Truncated)', 'Score': lag_row_to_truncate_to['Score']})

    max_dfs_for_auc[bench][lead_category] = pd.DataFrame(lead_rows)
    max_dfs_for_auc[bench][lag_category] = pd.DataFrame(lag_rows)


MMLU
GPQA
GSM1k
BBH


In [28]:
max_dfs_for_auc['MMLU']['Closed']

Unnamed: 0,Date,System,Score
0,2020-05-28,Random chance,0.25
1,2020-05-28,text-davinci-001,0.505838
2,2022-03-01,code-davinci-002,1.145704
3,2022-04-04,PaLM 540B,1.248273
4,2023-03-15,GPT-4 (original),1.9951
5,2024-03-04,Claude 3 Opus,2.024953
6,2024-05-13,GPT-4o,2.055725
7,2024-05-13,Claude 3.5 Sonnet (Truncated),2.063568


In [29]:
# Plot time on y axis and score on x axis
bench = 'GPQA'
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=max_dfs_for_auc[bench]['Closed']['Score'],
        y=max_dfs_for_auc[bench]['Closed']['Date'], 
        mode='lines+markers',
        marker=dict(color='orange'),
        text=max_dfs_for_auc[bench]['Closed']['System'],
        textposition='bottom center',
        name='Closed',
        line=dict(shape='vh')
    )
)
fig.add_trace(
    go.Scatter(
        x=max_dfs_for_auc[bench]['Open']['Score'],
        y=max_dfs_for_auc[bench]['Open']['Date'],
        mode='lines+markers',
        marker=dict(color='blue'),
        text=max_dfs_for_auc[bench]['Open']['System'],
        textposition='bottom center',
        name='Open',
        line=dict(shape='vh')
    )
)
fig.update_layout(
    width=650,
    height=600,
    title=f"{bench} curves for AUC calculation",
    xaxis_title="Accuracy",
    yaxis_title="Date",
)
fig.show()

In [30]:
# We estimate the mean date lag using integrals
# First, calculate the area under each date(performance) function
integrals = defaultdict(lambda: defaultdict(list))
auc_lags = {}
for bench in benchmarks_to_analyze:
    integrals[bench] = {}
    for category in ['Open', 'Closed']:
        integral = 0
        max_df = max_dfs_for_auc[bench][category]
        for i, row in max_df.iterrows():
            if i == 0:
                continue
            # Measure date in months since epoch
            date_diff = (row['Date'] - datetime(1970, 1, 1)).days/30
            score_diff = row['Score'] - max_df.iloc[i-1]['Score']
            integral += score_diff * date_diff
        integrals[bench][category] = integral
    # Now take the difference between the two integrals and divide by the score range
    # We made the score range the same for each category, so use 'Open' arbitrarily
    score_range = max_dfs_for_auc[bench]['Open'].iloc[-1]['Score'] - max_dfs_for_auc[bench]['Open'].iloc[0]['Score']
    print(f"Score range for {bench}: {score_range:.2f}")
    mean_lag = (integrals[bench]['Open'] - integrals[bench]['Closed']) / score_range
    auc_lags[bench] = mean_lag
    print(f"Lag for {bench}: {mean_lag:.0f} months")

Score range for MMLU: 1.81
Lag for MMLU: 14 months
Score range for GPQA: 0.47
Lag for GPQA: 4 months
Score range for GSM1k: 3.02
Lag for GSM1k: 5 months
Score range for BBH: 0.83
Lag for BBH: 7 months


### Estimate lag by the average of discrete catch-up times

In [31]:
discrete_lags = defaultdict(list)
for i, bench in enumerate(benchmarks_to_analyze):
  print(bench)
  max_rows = max_rows_by_bench[bench]

  noise_tolerance = 0.02 # nats
  already_matched = set()
  open_winner_score = 0
  for closed_row in max_rows['Closed']:
    if closed_row['Score'] <= open_winner_score:
      # Skip if there has already been a better open model
      continue
    for open_row in max_rows['Open']:
      if open_row['Score'] > (closed_row['Score'] - noise_tolerance) and open_row['Date'] not in already_matched:
        open_winner_score = open_row['Score']
        lag_months = (open_row['Date'] - closed_row['Date']).days/365*12
        
        print(f"{open_row['System']} at {open_row['Score']:.2f} matched or exceeded " + 
              f"{closed_row['System']} at {closed_row['Score']:.2f} after " + 
              f"{lag_months:.1f} months")
        # already_matched.add(open_row['Date'])
        discrete_lags[bench].append(lag_months)
        break

  print(f"{bench} mean lag: {np.mean(discrete_lags[bench]):.1f} months\n")

discrete_lags = {bench: np.mean(lags) for bench, lags in discrete_lags.items()}
discrete_lags

MMLU
BLOOM-176B at 0.50 matched or exceeded text-davinci-001 at 0.51 after 29.4 months
LLaMa-2 70B at 1.17 matched or exceeded code-davinci-002 at 1.15 after 16.6 months
Yi-34B at 1.44 matched or exceeded PaLM 540B at 1.25 after 19.0 months
Llama 3.1 405B at 2.06 matched or exceeded GPT-4 (original) at 2.00 after 16.3 months
MMLU mean lag: 20.3 months

GPQA
Yi-34B at 0.46 matched or exceeded gpt-4-0613 at 0.44 after 4.7 months
Llama 3.1 405B at 0.72 matched or exceeded Claude 3 Opus at 0.70 after 4.6 months
GPQA mean lag: 4.7 months

GSM1k
Mixtral-8x22B-instruct-v0.1 at 1.43 matched or exceeded gpt-3.5-turbo at 1.40 after 10.2 months
Llama 3 70B at 2.31 matched or exceeded claude-2.1 at 2.24 after 9.3 months
Llama 3.1 405B at 3.12 matched or exceeded gpt-4-0125-preview at 3.02 after 5.9 months
GSM1k mean lag: 8.4 months

BBH
GPT-NeoX 20B at 0.52 matched or exceeded text-davinci-001 at 0.41 after 20.4 months
Falcon 180B at 0.78 matched or exceeded code-davinci-002 at 0.75 after 18.2 mon

{'MMLU': 20.317808219178083,
 'GPQA': 4.652054794520549,
 'GSM1k': 8.449315068493151,
 'BBH': 14.750684931506848}

### Estimate lag by linear regression

In [32]:
fig = make_subplots(rows=2, cols=2, subplot_titles=benchmarks_to_analyze, vertical_spacing=0.2, horizontal_spacing=0.2)
for i, bench in enumerate(benchmarks_to_analyze):
  for category in ['Open', 'Closed']:
    category_df = pd.DataFrame(max_rows_by_bench[bench][category])

    fig.append_trace(
        go.Scatter(
            x=category_df['Date'],
            y=category_df['Score'],
            mode='markers',
            marker=dict(color=color_map[category]),
            text=category_df['System'],
            name=category,
            legendgroup=category,
            showlegend=True if i == 0 else False
        ),
        row=i//2 + 1, col=i%2 + 1
    )
fig.update_layout(
    width=600,
    height=400,
    margin=dict(l=10, r=10, t=70, b=10),
    font=dict(size=12),
    hovermode="closest",
)
fig.show()

In [33]:
reg_results = defaultdict(dict)
for i, bench in enumerate(benchmarks_to_analyze):
  for category in ['Open', 'Closed']:
    category_df = pd.DataFrame(max_rows_by_bench[bench][category])

    # Ignore the flatter part of the curve close to random chance
    random_chance_level = df[df['System'] == 'Random chance'][bench].values[0]
    thres = -np.log(1 - random_chance_level) + 0.2
    category_df = category_df[category_df['Score'] > thres]

    # Do regression
    category_df['year'] = datetime_to_float_year(category_df['Date'])
    results = fit_ols_regression(category_df, ['Score'], 'year')
    print(f"{bench} {category} R^2 of {results.rsquared:.2f}, equation: {results.params[0]:.2f} + {results.params[1]:.2f} * score")
    reg_results[bench][category] = results

MMLU Open R^2 of 0.97, equation: 2022.17 + 1.17 * score
MMLU Closed R^2 of 0.95, equation: 2019.35 + 2.30 * score
GPQA Open R^2 of 0.73, equation: 2023.77 + 1.11 * score
GPQA Closed R^2 of 0.87, equation: 2023.26 + 1.37 * score
GSM1k Open R^2 of 0.74, equation: 2023.49 + 0.40 * score
GSM1k Closed R^2 of 0.86, equation: 2022.74 + 0.44 * score
BBH Open R^2 of 0.77, equation: 2022.20 + 1.30 * score
BBH Closed R^2 of 0.85, equation: 2021.85 + 1.08 * score


In [34]:
# Find the average of the area between curves
for bench in benchmarks_to_analyze:
    open_slope = reg_results[bench]['Open'].params[1]
    open_intercept = reg_results[bench]['Open'].params[0]
    closed_slope = reg_results[bench]['Closed'].params[1]
    closed_intercept = reg_results[bench]['Closed'].params[0]

    random_chance_level = df[df['System'] == 'Random chance'][bench].values[0]
    min_score = -np.log(1 - random_chance_level)
    # print(f"Possible max scores: Open: {max_rows_by_bench[bench]['Open'][-1]['System']} at {max_rows_by_bench[bench]['Open'][-1]['Score']:.2f}, Closed: {max_rows_by_bench[bench]['Closed'][-1]['System']} at {max_rows_by_bench[bench]['Closed'][-1]['Score']:.2f}")
    max_score = min(max_rows_by_bench[bench]['Open'][-1]['Score'], max_rows_by_bench[bench]['Closed'][-1]['Score'])
    # print(f"Min score: {min_score:.2f}, max score: {max_score:.2f}")

    # Use the formula for the area between two lines
    avg_lag = 0.5 * (open_slope - closed_slope) * (min_score + max_score) + (open_intercept - closed_intercept)
    print(f"{bench}: {avg_lag*12:.0f} months")

MMLU: 18 months
GPQA: 5 months
GSM1k: 8 months
BBH: 7 months


### Plot benchmarks over time with lags

In [35]:
lags = auc_lags

In [36]:
fig = make_subplots(rows=2, cols=2, subplot_titles=benchmarks_to_analyze, vertical_spacing=0.2, horizontal_spacing=0.2)

# Define x limits for each subplot
x_limits = {
  'MMLU': [datetime(2019, 1, 1), datetime(2025, 1, 1)],
  'BBH': [datetime(2019, 1, 1), datetime(2025, 1, 1)],
  'GSM1k': [datetime(2019, 1, 1), datetime(2025, 1, 1)],
  'GPQA': [datetime(2023, 1, 1), datetime(2025, 1, 1)],
  'SEAL Coding': [datetime(2023, 1, 1), datetime(2025, 1, 1)],
  'SEAL Math': [datetime(2023, 1, 1), datetime(2025, 1, 1)],
  'LMSys Elo': [datetime(2023, 1, 1), datetime(2025, 1, 1)],
}

for i, bench in enumerate(benchmarks_to_analyze):
  for category in ['Open', 'Closed']:
    category_df = pd.DataFrame(max_rows_by_bench[bench][category])

    fig.append_trace(
        go.Scatter(
            x=category_df['Date'],
            y=category_df['Score'],
            mode='lines',
            line=dict(color=color_map[category], shape='hv'),
            text=category_df['System'],
            name=category,
            legendgroup=category,
            showlegend=True if i == 0 else False
        ),
        row=i//2 + 1, col=i%2 + 1
    )

  # Annotate with lag
  ref_idx = "" if i == 0 else i+1
  avg_lag = lags[bench]
  fig.add_annotation(
    x=0.05,
    y=1,
    text=f"Mean lag: {avg_lag:.0f} months",
    xref=f"x{ref_idx} domain",
    yref=f"y{ref_idx} domain",
    showarrow=False,
    font=dict(size=10),
    align="left",
    xanchor="left",
    yanchor="top"
  )

  # Update x and y axes for this subplot
  dtick = "M12" if bench in ["GPQA"] else "M24"
  fig.update_xaxes(
    title_text="Model publication date" if i//2 + 1 == 2 else None,
    range=[x_limits[bench][0], x_limits[bench][1]],  # Set x limits
    row=i//2 + 1,
    col=i%2 + 1,
    dtick=dtick,  # Set tick marks to every 12 months
    tickformat="%Y",  # Display only the year
    # ticklabelmode="period",  # Ensure labels are centered on the year
    tickangle=0  # Make tick labels horizontal
  )

  if i%2 + 1 == 1:
    fig.update_yaxes(title_text="Negative log error", row=i//2 + 1, col=i%2 + 1)

# Title
fig.update_layout(title_text="Open LLMs lag on key benchmarks")

# Improve the layout
fig.update_layout(
  width=600,
  height=400,
  legend_title="Model accessibility",
  font=dict(size=12),
  hovermode="closest",
  template="plotly_white",
)

# Margins
fig.update_layout(
  margin=dict(l=10, r=10, t=70, b=10)
)

# Save the plot
if save:
  save_plot(fig, results_dir, f"benchmark_dates_{'_'.join(benchmarks_to_analyze)}")

# Show the plot
fig.show()

## Compute

In [37]:
fig = make_subplots(rows=2, cols=2, subplot_titles=benchmarks_to_analyze, vertical_spacing=0.15)

# Define x limits for each subplot
x_limits = {
  'MMLU': [1e20, 1e26],
  'GSM1k': [1e20, 1e26],
  'GPQA': [1e23, 1e26],
  'BBH': [1e20, 1e26],
  'SEAL Math': [1e23, 1e26],
  'SEAL Coding': [1e23, 1e26],
  'LMSys Elo': [1e22, 1e26],
}

for i, bench in enumerate(benchmarks_to_analyze):
  plot_df = df[~(df['System'] == 'Random chance')]
  if old_models_only:
    plot_df = plot_df[plot_df['Before November 2023'] == 'Before']
  elif new_models_only:
    plot_df = plot_df[plot_df['Before November 2023'] == 'After']
  if non_suspects_only:
    if bench == 'GPQA':
      # GPQA was released November 20, 2023
      old_df = plot_df[plot_df['Before November 2023'] == 'Before']
      new_df = plot_df[plot_df['Before November 2023'] == 'After']
      new_df = new_df[new_df['Trust in benchmark results'] >= 0]
      plot_df = pd.concat([old_df, new_df])
    elif bench == 'MMLU':
      plot_df = plot_df[plot_df['Trust in benchmark results'] >= 0]
  elif trusted_only:
    plot_df = plot_df[plot_df['Trust in benchmark results'] > 0]

  for category in ['Open', 'Closed']:
    category_df = plot_df[plot_df['Open/Closed'] == category]

    if plot_log_error and bench_is_accuracy[bench]:
      y = -np.log(1 - category_df[bench])
    else:
      y = category_df[bench]

    if show_model_age:
      fig.append_trace(
        go.Scatter(
          x=category_df['Training compute (FLOP)'],
          y=y,
          mode='markers',
          marker=dict(color=category_df['Date'].astype(int),),
          text=category_df['System'],
          # name=category,
          # legendgroup=category,
          # showlegend=True if i == 0 else False
          showlegend=False,
        ),
        row=i//2 + 1, col=i%2 + 1
      )
    else:
      fig.append_trace(
        go.Scatter(
          x=category_df['Training compute (FLOP)'],
          y=y,
          mode='markers',
          marker=dict(color=color_map[category]),
          text=category_df['System'],
          name=category,
          legendgroup=category,
          showlegend=True if i == 0 else False
        ),
        row=i//2 + 1, col=i%2 + 1
      )

    # Plot scores as horizontal lines for models that don't have compute
    # for _, row in category_df.iterrows():
    #   if pd.isna(row['Training compute (FLOP)']) and not pd.isna(row[bench]):
    #     fig.append_trace(
    #       go.Scatter(
    #         x=[0, 1e26],
    #         y=[row[bench], row[bench]],
    #         mode='lines',
    #         line=dict(color=color_map[category]),
    #         showlegend=False,
    #       ),
    #       row=i//2 + 1, col=i%2 + 1
    #     )

  # Update x and y axes for this subplot
  fig.update_xaxes(
    title_text="Training compute (FLOP)" if i//2 + 1 == 2 else None,
    type='log',
    range=[np.log10(x_limits[bench][0]), np.log10(x_limits[bench][1])],  # Set x limits
    tickmode='linear',
    dtick=2,  # This sets ticks at every two powers of 10
    row=i//2 + 1,
    col=i%2 + 1
  )

  if i%2 + 1 == 1:
    fig.update_yaxes(title_text="Score", row=i//2 + 1, col=i%2 + 1)

# Improve the layout
fig.update_layout(
  width=600,
  height=400,
  # legend_title="Model accessibility",
  font=dict(size=12),
  hovermode="closest",
)

# Margins
fig.update_layout(
  margin=dict(l=0, r=0, t=20, b=0)
)

# Save the plot
if save:
  save_plot(fig, results_dir, 'benchmark_compute')

# Show the plot
fig.show()

In [38]:
bench = 'MMLU'
reg_df = df[~(df['System'] == 'Random chance')]
if non_suspects_only:
  if bench == 'GPQA':
    # GPQA was released November 20, 2023
    old_df = reg_df[reg_df['Before November 2023'] == 'Before']
    new_df = reg_df[reg_df['Before November 2023'] == 'After']
    new_df = new_df[new_df['Trust in benchmark results'] >= 0]
    reg_df = pd.concat([old_df, new_df])
  elif bench == 'MMLU':
    reg_df = reg_df[reg_df['Trust in benchmark results'] >= 0]
elif trusted_only:
  reg_df = reg_df[reg_df['Trust in benchmark results'] > 0]


In [39]:
random_chance_level = df[df["System"]=="Random chance"][bench].values[0]
# Filter out models that are not far above random chance level
# This is a heuristic to find the changepoint
filtered_reg_df = reg_df[reg_df[bench] > random_chance_level + 0.05]
filtered_reg_df['log_compute'] = np.log10(filtered_reg_df['Training compute (FLOP)'])
filtered_reg_df[bench + '_log_error'] = -np.log(1 - filtered_reg_df[bench])
filtered_reg_df.dropna(subset=['log_compute', bench + '_log_error'], inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [40]:
filtered_reg_df['year'] = datetime_to_float_year(filtered_reg_df['Date'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [41]:
# Single fit for all data
model = fit_ols_regression(filtered_reg_df, ['log_compute', 'year'], bench + '_log_error')
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.891
Model:,OLS,Adj. R-squared:,0.885
Method:,Least Squares,F-statistic:,162.8
Date:,"Wed, 04 Sep 2024",Prob (F-statistic):,6.03e-20
Time:,13:25:00,Log-Likelihood:,17.206
No. Observations:,43,AIC:,-28.41
Df Residuals:,40,BIC:,-23.13
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-362.2236,65.799,-5.505,0.000,-495.209,-229.238
x1,0.4413,0.031,14.461,0.000,0.380,0.503
x2,0.1743,0.033,5.342,0.000,0.108,0.240

0,1,2,3
Omnibus:,1.433,Durbin-Watson:,1.128
Prob(Omnibus):,0.489,Jarque-Bera (JB):,0.639
Skew:,0.22,Prob(JB):,0.726
Kurtosis:,3.403,Cond. No.,5190000.0


In [42]:
def no_split(df, filter_threshold=None):
  return {'All': df}


def open_closed_split(df, filter_threshold=None):
  open_df = df[df['Open/Closed'] == 'Open']
  closed_df = df[df['Open/Closed'] == 'Closed']
  return {'Open': open_df, 'Closed': closed_df}


def new_old_split(df, date):
  new_df = df[df['Date'] >= date]
  old_df = df[df['Date'] < date]
  return {'Before': old_df, 'After': new_df}


def combined_rsquared(xs, ys, models):
    y_true = np.concatenate(ys)
    y_mean = np.mean(y_true)
    y_pred = []
    for i, model in enumerate(models):
        X = sm.add_constant(xs[i])
        y_pred.append(model.predict(X))
    y_pred = np.concatenate(y_pred)
    
    sst = np.sum((y_true - y_mean)**2)
    ssr = np.sum((y_true - y_pred)**2)
    r_squared = 1 - (ssr / sst)
    return r_squared


def combined_bic(xs, ys, models):
  y = np.concatenate(ys)
  y_pred = []
  total_params = 0
  for i, model in enumerate(models):
    X = sm.add_constant(xs[i])
    y_pred.append(model.predict(X))
    total_params += len(model.params)
  y_pred = np.concatenate(y_pred)
  n = len(y)
  rss = np.sum((y - y_pred)**2)
  ll = -n/2 * (1 + np.log(2*np.pi) + np.log(rss/n))
  bic = -2 * ll + total_params * np.log(n)
  return bic


# K-Fold Cross Validation
def perform_cross_validation(df, filter_fn, features, bench, k=10, random_state=42, filter_threshold=None):
  kf = KFold(n_splits=k, shuffle=True, random_state=random_state)
  folds_mses = []
  for train_index, test_index in kf.split(df):
    train_df, test_df = df.iloc[train_index], df.iloc[test_index]
    train_dfs = filter_fn(train_df, filter_threshold)
    test_dfs = filter_fn(test_df, filter_threshold)

    # Fit the models on the training set
    submodels = {}
    for category, train_df in train_dfs.items():
      model = fit_ols_regression(train_df, features, bench + '_log_error')
      submodels[category] = model

    # Predict on the test set
    residuals = []
    for i, (category, test_df) in enumerate(test_dfs.items()):
      predicted = get_predictions(submodels[category], test_df, features)
      residuals.append(predicted - test_df[bench + '_log_error'])
    residuals = np.concatenate(residuals)
    mse = np.mean(residuals**2)
    folds_mses.append(mse)

  return np.array(folds_mses)


def regression_with_results(df, filter_fn, features, bench, filter_threshold=None):
  dfs = filter_fn(df, filter_threshold)
  submodels = {category: fit_ols_regression(df, features, bench + '_log_error') for category, df in dfs.items()}
  mses = perform_cross_validation(df, filter_fn, features, bench, filter_threshold=filter_threshold)
  bic = combined_bic(
      [dfs[category][features] for category in dfs],
      [dfs[category][bench + '_log_error'] for category in dfs],
      [submodels[category] for category in dfs],
  )
  rsquared = combined_rsquared(
      [dfs[category][features] for category in dfs],
      [dfs[category][bench + '_log_error'] for category in dfs],
      [submodels[category] for category in dfs],
  )
  return {'mses': mses, 'bic': bic, 'rsquared': rsquared, 'submodels': submodels}

In [43]:
# Iterate over a range of dates to split the data on
# Try each month from 2020-01 to 2024-08
best_bic = np.inf
best_date = None
for date in pd.date_range(start='2020-01-01', end='2024-09-01', freq='MS'):
  before_df = filtered_reg_df[filtered_reg_df['Date'] < date]
  after_df = filtered_reg_df[filtered_reg_df['Date'] >= date]
  if len(before_df) == 0 or len(after_df) == 0:
    continue
  before_model = fit_ols_regression(before_df, ['log_compute'], bench + '_log_error')
  after_model = fit_ols_regression(after_df, ['log_compute'], bench + '_log_error')
  # Calculate overall BIC
  bic = combined_bic(
      [before_df['log_compute'], after_df['log_compute']],
      [before_df[bench + '_log_error'], after_df[bench + '_log_error']],
      [before_model, after_model],
  )
  print(f"BIC for {date}: {bic}")
  if bic < best_bic:
    best_bic = bic
    best_date = date
    best_models = [before_model, after_model]

print(f"Best BIC: {best_bic} when splitting on {best_date}")

BIC for 2020-06-01 00:00:00: -3.46603778216347
BIC for 2020-07-01 00:00:00: -3.46603778216347
BIC for 2020-08-01 00:00:00: -3.46603778216347
BIC for 2020-09-01 00:00:00: -3.46603778216347
BIC for 2020-10-01 00:00:00: -3.46603778216347
BIC for 2020-11-01 00:00:00: -3.46603778216347
BIC for 2020-12-01 00:00:00: -3.46603778216347
BIC for 2021-01-01 00:00:00: -3.46603778216347
BIC for 2021-02-01 00:00:00: -3.46603778216347
BIC for 2021-03-01 00:00:00: -3.46603778216347
BIC for 2021-04-01 00:00:00: -3.46603778216347
BIC for 2021-05-01 00:00:00: -3.46603778216347
BIC for 2021-06-01 00:00:00: -3.46603778216347
BIC for 2021-07-01 00:00:00: -3.46603778216347
BIC for 2021-08-01 00:00:00: -3.46603778216347
BIC for 2021-09-01 00:00:00: -3.46603778216347
BIC for 2021-10-01 00:00:00: -3.46603778216347
BIC for 2021-11-01 00:00:00: -3.46603778216347
BIC for 2021-12-01 00:00:00: -3.46603778216347
BIC for 2022-01-01 00:00:00: -3.46603778216347
BIC for 2022-02-01 00:00:00: -3.46603778216347
BIC for 2022-

In [44]:
results = {}
results['no_split'] = regression_with_results(filtered_reg_df, no_split, ['log_compute'], bench)
results['no_split_year'] = regression_with_results(filtered_reg_df, no_split, ['log_compute', 'year'], bench)
results['open_closed_split'] = regression_with_results(filtered_reg_df, open_closed_split, ['log_compute'], bench)
results['open_closed_split_year'] = regression_with_results(filtered_reg_df, open_closed_split, ['log_compute', 'year'], bench)
for split, result in results.items():
  print(f"Results for {split}:")
  print(f"Mean k-fold MSE: {np.mean(result['mses']):.3f} ({np.std(result['mses']):.3f})")
  print(f"BIC: {result['bic']:.4f}")
  print(f"R-squared: {result['rsquared']:.4f}")
  print("--------------------")

Results for no_split:
Mean k-fold MSE: 0.052 (0.029)
BIC: -3.7357
R-squared: 0.8126
--------------------
Results for no_split_year:
Mean k-fold MSE: 0.032 (0.023)
BIC: -23.1280
R-squared: 0.8906
--------------------
Results for open_closed_split:
Mean k-fold MSE: 0.049 (0.036)
BIC: -2.6215
R-squared: 0.8385
--------------------
Results for open_closed_split_year:
Mean k-fold MSE: 0.036 (0.026)
BIC: -13.9646
R-squared: 0.8959
--------------------


In [45]:
fig = go.Figure()

for category, color in [('Open', 'blue'), ('Closed', 'orange')]:
  df_category = filtered_reg_df[filtered_reg_df['Open/Closed'] == category]
  fig.add_trace(
      go.Scatter(
          x=df_category['log_compute'],
          y=df_category[bench + '_log_error'],
          mode='markers',
          name=f'{category} Models',
          marker=dict(color=color, opacity=0.5),
          text=df_category['System']
      ),
  )

fig.add_annotation(
    x=24,
    y=1.537,
    text="DeepSeek V2",
    xref="x",
    yref="y",
    showarrow=True,
    font=dict(size=10),
    align="left",
    xanchor="right",
    ax=-5,
    ay=-5
)
fig.add_annotation(
    x=24.87,
    y=1.527,
    text="PaLM 2",
    xref="x",
    yref="y",
    showarrow=True,
    font=dict(size=10),
    align="left",
    xanchor="left",
    ax=5,
    ay=5,
)
fig.add_annotation(
    x=23.64,
    y=1.25,
    text="Gemma 2 9B",
    xref="x",
    yref="y",
    showarrow=True,
    font=dict(size=10),
    align="left",
    xanchor="right",
    ax=-5,
    ay=-5
)
fig.add_annotation(
    x=24.4,
    y=1.25,
    text="PaLM 540B",
    xref="x",
    yref="y",
    showarrow=True,
    font=dict(size=10),
    align="left",
    xanchor="left",
    ax=5,
    ay=5
)

# Update layout
fig.update_layout(
  width=600,
  height=250,
  title_text=f"{bench} score vs log compute",
  showlegend=True,
  # legend=dict(
  #   orientation="h",
  #   yanchor="bottom",
  #   y=-0.4,
  #   xanchor="left",
  #   x=0,
  # ),
  margin=dict(t=50, b=20, l=40, r=40),
  template='plotly_white'
)

fig.update_xaxes(title_text="Log Compute", range=[21.5, 26.5])
fig.update_yaxes(title_text="Negative log of error rate")

# Show the plot
fig.show()

# Save the plot
if save:
  save_plot(fig, results_dir, f'{bench}_compute')

In [46]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=filtered_reg_df['log_compute'],
        y=filtered_reg_df[bench + '_log_error'],
        mode='markers',
        name='Data',
        marker=dict(color='gray', opacity=0.5),
        text=filtered_reg_df['System']
    ),
)

all_model = results['no_split']['submodels']['All']
y_pred = get_predictions(all_model, filtered_reg_df, ['log_compute'])

fig.add_trace(
    go.Scatter(
        x=filtered_reg_df['log_compute'],
        y=y_pred,
        mode='lines',
        name=f'Fit (compute)',
        text=filtered_reg_df['System'],
        marker=dict(color='black'),
    ),
)

# for category, color in [('Open', 'blue'), ('Closed', 'orange')]:
#   category_models = results['open_closed_split_year']['submodels']
#   model_category = category_models[category]
#   category_df = filtered_reg_df[filtered_reg_df['Open/Closed'] == category]
#   y_pred = get_predictions(model_category, category_df, ['log_compute', 'year'])
  
#   fig.add_trace(
#       go.Scatter(
#           x=category_df['log_compute'],
#           y=y_pred,
#           mode='markers',
#           name=f'Fit (compute, age, {category})',
#           text=category_df['System'],
#           marker=dict(color=color, opacity=0.5, symbol='square'),
#       ),
#   )

all_model = results['no_split_year']['submodels']['All']
y_pred = get_predictions(all_model, filtered_reg_df, ['log_compute', 'year'])

fig.add_trace(
  go.Scatter(
    x=filtered_reg_df['log_compute'],
    y=y_pred,
    mode='markers',
    name='Fit (compute, age)',
    text=filtered_reg_df['System'],
    marker=dict(color='green', opacity=0.5),
  ),
)

# Update layout
fig.update_layout(
  width=600,
  height=250,
  title_text=f"{bench} score vs log compute",
  showlegend=True,
  # legend=dict(
  #   orientation="h",
  #   yanchor="bottom",
  #   y=-0.4,
  #   xanchor="left",
  #   x=0,
  # ),
  margin=dict(t=50, b=20, l=40, r=40),
  template='plotly_white'
)

fig.update_xaxes(title_text="Log Compute", range=[22, 26])
fig.update_yaxes(title_text="Negative log of error rate")

# Show the plot
fig.show()

# Save the plot
if save:
  save_plot(fig, results_dir, f'{bench}_regression_models_comparison')

In [47]:
fig = go.Figure()

category = 'Open'
color = 'blue'
category_models = results['open_closed_split_year']['submodels']
model_category = category_models[category]
category_df = filtered_reg_df[filtered_reg_df['Open/Closed'] == category]
y_pred = get_predictions(model_category, category_df, ['log_compute', 'year'])

fig.add_trace(
    go.Scatter(
        x=category_df['log_compute'],
        y=y_pred - category_df[bench + '_log_error'],
        mode='markers',
        name=f'Fit (compute, age, {category})',
        marker=dict(color=color, opacity=0.5),
    ),
)

all_model = results['no_split_year']['submodels']['All']
y_pred = get_predictions(all_model, category_df, ['log_compute', 'year'])

fig.add_trace(
  go.Scatter(
    x=category_df['log_compute'],
    y=y_pred - category_df[bench + '_log_error'],
    mode='markers',
    name='Fit (compute, age)',
    marker=dict(color='green', opacity=0.5),
  ),
)

# Update layout
fig.update_layout(
  width=600,
  height=300,
  title_text=f"{bench} score vs log compute",
  showlegend=True,
  # legend=dict(
  #   orientation="h",
  #   yanchor="bottom",
  #   y=-0.4,
  #   xanchor="left",
  #   x=0,
  # ),
  margin=dict(t=50, b=20, l=40, r=40)
)

fig.update_xaxes(title_text="Log Compute", range=[22, 26])
fig.update_yaxes(title_text="Residuals")

# Show the plot
fig.show()

# Save the plot
if save:
  save_plot(fig, results_dir, f'{bench}_regression_residuals_{category}')

In [48]:
# Calculate the proportion of New models that are Open models
new_df = filtered_reg_df[filtered_reg_df['Date'] >= best_date]
old_df = filtered_reg_df[filtered_reg_df['Date'] < best_date]
open_df = new_df[new_df['Open/Closed'] == 'Open']
proportion_new = len(open_df) / len(new_df)
print(f"Proportion of New models that are Open models: {len(open_df)} out of {len(new_df)} ({proportion_new*100:.0f}%)")

Proportion of New models that are Open models: 11 out of 16 (69%)


In [49]:
# Calculate the average date of new models
# Need to convert the date to a number of days since the epoch
new_avg_age = (new_df['Date'] - new_df['Date'].min()).dt.days.mean()
old_avg_age = (old_df['Date'] - old_df['Date'].min()).dt.days.mean()
# Convert the average age back to date
new_avg_age_date = new_df['Date'].min() + pd.Timedelta(days=new_avg_age)
old_avg_age_date = old_df['Date'].min() + pd.Timedelta(days=old_avg_age)
print(f"Average age of new models: {new_avg_age_date}")
print(f"Average age of old models: {old_avg_age_date}")
print(f"Difference in average age: {new_avg_age_date - old_avg_age_date}")

Average age of new models: 2024-03-11 12:00:00
Average age of old models: 2022-12-25 11:33:20
Difference in average age: 442 days 00:26:40
