In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict
from datetime import datetime
import kaleido  # needed for saving plots
import numpy as np
import os
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from sklearn.model_selection import KFold
from tqdm import tqdm

from regression import *

# Parameters

In [3]:
benchmarks_to_analyze = ['MMLU', 'GPQA', 'GSM1k', 'BBH']
# Llama ablations
exclude_big_llama = False
exclude_all_llamas = False

bench_is_accuracy = {'MMLU': True, 'BBH': True, 'GSM1k': True, 'GPQA': True, 'LMSys Elo': False, 'SEAL Coding': False, 'SEAL Math': False}
plot_log_error = False  # Whether to plot the negative log of error rate instead of accuracy. This only applies to performance vs. compute plots.
non_suspects_only = True  # Whether to only include not-suspicious benchmark scores in the analysis
trusted_only = False  # Whether to only include actively trusted benchmark scores in the analysis (more strict)
old_models_only = False  # Whether to only include models released before November 2023
new_models_only = False  # Whether to only include models released in or after November 2023
show_model_age = False  # Whether to show model age on plots
save = True  # Whether to save plots and results to disk

color_map = {'Open': 'blue', 'Closed': 'darkorange'}
marker_map = {'Before': 'circle', 'After': 'diamond'}  # new vs. old models

In [4]:
results_dir = 'results/benchmark/16Oct/'
os.makedirs(results_dir, exist_ok=True)

In [5]:
rng = np.random.default_rng(seed=42)

In [6]:
def save_plot(fig, folder, filename, extensions=['png', 'svg', 'pdf'], scale=2):
    prefix = ''
    if exclude_all_llamas:
        prefix = 'all_llamas_excluded_'
    elif exclude_big_llama:
        prefix = 'big_llama_excluded_'
        
    for ext in extensions:
        fig.write_image(folder + prefix + filename + '.' + ext, scale=scale)
    fig.write_html(folder + prefix + filename + '.html')

# Prepare data

In [7]:
# data_path = "https://docs.google.com/spreadsheets/d/1etu9rXcME0uUA-S2ANA8bsfQbIZgNu-8NxqFGQdDIzQ/export?format=csv&gid=1305280917#gid=1305280917"
data_path = "data/benchmarks_with_model_accessibility.csv"
df = pd.read_csv(data_path)

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

df.head(30)

Unnamed: 0,System,Model size (parameters),Active Parameters,Dataset size,Date,Open/Closed,Training compute (FLOP),Training compute notes,BBH,GPQA,Diamond subset?,MMLU,HELM MMLU,SEAL Coding,SEAL Instruction Following,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,GPQA Notes,MMLU Notes,HELM MMLU Notes,Trust in benchmark results,Trust notes
0,Random chance,,,,,,1e+20,,0.25,0.25,y,0.25,0.25,0.0,0.0,0.0,0.0,,,,,,,0,
1,BLOOM-176B,176000000000.0,176000000000.0,390000000000.0,2022-11-09,Open,4.12e+23,,0.4491,,,0.3913,,,,,,,,,,,,0,
2,BloombergGPT,50000000000.0,50000000000.0,708000000000.0,2023-03-30,Closed,2.12e+23,,0.4197,,,0.3918,,,,,,,,,,,,0,
3,Camelidae-8x34B,,,,2024-01-05,Open,,,,,,0.756,,,,,,,,,,,,0,
4,ChatGLM-6B,6000000000.0,6000000000.0,,2023-03-01,Open,,,0.1873,,,,,,,,880.0,,,,,,,0,
5,ChatGLM2-12B-base,12000000000.0,12000000000.0,,2023-06-25,Open,,,0.3602,,,,,,,,,,,,,,,0,
6,ChatGLM2-6B-base,6000000000.0,6000000000.0,,2023-06-25,Open,,,0.3368,,,,,,,,924.0,,,,,,,0,
7,ChatGLM3-6B,6000000000.0,6000000000.0,,2023-10-27,Open,5.04e+22,,0.661,,,,,,,,955.0,,,,,,,0,
8,Chinchilla 70B,,70000000000.0,,2022-03-29,Closed,5.76e+23,,,,,0.675,,,,,,,,,,,,0,
9,Claude 2,,,,2023-07-11,Closed,,,,,,0.785,,,,,1132.0,,,,,"Actually CoT, so probably an overestimate. HEL...",,1,Claude 2.1 doesn't perform worse on GSM1k rela...


In [8]:
# gsm1k_data_path = "https://docs.google.com/spreadsheets/d/1KYp4h3urj-698IE9bR7n1ctuH1iyCAQ5pTZIqQ_qs9g/export?format=csv"
gsm1k_data_path = "data/gsm1k_with_model_accessibility.csv"
gsm1k_df = pd.read_csv(gsm1k_data_path)
gsm1k_df

Unnamed: 0,System,Date,GSM8k,GSM1k,Training compute (FLOP),Speculative Compute,Active Parameters,Open/Closed
0,claude-2.1,2023-07-11,0.887,0.894,3.800000e+24,,,Closed
1,claude-3-haiku-20240307,2024-03-04,0.785,0.785,,,,Closed
2,claude-3-opus-20240229,2024-03-04,0.802,0.825,,4.000000e+25,,Closed
3,claude-3-sonnet-20240229,2024-03-04,0.719,0.744,,,,Closed
4,codegemma-7b,2024-04-09,0.479,0.416,3.330000e+23,,7000000000,Open
...,...,...,...,...,...,...,...,...
66,vicuna-33b-v1.3,2023-06-22,0.379,0.341,,,33000000000,Open
67,Xwin-Math-13B-V1.0,2024-03-07,0.631,0.529,,,13000000000,Open
68,Xwin-Math-7B-V1.0,2024-03-07,0.529,0.428,,,7000000000,Open
69,Yi-34B-Chat,2023-11-02,0.641,0.569,6.100000e+23,,34000000000,Open


In [9]:
epoch_gpqa_df = pd.read_csv("data/epoch_gpqa_with_model_accessibility.csv")
epoch_gpqa_df

Unnamed: 0,System,GPQA,Date,Open/Closed,Trust in benchmark results
0,claude-2.0,0.352525,2023-07-11,Closed,0
1,claude-2.1,0.360606,2023-11-21,Closed,0
2,claude-3-5-sonnet-20240620,0.562374,2024-06-20,Closed,0
3,claude-3-haiku-20240307,0.344444,2024-03-07,Closed,0
4,claude-3-opus-20240229,0.478788,2024-02-29,Closed,0
5,claude-3-sonnet-20240229,0.391414,2024-02-29,Closed,0
6,dbrx-instruct,0.30404,2024-03-27,Open,0
7,deepseek-chat,0.415152,2023-11-29,Open,0
8,deepseek-coder,0.430303,2023-11-01,Open,0
9,deepseek-llm-67b-chat,0.211111,2024-01-05,Open,0


In [10]:
# Delete "GPQA" column from df
df = df.drop(columns=['GPQA', 'Diamond subset?', 'GPQA Notes'])
df

Unnamed: 0,System,Model size (parameters),Active Parameters,Dataset size,Date,Open/Closed,Training compute (FLOP),Training compute notes,BBH,MMLU,HELM MMLU,SEAL Coding,SEAL Instruction Following,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,MMLU Notes,HELM MMLU Notes,Trust in benchmark results,Trust notes
0,Random chance,,,,,,1.000000e+20,,0.2500,0.2500,0.25,0.0,0.0,0.0,0.0,,,,,,0,
1,BLOOM-176B,1.760000e+11,176000000000,3.900000e+11,2022-11-09,Open,4.120000e+23,,0.4491,0.3913,,,,,,,,,,,0,
2,BloombergGPT,5.000000e+10,50000000000,7.080000e+11,2023-03-30,Closed,2.120000e+23,,0.4197,0.3918,,,,,,,,,,,0,
3,Camelidae-8x34B,,,,2024-01-05,Open,,,,0.7560,,,,,,,,,,,0,
4,ChatGLM-6B,6.000000e+09,6000000000,,2023-03-01,Open,,,0.1873,,,,,,880.0,,,,,,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,XVerse-65B,,65000000000,,2023-11-05,Open,,,,,,,,,,,,,,,0,
109,XVerse-7B,,7000000000,,2023-09-26,Open,,,,,,,,,,,,,,,0,
110,Yi-34B,3.400000e+10,34000000000,3.000000e+12,2023-11-02,Open,6.120000e+23,,0.5430,0.7635,,,,,1111.0,chat,,,,,0,
111,Yi-6B,6.000000e+09,6000000000,3.000000e+12,2023-11-02,Open,1.080000e+23,,0.4280,0.6385,,,,,,,,,,,0,


In [11]:
# Concatenate dfs
df = pd.concat([gsm1k_df, epoch_gpqa_df, df], axis=0, join='outer', ignore_index=True)
df

Unnamed: 0,System,Date,GSM8k,GSM1k,Training compute (FLOP),Speculative Compute,Active Parameters,Open/Closed,GPQA,Trust in benchmark results,Model size (parameters),Dataset size,Training compute notes,BBH,MMLU,HELM MMLU,SEAL Coding,SEAL Instruction Following,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,MMLU Notes,HELM MMLU Notes,Trust notes
0,claude-2.1,2023-07-11,0.887,0.894,3.800000e+24,,,Closed,,,,,,,,,,,,,,,,,,
1,claude-3-haiku-20240307,2024-03-04,0.785,0.785,,,,Closed,,,,,,,,,,,,,,,,,,
2,claude-3-opus-20240229,2024-03-04,0.802,0.825,,4.000000e+25,,Closed,,,,,,,,,,,,,,,,,,
3,claude-3-sonnet-20240229,2024-03-04,0.719,0.744,,,,Closed,,,,,,,,,,,,,,,,,,
4,codegemma-7b,2024-04-09,0.479,0.416,3.330000e+23,,7000000000,Open,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,XVerse-65B,2023-11-05,,,,,65000000000,Open,,0.0,,,,,,,,,,,,,,,,
225,XVerse-7B,2023-09-26,,,,,7000000000,Open,,0.0,,,,,,,,,,,,,,,,
226,Yi-34B,2023-11-02,,,6.120000e+23,,34000000000,Open,,0.0,3.400000e+10,3.000000e+12,,0.543,0.7635,,,,,1111.0,chat,,,,,
227,Yi-6B,2023-11-02,,,1.080000e+23,,6000000000,Open,,0.0,6.000000e+09,3.000000e+12,,0.428,0.6385,,,,,,,,,,,


In [12]:
df.loc[df['System'] == 'Random chance', 'GPQA'] = 0.25
df.loc[df['System'] == 'Random chance', 'GSM1k'] = 0.0

In [13]:
# Ablate Llamas
if exclude_big_llama:
    df = df[~((df['System'].str.contains('Llama', case=False)) & (df['System'].str.contains('405')))]
if exclude_all_llamas:
    df = df[~df['System'].str.contains('Llama', case=False)]

In [14]:
df['Date'] = pd.to_datetime(df['Date'], format='mixed')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
df

Unnamed: 0,System,Date,GSM8k,GSM1k,Training compute (FLOP),Speculative Compute,Active Parameters,Open/Closed,GPQA,Trust in benchmark results,Model size (parameters),Dataset size,Training compute notes,BBH,MMLU,HELM MMLU,SEAL Coding,SEAL Instruction Following,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,MMLU Notes,HELM MMLU Notes,Trust notes
0,claude-2.1,2023-07-11,0.887,0.894,3.800000e+24,,,Closed,,,,,,,,,,,,,,,,,,
1,claude-3-haiku-20240307,2024-03-04,0.785,0.785,,,,Closed,,,,,,,,,,,,,,,,,,
2,claude-3-opus-20240229,2024-03-04,0.802,0.825,,4.000000e+25,,Closed,,,,,,,,,,,,,,,,,,
3,claude-3-sonnet-20240229,2024-03-04,0.719,0.744,,,,Closed,,,,,,,,,,,,,,,,,,
4,codegemma-7b,2024-04-09,0.479,0.416,3.330000e+23,,7000000000,Open,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,XVerse-65B,2023-11-05,,,,,65000000000,Open,,0.0,,,,,,,,,,,,,,,,
225,XVerse-7B,2023-09-26,,,,,7000000000,Open,,0.0,,,,,,,,,,,,,,,,
226,Yi-34B,2023-11-02,,,6.120000e+23,,34000000000,Open,,0.0,3.400000e+10,3.000000e+12,,0.543,0.7635,,,,,1111.0,chat,,,,,
227,Yi-6B,2023-11-02,,,1.080000e+23,,6000000000,Open,,0.0,6.000000e+09,3.000000e+12,,0.428,0.6385,,,,,,,,,,,


In [15]:
# Filter out finetuned systems

finetuned_systems = [
 'Layer Normalization: Handwriting sequence generation',
 'ULM-FiT',
 'ADP-FAIRSEQ + NGRAMRES',
 'Cross-lingual alignment',
 'UnifiedQA',
 '$\\infty$-former (SM)',
 'FLAN 137B',
 'AlphaFold-Multimer',
 'Masked Autoencoders',
 'Contriever',
 'BERT-RBP',
 'Minerva',
 'BlenderBot 3',
 'PaLM-SayCan',
 'NMST+GPT-2',
 'Decaying Fast Weights Transformer (WT-103)',
 'GPT-2 + Progressive LRD',
 'U-PaLM',
 'Flan-T5 11B',
 'Flan-PaLM 540B',
 'Taiyi-Stable Diffusion',
 'OPT-IML (175B)',
 'SparseOPT-175B',
 'DiT-XL/2',
 'VideoMAE V2',
 'Segment Anything Model',
 'gLM',
 'MOSS-Moon-003',
 'WizardLM-7B',
 'InstructBLIP',
 'Guanaco-65B',
 'WizardCoder-15.5B',
 'Code Llama-34B',
 'Code Llama-7B',
 'TigerBot-70B',
 'MiniGPT4 (Vicuna finetune)',
 'LLaMA-7B (protein-oriented instructions finetuned)',
 'FinGPT-13B',
 'LLaVA 1.5',
 'CogVLM',
 'Volcano 13B',
 'SPHINX (Llama 2 13B)',
 'Orca 2-13B',
 'Llama Guard',
 'FunSearch',
 'Elyza',
 'Code Llama-70B',
 'Swallow'
]

df = df[~df['System'].isin(finetuned_systems)]
df = df[~df['System'].str.contains('Flan', case=False)]

## Merge SEAL Math with GSM1k

In [16]:
for i, row in df.iterrows():
  if pd.notna(row['SEAL Math']):
    df.at[i, 'GSM1k'] = row['SEAL Math']
df

Unnamed: 0,System,Date,GSM8k,GSM1k,Training compute (FLOP),Speculative Compute,Active Parameters,Open/Closed,GPQA,Trust in benchmark results,Model size (parameters),Dataset size,Training compute notes,BBH,MMLU,HELM MMLU,SEAL Coding,SEAL Instruction Following,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,MMLU Notes,HELM MMLU Notes,Trust notes
0,claude-2.1,2023-07-11,0.887,0.894,3.800000e+24,,,Closed,,,,,,,,,,,,,,,,,,
1,claude-3-haiku-20240307,2024-03-04,0.785,0.785,,,,Closed,,,,,,,,,,,,,,,,,,
2,claude-3-opus-20240229,2024-03-04,0.802,0.825,,4.000000e+25,,Closed,,,,,,,,,,,,,,,,,,
3,claude-3-sonnet-20240229,2024-03-04,0.719,0.744,,,,Closed,,,,,,,,,,,,,,,,,,
4,codegemma-7b,2024-04-09,0.479,0.416,3.330000e+23,,7000000000,Open,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,XVerse-65B,2023-11-05,,,,,65000000000,Open,,0.0,,,,,,,,,,,,,,,,
225,XVerse-7B,2023-09-26,,,,,7000000000,Open,,0.0,,,,,,,,,,,,,,,,
226,Yi-34B,2023-11-02,,,6.120000e+23,,34000000000,Open,,0.0,3.400000e+10,3.000000e+12,,0.543,0.7635,,,,,1111.0,chat,,,,,
227,Yi-6B,2023-11-02,,,1.080000e+23,,6000000000,Open,,0.0,6.000000e+09,3.000000e+12,,0.428,0.6385,,,,,,,,,,,


## Benchmarks vs Active Parameters

In [17]:
def convert_int(x):
    if pd.isna(x):
        return np.nan
    try:
        return int(str(x).replace(',', ''))
    except ValueError:
        return np.nan

df['Active Parameters'] = df['Active Parameters'].apply(convert_int)

In [18]:
color_map = {'Open': 'blue', 'Closed': 'darkorange'}

highlighted_models = ['Llama 3.1 405B', 'GPT-4 (original)', 'Qwen2-72B', 'Mistral Large 2', 
                      'Nemotron-4-340B Base', 'PaLM-2', 'Gemma 2 9B', 'LLaMa-2 7B', 'Gemma 1 2B', 'T5-Base', 'Chinchilla 70B']

# Create the plot
fig = px.scatter(df, x='Active Parameters', y='MMLU', 
                 color='Open/Closed',
                 title='MMLU vs Active Parameters',
                 hover_data=['System', 'MMLU', 'Active Parameters'],
                 color_discrete_map=color_map
                )
fig.update_layout(xaxis_type="log")

for i, row in df.iterrows():
    if row['System'] in highlighted_models:
        if row['System'] in ['GPT-4 (original)', 'Qwen2-72B', 'Gemma 2 9B', 'T5-Base']:
            ay = -20
        elif row['System'] == 'Llama 3.1 405B':
            ay = 20
        else:
            ay = 30

        ax = 20 if row['System'] in ['Llama 3.1 405B', 'Chinchilla 70B', 'Nemotron-4-340B Base'] else -10
        
        fig.add_annotation(
            x=np.log10(row['Active Parameters']),
            y=row['MMLU'],
            text=row["System"],
            showarrow=True,
            arrowhead=2,
            arrowsize=1,
            arrowwidth=1.5,
            visible=True,
            ay=ay,
            ax=ax
        )

# Show the plot
fig.show()

In [19]:
color_map = {'Open': 'blue', 'Closed': 'darkorange'}

highlighted_models = ['Llama 3.1 405B', 'Mistral Large 2', 'Llama 3.1 70B', 'Llama 3 70B']

# Create the plot
fig = px.scatter(df, x='Active Parameters', y='GPQA', 
                 color='Open/Closed',
                 title='GPQA vs Active Parameters',
                 hover_data=['System', 'GPQA', 'Active Parameters'],
                 color_discrete_map=color_map
                )

for i, row in df.iterrows():
    if row['System'] in highlighted_models:
        fig.add_annotation(
            x=np.log10(row['Active Parameters']),
            y=row['GPQA'],
            text=row["System"],
            showarrow=True,
            arrowhead=2,
            arrowsize=1,
            arrowwidth=1,
            visible=True,
            ay=-30,
            ax=-20
        )
fig.update_layout(xaxis_type="log")

# Show the plot
fig.show()

In [20]:
color_map = {'Open': 'blue', 'Closed': 'darkorange'}

highlighted_models = ['Nemotron-4-340B Base','DeepSeek-Coder-V2','Qwen2-72B', 'Mixtral8x22B', 'Gemma 2 27B', 
                      'PaLM-2', 'text-davinci-003']

# Create the plot
fig = px.scatter(df, x='Active Parameters', y='BBH', 
                 color='Open/Closed',
                 title='BBH vs Active Parameters',
                 hover_data=['System', 'BBH', 'Active Parameters'],
                 color_discrete_map=color_map
                )
fig.update_layout(xaxis_type="log")
for i, row in df.iterrows():
    if row['System'] in highlighted_models:
        ay = -40 if row['System'] == 'Nemotron-4-340B Base' else -30
        fig.add_annotation(
            x=np.log10(row['Active Parameters']),
            y=row['BBH'],
            text=row["System"],
            showarrow=True,
            arrowhead=2,
            arrowsize=1,
            arrowwidth=1,
            visible=True,
            ay=ay,
            ax=-30
        )

# Show the plot
fig.show()

In [21]:
color_map = {'Open': 'blue', 'Closed': 'darkorange'}

highlighted_models = ['Llama 3.1 405B', 'gpt-4', 'Llama 3 70B', 'Mixtral-8x22B-instruct-v0.1'] 
fig = px.scatter(df, x='Active Parameters', y='GSM1k', 
                 color='Open/Closed',
                 title='GSM1k vs Active Parameters',
                 hover_data=['System', 'GSM1k', 'Active Parameters'],
                 color_discrete_map=color_map
                )
fig.update_layout(xaxis_type="log")
for i, row in df.iterrows():
    if row['System'] in highlighted_models:
        fig.add_annotation(
            x=np.log10(row['Active Parameters']),
            y=row['GSM1k'],
            text=row["System"],
            showarrow=True,
            arrowhead=2,
            arrowsize=1,
            arrowwidth=1,
            visible=True,
            ay=-30,
            ax=-20
        )

# Show the plot
fig.show()

# Bench-to-bench comparisons for vetting purposes

FIXME: match up the model names again so that this plot works.

In [22]:
bench1 = 'MMLU'
bench2 = 'GPQA' 
df[f'{bench1} (log-ratio)'] = -np.log(1/df[bench1] - 1)
df[f'{bench2} (log-ratio)'] = -np.log(1/df[bench2] - 1)

In [23]:
color_map = {'Open': 'blue', 'Closed': 'darkorange'}

# Create the plot
fig = px.scatter(df, x=f'{bench1} (log-ratio)', y=f'{bench2} (log-ratio)', color='Open/Closed',
                 title=f'{bench1} vs. {bench2} for Open and Closed Models',
                 labels={f'{bench1} (log-ratio)': f'{bench1} Score', f'{bench2} (log-ratio)': f'{bench2} Score', 'Date': 'Date', 'System': 'Model'},
                 hover_data=['System', 'Date', f'{bench1}', f'{bench2}'],
                 color_discrete_map=color_map)

# x limits
# fig.update_xaxes(range=[0.6, 1])
fig.update_xaxes(range=[0.5, 2.5])

# Improve the layout
fig.update_layout(
    width=800,
    height=400,
    xaxis_title=f"{bench1} (log-ratio)",
    yaxis_title=f"{bench2} (log-ratio)",
    legend_title="Model accessibility",
    font=dict(size=12),
    hovermode="closest"
)

# Show the plot
fig.show()

In [24]:
mmlu_gqpa_df = df.dropna(subset=[f'{bench1} (log-ratio)', f'{bench2} (log-ratio)'])
largest_diffs = (mmlu_gqpa_df[f'{bench1} (log-ratio)'] - mmlu_gqpa_df[f'{bench2} (log-ratio)']).sort_values(ascending=False).index
for idx in largest_diffs:
    print(df.loc[idx, 'System'], df.loc[idx, f'{bench1} (log-ratio)'] - df.loc[idx, f'{bench2} (log-ratio)'])

Random chance 0.0


In [25]:
# # Fit regression to bench1 vs. bench2
# model = fit_ols_regression(df, [f'{bench1} (log-ratio)'], f'{bench2} (log-ratio)')
# model.summary()
# # Calculate residuals
# residuals = pd.Series(model.resid, index=df.dropna(subset=[f'{bench1} (log-ratio)', f'{bench2} (log-ratio)']).index)
# # Find the n models with the largest residuals
# n = 10  # Number of models with largest residuals to find

# # Sort the residuals by absolute value in descending order
# largest_residuals = residuals.sort_values(ascending=True)

# # Get the indices of the n largest residuals
# largest_residual_indices = largest_residuals.head(n).index

# # Print the models with the largest residuals
# print(f"The {n} models with the largest negative residuals:")
# for idx in largest_residual_indices:
#     model_name = df.loc[idx, 'System']
#     residual_value = residuals[idx]
#     bench1_score = df.loc[idx, f'{bench1} (log-ratio)']
#     bench2_score = df.loc[idx, f'{bench2} (log-ratio)']
#     print(f"{model_name}: Residual = {residual_value:.4f}, {bench1} = {bench1_score:.4f}, {bench2} = {bench2_score:.4f}")


# LMSys leaderboard

In [26]:
# From 
lmsys_leaderboard_bootstrap_elo_lu = pd.read_csv('https://docs.google.com/spreadsheets/d/12zpanuQ1Vf_ZsZ6yjIUwsN7uGPBv3ChLnEOH-g9yZDA/export?format=csv')
lmsys_leaderboard_bootstrap_elo_lu = lmsys_leaderboard_bootstrap_elo_lu.iloc[:, 1:]  # remove first column which is meaningless
lmsys_leaderboard_bootstrap_elo_lu

Unnamed: 0,gpt-4o-2024-05-13,gpt-4o-mini-2024-07-18,claude-3-5-sonnet-20240620,gemini-advanced-0514,llama-3.1-405b-instruct,gemini-1.5-pro-api-0514,gemini-1.5-pro-api-0409-preview,gpt-4-turbo-2024-04-09,gpt-4-1106-preview,claude-3-opus-20240229,athene-70b-0725,gpt-4-0125-preview,llama-3.1-70b-instruct,yi-large-preview,gemini-1.5-flash-api-0514,gemma-2-27b-it,yi-large,nemotron-4-340b-instruct,bard-jan-24-gemini-pro,glm-4-0520,llama-3-70b-instruct,claude-3-sonnet-20240229,reka-core-20240501,command-r-plus,gemma-2-9b-it,qwen2-72b-instruct,gpt-4-0314,glm-4-0116,qwen-max-0428,claude-3-haiku-20240307,deepseek-coder-v2,llama-3.1-8b-instruct,reka-flash-preview-20240611,gpt-4-0613,qwen1.5-110b-chat,yi-1.5-34b-chat,mistral-large-2402,reka-flash-21b-20240226-online,llama-3-8b-instruct,claude-1,command-r,mistral-medium,reka-flash-21b-20240226,qwen1.5-72b-chat,mixtral-8x22b-instruct-v0.1,claude-2.0,gemini-pro-dev-api,zephyr-orpo-141b-A35b-v0.1,qwen1.5-32b-chat,mistral-next,phi-3-medium-4k-instruct,starling-lm-7b-beta,claude-2.1,gpt-3.5-turbo-0613,mixtral-8x7b-instruct-v0.1,claude-instant-1,yi-34b-chat,gemini-pro,qwen1.5-14b-chat,gpt-3.5-turbo-0314,wizardlm-70b,gpt-3.5-turbo-0125,dbrx-instruct-preview,phi-3-small-8k-instruct,tulu-2-dpo-70b,llama-2-70b-chat,openchat-3.5-0106,vicuna-33b,snowflake-arctic-instruct,starling-lm-7b-alpha,gemma-1.1-7b-it,nous-hermes-2-mixtral-8x7b-dpo,llama2-70b-steerlm-chat,pplx-70b-online,deepseek-llm-67b-chat,openchat-3.5,openhermes-2.5-mistral-7b,mistral-7b-instruct-v0.2,qwen1.5-7b-chat,gpt-3.5-turbo-1106,phi-3-mini-4k-instruct,llama-2-13b-chat,dolphin-2.2.1-mistral-7b,solar-10.7b-instruct-v1.0,phi-3-mini-4k-instruct-june-2024,wizardlm-13b,zephyr-7b-beta,mpt-30b-chat,pplx-7b-online,zephyr-7b-alpha,codellama-34b-instruct,vicuna-13b,codellama-70b-instruct,gemma-7b-it,llama-2-7b-chat,phi-3-mini-128k-instruct,qwen-14b-chat,falcon-180b-chat,guanaco-33b,gemma-1.1-2b-it,stripedhyena-nous-7b,olmo-7b-instruct,mistral-7b-instruct,vicuna-7b,palm-2,gemma-2b-it,qwen1.5-4b-chat,koala-13b,chatglm3-6b,gpt4all-13b-snoozy,mpt-7b-chat,chatglm2-6b,RWKV-4-Raven-14B,alpaca-13b,oasst-pythia-12b,chatglm-6b,fastchat-t5-3b,stablelm-tuned-alpha-7b,dolly-v2-12b,llama-13b
0,1286.563455,1279.608815,1274.564379,1266.819548,1263.930629,1262.800113,1256.751200,1259.245346,1251.302638,1246.760897,1246.975827,1245.403904,1231.261502,1240.579372,1227.633631,1217.333304,1214.194600,1213.808952,1207.248663,1209.357038,1207.080257,1202.432095,1201.256188,1191.611806,1190.367829,1185.927943,1186.314345,1185.426038,1181.411865,1177.081150,1183.789524,1161.297640,1165.446680,1162.079717,1162.169726,1155.032525,1157.033520,1154.975771,1152.449178,1147.384927,1150.788865,1147.545597,1148.235700,1146.857984,1145.494533,1134.056454,1130.061195,1124.566277,1125.848089,1122.868939,1125.179143,1120.464656,1118.266832,1117.700619,1114,1110.580958,1113.647393,1110.270015,1107.595299,1108.959096,1104.118028,1106.121387,1101.942314,1104.556936,1103.714793,1095.159053,1094.081357,1089.516859,1091.639150,1091.191383,1085.533770,1075.749723,1078.874043,1078.454370,1074.880985,1073.584866,1073.645051,1073.257736,1068.678238,1064.406846,1067.189825,1061.298298,1057.464642,1061.934840,1061.155330,1058.210607,1055.467933,1047.889266,1041.020906,1038.351736,1040.433205,1042.904689,1032.235059,1035.848555,1036.705633,1039.397830,1033.082207,1028.477258,1026.970100,1024.312538,1018.190363,1016.452466,1011.145134,1001.019073,1000.320046,995.256003,988.437218,958.395166,961.292969,935.567166,924.081608,930.543839,918.995447,901.694532,895.192400,883.743569,867.977489,837.073711,818.981457,808.397845
1,1288.428940,1287.494945,1274.828503,1268.867005,1263.724865,1263.094929,1258.320223,1258.824223,1252.959457,1250.215882,1255.401278,1247.008432,1243.378408,1241.854920,1231.290454,1220.257022,1217.498630,1213.345389,1208.149339,1209.489786,1209.536015,1203.785052,1203.180498,1191.145427,1189.836713,1189.163734,1189.473124,1184.030035,1186.551048,1181.552980,1180.795775,1170.386280,1168.064371,1164.614780,1162.841517,1161.812256,1158.080792,1158.082067,1155.771638,1152.626911,1150.815794,1150.946367,1152.278263,1150.088000,1149.630338,1130.605580,1130.770938,1121.721557,1126.291162,1126.190343,1123.979127,1120.687725,1123.000312,1119.758044,1114,1114.388438,1112.908520,1108.390880,1109.701183,1107.390717,1106.036885,1105.283598,1103.151582,1105.121552,1108.542304,1093.167265,1093.835343,1094.470570,1094.043259,1096.638123,1085.669618,1083.184865,1088.394615,1081.484753,1082.906614,1078.653575,1080.532743,1073.150662,1067.927409,1068.916223,1066.045766,1064.819131,1065.078744,1066.149150,1066.060671,1059.189647,1054.655136,1045.996247,1050.144587,1055.322338,1044.756744,1047.052509,1042.975007,1042.751925,1037.711333,1037.235678,1038.926405,1036.818103,1039.492831,1024.761184,1016.847924,1019.784560,1010.279884,1004.253308,1003.562116,992.770733,989.239430,960.512670,960.204147,942.278520,935.186934,920.658893,920.372902,902.614513,894.466683,882.769816,864.055190,840.495344,834.205668,808.046396
2,1287.656293,1281.667695,1275.934504,1268.043471,1261.739568,1261.318913,1258.466691,1258.400212,1254.262636,1250.576760,1246.037378,1248.277625,1238.777689,1242.003215,1228.492371,1220.253146,1209.163981,1210.517801,1207.819440,1209.243273,1208.669483,1203.928452,1199.711839,1191.235717,1188.201027,1189.075528,1186.600366,1185.472175,1186.036211,1180.461023,1180.107022,1168.886948,1168.169288,1163.786371,1164.175694,1156.756993,1159.512262,1159.252165,1152.752476,1153.238045,1149.769847,1148.737746,1149.340016,1147.793792,1146.109214,1135.361372,1132.280936,1130.917089,1127.380161,1129.383386,1122.401913,1120.083882,1120.486747,1120.927793,1114,1110.339377,1111.670031,1109.608192,1107.005540,1111.026944,1099.485290,1108.446341,1102.496194,1105.424916,1098.648240,1091.116669,1091.645717,1095.197897,1089.785447,1090.908224,1085.009069,1089.764802,1077.373511,1080.344500,1082.049753,1079.956788,1075.670486,1075.297875,1077.096929,1071.320717,1069.509829,1063.347451,1058.562476,1060.397113,1061.935243,1059.412612,1056.471247,1046.165581,1039.243801,1052.115476,1041.399497,1044.434255,1034.371684,1034.350937,1041.904191,1039.393170,1034.862011,1033.137269,1026.932354,1021.486455,1019.102282,1009.294663,1012.588262,1008.991545,1010.916887,998.212423,988.634205,968.564964,954.194597,932.693311,929.251400,923.474876,922.648498,902.220662,894.705996,882.850605,866.929706,847.122558,827.818429,800.490271
3,1284.869154,1279.653476,1269.833920,1270.960974,1269.859096,1260.875571,1257.971376,1257.228266,1251.218192,1249.384635,1244.765008,1246.027968,1238.600557,1239.962679,1228.630784,1218.799800,1212.911846,1211.234277,1209.285137,1210.573652,1207.450299,1201.099683,1199.440640,1190.454796,1188.274141,1184.662043,1187.350839,1185.008081,1185.153417,1179.772180,1184.085087,1163.342737,1165.334026,1161.692460,1160.507336,1158.821323,1158.486274,1157.530237,1152.332601,1147.531251,1147.717038,1151.031256,1144.683023,1149.042223,1145.919220,1129.836154,1134.054336,1122.036513,1127.344368,1128.076635,1123.985237,1120.106137,1116.721174,1117.081825,1114,1110.979453,1110.942297,1108.229144,1111.314813,1104.285139,1104.194076,1104.471132,1104.301859,1105.187795,1098.942758,1093.993862,1089.437287,1092.581283,1090.792032,1090.558495,1084.514059,1079.838068,1080.717154,1075.359586,1069.906938,1075.373450,1075.906527,1073.173593,1070.713625,1068.415809,1064.130233,1063.596582,1064.008896,1062.536891,1059.283383,1060.498858,1047.110291,1037.626396,1045.918317,1045.004082,1043.742484,1043.028532,1041.195840,1040.482656,1031.530894,1037.116565,1032.324942,1018.413425,1043.596433,1018.820958,1019.447519,1011.435419,1007.873946,1005.351139,1008.274944,981.027352,987.527531,962.717374,953.252610,928.720031,924.035959,922.679121,927.995020,907.170559,897.610422,886.338026,869.905058,842.907147,822.159108,802.589771
4,1285.265726,1282.100890,1268.621384,1266.144561,1262.945738,1260.335847,1257.264334,1257.701172,1248.990652,1247.745614,1241.989530,1244.775551,1231.432171,1238.246970,1227.890128,1221.264818,1214.001984,1209.574150,1207.965499,1206.981239,1206.862517,1200.554215,1199.229945,1190.118195,1189.650062,1188.981683,1188.262552,1183.628132,1183.167295,1179.228117,1176.895123,1170.103369,1167.408527,1161.406929,1159.339837,1155.862101,1156.826559,1155.167481,1151.384750,1150.252054,1150.857093,1145.536270,1150.636927,1147.848009,1149.199917,1129.311417,1132.015835,1133.264348,1126.027003,1125.655502,1118.314039,1118.583090,1118.315578,1116.620928,1114,1110.597214,1114.293543,1107.017358,1107.853728,1109.320549,1101.888021,1105.556206,1103.044421,1100.049960,1086.278982,1095.878779,1091.193824,1091.181549,1087.860187,1088.248529,1080.889541,1086.969971,1079.043105,1078.615681,1075.103461,1071.299177,1077.883659,1073.745500,1070.968598,1064.902141,1063.711598,1061.643682,1057.257394,1059.917499,1057.583164,1059.288105,1056.155742,1049.698794,1042.858053,1047.513699,1041.476461,1043.760525,1069.177812,1039.817852,1036.599989,1033.655240,1031.103429,1037.779820,1033.848920,1020.593736,1016.631851,1014.471014,1011.772623,1004.741483,1005.486372,990.251481,995.010960,964.579535,957.400553,928.635469,932.584454,915.986844,925.938795,907.229511,892.749966,885.482519,870.742351,844.870609,836.226871,792.792144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1285.964492,1276.413219,1272.279060,1265.682928,1259.271852,1259.780934,1257.705533,1255.287783,1253.316087,1249.462305,1250.118184,1245.375842,1252.647516,1239.915529,1227.620495,1218.651696,1211.545020,1207.682220,1210.141820,1204.587423,1205.735939,1201.079385,1198.207618,1189.860113,1190.357116,1187.135136,1185.059318,1186.587361,1181.296577,1177.659620,1181.427975,1175.960466,1164.753039,1161.005148,1160.537558,1158.381515,1157.696610,1154.418470,1150.211880,1148.857123,1149.222057,1149.353984,1150.141530,1147.971940,1143.913788,1133.137546,1129.704517,1125.623155,1125.412302,1124.062232,1124.193141,1119.683704,1120.310408,1117.017450,1114,1110.330399,1113.583789,1112.798789,1108.909717,1106.307139,1102.120233,1105.447671,1102.680158,1103.339253,1101.438952,1092.058102,1093.515583,1087.079109,1087.922039,1090.029873,1085.585950,1084.922269,1087.452048,1073.109523,1068.763540,1074.278033,1074.264552,1068.543131,1066.694309,1062.199029,1064.185115,1059.596985,1051.677943,1063.111074,1059.135590,1050.939258,1055.174922,1039.611099,1042.267166,1049.230080,1038.356785,1041.763333,1040.619725,1041.200207,1036.057344,1032.375117,1032.456126,1042.605066,1039.644623,1020.208496,1015.742017,1013.278049,1011.219028,999.489159,1004.290663,991.082586,979.422024,964.184412,961.178382,928.156296,923.560224,923.189484,917.675145,905.453634,895.762404,881.942653,870.194162,848.375467,827.956177,791.777917
96,1285.769193,1280.955978,1272.596400,1265.313367,1265.510474,1259.210643,1257.968203,1255.820464,1249.073128,1246.840205,1248.864684,1244.971098,1240.878107,1237.570974,1227.730787,1218.271354,1209.082470,1207.717429,1206.828475,1206.673720,1204.760525,1200.021807,1199.005871,1189.549841,1189.415928,1184.640932,1183.458792,1181.262421,1183.821888,1176.286955,1177.474449,1161.310047,1164.512898,1160.857644,1160.924131,1156.997644,1155.368584,1154.275994,1149.693538,1144.853220,1148.288975,1146.745783,1146.389162,1142.855488,1140.776065,1134.534485,1125.743917,1122.442549,1126.576043,1117.182518,1120.823818,1121.777720,1117.685108,1113.398760,1114,1113.086874,1112.723950,1110.353686,1106.863958,1112.256960,1103.197110,1106.587069,1103.709350,1101.480461,1102.521706,1092.263812,1090.381233,1089.299067,1087.140232,1088.219806,1081.413733,1078.155965,1082.225270,1072.366092,1071.946436,1079.124883,1075.476230,1072.932327,1071.653879,1068.050034,1063.126556,1060.771969,1057.420483,1065.124282,1062.426569,1065.495814,1049.206388,1037.838931,1052.143627,1053.637448,1041.572334,1043.138570,1046.340800,1039.589984,1036.724042,1036.728804,1035.662035,1052.933881,1025.613707,1022.293246,1017.016519,1014.549168,1002.719665,1003.808947,1006.702860,985.348666,988.125134,961.875379,966.552303,934.972419,926.643587,917.478913,922.521962,902.473931,895.742491,876.732514,866.833976,841.230148,817.499211,799.746847
97,1283.361066,1279.543617,1269.244543,1263.704350,1262.416331,1258.528302,1255.166021,1256.156088,1249.962949,1247.240214,1249.646910,1243.158755,1252.269916,1239.496695,1223.607557,1217.316512,1210.017320,1207.626649,1208.772795,1206.297166,1203.939334,1198.290028,1196.505838,1187.960666,1187.209511,1186.438610,1184.738362,1182.063752,1181.444273,1176.290060,1176.667900,1166.197302,1162.802299,1159.493347,1158.606328,1157.819564,1155.261558,1154.182377,1150.078304,1148.498421,1148.776952,1147.107220,1148.301851,1145.374800,1146.689503,1129.528061,1128.921999,1129.620424,1123.344304,1121.770127,1123.869207,1120.533932,1119.927405,1116.135594,1114,1109.574889,1107.355339,1109.017348,1110.160452,1104.311751,1105.146605,1104.287628,1099.036883,1096.249176,1095.174136,1092.086461,1086.812267,1090.077548,1087.700665,1089.557833,1081.418910,1075.821448,1072.473513,1073.559360,1074.939939,1071.456162,1072.278387,1074.323399,1067.928568,1066.441003,1061.275799,1060.728813,1065.066450,1060.804187,1060.983678,1061.216136,1047.494688,1042.569118,1042.877897,1035.095668,1041.454760,1036.700349,1053.289916,1032.363476,1035.374436,1034.376858,1036.146223,1025.716106,1018.320141,1019.712035,1013.910744,1012.707687,1005.623620,999.041388,1004.294641,987.795444,986.152845,959.990412,957.475318,932.008467,923.772273,912.598287,917.762886,891.914471,893.081979,872.800000,861.065138,833.636823,818.914647,789.664182
98,1286.871546,1280.301802,1271.749040,1270.644211,1260.150249,1261.494653,1257.594055,1259.429122,1253.718693,1249.689318,1248.366362,1245.335738,1242.584594,1239.887457,1229.943305,1219.401072,1215.448001,1211.245407,1213.453738,1209.117916,1207.854656,1201.675882,1200.578545,1192.812218,1185.016867,1190.748125,1187.424654,1182.844847,1185.628490,1179.083319,1175.649355,1170.255183,1168.387729,1163.159687,1163.933328,1161.138457,1158.592143,1160.312775,1151.866588,1149.048250,1151.188652,1148.916720,1152.341166,1148.155311,1143.723350,1131.789863,1133.717662,1128.542275,1127.078467,1128.155381,1123.313295,1121.256764,1117.939091,1117.144824,1114,1108.820095,1111.296699,1115.857732,1108.900136,1106.543053,1106.767638,1105.907799,1103.032689,1104.210949,1098.503599,1093.638908,1093.719780,1090.840654,1087.814365,1088.138792,1086.679868,1090.732792,1073.750852,1076.069125,1079.024451,1082.293418,1074.699697,1067.793315,1079.367565,1067.631884,1066.478246,1062.554079,1068.291620,1066.805434,1068.409008,1060.040650,1053.958715,1046.240626,1040.727926,1045.167644,1040.070551,1044.156526,1045.028801,1034.222833,1037.625508,1034.552688,1028.933469,1035.636142,1034.455969,1021.853963,1018.816126,1021.234925,1003.843048,1005.606502,1004.327026,992.801345,992.388475,967.243308,951.229480,934.487378,920.938684,935.274025,917.555680,900.601728,889.143356,884.380531,866.582044,840.679533,821.720833,802.801129


In [27]:
# Alphabetical order
lmsys_leaderboard_bootstrap_elo_lu.mean().round().sort_index()

RWKV-4-Raven-14B               922.0
alpaca-13b                     902.0
athene-70b-0725               1246.0
bard-jan-24-gemini-pro        1208.0
chatglm-6b                     880.0
                               ...  
yi-large                      1212.0
yi-large-preview              1240.0
zephyr-7b-alpha               1042.0
zephyr-7b-beta                1053.0
zephyr-orpo-141b-A35b-v0.1    1126.0
Length: 120, dtype: float64

In [28]:
# Descending order of Elo
lmsys_leaderboard_bootstrap_elo_lu.mean().round().sort_values(ascending=False)

gpt-4o-2024-05-13             1286.0
gpt-4o-mini-2024-07-18        1280.0
claude-3-5-sonnet-20240620    1271.0
gemini-advanced-0514          1266.0
llama-3.1-405b-instruct       1262.0
                               ...  
chatglm-6b                     880.0
fastchat-t5-3b                 869.0
stablelm-tuned-alpha-7b        840.0
dolly-v2-12b                   823.0
llama-13b                      799.0
Length: 120, dtype: float64

## Trust in evaluations

In [29]:
# Models whose evaluations we have a concrete reason to distrust
list(df[df['Trust in benchmark results'] < 0]['System'])

['DBRX-Instruct',
 'DeepSeek-67B',
 'Falcon 180B',
 'Gemma 2 27B',
 'gpt-4-0125-preview',
 'gpt-4-0613',
 'Llama 3 8B',
 'Mistral-7B',
 'Mixtral8x22B',
 'Qwen2-72B',
 'Reka Core']

In [30]:
# Models whose evaluations we have a concrete reason to trust
list(df[df['Trust in benchmark results'] > 0]['System'])

['Claude 2',
 'Claude 3 Opus',
 'Claude 3 Sonnet',
 'Gemini 1.0 Pro',
 'Gemini 1.5 Pro (April 2024)',
 'GPT-3.5-turbo-16k',
 'GPT-4 (original)',
 'gpt-4-turbo-2024-04-09',
 'Llama 3 70B',
 'Mistral Large']

# Analysis

In [31]:
df['Before November 2023'] = ['Before' if pd.to_datetime(row['Date']) < pd.to_datetime('2023-11-01') else 'After' for i, row in df.iterrows()]

## Date

In [32]:
filtered_dfs = {}

for i, bench in enumerate(benchmarks_to_analyze):
  print(bench)
  filtered_df = df[~(df['System'] == 'Random chance')]
  if old_models_only:
    filtered_df = filtered_df[filtered_df['Before November 2023'] == 'Before']
  elif new_models_only:
    filtered_df = filtered_df[filtered_df['Before November 2023'] == 'After']
  if non_suspects_only:
    if bench == 'GPQA':
      # GPQA was released November 20, 2023
      old_df = filtered_df[filtered_df['Before November 2023'] == 'Before']
      new_df = filtered_df[filtered_df['Before November 2023'] == 'After']
      new_df = new_df[new_df['Trust in benchmark results'] >= 0]
      filtered_df = pd.concat([old_df, new_df])
    elif bench == 'MMLU':
      filtered_df = filtered_df[filtered_df['Trust in benchmark results'] >= 0]
  elif trusted_only:
    filtered_df = filtered_df[filtered_df['Trust in benchmark results'] > 0]

  # Find the max score over time for each category
  filtered_df = filtered_df.sort_values('Date')
  filtered_dfs[bench] = filtered_df

MMLU
GPQA
GSM1k
BBH


In [33]:
max_rows_by_bench = defaultdict(dict)
for i, bench in enumerate(benchmarks_to_analyze):
    print(bench)
    filtered_df = filtered_dfs[bench]

    # Find the max score over time for each category
    filtered_df = filtered_df.sort_values('Date')
    max_rows = defaultdict(list)
    for category in ['Open', 'Closed']:
        category_df = filtered_df[filtered_df['Open/Closed'] == category].dropna(subset=[bench])
        max_score = 0
        for i, row in category_df.iterrows():
            score = -np.log(1 - row[bench])
            if score > max_score:
                # Models released on the same date: only include the maximum of those models
                # Example: Claude 3 Sonnet vs. Opus
                max_score = score
                max_row = {'Date': row['Date'], 'System': row['System'], 'Score': score}
                if len(max_rows[category]) > 0 and max_rows[category][-1]['Date'] == row['Date']:
                    max_rows[category][-1] = max_row
                else:
                    max_rows[category].append(max_row)

        print(f"{category} models:")
        for row in max_rows[category]:
            print(row['System'], row['Date'], row['Score'])
        print()

    max_rows_by_bench[bench] = max_rows

MMLU
Open models:
T5-Small 2019-10-23 00:00:00 0.3106095770954856
GPT-NeoX 20B 2022-02-09 00:00:00 0.4094731295057033
OPT-66B 2022-05-02 00:00:00 0.44613086483417935
GLM 130B 2022-10-05 00:00:00 0.5942072327050416
LLaMa-1 65B 2023-02-24 00:00:00 1.0051219455807707
LLaMa-2 70B 2023-07-18 00:00:00 1.1679623668029027
Yi-34B 2023-11-02 00:00:00 1.4418070710501492
Llama 3 70B 2024-04-18 00:00:00 1.7147984280919264
Llama 3.1 405B 2024-07-23 00:00:00 2.0635681925235456

Closed models:
text-davinci-001 2020-05-28 00:00:00 0.5058380822549516
Gopher 280B 2021-12-08 00:00:00 0.916290731874155
code-davinci-002 2022-03-01 00:00:00 1.1457038962019603
PaLM 540B 2022-04-04 00:00:00 1.2482730632225159
GPT-4 (original) 2023-03-15 00:00:00 1.995100393246085
Claude 3 Opus 2024-03-04 00:00:00 2.0249533563957662
GPT-4o 2024-05-13 00:00:00 2.05572501506252
Claude 3.5 Sonnet 2024-06-20 00:00:00 2.1803674602697964

GPQA
Open models:
Llama-2-70b-chat-hf 2023-07-18 00:00:00 0.28835569995451654
deepseek-coder 202

In [34]:
max_rows_by_bench

defaultdict(dict,
            {'MMLU': defaultdict(list,
                         {'Open': [{'Date': Timestamp('2019-10-23 00:00:00'),
                            'System': 'T5-Small',
                            'Score': 0.3106095770954856},
                           {'Date': Timestamp('2022-02-09 00:00:00'),
                            'System': 'GPT-NeoX 20B',
                            'Score': 0.4094731295057033},
                           {'Date': Timestamp('2022-05-02 00:00:00'),
                            'System': 'OPT-66B',
                            'Score': 0.44613086483417935},
                           {'Date': Timestamp('2022-10-05 00:00:00'),
                            'System': 'GLM 130B',
                            'Score': 0.5942072327050416},
                           {'Date': Timestamp('2023-02-24 00:00:00'),
                            'System': 'LLaMa-1 65B',
                            'Score': 1.0051219455807707},
                           {'Date': Ti

### Estimate lag by area between curves

In [35]:
max_dfs_for_auc = defaultdict(dict)
for i, bench in enumerate(benchmarks_to_analyze):
    print(bench)
    max_rows = max_rows_by_bench[bench]

    if max_rows['Open'][-1]['Score'] < max_rows['Closed'][-1]['Score']:
        lead_category = 'Closed'
        lag_category = 'Open'
    else:
        lead_category = 'Open'
        lag_category = 'Closed'

    lead_rows = [row for row in max_rows[lead_category]]
    lag_rows = [row for row in max_rows[lag_category]]
    # Start scores from the random chance baseline
    lead_rows.insert(0, {'Date': lead_rows[0]['Date'], 'System': 'Random chance', 'Score': df[df['System'] == 'Random chance'][bench].values[0]})
    lag_rows.insert(0, {'Date': lag_rows[0]['Date'], 'System': 'Random chance', 'Score': df[df['System'] == 'Random chance'][bench].values[0]})
    # We don't know how long it will take the lag category to catch up from now
    # So truncate the lag category to the last score that surpassed the lead category,
    # and truncate the lead category to that score too
    lag_row_to_truncate_to = None
    for lead_row in lead_rows[::-1]:
        if lag_row_to_truncate_to is not None:
            break
        if lead_row['Score'] < lag_rows[-1]['Score']:
            for i, lag_row in enumerate(lag_rows):
                if lag_row['Score'] > lead_row['Score']:
                    lag_rows = lag_rows[:i+1]
                    lag_row_to_truncate_to = lag_row
                    break

    next_lead_row = [row for row in lead_rows if row['Score'] > lag_row_to_truncate_to['Score']][0]
    lead_rows = [row for row in lead_rows if row['Score'] <= lag_row_to_truncate_to['Score']]
    lead_rows.append({'Date': lead_rows[-1]['Date'], 'System': f'{next_lead_row["System"]} (Truncated)', 'Score': lag_row_to_truncate_to['Score']})

    max_dfs_for_auc[bench][lead_category] = pd.DataFrame(lead_rows)
    max_dfs_for_auc[bench][lag_category] = pd.DataFrame(lag_rows)


MMLU
GPQA
GSM1k
BBH


In [36]:
max_dfs_for_auc['MMLU']['Closed']

Unnamed: 0,Date,System,Score
0,2020-05-28,Random chance,0.25
1,2020-05-28,text-davinci-001,0.505838
2,2021-12-08,Gopher 280B,0.916291
3,2022-03-01,code-davinci-002,1.145704
4,2022-04-04,PaLM 540B,1.248273
5,2023-03-15,GPT-4 (original),1.9951
6,2024-03-04,Claude 3 Opus,2.024953
7,2024-05-13,GPT-4o,2.055725
8,2024-05-13,Claude 3.5 Sonnet (Truncated),2.063568


In [37]:
# Plot time on y axis and score on x axis
bench = 'GPQA'
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=max_dfs_for_auc[bench]['Closed']['Score'],
        y=max_dfs_for_auc[bench]['Closed']['Date'], 
        mode='lines+markers',
        marker=dict(color='orange'),
        text=max_dfs_for_auc[bench]['Closed']['System'],
        textposition='bottom center',
        name='Closed',
        line=dict(shape='vh')
    )
)
fig.add_trace(
    go.Scatter(
        x=max_dfs_for_auc[bench]['Open']['Score'],
        y=max_dfs_for_auc[bench]['Open']['Date'],
        mode='lines+markers',
        marker=dict(color='blue'),
        text=max_dfs_for_auc[bench]['Open']['System'],
        textposition='bottom center',
        name='Open',
        line=dict(shape='vh')
    )
)
fig.update_layout(
    width=650,
    height=600,
    title=f"{bench} curves for AUC calculation",
    xaxis_title="Accuracy",
    yaxis_title="Date",
)
fig.show()

In [38]:
max_dfs_for_auc[bench][category]

Unnamed: 0,Date,System,Score
0,2023-06-13,Random chance,0.25
1,2023-06-13,gpt-4-0613,0.397918
2,2023-07-11,claude-2.0,0.434675
3,2023-11-06,gpt-4-1106-preview,0.553825
4,2024-02-29,claude-3-opus-20240229,0.651598
5,2024-05-13,gpt-4o-2024-05-13,0.673939
6,2024-05-13,claude-3-5-sonnet-20240620 (Truncated),0.711496


In [39]:
# We estimate the mean date lag using integrals
# First, calculate the area under each date(performance) function
integrals = defaultdict(lambda: defaultdict(list))
auc_lags = {}
for bench in benchmarks_to_analyze:
    integrals[bench] = {}
    for category in ['Open', 'Closed']:
        integral = 0
        max_df = max_dfs_for_auc[bench][category]
        for i, row in max_df.iterrows():
            if i == 0:
                continue
            # Measure date in months since epoch
            date_diff = (row['Date'] - datetime(1970, 1, 1)).days/30
            score_diff = row['Score'] - max_df.iloc[i-1]['Score']
            integral += score_diff * date_diff
        integrals[bench][category] = integral
    # Now take the difference between the two integrals and divide by the score range
    # We made the score range the same for each category, so use 'Open' arbitrarily
    score_range = max_dfs_for_auc[bench]['Open'].iloc[-1]['Score'] - max_dfs_for_auc[bench]['Open'].iloc[0]['Score']
    print(f"Score range for {bench}: {score_range:.2f}")
    mean_lag = (integrals[bench]['Open'] - integrals[bench]['Closed']) / score_range
    auc_lags[bench] = mean_lag
    print(f"Lag for {bench}: {mean_lag:.0f} months")

Score range for MMLU: 1.81
Lag for MMLU: 14 months
Score range for GPQA: 0.46
Lag for GPQA: 3 months
Score range for GSM1k: 3.02
Lag for GSM1k: 5 months
Score range for BBH: 0.83
Lag for BBH: 7 months


### Estimate lag by discrete catch-up times

In [40]:
discrete_lags = defaultdict(lambda: defaultdict(list))
for i, bench in enumerate(benchmarks_to_analyze):
  print(bench)
  max_rows = max_rows_by_bench[bench]

  noise_tolerance = 0.02  # nats - 0.02 roughly means that error rates have a relative diff of <2%
  already_matched = set()
  open_winner_score = 0
  for closed_row in max_rows['Closed']:
    if closed_row['Score'] <= open_winner_score + noise_tolerance:
      # Skip if there has already been a better open model
      continue
    for open_row in max_rows['Open']:
      if open_row['Score'] > (closed_row['Score'] - noise_tolerance) and open_row['Date'] not in already_matched:
        open_winner_score = open_row['Score']
        lag_months = (open_row['Date'] - closed_row['Date']).days/365*12
        closed_acc = 100 * (1 - np.exp(-closed_row['Score']))
        open_acc = 100 * (1 - np.exp(-open_row['Score']))
        print(f"{open_row['System']} at {open_acc:.2f}% matched or exceeded " + 
              f"{closed_row['System']} at {closed_acc:.2f}% after " + 
              f"{lag_months:.1f} months")
        # already_matched.add(open_row['Date'])
        discrete_lags[bench]['Closed system'].append(closed_row['System'])
        discrete_lags[bench]['Closed performance level'].append(closed_acc)
        discrete_lags[bench]['Closed date'].append(closed_row['Date'])
        discrete_lags[bench]['Open system'].append(open_row['System'])
        discrete_lags[bench]['Open performance level'].append(open_acc)
        discrete_lags[bench]['Open date'].append(open_row['Date'])
        discrete_lags[bench]['Lag (months)'].append(lag_months)

        break

  print(f"{bench} mean lag: {np.mean(discrete_lags[bench]['Lag (months)']):.1f} months\n")

discrete_lags

MMLU
GLM 130B at 44.80% matched or exceeded text-davinci-001 at 39.70% after 28.3 months
LLaMa-1 65B at 63.40% matched or exceeded Gopher 280B at 60.00% after 14.6 months
LLaMa-2 70B at 68.90% matched or exceeded code-davinci-002 at 68.20% after 16.6 months
Yi-34B at 76.35% matched or exceeded PaLM 540B at 71.30% after 19.0 months
Llama 3.1 405B at 87.30% matched or exceeded GPT-4 (original) at 86.40% after 16.3 months
MMLU mean lag: 18.9 months

GPQA
deepseek-coder at 43.03% matched or exceeded gpt-4-0613 at 32.83% after 4.6 months
Meta-Llama-3.1-405B-Instruct at 50.91% matched or exceeded claude-3-opus-20240229 at 47.88% after 4.8 months
GPQA mean lag: 4.7 months

GSM1k
Mixtral-8x22B-instruct-v0.1 at 76.00% matched or exceeded gpt-3.5-turbo at 75.30% after 10.2 months
Llama 3 70B at 90.12% matched or exceeded claude-2.1 at 89.40% after 9.3 months
Llama 3.1 405B at 95.60% matched or exceeded gpt-4-0125-preview at 95.10% after 5.9 months
GSM1k mean lag: 8.4 months

BBH
GPT-NeoX 20B at 

defaultdict(<function __main__.<lambda>()>,
            {'MMLU': defaultdict(list,
                         {'Closed system': ['text-davinci-001',
                           'Gopher 280B',
                           'code-davinci-002',
                           'PaLM 540B',
                           'GPT-4 (original)'],
                          'Closed performance level': [39.7,
                           60.0,
                           68.2,
                           71.3,
                           86.4],
                          'Closed date': [Timestamp('2020-05-28 00:00:00'),
                           Timestamp('2021-12-08 00:00:00'),
                           Timestamp('2022-03-01 00:00:00'),
                           Timestamp('2022-04-04 00:00:00'),
                           Timestamp('2023-03-15 00:00:00')],
                          'Open system': ['GLM 130B',
                           'LLaMa-1 65B',
                           'LLaMa-2 70B',
                       

In [41]:
# Create the figure with 1x4 subplots
fig = make_subplots(
    rows=1, 
    cols=4, 
    subplot_titles=benchmarks_to_analyze,
    horizontal_spacing=0.1
)

# Loop through each benchmark to plot the discrete_lags
for idx, bench in enumerate(benchmarks_to_analyze):
    for i in range(len(discrete_lags[bench]['Closed performance level'])):
        fig.add_trace(go.Scatter(
            y=[discrete_lags[bench]['Closed performance level'][i], discrete_lags[bench]['Closed performance level'][i]],
            x=[0, discrete_lags[bench]['Lag (months)'][i]],
            mode='lines',
            line=dict(color='rgb(230, 230, 245)', width=2),
            showlegend=False,
        ), row=1, col=idx + 1)
    
    # Plot markers for Closed and Open
    fig.add_trace(go.Scatter(
        x=np.zeros_like(discrete_lags[bench]['Closed performance level']),
        y=discrete_lags[bench]['Closed performance level'],
        mode='markers',
        marker=dict(color=color_map['Closed'], size=10),
        name='Closed (new state-of-the-art)',
        text=discrete_lags[bench]['Closed system'],
        showlegend=(idx == 0),
    ), row=1, col=idx + 1)
    fig.add_trace(go.Scatter(
        x=discrete_lags[bench]['Lag (months)'],
        y=discrete_lags[bench]['Closed performance level'],
        mode='markers',
        marker=dict(color=color_map['Open'], size=10),
        name='Open (similar or better performance)',
        text=discrete_lags[bench]['Open system'],
        showlegend=(idx == 0),
    ), row=1, col=idx + 1)

    if idx == 0:
        fig.update_yaxes(
            title='Accuracy level (%)',
            row=1, col=idx + 1
        )
    else:
        fig.update_yaxes(
            showticklabels=False,
            row=1, col=idx + 1
        )
    fig.update_yaxes(
        range=[0, 100],
        showgrid=False,
        row=1, col=idx + 1
    )

fig.update_xaxes(
    range=[-5, 32],
    title='Lag (months)',
    showgrid=False,
)

fig.update_layout(
    width=600,
    height=400,
    title='Open models have lagged on benchmarks by 5 to 24 months',
    template='plotly_white',
    # showlegend=False,
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=1.2,
        xanchor='left',
        x=0,
    ),
    margin=dict(l=10, r=10, t=160, b=10),
)

if save:
    save_plot(fig, results_dir, f"benchmark_lags_{'_'.join(benchmarks_to_analyze)}_all")

# Show the plot
fig.show()

In [42]:
all_lags = []
for bench in benchmarks_to_analyze:
    all_lags.extend(discrete_lags[bench]['Lag (months)'])
all_lags = np.array(all_lags)
print(f"Mean lag: {np.mean(all_lags):.0f} months")
print(f"Median lag: {np.median(all_lags):.0f} months")
print(f"95% of lags are less than {np.percentile(all_lags, 95):.0f} months")
print(f"5% of lags are less than {np.percentile(all_lags, 5):.0f} months")


Mean lag: 13 months
Median lag: 15 months
95% of lags are less than 24 months
5% of lags are less than 5 months


In [43]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=all_lags, nbinsx=20, opacity=0.75))
fig.update_layout(
    title='Distribution of lag (months)',
    xaxis_title='Lag (months)',
    yaxis_title='Count',
    template='plotly_white',
)
fig.show()


In [44]:
# Weighted average of compute lag and benchmark lags
compute_lag = 15  # Check this number is up to date with compute_analysis.ipynb
# Compute lag is weighted as much as the benchmark lags combined
weights = np.concatenate([np.ones(len(all_lags)), [compute_lag]])
all_lags_with_compute = np.concatenate([all_lags, [compute_lag]])
weighted_avg_lag = np.sum(all_lags_with_compute * weights) / np.sum(weights)
print(f"Weighted average lag: {weighted_avg_lag:.0f} months")

Weighted average lag: 14 months


### Estimate lag by linear regression

In [45]:
fig = make_subplots(rows=2, cols=2, subplot_titles=benchmarks_to_analyze, vertical_spacing=0.2, horizontal_spacing=0.2)
for i, bench in enumerate(benchmarks_to_analyze):
  for category in ['Open', 'Closed']:
    category_df = pd.DataFrame(max_rows_by_bench[bench][category])

    fig.append_trace(
        go.Scatter(
            x=category_df['Date'],
            y=category_df['Score'],
            mode='markers',
            marker=dict(color=color_map[category]),
            text=category_df['System'],
            name=category,
            legendgroup=category,
            showlegend=True if i == 0 else False
        ),
        row=i//2 + 1, col=i%2 + 1
    )
fig.update_layout(
    width=600,
    height=400,
    margin=dict(l=10, r=10, t=70, b=10),
    font=dict(size=12),
    hovermode="closest",
)
fig.show()

In [46]:
reg_results = defaultdict(dict)
for i, bench in enumerate(benchmarks_to_analyze):
  for category in ['Open', 'Closed']:
    category_df = pd.DataFrame(max_rows_by_bench[bench][category])

    # Ignore the flatter part of the curve close to random chance
    random_chance_level = df[df['System'] == 'Random chance'][bench].values[0]
    thres = -np.log(1 - random_chance_level) + 0.2
    category_df = category_df[category_df['Score'] > thres]

    # Do regression
    category_df['year'] = datetime_to_float_year(category_df['Date'])
    results = fit_ols_regression(category_df, ['Score'], 'year')
    print(f"{bench} {category} R^2 of {results.rsquared:.2f}, equation: {results.params[0]:.2f} + {results.params[1]:.2f} * score")
    reg_results[bench][category] = results

MMLU Open R^2 of 0.98, equation: 2021.97 + 1.29 * score
MMLU Closed R^2 of 0.94, equation: 2019.56 + 2.19 * score
GPQA Open R^2 of 1.00, equation: 2021.09 + 4.88 * score
GPQA Closed R^2 of 0.78, equation: 2023.43 + 1.13 * score
GSM1k Open R^2 of 0.74, equation: 2023.49 + 0.40 * score
GSM1k Closed R^2 of 0.86, equation: 2022.74 + 0.44 * score
BBH Open R^2 of 0.77, equation: 2022.20 + 1.30 * score
BBH Closed R^2 of 0.85, equation: 2021.85 + 1.08 * score


In [47]:
# Find the average of the area between curves
for bench in benchmarks_to_analyze:
    open_slope = reg_results[bench]['Open'].params[1]
    open_intercept = reg_results[bench]['Open'].params[0]
    closed_slope = reg_results[bench]['Closed'].params[1]
    closed_intercept = reg_results[bench]['Closed'].params[0]

    random_chance_level = df[df['System'] == 'Random chance'][bench].values[0]
    min_score = -np.log(1 - random_chance_level)
    # print(f"Possible max scores: Open: {max_rows_by_bench[bench]['Open'][-1]['System']} at {max_rows_by_bench[bench]['Open'][-1]['Score']:.2f}, Closed: {max_rows_by_bench[bench]['Closed'][-1]['System']} at {max_rows_by_bench[bench]['Closed'][-1]['Score']:.2f}")
    max_score = min(max_rows_by_bench[bench]['Open'][-1]['Score'], max_rows_by_bench[bench]['Closed'][-1]['Score'])
    # print(f"Min score: {min_score:.2f}, max score: {max_score:.2f}")

    # Use the formula for the area between two lines
    avg_lag = 0.5 * (open_slope - closed_slope) * (min_score + max_score) + (open_intercept - closed_intercept)
    print(f"{bench}: {avg_lag*12:.0f} months")

MMLU: 16 months
GPQA: -6 months
GSM1k: 8 months
BBH: 7 months


## Sample the lag at fixed performance levels

In [48]:
# performance_levels = [35, 45, 55, 65, 75, 85, 95]
# performance_levels = [30, 40, 50, 60, 70, 80, 90]
performance_levels = list(range(35, 100, 5))

In [49]:
rows_that_passed = defaultdict(lambda: defaultdict(dict))
time_deltas = defaultdict(lambda: defaultdict(list))
for bench in benchmarks_to_analyze:
    for category in ['Open', 'Closed']:
        row_idx = 0
        for pl in performance_levels:
            while row_idx < len(max_rows_by_bench[bench][category]):
                row = max_rows_by_bench[bench][category][row_idx]
                if row['Score'] >= -np.log(1 - pl/100):
                    rows_that_passed[bench][category][pl] = row
                    break
                row_idx += 1
    for pl in performance_levels:
        if pl in rows_that_passed[bench]['Closed'] and pl in rows_that_passed[bench]['Open']:
            time_deltas[bench]['Closed performance level'].append(pl)
            open_row = rows_that_passed[bench]['Open'][pl]
            closed_row = rows_that_passed[bench]['Closed'][pl]
            td = (open_row['Date'] - closed_row['Date']).days/365*12
            time_deltas[bench]['Lag (months)'].append(td)
            time_deltas[bench]['Open system'].append(open_row['System'])
            time_deltas[bench]['Closed system'].append(closed_row['System'])

time_deltas

defaultdict(<function __main__.<lambda>()>,
            {'MMLU': defaultdict(list,
                         {'Closed performance level': [35,
                           40,
                           45,
                           50,
                           55,
                           60,
                           65,
                           70,
                           75,
                           80,
                           85],
                          'Lag (months)': [23.145205479452056,
                           9.895890410958906,
                           14.564383561643837,
                           14.564383561643837,
                           14.564383561643837,
                           14.564383561643837,
                           16.56986301369863,
                           18.96986301369863,
                           7.627397260273973,
                           13.150684931506849,
                           16.306849315068494],
                 

In [50]:
# Create the figure with 1x4 subplots
fig = make_subplots(
    rows=1, 
    cols=4, 
    shared_yaxes=True, 
    subplot_titles=benchmarks_to_analyze
)

# Loop through each benchmark to plot the time_deltas
for idx, bench in enumerate(benchmarks_to_analyze):
    for i in range(len(time_deltas[bench]['Closed performance level'])):
        fig.add_trace(go.Scatter(
            y=[time_deltas[bench]['Closed performance level'][i], time_deltas[bench]['Closed performance level'][i]],
            x=[0, time_deltas[bench]['Lag (months)'][i]],
            mode='lines',
            line=dict(color='rgb(230, 230, 245)', width=2),
            showlegend=False,
        ), row=1, col=idx + 1)
    
    # Plot markers for Closed and Open
    fig.add_trace(go.Scatter(
        y=time_deltas[bench]['Closed performance level'],
        x=np.zeros_like(time_deltas[bench]['Closed performance level']),
        mode='markers',
        marker=dict(color=color_map['Closed'], size=10),
        name='First closed model',
        text=time_deltas[bench]['Closed system'],
        showlegend=(idx == 0),
    ), row=1, col=idx + 1)
    fig.add_trace(go.Scatter(
        y=time_deltas[bench]['Closed performance level'],
        x=time_deltas[bench]['Lag (months)'],
        mode='markers',
        marker=dict(color=color_map['Open'], size=10),
        name='First open model',
        text=time_deltas[bench]['Open system'],
        showlegend=(idx == 0),
    ), row=1, col=idx + 1)

fig.update_yaxes(
    tickvals=performance_levels,
    title='Accuracy threshold (%)',
    showgrid=False,
)
fig.update_xaxes(
    title='Lag (months)',
    showgrid=False,
)

fig.update_layout(
    width=800,
    height=400,
    title='Open models usually take 3 to 12 months longer to pass an accuracy threshold',
    yaxis=dict(tickvals=performance_levels, title=None),
    template='plotly_white',
    # showlegend=False,
    legend=dict(
        orientation='h',
        yanchor='top',
        y=-0.25,
        xanchor='left',
        x=0,
    )
)

if save:
    save_plot(fig, results_dir, f"benchmark_time_deltas_{'_'.join(benchmarks_to_analyze)}_all")

# Show the plot
fig.show()

In [51]:
all_lags = []
for bench in benchmarks_to_analyze:
    all_lags.extend(time_deltas[bench]['Lag (months)'])
all_lags = np.array(all_lags)
print(f"Mean lag: {np.mean(all_lags):.0f} months")
print(f"Median lag: {np.median(all_lags):.0f} months")
print(f"75% of lags are less than {np.percentile(all_lags, 75):.0f} months")
print(f"25% of lags are less than {np.percentile(all_lags, 25):.0f} months")


Mean lag: 8 months
Median lag: 6 months
75% of lags are less than 14 months
25% of lags are less than 1 months


In [52]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=all_lags, nbinsx=20, opacity=0.75))
fig.update_layout(
    title='Distribution of lag (months)',
    xaxis_title='Lag (months)',
    yaxis_title='Count',
    template='plotly_white',
)
fig.show()


In [53]:
# Weighted average of compute lag and benchmark lags
compute_lag = 15  # Check this number is up to date with compute_analysis.ipynb
# Compute lag is weighted as much as the benchmark lags combined
weights = np.concatenate([np.ones(len(all_lags)), [compute_lag]])
all_lags_with_compute = np.concatenate([all_lags, [compute_lag]])
weighted_avg_lag = np.sum(all_lags_with_compute * weights) / np.sum(weights)
print(f"Weighted average lag: {weighted_avg_lag:.0f} months")

Weighted average lag: 10 months


## Plot benchmarks over time

### All together

In [54]:
fig = make_subplots(rows=2, cols=2, subplot_titles=benchmarks_to_analyze, vertical_spacing=0.2, horizontal_spacing=0.2)

# Define x limits for each subplot
x_limits = {
  'MMLU': [datetime(2019, 1, 1), datetime(2025, 1, 1)],
  'BBH': [datetime(2019, 1, 1), datetime(2025, 1, 1)],
  'GSM1k': [datetime(2019, 1, 1), datetime(2025, 1, 1)],
  'GPQA': [datetime(2023, 1, 1), datetime(2025, 1, 1)],
  'SEAL Coding': [datetime(2023, 1, 1), datetime(2025, 1, 1)],
  'SEAL Math': [datetime(2023, 1, 1), datetime(2025, 1, 1)],
  'LMSys Elo': [datetime(2023, 1, 1), datetime(2025, 1, 1)],
}

for i, bench in enumerate(benchmarks_to_analyze):
  for category in ['Open', 'Closed']:
    category_df = pd.DataFrame(max_rows_by_bench[bench][category])

    fig.append_trace(
        go.Scatter(
            x=category_df['Date'],
            y=100 * (1 - np.exp(-category_df['Score'])),
            mode='lines',
            line=dict(color=color_map[category], shape='hv'),
            text=category_df['System'],
            name=category,
            legendgroup=category,
            showlegend=True if i == 0 else False
        ),
        row=i//2 + 1, col=i%2 + 1
    )

  # Update x and y axes for this subplot
  dtick = "M12" if bench in ["GPQA"] else "M24"
  fig.update_xaxes(
    title_text="Model publication date" if i//2 + 1 == 2 else None,
    range=[x_limits[bench][0], x_limits[bench][1]],  # Set x limits
    row=i//2 + 1,
    col=i%2 + 1,
    dtick=dtick,  # Set tick marks to every 12 months
    # tickformat="%Y",  # Display only the year
    # ticklabelmode="period",  # Ensure labels are centered on the year
    tickangle=0  # Make tick labels horizontal
  )

  if i%2 + 1 == 1:
    fig.update_yaxes(title_text="Accuracy (%)", row=i//2 + 1, col=i%2 + 1)

# Title
fig.update_layout(title_text="Open models trail closed models on key benchmarks")

# Improve the layout
fig.update_layout(
  width=600,
  height=400,
  legend_title="Model accessibility",
  font=dict(size=12),
  hovermode="closest",
  template="plotly_white",
)

# Margins
fig.update_layout(
  margin=dict(l=10, r=10, t=70, b=80)
)

footnote = None
if exclude_all_llamas:
    footnote = '*All Llama models excluded'
elif exclude_big_llama:
    footnote = '*Llama 3.1 405B excluded'
    
if footnote:
    fig.add_annotation(
        showarrow=False,
        text=footnote,
        font=dict(size=10), 
        xref="paper",
        x=0,
        yref="paper",
        y=-0.2,
        xanchor="left",
        yanchor="top",
    )

# Save the plot
if save:
  save_plot(fig, results_dir, f"benchmark_dates_{'_'.join(benchmarks_to_analyze)}")

# Show the plot
fig.show()

### One at a time

In [55]:
# Define x limits for each subplot
x_limits = {
  'MMLU': [datetime(2018, 1, 1), datetime(2027, 1, 1)],
  'BBH': [datetime(2018, 1, 1), datetime(2026, 1, 1)],
  'GSM1k': [datetime(2019, 1, 1), datetime(2027, 1, 1)],
  'GPQA': [datetime(2022, 1, 1), datetime(2026, 1, 1)],
  'SEAL Coding': [datetime(2023, 1, 1), datetime(2025, 1, 1)],
  'SEAL Math': [datetime(2023, 1, 1), datetime(2025, 1, 1)],
  'LMSys Elo': [datetime(2023, 1, 1), datetime(2025, 1, 1)],
}

for bench in benchmarks_to_analyze:
    fig = go.Figure()
    for category in ['Open', 'Closed']:
        category_df = pd.DataFrame(max_rows_by_bench[bench][category])

        fig.add_trace(go.Scatter(
            x=category_df['Date'],
            y=100 * (1 - np.exp(-category_df['Score'])),
            mode='lines+markers',
            line=dict(color=color_map[category], shape='hv', width=3),
            text=category_df['System'],
            name=category,
        ))

        # Annotate models where we want to highlight the lag
        for i in range(len(discrete_lags[bench]['Closed performance level'])):
            fig.add_trace(go.Scatter(
                x=[discrete_lags[bench][f'{category} date'][i]],
                y=[discrete_lags[bench][f'{category} performance level'][i]],
                mode='text',
                text=discrete_lags[bench][f'{category} system'][i],
                textposition=('top left' if category == 'Closed' else 'bottom right'),
                showlegend=False,
            ))


    # Update x and y axes for this subplot
    dtick = "M12"
    fig.update_xaxes(
        title_text="Model publication date" if i//2 + 1 == 2 else None,
        range=[x_limits[bench][0], x_limits[bench][1]],  # Set x limits
        dtick=dtick,  # Set tick marks to every 12 months
        # tickformat="%Y",  # Display only the year
        # ticklabelmode="period",  # Ensure labels are centered on the year
        tickangle=0,  # Make tick labels horizontal
    )
    fig.update_yaxes(title_text="Accuracy (%)")

    # Title
    fig.update_layout(title_text=f"{bench} accuracy over time")

    # Improve the layout
    fig.update_layout(
    width=600,
    height=400,
    legend_title="Model accessibility",
    font=dict(size=12),
    hovermode="closest",
    template="plotly_white",
    )

    footnote = None
    if exclude_all_llamas:
        footnote = '*All Llama models excluded'
    elif exclude_big_llama:
        footnote = '*Llama 3.1 405B excluded'
        
    if footnote:
        fig.add_annotation(
            showarrow=False,
            text=footnote,
            font=dict(size=10), 
            xref="paper",
            x=0,
            yref="paper",
            y=-0.2,
            xanchor="left",
            yanchor="top",
        )

    # Save the plot
    if save:
        save_plot(fig, results_dir, f"benchmark_dates_{bench}")

    # Show the plot
    fig.show()

## Pedagogical plot for measuring lag

In [56]:
fig = go.Figure()

# Define x limits for each subplot
x_limits = {
  'MMLU': [datetime(2022, 1, 1), datetime(2026, 1, 1)],
  'BBH': [datetime(2019, 1, 1), datetime(2025, 1, 1)],
  'GSM1k': [datetime(2019, 1, 1), datetime(2025, 1, 1)],
  'GPQA': [datetime(2023, 1, 1), datetime(2025, 1, 1)],
  'SEAL Coding': [datetime(2023, 1, 1), datetime(2025, 1, 1)],
  'SEAL Math': [datetime(2023, 1, 1), datetime(2025, 1, 1)],
  'LMSys Elo': [datetime(2023, 1, 1), datetime(2025, 1, 1)],
}

bench = 'MMLU'

closed_model = 'GPT-4 (original)'
open_model = 'Llama 3.1 405B'
closed_date = pd.to_datetime(df[df['System'] == closed_model]['Date'].values[0])
closed_score = df[df['System'] == closed_model][bench].values[0]
open_date = pd.to_datetime(df[df['System'] == open_model]['Date'].values[0])
open_score = df[df['System'] == open_model][bench].values[0]

# fig.add_trace(go.Scatter(
#     x=[closed_date],
#     y=[100*closed_score],
#     mode='markers+text',
#     marker=dict(size=10, color=color_map['Closed']),
#     showlegend=False,
# ))
# fig.add_trace(go.Scatter(
#     x=[open_date],
#     y=[100*open_score],
#     mode='markers+text',
#     marker=dict(size=10, color=color_map['Open']),
#     showlegend=False,
# ))
line_color = 'rgb(150, 150, 150)'
# Horizontal line at each score level
fig.add_hline(y=100 * closed_score, line_color=line_color, line_width=1)
# fig.add_hline(y=100 * open_score, line_dash="dash", line_color="black", line_width=1)
# Vertical line at each date
fig.add_vline(x=closed_date, line_color=line_color, line_width=1)
fig.add_vline(x=open_date, line_color=line_color, line_width=1)
# Horizontal line shape with arrows near the x-axis, in between the vertical lines
fig.add_shape(
    type='line',
    x0=closed_date,
    y0=25,
    x1=open_date,
    y1=25,
    line=dict(color=line_color, width=2),
)
# Annotate line shape with lag
example_lag = (open_date - closed_date).days / 365 * 12
fig.add_annotation(
    x=closed_date + (open_date - closed_date) / 2,
    y=28,
    text=f'{example_lag:.0f} months',
    xref="x",
    yref="y",
    showarrow=False,
)

for category in ['Open', 'Closed']:
    category_df = pd.DataFrame(max_rows_by_bench[bench][category])

    fig.add_trace(go.Scatter(
        x=category_df['Date'],
        y=100 * (1 - np.exp(-category_df['Score'])),
        mode='lines',
        line=dict(color=color_map[category], shape='hv', width=3),
        text=category_df['System'],
        name=category,
    ))

fig.add_annotation(
    x=closed_date,
    y=100 * closed_score,
    text=closed_model,
    xref="x",
    yref="y",
    showarrow=False,
    align="left",
    xanchor="right",
    yanchor="bottom",
)
fig.add_annotation(
    x=open_date,
    y=100 * open_score,
    text=open_model,
    xref="x",
    yref="y",
    showarrow=False,
    align="left",
    xanchor="left",
    yanchor="bottom",
)

# Update x and y axes for this subplot
dtick = "M12"
fig.update_xaxes(
    title_text="Model publication date" if i//2 + 1 == 2 else None,
    range=[x_limits[bench][0], x_limits[bench][1]],  # Set x limits
    dtick=dtick,  # Set tick marks to every 12 months
    # tickformat="%Y",  # Display only the year
    # ticklabelmode="period",  # Ensure labels are centered on the year
    tickangle=0,  # Make tick labels horizontal
)
fig.update_yaxes(title_text="MMLU accuracy (%)")

# Title
fig.update_layout(title_text="Measuring the lag of open models on benchmarks")

# Improve the layout
fig.update_layout(
  width=600,
  height=400,
  legend_title="Model accessibility",
  font=dict(size=12),
  hovermode="closest",
  template="plotly_white",
)

footnote = None
if exclude_all_llamas:
    footnote = '*All Llama models excluded'
elif exclude_big_llama:
    footnote = '*Llama 3.1 405B excluded'
    
if footnote:
    fig.add_annotation(
        showarrow=False,
        text=footnote,
        font=dict(size=10), 
        xref="paper",
        x=0,
        yref="paper",
        y=-0.2,
        xanchor="left",
        yanchor="top",
    )

# Save the plot
if save:
  save_plot(fig, results_dir, f"lag_example_{bench}")

# Show the plot
fig.show()

## Compute

In [57]:
fig = make_subplots(rows=2, cols=2, subplot_titles=benchmarks_to_analyze, vertical_spacing=0.15)

# Define x limits for each subplot
x_limits = {
  'MMLU': [1e20, 1e26],
  'GSM1k': [1e20, 1e26],
  'GPQA': [1e23, 1e26],
  'BBH': [1e20, 1e26],
  'SEAL Math': [1e23, 1e26],
  'SEAL Coding': [1e23, 1e26],
  'LMSys Elo': [1e22, 1e26],
}

for i, bench in enumerate(benchmarks_to_analyze):
  plot_df = df[~(df['System'] == 'Random chance')]
  if old_models_only:
    plot_df = plot_df[plot_df['Before November 2023'] == 'Before']
  elif new_models_only:
    plot_df = plot_df[plot_df['Before November 2023'] == 'After']
  if non_suspects_only:
    if bench == 'GPQA':
      # GPQA was released November 20, 2023
      old_df = plot_df[plot_df['Before November 2023'] == 'Before']
      new_df = plot_df[plot_df['Before November 2023'] == 'After']
      new_df = new_df[new_df['Trust in benchmark results'] >= 0]
      plot_df = pd.concat([old_df, new_df])
    elif bench == 'MMLU':
      plot_df = plot_df[plot_df['Trust in benchmark results'] >= 0]
  elif trusted_only:
    plot_df = plot_df[plot_df['Trust in benchmark results'] > 0]

  for category in ['Open', 'Closed']:
    category_df = plot_df[plot_df['Open/Closed'] == category]

    if plot_log_error and bench_is_accuracy[bench]:
      y = -np.log(1 - category_df[bench])
    else:
      y = 100*category_df[bench]

    if show_model_age:
      fig.append_trace(
        go.Scatter(
          x=category_df['Training compute (FLOP)'],
          y=y,
          mode='markers',
          marker=dict(color=category_df['Date'].astype(int),),
          text=category_df['System'],
          # name=category,
          # legendgroup=category,
          # showlegend=True if i == 0 else False
          showlegend=False,
        ),
        row=i//2 + 1, col=i%2 + 1
      )
    else:
      fig.append_trace(
        go.Scatter(
          x=category_df['Training compute (FLOP)'],
          y=y,
          mode='markers',
          marker=dict(color=color_map[category]),
          text=category_df['System'],
          name=category,
          legendgroup=category,
          showlegend=True if i == 0 else False
        ),
        row=i//2 + 1, col=i%2 + 1
      )

    # Plot scores as horizontal lines for models that don't have compute
    # for _, row in category_df.iterrows():
    #   if pd.isna(row['Training compute (FLOP)']) and not pd.isna(row[bench]):
    #     fig.append_trace(
    #       go.Scatter(
    #         x=[0, 1e26],
    #         y=[row[bench], row[bench]],
    #         mode='lines',
    #         line=dict(color=color_map[category]),
    #         showlegend=False,
    #       ),
    #       row=i//2 + 1, col=i%2 + 1
    #     )

  # Update x and y axes for this subplot
  fig.update_xaxes(
    title_text="Training compute (FLOP)" if i//2 + 1 == 2 else None,
    type='log',
    range=[np.log10(x_limits[bench][0]), np.log10(x_limits[bench][1])],  # Set x limits
    tickmode='linear',
    dtick=2,  # This sets ticks at every two powers of 10
    row=i//2 + 1,
    col=i%2 + 1
  )

  if i%2 + 1 == 1:
    fig.update_yaxes(title_text="Accuracy (%)", row=i//2 + 1, col=i%2 + 1)

# Improve the layout
fig.update_layout(
  template='plotly_white',
  width=600,
  height=400,
  # legend_title="Model accessibility",
  font=dict(size=12),
  hovermode="closest",
)

# Margins
fig.update_layout(
  margin=dict(l=0, r=0, t=20, b=0)
)

# Save the plot
if save:
  save_plot(fig, results_dir, 'benchmark_compute')

# Show the plot
fig.show()

In [58]:
bench = 'MMLU'

In [59]:
reg_df = df[~(df['System'] == 'Random chance')]
if non_suspects_only:
  if bench == 'GPQA':
    # GPQA was released November 20, 2023
    old_df = reg_df[reg_df['Before November 2023'] == 'Before']
    new_df = reg_df[reg_df['Before November 2023'] == 'After']
    new_df = new_df[new_df['Trust in benchmark results'] >= 0]
    reg_df = pd.concat([old_df, new_df])
  elif bench == 'MMLU':
    reg_df = reg_df[reg_df['Trust in benchmark results'] >= 0]
elif trusted_only:
  reg_df = reg_df[reg_df['Trust in benchmark results'] > 0]

In [60]:
random_chance_level = df.loc[df["System"] == "Random chance", bench].values[0]
# Filter out models that are not far above random chance level
# This is a heuristic to find the changepoint
filtered_reg_df = reg_df.loc[reg_df[bench] > random_chance_level + 0.05].copy()
filtered_reg_df['log_compute'] = np.log10(filtered_reg_df['Training compute (FLOP)'])
filtered_reg_df[bench + '_log_error'] = -np.log(1 - filtered_reg_df[bench])
filtered_reg_df.dropna(subset=['log_compute', bench + '_log_error'], inplace=True)

In [61]:
filtered_reg_df.loc[:, 'year'] = datetime_to_float_year(filtered_reg_df['Date'])

In [62]:
# Single fit for all data
model = fit_ols_regression(filtered_reg_df, ['log_compute', 'year'], bench + '_log_error')
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.879
Model:,OLS,Adj. R-squared:,0.874
Method:,Least Squares,F-statistic:,159.9
Date:,"Wed, 16 Oct 2024",Prob (F-statistic):,6.540000000000001e-21
Time:,17:48:59,Log-Likelihood:,17.859
No. Observations:,47,AIC:,-29.72
Df Residuals:,44,BIC:,-24.17
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-339.4976,61.982,-5.477,0.000,-464.414,-214.582
x1,0.4427,0.031,14.367,0.000,0.381,0.505
x2,0.1631,0.031,5.304,0.000,0.101,0.225

0,1,2,3
Omnibus:,0.864,Durbin-Watson:,1.19
Prob(Omnibus):,0.649,Jarque-Bera (JB):,0.356
Skew:,0.192,Prob(JB):,0.837
Kurtosis:,3.185,Cond. No.,5030000.0


In [63]:
def no_split(df, filter_threshold=None):
  return {'All': df}


def open_closed_split(df, filter_threshold=None):
  open_df = df[df['Open/Closed'] == 'Open']
  closed_df = df[df['Open/Closed'] == 'Closed']
  return {'Open': open_df, 'Closed': closed_df}


def new_old_split(df, date):
  new_df = df[df['Date'] >= date]
  old_df = df[df['Date'] < date]
  return {'Before': old_df, 'After': new_df}


def combined_rsquared(xs, ys, models):
    y_true = np.concatenate(ys)
    y_mean = np.mean(y_true)
    y_pred = []
    for i, model in enumerate(models):
        X = sm.add_constant(xs[i])
        y_pred.append(model.predict(X))
    y_pred = np.concatenate(y_pred)
    
    sst = np.sum((y_true - y_mean)**2)
    ssr = np.sum((y_true - y_pred)**2)
    r_squared = 1 - (ssr / sst)
    return r_squared


def combined_bic(xs, ys, models):
  y = np.concatenate(ys)
  y_pred = []
  total_params = 0
  for i, model in enumerate(models):
    X = sm.add_constant(xs[i])
    y_pred.append(model.predict(X))
    total_params += len(model.params)
  y_pred = np.concatenate(y_pred)
  n = len(y)
  rss = np.sum((y - y_pred)**2)
  ll = -n/2 * (1 + np.log(2*np.pi) + np.log(rss/n))
  bic = -2 * ll + total_params * np.log(n)
  return bic


# K-Fold Cross Validation
def perform_cross_validation(df, filter_fn, features, bench, k=10, random_state=42, filter_threshold=None):
  kf = KFold(n_splits=k, shuffle=True, random_state=random_state)
  folds_mses = []
  for train_index, test_index in kf.split(df):
    train_df, test_df = df.iloc[train_index], df.iloc[test_index]
    train_dfs = filter_fn(train_df, filter_threshold)
    test_dfs = filter_fn(test_df, filter_threshold)

    # Fit the models on the training set
    submodels = {}
    for category, train_df in train_dfs.items():
      model = fit_ols_regression(train_df, features, bench + '_log_error')
      submodels[category] = model

    # Predict on the test set
    residuals = []
    for i, (category, test_df) in enumerate(test_dfs.items()):
      predicted = get_predictions(submodels[category], test_df, features)
      residuals.append(predicted - test_df[bench + '_log_error'])
    residuals = np.concatenate(residuals)
    mse = np.mean(residuals**2)
    folds_mses.append(mse)

  return np.array(folds_mses)


def regression_with_results(df, filter_fn, features, bench, filter_threshold=None):
  dfs = filter_fn(df, filter_threshold)
  submodels = {category: fit_ols_regression(df, features, bench + '_log_error') for category, df in dfs.items()}
  mses = perform_cross_validation(df, filter_fn, features, bench, filter_threshold=filter_threshold)
  bic = combined_bic(
      [dfs[category][features] for category in dfs],
      [dfs[category][bench + '_log_error'] for category in dfs],
      [submodels[category] for category in dfs],
  )
  rsquared = combined_rsquared(
      [dfs[category][features] for category in dfs],
      [dfs[category][bench + '_log_error'] for category in dfs],
      [submodels[category] for category in dfs],
  )
  return {'mses': mses, 'bic': bic, 'rsquared': rsquared, 'submodels': submodels}


def boostrapped_regression_with_results(df, filter_fn, features, bench, filter_threshold=None):
  bootstrap_results = []
  for i in tqdm(range(1000)):
    resampled_df = df.sample(frac=1, replace=True, random_state=rng)
    bootstrap_results.append(regression_with_results(resampled_df, filter_fn, features, bench, filter_threshold))
  return bootstrap_results


In [73]:
bootstrap_results = {}
bootstrap_results['no_split'] = boostrapped_regression_with_results(filtered_reg_df, no_split, ['log_compute'], bench)
bootstrap_results['no_split_year'] = boostrapped_regression_with_results(filtered_reg_df, no_split, ['log_compute', 'year'], bench)
bootstrap_results['open_closed_split'] = boostrapped_regression_with_results(filtered_reg_df, open_closed_split, ['log_compute'], bench)
bootstrap_results['open_closed_split_year'] = boostrapped_regression_with_results(filtered_reg_df, open_closed_split, ['log_compute', 'year'], bench)

100%|██████████| 1000/1000 [00:08<00:00, 120.99it/s]
100%|██████████| 1000/1000 [00:08<00:00, 118.05it/s]
100%|██████████| 1000/1000 [00:19<00:00, 51.78it/s]
100%|██████████| 1000/1000 [00:19<00:00, 51.18it/s]


In [74]:
ci = [5, 95]
for split, result in bootstrap_results.items():
    print(f"Results for {split}:")
    bootstrapped_mses = np.array([r['mses'] for r in result])
    bootstrapped_mean_mses = np.mean(bootstrapped_mses, axis=1)
    print(f"Mean k-fold MSE: {bootstrapped_mean_mses.mean():.3f} ({np.percentile(bootstrapped_mean_mses, ci)})")
    print(f"BIC: {np.mean([r['bic'] for r in result]):.4f} ({np.percentile([r['bic'] for r in result], ci)})")
    print(f"R-squared: {np.mean([r['rsquared'] for r in result]):.4f} ({np.percentile([r['rsquared'] for r in result], ci)})")
    print("--------------------")

Results for no_split:
Mean k-fold MSE: 0.047 ([0.03033778 0.06507836])
BIC: -6.9397 ([-26.06251152   9.45057739])
R-squared: 0.7980 ([0.7128817  0.87572825])
--------------------
Results for no_split_year:
Mean k-fold MSE: 0.029 ([0.01878133 0.04001288])
BIC: -28.6030 ([-47.7132012  -12.58637313])
R-squared: 0.8830 ([0.83336796 0.92562929])
--------------------
Results for open_closed_split:
Mean k-fold MSE: 0.044 ([0.02539281 0.06481748])
BIC: -8.6134 ([-32.83192436  12.05078272])
R-squared: 0.8314 ([0.74367899 0.90694023])
--------------------
Results for open_closed_split_year:
Mean k-fold MSE: 0.032 ([0.01755491 0.04697699])
BIC: -26.6244 ([-49.19208404  -7.54448752])
R-squared: 0.9035 ([0.86248096 0.94016265])
--------------------


In [99]:
ci = [5, 95]
ci_width = ci[1] - ci[0]
results_list = []
splits_names = {
    'no_split': 'Compute only',
    'no_split_year': 'Compute + date',
    'open_closed_split': 'Compute + open/closed',
    'open_closed_split_year': 'Compute + date + open/closed'
}
for split, result in bootstrap_results.items():
    bootstrapped_mses = np.array([r['mses'] for r in result])
    bootstrapped_mean_mses = np.mean(bootstrapped_mses, axis=1)
    mean_mse = round(bootstrapped_mean_mses.mean(), 3)
    mean_mse_ci_lower = round(np.percentile(bootstrapped_mean_mses, ci[0]), 3)
    mean_mse_ci_upper = round(np.percentile(bootstrapped_mean_mses, ci[1]), 3)
    bic_mean = round(np.mean([r['bic'] for r in result]), 0)
    bic_ci_lower = round(np.percentile([r['bic'] for r in result], ci[0]), 0)
    bic_ci_upper = round(np.percentile([r['bic'] for r in result], ci[1]), 0)
    rsquared_mean = round(np.mean([r['rsquared'] for r in result]), 4)
    rsquared_ci_lower = round(np.percentile([r['rsquared'] for r in result], ci[0]), 3)
    rsquared_ci_upper = round(np.percentile([r['rsquared'] for r in result], ci[1]), 3)
    
    results_list.append({
        'Regress on': splits_names[split],
        'Mean k-fold MSE': mean_mse,
        f'Mean k-fold MSE {ci_width}% CI': (mean_mse_ci_lower, mean_mse_ci_upper),
        'BIC Mean': bic_mean,
        f'BIC {ci_width}% CI': (bic_ci_lower, bic_ci_upper),
        'R-squared Mean': rsquared_mean,
        f'R-squared {ci_width}% CI': (rsquared_ci_lower, rsquared_ci_upper)
    })

results_df = pd.DataFrame(results_list)
results_df

Unnamed: 0,Regress on,Mean k-fold MSE,Mean k-fold MSE 90% CI,BIC Mean,BIC 90% CI,R-squared Mean,R-squared 90% CI
0,Compute only,0.047,"(0.03, 0.065)",-7.0,"(-26.0, 9.0)",0.798,"(0.713, 0.876)"
1,Compute + date,0.029,"(0.019, 0.04)",-29.0,"(-48.0, -13.0)",0.883,"(0.833, 0.926)"
2,Compute + open/closed,0.044,"(0.025, 0.065)",-9.0,"(-33.0, 12.0)",0.8314,"(0.744, 0.907)"
3,Compute + date + open/closed,0.032,"(0.018, 0.047)",-27.0,"(-49.0, -8.0)",0.9035,"(0.862, 0.94)"


In [88]:
split1 = 'open_closed_split'
split2 = 'no_split_year'
for metric in ['bic', 'rsquared']:
  diff = np.array([r2[metric] - r1[metric] for r1, r2 in zip(bootstrap_results[split1], bootstrap_results[split2])])
  print(f"{split2} - {split1} {metric}: {diff.mean()} (90% CI: {np.percentile(diff, ci)})")


no_split_year - open_closed_split bic: -19.98952713167976 (90% CI: [-47.32488553   9.58365507])
no_split_year - open_closed_split rsquared: 0.05163173890035034 (90% CI: [-0.0390478   0.14801254])


In [71]:
# Without bootstrapping
results = {}
results['no_split'] = regression_with_results(filtered_reg_df, no_split, ['log_compute'], bench)
results['no_split_year'] = regression_with_results(filtered_reg_df, no_split, ['log_compute', 'year'], bench)
results['open_closed_split'] = regression_with_results(filtered_reg_df, open_closed_split, ['log_compute'], bench)
results['open_closed_split_year'] = regression_with_results(filtered_reg_df, open_closed_split, ['log_compute', 'year'], bench)
for split, result in results.items():
  print(f"Results for {split}:")
  print(f"Mean k-fold MSE: {np.mean(result['mses']):.3f} ({np.std(result['mses']):.3f})")
  print(f"BIC: {result['bic']:.4f}")
  print(f"R-squared: {result['rsquared']:.4f}")
  print("--------------------")

Results for no_split:
Mean k-fold MSE: 0.048 (0.031)
BIC: -4.7856
R-squared: 0.8018
--------------------
Results for no_split_year:
Mean k-fold MSE: 0.030 (0.026)
BIC: -24.1670
R-squared: 0.8791
--------------------
Results for open_closed_split:
Mean k-fold MSE: 0.047 (0.043)
BIC: -3.1751
R-squared: 0.8259
--------------------
Results for open_closed_split_year:
Mean k-fold MSE: 0.039 (0.031)
BIC: -16.2985
R-squared: 0.8882
--------------------


In [67]:
fig = go.Figure()

bench = 'MMLU'

for category, color in [('Open', 'blue'), ('Closed', 'orange')]:
  df_category = filtered_reg_df[filtered_reg_df['Open/Closed'] == category]
  fig.add_trace(
      go.Scatter(
          x=10**df_category['log_compute'],
          y=100*df_category[bench],
          mode='markers',
          name=f'{category}',
          marker=dict(color=color, opacity=0.5),
          text=df_category['System']
      ),
  )

fig.add_annotation(
    x=24,
    y=78.5,
    text="DeepSeek V2",
    xref="x",
    yref="y",
    showarrow=True,
    arrowhead=2,
    font=dict(size=10),
    align="left",
    xanchor="right",
    ax=-10,
    ay=-10
)
fig.add_annotation(
    x=24.87,
    y=78.3,
    text="PaLM 2",
    xref="x",
    yref="y",
    showarrow=True,
    arrowhead=2,
    font=dict(size=10),
    align="left",
    xanchor="left",
    ax=10,
    ay=10,
)
fig.add_annotation(
    x=23.64,
    y=71.3,
    text="Gemma 2 9B",
    xref="x",
    yref="y",
    showarrow=True,
    arrowhead=2,
    font=dict(size=10),
    align="left",
    xanchor="right",
    ax=-10,
    ay=-10
)
fig.add_annotation(
    x=24.4,
    y=71.3,
    text="PaLM 540B",
    xref="x",
    yref="y",
    showarrow=True,
    arrowhead=2,
    font=dict(size=10),
    align="left",
    xanchor="left",
    ax=10,
    ay=10
)
fig.add_shape(
    type="line",
    x0=10**24.0,
    y0=78.5,
    x1=10**24.87,
    y1=78.5,
    line=dict(color="black", width=1)
)
fig.add_annotation(
    x=(24.87 + 24.0) / 2,
    y=78.5 + 3,
    text="7x",
    showarrow=False,
    font=dict(size=10),
    align="center",
)
fig.add_shape(
    type="line",
    x0=10**23.64,
    y0=71.3,
    x1=10**24.4,
    y1=71.3,
    line=dict(color="black", width=1)
)
fig.add_annotation(
    x=(24.4 + 23.64) / 2,
    y=71.3 + 3,
    text="6x",
    showarrow=False,
    font=dict(size=10),
    align="center",
)

# Update layout
fig.update_layout(
  width=600,
  height=300,
  title_text=f"MMLU accuracy vs. training compute",
  showlegend=True,
  legend=dict(
    title="Model accessibility"
  ),
  margin=dict(t=50, b=20, l=40, r=40),
  template='plotly_white'
)

fig.update_xaxes(
    title_text="Training compute (FLOP)" if i//2 + 1 == 2 else None,
    type='log',
    range=[22, 26],  # Set x limits
    tickmode='linear',
    dtick=2,  # This sets ticks at every two powers of 10
)
fig.update_yaxes(
    title_text="Accuracy (%)",
    range=[25, 100],
)

# Show the plot
fig.show()

# Save the plot
if save:
  save_plot(fig, results_dir, f'{bench}_compute')

In [68]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=filtered_reg_df['log_compute'],
        y=filtered_reg_df[bench + '_log_error'],
        mode='markers',
        name='Data',
        marker=dict(color='gray', opacity=0.5),
        text=filtered_reg_df['System']
    ),
)

all_model = results['no_split']['submodels']['All']
y_pred = get_predictions(all_model, filtered_reg_df, ['log_compute'])

fig.add_trace(
    go.Scatter(
        x=filtered_reg_df['log_compute'],
        y=y_pred,
        mode='lines',
        name=f'Fit (compute)',
        text=filtered_reg_df['System'],
        marker=dict(color='black'),
    ),
)

# for category, color in [('Open', 'blue'), ('Closed', 'orange')]:
#   category_models = results['open_closed_split_year']['submodels']
#   model_category = category_models[category]
#   category_df = filtered_reg_df[filtered_reg_df['Open/Closed'] == category]
#   y_pred = get_predictions(model_category, category_df, ['log_compute', 'year'])
  
#   fig.add_trace(
#       go.Scatter(
#           x=category_df['log_compute'],
#           y=y_pred,
#           mode='markers',
#           name=f'Fit (compute, age, {category})',
#           text=category_df['System'],
#           marker=dict(color=color, opacity=0.5, symbol='square'),
#       ),
#   )

all_model = results['no_split_year']['submodels']['All']
y_pred = get_predictions(all_model, filtered_reg_df, ['log_compute', 'year'])

fig.add_trace(
  go.Scatter(
    x=filtered_reg_df['log_compute'],
    y=y_pred,
    mode='markers',
    name='Fit (compute, age)',
    text=filtered_reg_df['System'],
    marker=dict(color='green', opacity=0.5),
  ),
)

# Update layout
fig.update_layout(
  width=600,
  height=250,
  title_text=f"{bench} score vs log compute",
  showlegend=True,
  # legend=dict(
  #   orientation="h",
  #   yanchor="bottom",
  #   y=-0.4,
  #   xanchor="left",
  #   x=0,
  # ),
  margin=dict(t=50, b=20, l=40, r=40),
  template='plotly_white'
)

fig.update_xaxes(title_text="Log Compute", range=[22, 26])
fig.update_yaxes(title_text="Negative log of error rate")

# Show the plot
fig.show()

# Save the plot
if save:
  save_plot(fig, results_dir, f'{bench}_regression_models_comparison')

In [69]:
fig = go.Figure()

category = 'Open'
color = 'blue'
category_models = results['open_closed_split_year']['submodels']
model_category = category_models[category]
category_df = filtered_reg_df[filtered_reg_df['Open/Closed'] == category]
y_pred = get_predictions(model_category, category_df, ['log_compute', 'year'])

fig.add_trace(
    go.Scatter(
        x=category_df['log_compute'],
        y=y_pred - category_df[bench + '_log_error'],
        mode='markers',
        name=f'Fit (compute, age, {category})',
        marker=dict(color=color, opacity=0.5),
    ),
)

all_model = results['no_split_year']['submodels']['All']
y_pred = get_predictions(all_model, category_df, ['log_compute', 'year'])

fig.add_trace(
  go.Scatter(
    x=category_df['log_compute'],
    y=y_pred - category_df[bench + '_log_error'],
    mode='markers',
    name='Fit (compute, age)',
    marker=dict(color='green', opacity=0.5),
  ),
)

# Update layout
fig.update_layout(
  width=600,
  height=300,
  title_text=f"{bench} score vs log compute",
  showlegend=True,
  # legend=dict(
  #   orientation="h",
  #   yanchor="bottom",
  #   y=-0.4,
  #   xanchor="left",
  #   x=0,
  # ),
  margin=dict(t=50, b=20, l=40, r=40)
)

fig.update_xaxes(title_text="Log Compute", range=[22, 26])
fig.update_yaxes(title_text="Residuals")

# Show the plot
fig.show()

# Save the plot
if save:
  save_plot(fig, results_dir, f'{bench}_regression_residuals_{category}')

In [70]:
# Calculate the proportion of New models that are Open models
new_df = filtered_reg_df[filtered_reg_df['Date'] >= best_date]
old_df = filtered_reg_df[filtered_reg_df['Date'] < best_date]
open_df = new_df[new_df['Open/Closed'] == 'Open']
proportion_new = len(open_df) / len(new_df)
print(f"Proportion of New models that are Open models: {len(open_df)} out of {len(new_df)} ({proportion_new*100:.0f}%)")

NameError: name 'best_date' is not defined

In [68]:
# Calculate the average date of new models
# Need to convert the date to a number of days since the epoch
new_avg_age = (new_df['Date'] - new_df['Date'].min()).dt.days.mean()
old_avg_age = (old_df['Date'] - old_df['Date'].min()).dt.days.mean()
# Convert the average age back to date
new_avg_age_date = new_df['Date'].min() + pd.Timedelta(days=new_avg_age)
old_avg_age_date = old_df['Date'].min() + pd.Timedelta(days=old_avg_age)
print(f"Average age of new models: {new_avg_age_date}")
print(f"Average age of old models: {old_avg_age_date}")
print(f"Difference in average age: {new_avg_age_date - old_avg_age_date}")

Average age of new models: 2024-03-09 17:20:00
Average age of old models: 2022-12-02 22:20:41.379310336
Difference in average age: 462 days 18:59:18.620689664
