In [64]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [65]:
from collections import defaultdict
from datetime import datetime
import kaleido  # needed for saving plots
import numpy as np
import os
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import KFold
from tqdm import tqdm

from regression import *
from plotting import *

# Parameters

In [66]:
cost_var = 'log_params'  # log_compute or log_params
benchmarks_to_plot = ['MMLU', 'GPQA', 'GSM1k', 'BBH']
benchmark_to_analyze = 'MMLU'
non_suspects_only = True  # Whether to only include not-suspicious benchmark scores in the analysis
trusted_only = False  # Whether to only include actively trusted benchmark scores in the analysis (more strict)
save = True  # Whether to save plots and results to disk
colorscale = 'Viridis'

bench_is_accuracy = {'MMLU': True, 'BBH': True, 'GSM1k': True, 'GPQA': True, 'LMSys Elo': False, 'SEAL Coding': False, 'SEAL Math': False}

In [67]:
results_dir = 'results/24Oct/'
os.makedirs(results_dir, exist_ok=True)

In [68]:
rng = np.random.default_rng(seed=42)

# Prepare data

In [69]:
# data_path = "https://docs.google.com/spreadsheets/d/1etu9rXcME0uUA-S2ANA8bsfQbIZgNu-8NxqFGQdDIzQ/export?format=csv&gid=1305280917#gid=1305280917"
data_path = "data/benchmarks_with_model_accessibility.csv"
df = pd.read_csv(data_path)

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

df.head(30)

Unnamed: 0,System,Model size (parameters),Active Parameters,Dataset size,Date,Open/Closed,Training compute (FLOP),Training compute notes,BBH,GPQA,MMLU,HELM MMLU,SEAL Coding,SEAL Instruction Following,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,GPQA Notes,MMLU Notes,HELM MMLU Notes,Trust in benchmark results,Trust notes
0,BLOOM-176B,176000000000.0,176000000000.0,390000000000.0,2022-11-09,Open,4.12e+23,,0.4491,,0.3913,,,,,,,,,,,,0,
1,BloombergGPT,50000000000.0,50000000000.0,708000000000.0,2023-03-30,Closed,2.12e+23,,0.4197,,0.3918,,,,,,,,,,,,0,
2,Camelidae-8x34B,,,,2024-01-05,Open,,,,,0.756,,,,,,,,,,,,0,
3,ChatGLM-6B,6000000000.0,6000000000.0,,2023-03-01,Open,,,0.1873,,,,,,,880.0,,,,,,,0,
4,ChatGLM2-12B-base,12000000000.0,12000000000.0,,2023-06-25,Open,,,0.3602,,,,,,,,,,,,,,0,
5,ChatGLM2-6B-base,6000000000.0,6000000000.0,,2023-06-25,Open,,,0.3368,,,,,,,924.0,,,,,,,0,
6,ChatGLM3-6B,6000000000.0,6000000000.0,,2023-10-27,Open,5.04e+22,,0.661,,,,,,,955.0,,,,,,,0,
7,Chinchilla 70B,,70000000000.0,,2022-03-29,Closed,5.76e+23,,,,0.675,,,,,,,,,,,,0,
8,Claude 2,,,,2023-07-11,Closed,,,,0.353,0.785,,,,,1132.0,,,,Epoch evaluation,"Actually CoT, so probably an overestimate. HEL...",,0,
9,Claude 2.1,,,,2023-11-21,Closed,,,,0.361,,,,,,,,,,Epoch evaluation,,,0,Doesn't perform worse on GSM1k relative to GSM8k


In [70]:
# gsm1k_data_path = "https://docs.google.com/spreadsheets/d/1KYp4h3urj-698IE9bR7n1ctuH1iyCAQ5pTZIqQ_qs9g/export?format=csv"
gsm1k_data_path = "data/gsm1k_with_model_accessibility.csv"
gsm1k_df = pd.read_csv(gsm1k_data_path)
gsm1k_df

Unnamed: 0,System,Date,GSM8k,GSM1k,Training compute (FLOP),Speculative Compute,Active Parameters,Open/Closed
0,claude-2.1,2023-07-11,0.887,0.894,3.800000e+24,,,Closed
1,claude-3-haiku-20240307,2024-03-04,0.785,0.785,,,,Closed
2,claude-3-opus-20240229,2024-03-04,0.802,0.825,,4.000000e+25,,Closed
3,claude-3-sonnet-20240229,2024-03-04,0.719,0.744,,,,Closed
4,codegemma-7b,2024-04-09,0.479,0.416,3.330000e+23,,7000000000,Open
...,...,...,...,...,...,...,...,...
66,vicuna-33b-v1.3,2023-06-22,0.379,0.341,,,33000000000,Open
67,Xwin-Math-13B-V1.0,2024-03-07,0.631,0.529,,,13000000000,Open
68,Xwin-Math-7B-V1.0,2024-03-07,0.529,0.428,,,7000000000,Open
69,Yi-34B-Chat,2023-11-02,0.641,0.569,6.100000e+23,,34000000000,Open


In [71]:
# Concatenate dfs
df = pd.concat([gsm1k_df, df], axis=0, join='outer', ignore_index=True)
df

Unnamed: 0,System,Date,GSM8k,GSM1k,Training compute (FLOP),Speculative Compute,Active Parameters,Open/Closed,Model size (parameters),Dataset size,Training compute notes,BBH,GPQA,MMLU,HELM MMLU,SEAL Coding,SEAL Instruction Following,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,GPQA Notes,MMLU Notes,HELM MMLU Notes,Trust in benchmark results,Trust notes
0,claude-2.1,2023-07-11,0.887,0.894,3.800000e+24,,,Closed,,,,,,,,,,,,,,,,,,,
1,claude-3-haiku-20240307,2024-03-04,0.785,0.785,,,,Closed,,,,,,,,,,,,,,,,,,,
2,claude-3-opus-20240229,2024-03-04,0.802,0.825,,4.000000e+25,,Closed,,,,,,,,,,,,,,,,,,,
3,claude-3-sonnet-20240229,2024-03-04,0.719,0.744,,,,Closed,,,,,,,,,,,,,,,,,,,
4,codegemma-7b,2024-04-09,0.479,0.416,3.330000e+23,,7000000000,Open,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,XVerse-7B,2023-09-26,,,,,7000000000,Open,,,,,,,,,,,,,,,,,,0.0,
199,Yi-1.5-34B,2024-05-10,,,,,,Open,,,,,0.060,,,,,,,,,,Epoch evaluation,,,0.0,
200,Yi-34B,2023-11-02,,,6.120000e+23,,34000000000,Open,3.400000e+10,3.000000e+12,,0.543,0.165,0.7635,,,,,1111.0,chat,,,Epoch evaluation,,,-1.0,MMLU-GPQA performance difference is relatively...
201,Yi-6B,2023-11-02,,,1.080000e+23,,6000000000,Open,6.000000e+09,3.000000e+12,,0.428,,0.6385,,,,,,,,,,,,0.0,


In [72]:
df.loc[df['System'] == 'Random chance', 'GSM1k'] = 0.0

In [73]:
df['Date'] = pd.to_datetime(df['Date'], format='mixed')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
df

Unnamed: 0,System,Date,GSM8k,GSM1k,Training compute (FLOP),Speculative Compute,Active Parameters,Open/Closed,Model size (parameters),Dataset size,Training compute notes,BBH,GPQA,MMLU,HELM MMLU,SEAL Coding,SEAL Instruction Following,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,GPQA Notes,MMLU Notes,HELM MMLU Notes,Trust in benchmark results,Trust notes
0,claude-2.1,2023-07-11,0.887,0.894,3.800000e+24,,,Closed,,,,,,,,,,,,,,,,,,,
1,claude-3-haiku-20240307,2024-03-04,0.785,0.785,,,,Closed,,,,,,,,,,,,,,,,,,,
2,claude-3-opus-20240229,2024-03-04,0.802,0.825,,4.000000e+25,,Closed,,,,,,,,,,,,,,,,,,,
3,claude-3-sonnet-20240229,2024-03-04,0.719,0.744,,,,Closed,,,,,,,,,,,,,,,,,,,
4,codegemma-7b,2024-04-09,0.479,0.416,3.330000e+23,,7000000000,Open,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,XVerse-7B,2023-09-26,,,,,7000000000,Open,,,,,,,,,,,,,,,,,,0.0,
199,Yi-1.5-34B,2024-05-10,,,,,,Open,,,,,0.060,,,,,,,,,,Epoch evaluation,,,0.0,
200,Yi-34B,2023-11-02,,,6.120000e+23,,34000000000,Open,3.400000e+10,3.000000e+12,,0.543,0.165,0.7635,,,,,1111.0,chat,,,Epoch evaluation,,,-1.0,MMLU-GPQA performance difference is relatively...
201,Yi-6B,2023-11-02,,,1.080000e+23,,6000000000,Open,6.000000e+09,3.000000e+12,,0.428,,0.6385,,,,,,,,,,,,0.0,


In [74]:
# Filter out finetuned systems

finetuned_systems = [
 'Layer Normalization: Handwriting sequence generation',
 'ULM-FiT',
 'ADP-FAIRSEQ + NGRAMRES',
 'Cross-lingual alignment',
 'UnifiedQA',
 '$\\infty$-former (SM)',
 'FLAN 137B',
 'AlphaFold-Multimer',
 'Masked Autoencoders',
 'Contriever',
 'BERT-RBP',
 'Minerva',
 'BlenderBot 3',
 'PaLM-SayCan',
 'NMST+GPT-2',
 'Decaying Fast Weights Transformer (WT-103)',
 'GPT-2 + Progressive LRD',
 'U-PaLM',
 'Flan-T5 11B',
 'Flan-PaLM 540B',
 'Taiyi-Stable Diffusion',
 'OPT-IML (175B)',
 'SparseOPT-175B',
 'DiT-XL/2',
 'VideoMAE V2',
 'Segment Anything Model',
 'gLM',
 'MOSS-Moon-003',
 'WizardLM-7B',
 'InstructBLIP',
 'Guanaco-65B',
 'WizardCoder-15.5B',
 'Code Llama-34B',
 'Code Llama-7B',
 'TigerBot-70B',
 'MiniGPT4 (Vicuna finetune)',
 'LLaMA-7B (protein-oriented instructions finetuned)',
 'FinGPT-13B',
 'LLaVA 1.5',
 'CogVLM',
 'Volcano 13B',
 'SPHINX (Llama 2 13B)',
 'Orca 2-13B',
 'Llama Guard',
 'FunSearch',
 'Elyza',
 'Code Llama-70B',
 'Swallow'
]

df = df[~df['System'].isin(finetuned_systems)]
df = df[~df['System'].str.contains('Flan', case=False)]

## Merge SEAL Math with GSM1k

In [75]:
for i, row in df.iterrows():
  if pd.notna(row['SEAL Math']):
    df.at[i, 'GSM1k'] = row['SEAL Math']
df

Unnamed: 0,System,Date,GSM8k,GSM1k,Training compute (FLOP),Speculative Compute,Active Parameters,Open/Closed,Model size (parameters),Dataset size,Training compute notes,BBH,GPQA,MMLU,HELM MMLU,SEAL Coding,SEAL Instruction Following,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,GPQA Notes,MMLU Notes,HELM MMLU Notes,Trust in benchmark results,Trust notes
0,claude-2.1,2023-07-11,0.887,0.894,3.800000e+24,,,Closed,,,,,,,,,,,,,,,,,,,
1,claude-3-haiku-20240307,2024-03-04,0.785,0.785,,,,Closed,,,,,,,,,,,,,,,,,,,
2,claude-3-opus-20240229,2024-03-04,0.802,0.825,,4.000000e+25,,Closed,,,,,,,,,,,,,,,,,,,
3,claude-3-sonnet-20240229,2024-03-04,0.719,0.744,,,,Closed,,,,,,,,,,,,,,,,,,,
4,codegemma-7b,2024-04-09,0.479,0.416,3.330000e+23,,7000000000,Open,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,XVerse-7B,2023-09-26,,,,,7000000000,Open,,,,,,,,,,,,,,,,,,0.0,
199,Yi-1.5-34B,2024-05-10,,,,,,Open,,,,,0.060,,,,,,,,,,Epoch evaluation,,,0.0,
200,Yi-34B,2023-11-02,,,6.120000e+23,,34000000000,Open,3.400000e+10,3.000000e+12,,0.543,0.165,0.7635,,,,,1111.0,chat,,,Epoch evaluation,,,-1.0,MMLU-GPQA performance difference is relatively...
201,Yi-6B,2023-11-02,,,1.080000e+23,,6000000000,Open,6.000000e+09,3.000000e+12,,0.428,,0.6385,,,,,,,,,,,,0.0,


In [76]:
def convert_int(x):
    if pd.isna(x):
        return np.nan
    try:
        return int(str(x).replace(',', ''))
    except ValueError:
        return np.nan

df['Active Parameters'] = df['Active Parameters'].apply(convert_int)

## Data visualization

In [77]:
fig = make_subplots(rows=2, cols=2, subplot_titles=benchmarks_to_plot, vertical_spacing=0.15)

for i, bench in enumerate(benchmarks_to_plot):
    plot_df = df[~(df['System'] == 'Random chance')]
    if non_suspects_only:
        if bench == 'GPQA':
            # GPQA was released November 20, 2023
            old_df = plot_df[plot_df['Date'] < pd.to_datetime('2023-11-20')]
            new_df = plot_df[plot_df['Date'] >= pd.to_datetime('2023-11-20')]
            new_df = new_df[new_df['Trust in benchmark results'] >= 0]
            plot_df = pd.concat([old_df, new_df])
        elif bench == 'MMLU':
            plot_df = plot_df[plot_df['Trust in benchmark results'] >= 0]
    elif trusted_only:
        plot_df = plot_df[plot_df['Trust in benchmark results'] > 0]

    fig.append_trace(
        go.Scatter(
            x=plot_df['Training compute (FLOP)'],
            y=100 * plot_df[bench],
            mode='markers',
            text=plot_df['System'],
            name=bench,
            showlegend=True if i == 0 else False
        ),
        row=i//2 + 1, col=i%2 + 1
    )

    # Update x and y axes for this subplot
    fig.update_xaxes(
        title_text="Training compute (FLOP)" if i//2 + 1 == 2 else None,
        type='log',
        tickmode='linear',
        dtick=2,  # This sets ticks at every two powers of 10
        row=i//2 + 1,
        col=i%2 + 1
    )

    if i%2 + 1 == 1:
        fig.update_yaxes(title_text="Accuracy (%)", row=i//2 + 1, col=i%2 + 1)

# Improve the layout
fig.update_layout(
    template='plotly_white',
    width=600,
    height=400,
    # legend_title="Model accessibility",
    font=dict(size=12),
    hovermode="closest",
)

# Margins
fig.update_layout(
    margin=dict(l=0, r=0, t=20, b=0)
)

# Save the plot
if save:
    save_plot(fig, results_dir, 'benchmark_training_compute')

# Show the plot
fig.show()

In [78]:
fig = make_subplots(rows=2, cols=2, subplot_titles=benchmarks_to_plot, vertical_spacing=0.15)

for i, bench in enumerate(benchmarks_to_plot):
    plot_df = df[~(df['System'] == 'Random chance')]
    if non_suspects_only:
        if bench == 'GPQA':
            # GPQA was released November 20, 2023
            old_df = plot_df[plot_df['Date'] < pd.to_datetime('2023-11-20')]
            new_df = plot_df[plot_df['Date'] >= pd.to_datetime('2023-11-20')]
            new_df = new_df[new_df['Trust in benchmark results'] >= 0]
            plot_df = pd.concat([old_df, new_df])
        elif bench == 'MMLU':
            plot_df = plot_df[plot_df['Trust in benchmark results'] >= 0]
    elif trusted_only:
        plot_df = plot_df[plot_df['Trust in benchmark results'] > 0]

    fig.append_trace(
        go.Scatter(
            x=plot_df['Active Parameters'],
            y=100 * plot_df[bench],
            mode='markers',
            text=plot_df['System'],
            name=bench,
            showlegend=True if i == 0 else False
        ),
        row=i//2 + 1, col=i%2 + 1
    )

    # Update x and y axes for this subplot
    fig.update_xaxes(
        title_text="Active Parameters" if i//2 + 1 == 2 else None,
        type='log',
        tickmode='linear',
        dtick=2,  # This sets ticks at every two powers of 10
        row=i//2 + 1,
        col=i%2 + 1
    )

    if i%2 + 1 == 1:
        fig.update_yaxes(title_text="Accuracy (%)", row=i//2 + 1, col=i%2 + 1)

# Improve the layout
fig.update_layout(
    template='plotly_white',
    width=600,
    height=400,
    # legend_title="Model accessibility",
    font=dict(size=12),
    hovermode="closest",
)

# Margins
fig.update_layout(
    margin=dict(l=0, r=0, t=20, b=0)
)

# Save the plot
if save:
    save_plot(fig, results_dir, 'benchmark_active_params')

# Show the plot
fig.show()

# Prepare data for regression

In [79]:
reg_df = df[~(df['System'] == 'Random chance')]
if non_suspects_only:
    if benchmark_to_analyze == 'GPQA':
        # GPQA was released November 20, 2023
        old_df = reg_df[reg_df['Date'] < pd.to_datetime('2023-11-20')]
        new_df = reg_df[reg_df['Date'] >= pd.to_datetime('2023-11-20')]
        new_df = new_df[new_df['Trust in benchmark results'] >= 0]
        reg_df = pd.concat([old_df, new_df])
    elif benchmark_to_analyze == 'MMLU':
        reg_df = reg_df[reg_df['Trust in benchmark results'] >= 0]
elif trusted_only:
    reg_df = reg_df[reg_df['Trust in benchmark results'] > 0]

In [80]:
random_chance_level = df.loc[df["System"] == "Random chance", benchmark_to_analyze].values[0]
# Filter out models that are not far above random chance level
# This is a heuristic to find the changepoint
filtered_reg_df = reg_df.loc[reg_df[benchmark_to_analyze] > random_chance_level + 0.05].copy()
filtered_reg_df['log_compute'] = np.log10(filtered_reg_df['Training compute (FLOP)'])
filtered_reg_df['log_params'] = np.log10(filtered_reg_df['Active Parameters'])
filtered_reg_df[benchmark_to_analyze + '_log_error'] = -np.log(1 - filtered_reg_df[benchmark_to_analyze])
filtered_reg_df.dropna(subset=[cost_var, benchmark_to_analyze + '_log_error'], inplace=True)

In [81]:
filtered_reg_df.loc[:, 'float_date'] = datetime_to_float_year(filtered_reg_df['Date'])

In [82]:
var_labels = {
    'float_date': 'Year',
    'log_compute': 'Training compute (FLOP)',
    'log_params': 'Active Parameters',
    benchmark_to_analyze + '_log_error': 'Performance (negative log of error rate)'
}

# Fit a model of performance = f(cost, date)

In [83]:
# Single fit for all data
performance_model = fit_ols_regression(filtered_reg_df, ['float_date', cost_var], benchmark_to_analyze + '_log_error')
performance_model.summary()

0,1,2,3
Dep. Variable:,MMLU_log_error,R-squared:,0.891
Model:,OLS,Adj. R-squared:,0.886
Method:,Least Squares,F-statistic:,187.3
Date:,"Thu, 24 Oct 2024",Prob (F-statistic):,7.87e-23
Time:,13:57:27,Log-Likelihood:,20.446
No. Observations:,49,AIC:,-34.89
Df Residuals:,46,BIC:,-29.22
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-337.8448,58.754,-5.750,0.000,-456.111,-219.579
float_date,0.1623,0.029,5.566,0.000,0.104,0.221
log_compute,0.4413,0.029,15.368,0.000,0.384,0.499

0,1,2,3
Omnibus:,1.17,Durbin-Watson:,1.401
Prob(Omnibus):,0.557,Jarque-Bera (JB):,0.502
Skew:,0.185,Prob(JB):,0.778
Kurtosis:,3.33,Cond. No.,5060000.0


In [84]:
performance_model.params.index

Index(['Intercept', 'float_date', 'log_compute'], dtype='object')

In [85]:
get_predictions(performance_model, filtered_reg_df, ['float_date', cost_var])


array([0.86809836, 0.80417934, 0.83301391, 1.10795706, 0.7344945 ,
       1.34588002, 1.28059877, 0.89900253, 1.96231806, 1.14080943,
       0.82425726, 0.80058925, 1.31763294, 1.6783322 , 0.36626722,
       1.40823777, 1.64743564, 1.69486758, 1.62472506, 2.01194255,
       1.66751951, 0.59635087, 0.83924538, 0.96930727, 0.47770478,
       0.79415586, 0.97842154, 1.11682709, 0.67550977, 1.68978575,
       1.90143795, 0.81918711, 0.50983512, 1.85120689, 1.85120689,
       0.44764354, 1.11906349, 0.70587887, 1.50168741, 0.74232561,
       0.67790121, 0.62812   , 0.41936146, 1.10795706, 1.22814821,
       0.7560468 , 0.93330941, 0.77066472, 1.39592097])

In [86]:
# Define the range for log_compute and float_date
float_date_min, float_date_max = filtered_reg_df['float_date'].min(), filtered_reg_df['float_date'].max()
log_cost_min, log_cost_max = filtered_reg_df[cost_var].min(), filtered_reg_df[cost_var].max()

# Create a grid of values
log_cost_vals = np.linspace(log_cost_min - 1, log_cost_max + 1, 100)
float_date_vals = np.linspace(float_date_min - 1, float_date_max + 1, 100)
X_grid, Y_grid = np.meshgrid(float_date_vals, log_cost_vals)

# Prepare the grid for prediction
X_pred = pd.DataFrame({
    'float_date': X_grid.ravel(),
    cost_var: Y_grid.ravel()
})

# Generate predictions
Z_pred = performance_model.predict(X_pred)
Z_pred = Z_pred.values.reshape(X_grid.shape)

min_performance = Z_pred.min()
max_performance = Z_pred.max()

# Plot the contour using Plotly
fig = go.Figure(data=go.Contour(
    x=float_date_vals,
    y=10**log_cost_vals,
    z=accuracy_from_negative_log_error(Z_pred)*100,
    colorscale=colorscale,
    colorbar=dict(title='Predicted accuracy (%)'),
    contours=dict(
        coloring='heatmap',
        showlabels=True,
        labelfont=dict(size=12, color='white')
    )
))

# Add the actual data points with Viridiscolorscale
fig.add_trace(go.Scatter(
    x=filtered_reg_df['float_date'],
    y=10**filtered_reg_df[cost_var],
    mode='markers',
    marker=dict(
        color=accuracy_from_negative_log_error(filtered_reg_df[benchmark_to_analyze + '_log_error'])*100,
        colorscale=colorscale,
        cmin=accuracy_from_negative_log_error(min_performance)*100,
        cmax=accuracy_from_negative_log_error(max_performance)*100,
        line=dict(width=1, color='black')
    ),
    text=filtered_reg_df['System'],
    # show year, cost, and performance on hover
    hovertemplate='%{text}<br>Year: %{x:.2f}<br>cost: %{y:.2f}<br>Performance: %{customdata:.2f}',
    customdata=accuracy_from_negative_log_error(filtered_reg_df[benchmark_to_analyze + '_log_error'])*100,
    name='Data'
))

fig.update_yaxes(type='log')

fig.update_layout(
    title=f'{var_labels[cost_var]} efficiency for {benchmark_to_analyze}',
    xaxis_title='Release date',
    yaxis_title=var_labels[cost_var],
    coloraxis_colorbar=dict(title=f'Predicted {benchmark_to_analyze} accuracy (%)'),
    width=600,
    height=400
)

if save:
    save_plot(fig, results_dir, f'{benchmark_to_analyze}_predictions_isoperformance_contour_cost={cost_var}')

fig.show()

In [87]:
performance_model.params

Intercept     -337.844845
float_date       0.162291
log_compute      0.441317
dtype: float64

In [88]:
isoperformance_slope = -performance_model.params['float_date'] / performance_model.params[cost_var]
print(f'It costs {1/(10**isoperformance_slope):.1f}x less {var_labels[cost_var]} each year to keep {benchmark_to_analyze} performance fixed.')


It costs 2.3x less Training compute (FLOP) each year to keep MMLU performance fixed.


# Fit a model of date = f(performance, cost)

In [89]:
time_model = fit_ols_regression(filtered_reg_df, [benchmark_to_analyze + '_log_error', cost_var], 'float_date')
time_model.summary()

0,1,2,3
Dep. Variable:,float_date,R-squared:,0.477
Model:,OLS,Adj. R-squared:,0.455
Method:,Least Squares,F-statistic:,21.0
Date:,"Thu, 24 Oct 2024",Prob (F-statistic):,3.32e-07
Time:,13:57:27,Log-Likelihood:,-46.356
No. Observations:,49,AIC:,98.71
Df Residuals:,46,BIC:,104.4
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2041.9042,5.434,375.748,0.000,2030.966,2052.843
MMLU_log_error,2.4800,0.446,5.566,0.000,1.583,3.377
log_compute,-0.8863,0.245,-3.611,0.001,-1.380,-0.392

0,1,2,3
Omnibus:,10.447,Durbin-Watson:,1.477
Prob(Omnibus):,0.005,Jarque-Bera (JB):,10.055
Skew:,-1.041,Prob(JB):,0.00655
Kurtosis:,3.77,Cond. No.,1420.0


In [90]:
# Define the range for performance and log_compute
performance_min, performance_max = filtered_reg_df[benchmark_to_analyze + '_log_error'].min(), filtered_reg_df[benchmark_to_analyze + '_log_error'].max()
log_cost_min, log_cost_max = filtered_reg_df[cost_var].min(), filtered_reg_df[cost_var].max()

# Create a grid of values
performance_vals = np.linspace(performance_min - 0.5, performance_max + 0.5, 100)
log_cost_vals = np.linspace(log_cost_min - 1, log_cost_max + 1, 100)
X_grid, Y_grid = np.meshgrid(performance_vals, log_cost_vals)

# Prepare the grid for prediction
X_pred = pd.DataFrame({
    benchmark_to_analyze + '_log_error': X_grid.ravel(),
    cost_var: Y_grid.ravel()
})

# Generate predictions
Z_pred = time_model.predict(X_pred)
Z_pred = Z_pred.values.reshape(X_grid.shape)

min_year = Z_pred.min()
max_year = Z_pred.max()

# Plot the contour using Plotly
fig = go.Figure(data=go.Contour(
    x=performance_vals,
    y=10**log_cost_vals,
    z=Z_pred,
    colorscale=colorscale,
    colorbar=dict(title='Year'),
    contours=dict(
        coloring='heatmap',
        showlabels=True,
        labelfont=dict(size=12, color='white')
    )
))

# Add the actual data points with Viridiscolorscale
fig.add_trace(go.Scatter(
    x=filtered_reg_df[benchmark_to_analyze + '_log_error'],
    y=10**filtered_reg_df[cost_var],
    mode='markers',
    marker=dict(
        color=filtered_reg_df['float_date'],
        colorscale=colorscale,
        cmin=min_year,
        cmax=max_year,
        line=dict(width=1, color='black')
    ),
    text=filtered_reg_df['System'],
    name='Data'
))

fig.update_yaxes(type='log')

fig.update_layout(
    title=f'Regression Model Predictions for {benchmark_to_analyze}',
    xaxis_title='Performance (negative log of error rate)',
    yaxis_title=var_labels[cost_var],
    coloraxis_colorbar=dict(title=f'Predicted Year'),
    width=800,
    height=400
)

if save:
    save_plot(fig, results_dir, f'{benchmark_to_analyze}_predictions_isotime_contour_cost={cost_var}')

fig.show()

# Put models into (performance, date) buckets and average the cost in each bucket

In [91]:
filtered_reg_df.head()

Unnamed: 0,System,Date,GSM8k,GSM1k,Training compute (FLOP),Speculative Compute,Active Parameters,Open/Closed,Model size (parameters),Dataset size,Training compute notes,BBH,GPQA,MMLU,HELM MMLU,SEAL Coding,SEAL Instruction Following,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,GPQA Notes,MMLU Notes,HELM MMLU Notes,Trust in benchmark results,Trust notes,log_compute,log_params,MMLU_log_error,float_date
71,BLOOM-176B,2022-11-09,,,4.12e+23,,176000000000.0,Open,176000000000.0,390000000000.0,,0.4491,,0.3913,,,,,,,,,,,,0.0,,23.614897,11.245513,0.49643,2022.855237
72,BloombergGPT,2023-03-30,,,2.12e+23,,50000000000.0,Closed,50000000000.0,708000000000.0,,0.4197,,0.3918,,,,,,,,,,,,0.0,,23.326336,10.69897,0.497252,2023.246066
78,Chinchilla 70B,2022-03-29,,,5.76e+23,,70000000000.0,Closed,,,,,,0.675,,,,,,,,,,,,0.0,,23.760422,10.845098,1.12393,2022.243328
87,code-davinci-002,2022-03-01,,,2.58e+24,,,Closed,,,,0.528,,0.682,,,,,,,,,,,,0.0,,24.41162,,1.145704,2022.166667
93,DeepSeek-7B,2023-11-29,,,8.4e+22,,7000000000.0,Open,7000000000.0,2000000000000.0,,,,0.482,,,,,,,,,,,,0.0,,22.924279,9.845098,0.65778,2023.909995


In [92]:
performance_min = 0.25
performance_max = 2.25
performance_step = 0.5
performance_buckets = np.arange(performance_min, performance_max, performance_step)
performance_col = benchmark_to_analyze + '_log_error'
performance_bucket_dfs = {}
for performance_low in performance_buckets:
    performance_high = performance_low + performance_step
    bucket_df = filtered_reg_df.loc[(filtered_reg_df[performance_col] >= performance_low) & (filtered_reg_df[performance_col] < performance_high)]
    performance_bucket_dfs[(performance_low, performance_high)] = bucket_df
    print(f'Bucket: {performance_low:.2f} to {performance_high:.2f}')
    print(bucket_df[['System', 'Date', cost_var, performance_col]])
performance_bucket_dfs[(0.25, 0.75)]


Bucket: 0.25 to 0.75
                   System       Date  log_compute  MMLU_log_error
71             BLOOM-176B 2022-11-09    23.614897        0.496430
72           BloombergGPT 2023-03-30    23.326336        0.497252
93            DeepSeek-7B 2023-11-29    22.924279        0.657780
119              GLM 130B 2022-10-05    23.550228        0.594207
133          GPT-NeoX 20B 2022-02-09    22.753583        0.409473
144           LLaMa-1 13B 2023-02-24    22.892095        0.632993
147            LLaMa-1 7B 2023-02-24    22.623249        0.432323
151            LLaMa-2 7B 2023-07-18    22.924279        0.603306
158               MPT 30B 2023-06-22    23.276462        0.653926
159                MPT 7B 2023-05-05    22.623249        0.368169
167               OPT-66B 2022-05-02    22.853090        0.446131
180      StableLM-3B-4E1T 2023-09-29    22.857332        0.602028
182  StableLM-alpha-7b-v2 2023-08-05    22.799341        0.599657
191      text-davinci-001 2020-05-28    23.498311      

Unnamed: 0,System,Date,GSM8k,GSM1k,Training compute (FLOP),Speculative Compute,Active Parameters,Open/Closed,Model size (parameters),Dataset size,Training compute notes,BBH,GPQA,MMLU,HELM MMLU,SEAL Coding,SEAL Instruction Following,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,GPQA Notes,MMLU Notes,HELM MMLU Notes,Trust in benchmark results,Trust notes,log_compute,log_params,MMLU_log_error,float_date
71,BLOOM-176B,2022-11-09,,,4.12e+23,,176000000000.0,Open,176000000000.0,390000000000.0,,0.4491,,0.3913,,,,,,,,,,,,0.0,,23.614897,11.245513,0.49643,2022.855237
72,BloombergGPT,2023-03-30,,,2.12e+23,,50000000000.0,Closed,50000000000.0,708000000000.0,,0.4197,,0.3918,,,,,,,,,,,,0.0,,23.326336,10.69897,0.497252,2023.246066
93,DeepSeek-7B,2023-11-29,,,8.4e+22,,7000000000.0,Open,7000000000.0,2000000000000.0,,,,0.482,,,,,,,,,,,,0.0,,22.924279,9.845098,0.65778,2023.909995
119,GLM 130B,2022-10-05,,,3.55e+23,,130000000000.0,Open,130000000000.0,,,,,0.448,,,,,,,,,,,,0.0,,23.550228,11.113943,0.594207,2022.760952
133,GPT-NeoX 20B,2022-02-09,,,5.67e+22,,20000000000.0,Open,20000000000.0,473000000000.0,,0.4025,,0.336,,,,,,,,,,,,0.0,,22.753583,10.30103,0.409473,2022.105237
144,LLaMa-1 13B,2023-02-24,,,7.8e+22,,13000000000.0,Open,13000000000.0,1000000000000.0,,0.37,,0.469,,,,,799.0,,,,,,,0.0,,22.892095,10.113943,0.632993,2023.146305
147,LLaMa-1 7B,2023-02-24,,,4.2e+22,,7000000000.0,Open,7000000000.0,1000000000000.0,,0.303,,0.351,,,,,,,,,,,,0.0,,22.623249,9.845098,0.432323,2023.146305
151,LLaMa-2 7B,2023-07-18,,,8.4e+22,,7000000000.0,Open,7000000000.0,2000000000000.0,,0.326,,0.453,,,,,1036.0,chat,,,,,,0.0,,22.924279,9.845098,0.603306,2023.546544
158,MPT 30B,2023-06-22,,,1.89e+23,,30000000000.0,Open,30000000000.0,1050000000000.0,,0.38,,0.48,,,,,1045.0,chat,,,,,,0.0,,23.276462,10.477121,0.653926,2023.474163
159,MPT 7B,2023-05-05,,,4.2e+22,,7000000000.0,Open,7000000000.0,1000000000000.0,,0.31,,0.308,,,,,928.0,chat,,,,,,0.0,,22.623249,9.845098,0.368169,2023.344285


In [93]:
# Plot the cost as a function of year for each bucket

fig = make_subplots(
    rows=1,
    cols=len(performance_bucket_dfs),
    subplot_titles=[f'Performance: {accuracy_from_negative_log_error(performance_low)*100:.0f}% ' +
                    f'to {accuracy_from_negative_log_error(performance_high)*100:.0f}%'
                    for performance_low, performance_high in performance_bucket_dfs.keys()]
)

for i, (performance_bucket, bucket_df) in enumerate(performance_bucket_dfs.items()):
    bucket_df = bucket_df.copy()
    bucket_df['Year'] = bucket_df['Date'].dt.year
    fig.append_trace(
        go.Scatter(
            x=bucket_df['Year'],
            y=10**bucket_df[cost_var],
            mode='markers',
            text=bucket_df['System'],
            name=f'{performance_bucket[0]:.2f} to {performance_bucket[1]:.2f}',
            showlegend=False,
        ),
        row=1, col=i + 1
    )
    fig.update_xaxes(title_text='Release year', row=1, col=i + 1)

# Only show whole years
fig.update_xaxes(tickmode='linear', dtick=1)
fig.update_yaxes(type='log')

fig.update_layout(
    title=f'{var_labels[cost_var]} of models over time for {benchmark_to_analyze} performance buckets',
    yaxis_title=var_labels[cost_var],
    width=1200,
    height=400,
    template='plotly_white'
)

if save:
    save_plot(fig, results_dir, f'{benchmark_to_analyze}_{cost_var}_by_year_for_performance_buckets')

fig.show()

In [94]:
# Within each bucket, group by year
for performance_bucket, bucket_df in performance_bucket_dfs.items():
    bucket_df = bucket_df.copy()
    bucket_df['Year'] = bucket_df['Date'].dt.year
    mean_cost_by_year = bucket_df.groupby('Year')[cost_var].mean()
    print(f'Bucket: {performance_bucket}')
    print(mean_cost_by_year)


Bucket: (0.25, 0.75)
Year
2020    23.498311
2022    23.192950
2023    22.916291
Name: log_compute, dtype: float64
Bucket: (0.75, 1.25)
Year
2021    23.800029
2022    24.144212
2023    23.486557
2024    23.635484
Name: log_compute, dtype: float64
Bucket: (1.25, 1.75)
Year
2023    24.780939
2024    24.773871
Name: log_compute, dtype: float64
Bucket: (1.75, 2.25)
Year
2023    25.510595
2024    24.990694
Name: log_compute, dtype: float64
