In [1]:
%load_ext autoreload
%autoreload 2

In [57]:
from collections import defaultdict
from datetime import datetime
import kaleido  # needed for saving plots
import numpy as np
import os
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.decomposition import PCA 
from sklearn.model_selection import KFold
from tqdm import tqdm

from regression import *
from plotting import *
from utils import *

# Parameters

In [3]:
cost_var = 'log_params'  # log_compute or log_params
benchmarks_to_plot = ['MMLU', 'GPQA']
benchmark_to_analyze = 'MMLU'
non_suspects_only = True  # Whether to only include not-suspicious benchmark scores in the analysis
trusted_only = False  # Whether to only include actively trusted benchmark scores in the analysis (more strict)
save = True  # Whether to save plots and results to disk
colorscale = 'Viridis'
num_samples = 10000  # Number of samples to draw from distributions

bench_is_accuracy = {'MMLU': True, 'BBH': True, 'GSM1k': True, 'GPQA': True, 'LMSys Elo': False, 'SEAL Coding': False, 'SEAL Math': False}

In [4]:
results_dir = 'results/27Nov/'
os.makedirs(results_dir, exist_ok=True)

In [5]:
rng = np.random.default_rng(seed=42)

# Prepare data

In [6]:
# data_path = "https://docs.google.com/spreadsheets/d/1etu9rXcME0uUA-S2ANA8bsfQbIZgNu-8NxqFGQdDIzQ/export?format=csv&gid=1305280917#gid=1305280917"
data_path = "data/benchmarks_with_model_accessibility.csv"
df = pd.read_csv(data_path)

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

df.head(30)

Unnamed: 0,System,Model size (parameters),Active Parameters,Dataset size,Date,Open/Closed,Training compute (FLOP),Training compute notes,BBH,GPQA,MMLU,HELM MMLU,SEAL Coding,SEAL Instruction Following,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,GPQA Notes,MMLU Notes,HELM MMLU Notes,Trust in benchmark results,Trust notes
0,BLOOM-176B,176000000000.0,176000000000.0,390000000000.0,2022-11-09,Open,4.12e+23,,0.4491,,0.3913,,,,,,,,,,,,0,
1,BloombergGPT,50000000000.0,50000000000.0,708000000000.0,2023-03-30,Closed,2.12e+23,,0.4197,,0.3918,,,,,,,,,,,,0,
2,Camelidae-8x34B,,,,2024-01-05,Open,,,,,0.756,,,,,,,,,,,,0,
3,ChatGLM-6B,6000000000.0,6000000000.0,,2023-03-01,Open,,,0.1873,,,,,,,880.0,,,,,,,0,
4,ChatGLM2-12B-base,12000000000.0,12000000000.0,,2023-06-25,Open,,,0.3602,,,,,,,,,,,,,,0,
5,ChatGLM2-6B-base,6000000000.0,6000000000.0,,2023-06-25,Open,,,0.3368,,,,,,,924.0,,,,,,,0,
6,ChatGLM3-6B,6000000000.0,6000000000.0,,2023-10-27,Open,5.04e+22,,0.661,,,,,,,955.0,,,,,,,0,
7,Chinchilla 70B,,70000000000.0,,2022-03-29,Closed,5.76e+23,,,,0.675,,,,,,,,,,,,0,
8,Claude 2,,,,2023-07-11,Closed,,,,0.353,0.785,,,,,1132.0,,,,Epoch evaluation,"Actually CoT, so probably an overestimate. HEL...",,0,
9,Claude 2.1,,,,2023-11-21,Closed,,,,0.361,,,,,,,,,,Epoch evaluation,,,0,Doesn't perform worse on GSM1k relative to GSM8k


In [7]:
# gsm1k_data_path = "https://docs.google.com/spreadsheets/d/1KYp4h3urj-698IE9bR7n1ctuH1iyCAQ5pTZIqQ_qs9g/export?format=csv"
gsm1k_data_path = "data/gsm1k_with_model_accessibility.csv"
gsm1k_df = pd.read_csv(gsm1k_data_path)
gsm1k_df

Unnamed: 0,System,Date,GSM8k,GSM1k,Training compute (FLOP),Speculative Compute,Active Parameters,Open/Closed
0,claude-2.1,2023-07-11,0.887,0.894,3.800000e+24,,,Closed
1,claude-3-haiku-20240307,2024-03-04,0.785,0.785,,,,Closed
2,claude-3-opus-20240229,2024-03-04,0.802,0.825,,4.000000e+25,,Closed
3,claude-3-sonnet-20240229,2024-03-04,0.719,0.744,,,,Closed
4,codegemma-7b,2024-04-09,0.479,0.416,3.330000e+23,,7000000000,Open
...,...,...,...,...,...,...,...,...
66,vicuna-33b-v1.3,2023-06-22,0.379,0.341,,,33000000000,Open
67,Xwin-Math-13B-V1.0,2024-03-07,0.631,0.529,,,13000000000,Open
68,Xwin-Math-7B-V1.0,2024-03-07,0.529,0.428,,,7000000000,Open
69,Yi-34B-Chat,2023-11-02,0.641,0.569,6.100000e+23,,34000000000,Open


In [8]:
# Concatenate dfs
df = pd.concat([gsm1k_df, df], axis=0, join='outer', ignore_index=True)
df

Unnamed: 0,System,Date,GSM8k,GSM1k,Training compute (FLOP),Speculative Compute,Active Parameters,Open/Closed,Model size (parameters),Dataset size,Training compute notes,BBH,GPQA,MMLU,HELM MMLU,SEAL Coding,SEAL Instruction Following,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,GPQA Notes,MMLU Notes,HELM MMLU Notes,Trust in benchmark results,Trust notes
0,claude-2.1,2023-07-11,0.887,0.894,3.800000e+24,,,Closed,,,,,,,,,,,,,,,,,,,
1,claude-3-haiku-20240307,2024-03-04,0.785,0.785,,,,Closed,,,,,,,,,,,,,,,,,,,
2,claude-3-opus-20240229,2024-03-04,0.802,0.825,,4.000000e+25,,Closed,,,,,,,,,,,,,,,,,,,
3,claude-3-sonnet-20240229,2024-03-04,0.719,0.744,,,,Closed,,,,,,,,,,,,,,,,,,,
4,codegemma-7b,2024-04-09,0.479,0.416,3.330000e+23,,7000000000,Open,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,XVerse-7B,2023-09-26,,,,,7000000000,Open,,,,,,,,,,,,,,,,,,0.0,
199,Yi-1.5-34B,2024-05-10,,,,,,Open,,,,,0.060,,,,,,,,,,Epoch evaluation,,,0.0,
200,Yi-34B,2023-11-02,,,6.120000e+23,,34000000000,Open,3.400000e+10,3.000000e+12,,0.543,0.165,0.7635,,,,,1111.0,chat,,,Epoch evaluation,,,-1.0,MMLU-GPQA performance difference is relatively...
201,Yi-6B,2023-11-02,,,1.080000e+23,,6000000000,Open,6.000000e+09,3.000000e+12,,0.428,,0.6385,,,,,,,,,,,,0.0,


In [9]:
df.loc[df['System'] == 'Random chance', 'GSM1k'] = 0.0

In [10]:
df['Date'] = pd.to_datetime(df['Date'], format='mixed')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
df

Unnamed: 0,System,Date,GSM8k,GSM1k,Training compute (FLOP),Speculative Compute,Active Parameters,Open/Closed,Model size (parameters),Dataset size,Training compute notes,BBH,GPQA,MMLU,HELM MMLU,SEAL Coding,SEAL Instruction Following,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,GPQA Notes,MMLU Notes,HELM MMLU Notes,Trust in benchmark results,Trust notes
0,claude-2.1,2023-07-11,0.887,0.894,3.800000e+24,,,Closed,,,,,,,,,,,,,,,,,,,
1,claude-3-haiku-20240307,2024-03-04,0.785,0.785,,,,Closed,,,,,,,,,,,,,,,,,,,
2,claude-3-opus-20240229,2024-03-04,0.802,0.825,,4.000000e+25,,Closed,,,,,,,,,,,,,,,,,,,
3,claude-3-sonnet-20240229,2024-03-04,0.719,0.744,,,,Closed,,,,,,,,,,,,,,,,,,,
4,codegemma-7b,2024-04-09,0.479,0.416,3.330000e+23,,7000000000,Open,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,XVerse-7B,2023-09-26,,,,,7000000000,Open,,,,,,,,,,,,,,,,,,0.0,
199,Yi-1.5-34B,2024-05-10,,,,,,Open,,,,,0.060,,,,,,,,,,Epoch evaluation,,,0.0,
200,Yi-34B,2023-11-02,,,6.120000e+23,,34000000000,Open,3.400000e+10,3.000000e+12,,0.543,0.165,0.7635,,,,,1111.0,chat,,,Epoch evaluation,,,-1.0,MMLU-GPQA performance difference is relatively...
201,Yi-6B,2023-11-02,,,1.080000e+23,,6000000000,Open,6.000000e+09,3.000000e+12,,0.428,,0.6385,,,,,,,,,,,,0.0,


In [11]:
# Filter out finetuned systems

finetuned_systems = [
 'Layer Normalization: Handwriting sequence generation',
 'ULM-FiT',
 'ADP-FAIRSEQ + NGRAMRES',
 'Cross-lingual alignment',
 'UnifiedQA',
 '$\\infty$-former (SM)',
 'FLAN 137B',
 'AlphaFold-Multimer',
 'Masked Autoencoders',
 'Contriever',
 'BERT-RBP',
 'Minerva',
 'BlenderBot 3',
 'PaLM-SayCan',
 'NMST+GPT-2',
 'Decaying Fast Weights Transformer (WT-103)',
 'GPT-2 + Progressive LRD',
 'U-PaLM',
 'Flan-T5 11B',
 'Flan-PaLM 540B',
 'Taiyi-Stable Diffusion',
 'OPT-IML (175B)',
 'SparseOPT-175B',
 'DiT-XL/2',
 'VideoMAE V2',
 'Segment Anything Model',
 'gLM',
 'MOSS-Moon-003',
 'WizardLM-7B',
 'InstructBLIP',
 'Guanaco-65B',
 'WizardCoder-15.5B',
 'Code Llama-34B',
 'Code Llama-7B',
 'TigerBot-70B',
 'MiniGPT4 (Vicuna finetune)',
 'LLaMA-7B (protein-oriented instructions finetuned)',
 'FinGPT-13B',
 'LLaVA 1.5',
 'CogVLM',
 'Volcano 13B',
 'SPHINX (Llama 2 13B)',
 'Orca 2-13B',
 'Llama Guard',
 'FunSearch',
 'Elyza',
 'Code Llama-70B',
 'Swallow'
]

df = df[~df['System'].isin(finetuned_systems)]
df = df[~df['System'].str.contains('Flan', case=False)]

## Merge SEAL Math with GSM1k

In [12]:
for i, row in df.iterrows():
  if pd.notna(row['SEAL Math']):
    df.at[i, 'GSM1k'] = row['SEAL Math']
df

Unnamed: 0,System,Date,GSM8k,GSM1k,Training compute (FLOP),Speculative Compute,Active Parameters,Open/Closed,Model size (parameters),Dataset size,Training compute notes,BBH,GPQA,MMLU,HELM MMLU,SEAL Coding,SEAL Instruction Following,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,GPQA Notes,MMLU Notes,HELM MMLU Notes,Trust in benchmark results,Trust notes
0,claude-2.1,2023-07-11,0.887,0.894,3.800000e+24,,,Closed,,,,,,,,,,,,,,,,,,,
1,claude-3-haiku-20240307,2024-03-04,0.785,0.785,,,,Closed,,,,,,,,,,,,,,,,,,,
2,claude-3-opus-20240229,2024-03-04,0.802,0.825,,4.000000e+25,,Closed,,,,,,,,,,,,,,,,,,,
3,claude-3-sonnet-20240229,2024-03-04,0.719,0.744,,,,Closed,,,,,,,,,,,,,,,,,,,
4,codegemma-7b,2024-04-09,0.479,0.416,3.330000e+23,,7000000000,Open,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,XVerse-7B,2023-09-26,,,,,7000000000,Open,,,,,,,,,,,,,,,,,,0.0,
199,Yi-1.5-34B,2024-05-10,,,,,,Open,,,,,0.060,,,,,,,,,,Epoch evaluation,,,0.0,
200,Yi-34B,2023-11-02,,,6.120000e+23,,34000000000,Open,3.400000e+10,3.000000e+12,,0.543,0.165,0.7635,,,,,1111.0,chat,,,Epoch evaluation,,,-1.0,MMLU-GPQA performance difference is relatively...
201,Yi-6B,2023-11-02,,,1.080000e+23,,6000000000,Open,6.000000e+09,3.000000e+12,,0.428,,0.6385,,,,,,,,,,,,0.0,


In [13]:
def convert_int(x):
    if pd.isna(x):
        return np.nan
    try:
        return int(str(x).replace(',', ''))
    except ValueError:
        return np.nan

df['Active Parameters'] = df['Active Parameters'].apply(convert_int)

## Data visualization

In [14]:
fig = make_subplots(rows=2, cols=2, subplot_titles=benchmarks_to_plot, vertical_spacing=0.15)

for i, bench in enumerate(benchmarks_to_plot):
    plot_df = df[~(df['System'] == 'Random chance')]
    if non_suspects_only:
        if bench == 'GPQA':
            # GPQA was released November 20, 2023
            old_df = plot_df[plot_df['Date'] < pd.to_datetime('2023-11-20')]
            new_df = plot_df[plot_df['Date'] >= pd.to_datetime('2023-11-20')]
            new_df = new_df[new_df['Trust in benchmark results'] >= 0]
            plot_df = pd.concat([old_df, new_df])
        elif bench == 'MMLU':
            plot_df = plot_df[plot_df['Trust in benchmark results'] >= 0]
    elif trusted_only:
        plot_df = plot_df[plot_df['Trust in benchmark results'] > 0]

    fig.append_trace(
        go.Scatter(
            x=plot_df['Training compute (FLOP)'],
            y=100 * plot_df[bench],
            mode='markers',
            text=plot_df['System'],
            name=bench,
            showlegend=True if i == 0 else False
        ),
        row=i//2 + 1, col=i%2 + 1
    )

    # Update x and y axes for this subplot
    fig.update_xaxes(
        title_text="Training compute (FLOP)" if i//2 + 1 == 2 else None,
        type='log',
        tickmode='linear',
        dtick=2,  # This sets ticks at every two powers of 10
        row=i//2 + 1,
        col=i%2 + 1
    )

    if i%2 + 1 == 1:
        fig.update_yaxes(title_text="Accuracy (%)", row=i//2 + 1, col=i%2 + 1)

# Improve the layout
fig.update_layout(
    template='plotly_white',
    width=600,
    height=400,
    # legend_title="Model accessibility",
    font=dict(size=12),
    hovermode="closest",
)

# Margins
fig.update_layout(
    margin=dict(l=0, r=0, t=20, b=0)
)

# Save the plot
if save:
    save_plot(fig, results_dir, 'benchmark_training_compute')

# Show the plot
fig.show()

In [15]:
fig = make_subplots(rows=2, cols=2, subplot_titles=benchmarks_to_plot, vertical_spacing=0.15)

for i, bench in enumerate(benchmarks_to_plot):
    plot_df = df[~(df['System'] == 'Random chance')]
    if non_suspects_only:
        if bench == 'GPQA':
            # GPQA was released November 20, 2023
            old_df = plot_df[plot_df['Date'] < pd.to_datetime('2023-11-20')]
            new_df = plot_df[plot_df['Date'] >= pd.to_datetime('2023-11-20')]
            new_df = new_df[new_df['Trust in benchmark results'] >= 0]
            plot_df = pd.concat([old_df, new_df])
        elif bench == 'MMLU':
            plot_df = plot_df[plot_df['Trust in benchmark results'] >= 0]
    elif trusted_only:
        plot_df = plot_df[plot_df['Trust in benchmark results'] > 0]

    fig.append_trace(
        go.Scatter(
            x=plot_df['Active Parameters'],
            y=100 * plot_df[bench],
            mode='markers',
            text=plot_df['System'],
            name=bench,
            showlegend=True if i == 0 else False
        ),
        row=i//2 + 1, col=i%2 + 1
    )

    # Update x and y axes for this subplot
    fig.update_xaxes(
        title_text="Active Parameters",  # if i//2 + 1 == 2 else None,
        type='log',
        tickmode='linear',
        dtick=2,  # This sets ticks at every two powers of 10
        row=i//2 + 1,
        col=i%2 + 1
    )

    if i%2 + 1 == 1:
        fig.update_yaxes(title_text="Accuracy (%)", row=i//2 + 1, col=i%2 + 1)

# Improve the layout
fig.update_layout(
    template='plotly_white',
    width=600,
    height=400,
    # legend_title="Model accessibility",
    font=dict(size=12),
    hovermode="closest",
)

# Margins
fig.update_layout(
    margin=dict(l=0, r=0, t=20, b=0)
)

# Save the plot
if save:
    save_plot(fig, results_dir, 'benchmark_active_params')

# Show the plot
fig.show()

# Prepare data for regression

In [16]:
reg_df = df[~(df['System'] == 'Random chance')]
if non_suspects_only:
    if benchmark_to_analyze == 'GPQA':
        # GPQA was released November 20, 2023
        old_df = reg_df[reg_df['Date'] < pd.to_datetime('2023-11-20')]
        new_df = reg_df[reg_df['Date'] >= pd.to_datetime('2023-11-20')]
        new_df = new_df[new_df['Trust in benchmark results'] >= 0]
        reg_df = pd.concat([old_df, new_df])
    elif benchmark_to_analyze == 'MMLU':
        reg_df = reg_df[reg_df['Trust in benchmark results'] >= 0]
elif trusted_only:
    reg_df = reg_df[reg_df['Trust in benchmark results'] > 0]

In [17]:
random_chance_level = df.loc[df["System"] == "Random chance", benchmark_to_analyze].values[0]
# Filter out models that are not far above random chance level
# This is a heuristic to find the changepoint
filtered_reg_df = reg_df.loc[reg_df[benchmark_to_analyze] > random_chance_level + 0.05].copy()
filtered_reg_df['log_compute'] = np.log10(filtered_reg_df['Training compute (FLOP)'])
filtered_reg_df['log_params'] = np.log10(filtered_reg_df['Active Parameters'])
filtered_reg_df[benchmark_to_analyze + '_log_error'] = -np.log(1 - filtered_reg_df[benchmark_to_analyze])
filtered_reg_df.dropna(subset=[cost_var, benchmark_to_analyze + '_log_error'], inplace=True)

In [18]:
filtered_reg_df.loc[:, 'float_date'] = datetime_to_float_year(filtered_reg_df['Date'])

In [19]:
var_labels = {
    'float_date': 'Year',
    'log_compute': 'Training compute (FLOP)',
    'log_params': 'Active Parameters',
    benchmark_to_analyze + '_log_error': 'Performance (negative log of error rate)'
}

# Fit a model of performance = f(cost, date)

In [20]:
# Single fit for all data
performance_model = fit_ols_regression(filtered_reg_df, ['float_date', cost_var], benchmark_to_analyze + '_log_error')
performance_model.summary()

0,1,2,3
Dep. Variable:,MMLU_log_error,R-squared:,0.764
Model:,OLS,Adj. R-squared:,0.753
Method:,Least Squares,F-statistic:,69.62
Date:,"Wed, 27 Nov 2024",Prob (F-statistic):,3.28e-14
Time:,16:29:29,Log-Likelihood:,3.1623
No. Observations:,46,AIC:,-0.3245
Df Residuals:,43,BIC:,5.161
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-699.3934,80.360,-8.703,0.000,-861.455,-537.332
float_date,0.3435,0.040,8.659,0.000,0.263,0.423
log_params,0.5045,0.054,9.282,0.000,0.395,0.614

0,1,2,3
Omnibus:,3.351,Durbin-Watson:,1.504
Prob(Omnibus):,0.187,Jarque-Bera (JB):,2.915
Skew:,-0.054,Prob(JB):,0.233
Kurtosis:,4.228,Cond. No.,4720000.0


In [21]:
performance_model.params.index

Index(['Intercept', 'float_date', 'log_params'], dtype='object')

In [22]:
get_predictions(performance_model, filtered_reg_df, ['float_date', cost_var])


array([1.11757918, 0.97610158, 0.70538761, 1.48422739, 0.77338747,
       1.73323923, 1.69521023, 0.91310469, 0.62322414, 1.02411867,
       1.01881757, 0.90349758, 1.33944771, 0.3834775 , 1.27853433,
       1.59350356, 1.4106542 , 1.88583089, 1.50122986, 0.64669464,
       0.85079622, 0.9993176 , 0.51106503, 0.78417423, 0.9948165 ,
       1.15303403, 0.64854462, 1.62567289, 0.94253099, 0.57906979,
       1.8104132 , 1.8104132 , 0.72435251, 1.15813202, 0.68567076,
       1.43453336, 0.71519808, 0.97504808, 0.53049815, 0.66494318,
       0.27546548, 0.87981142, 1.13419938, 0.84894679, 0.88738664,
       0.71422127])

In [23]:
# Define the range for log_compute and float_date
float_date_min, float_date_max = filtered_reg_df['float_date'].min(), filtered_reg_df['float_date'].max()
log_cost_min, log_cost_max = filtered_reg_df[cost_var].min(), filtered_reg_df[cost_var].max()

# Create a grid of values
log_cost_vals = np.linspace(log_cost_min - 1, log_cost_max + 1, 100)
float_date_vals = np.linspace(float_date_min - 1, float_date_max + 1, 100)
X_grid, Y_grid = np.meshgrid(float_date_vals, log_cost_vals)

# Prepare the grid for prediction
X_pred = pd.DataFrame({
    'float_date': X_grid.ravel(),
    cost_var: Y_grid.ravel()
})

# Generate predictions
Z_pred = performance_model.predict(X_pred)
Z_pred = Z_pred.values.reshape(X_grid.shape)

min_performance = Z_pred.min()
max_performance = Z_pred.max()

# Plot the contour using Plotly
fig = go.Figure(data=go.Contour(
    x=float_date_vals,
    y=10**log_cost_vals,
    z=accuracy_from_negative_log_error(Z_pred)*100,
    colorscale=colorscale,
    colorbar=dict(title='Predicted accuracy (%)'),
    contours=dict(
        coloring='heatmap',
        showlabels=True,
        labelfont=dict(size=12, color='white')
    )
))

# Add the actual data points with Viridiscolorscale
fig.add_trace(go.Scatter(
    x=filtered_reg_df['float_date'],
    y=10**filtered_reg_df[cost_var],
    mode='markers',
    marker=dict(
        color=accuracy_from_negative_log_error(filtered_reg_df[benchmark_to_analyze + '_log_error'])*100,
        colorscale=colorscale,
        cmin=accuracy_from_negative_log_error(min_performance)*100,
        cmax=accuracy_from_negative_log_error(max_performance)*100,
        line=dict(width=1, color='black')
    ),
    text=filtered_reg_df['System'],
    # show year, cost, and performance on hover
    hovertemplate='%{text}<br>Year: %{x:.2f}<br>cost: %{y:.2f}<br>Performance: %{customdata:.2f}',
    customdata=accuracy_from_negative_log_error(filtered_reg_df[benchmark_to_analyze + '_log_error'])*100,
    name='Data'
))

fig.update_yaxes(type='log')

fig.update_layout(
    title=f'{var_labels[cost_var]} efficiency for {benchmark_to_analyze}',
    xaxis_title='Release date',
    yaxis_title=var_labels[cost_var],
    coloraxis_colorbar=dict(title=f'Predicted {benchmark_to_analyze} accuracy (%)'),
    width=600,
    height=400
)

if save:
    save_plot(fig, results_dir, f'{benchmark_to_analyze}_predictions_isoperformance_contour_cost={cost_var}')

fig.show()

In [24]:
print(performance_model.params)
print(performance_model.conf_int())

Intercept    -699.393413
float_date      0.343494
log_params      0.504489
dtype: float64
                     0           1
Intercept  -861.454843 -537.331983
float_date    0.263495    0.423492
log_params    0.394885    0.614094


Regression equation is

$P = \beta_0 + \beta_1 t + \beta_2 C$

where P is performance, t is the model release date and C is the cost variable.

For a constant performance level $P = d$, the cost $C$ is

$C = (d - \beta_0) / \beta_2 - (\beta_1/\beta_2) t$

So the growth in cost over time is given by $-(\beta_1/\beta_2)$

In [25]:
isoperformance_slope = -performance_model.params['float_date'] / performance_model.params[cost_var]
print(f'It costs {1/(10**isoperformance_slope):.1f}x less {var_labels[cost_var]} each year to keep {benchmark_to_analyze} performance fixed.')

It costs 4.8x less Active Parameters each year to keep MMLU performance fixed.


In [26]:
date_param_ci = performance_model.conf_int().loc['float_date']
date_param_dist = norm_from_ci(*date_param_ci, 95, num_samples)
cost_param_ci = performance_model.conf_int().loc[cost_var]
cost_param_dist = norm_from_ci(*cost_param_ci, 95, num_samples)

isoperformance_slope_dist = -date_param_dist / cost_param_dist
isoperformance_slope_mean = isoperformance_slope_dist.mean()
isoperformance_slope_ci = np.percentile(isoperformance_slope_dist, [5, 95])
print(f'It costs {1/(10**isoperformance_slope_mean):.1f}x less {var_labels[cost_var]} each year to keep {benchmark_to_analyze} performance fixed.')
print(f'90% CI: {1/(10**isoperformance_slope_ci[1]):.1f}x to {1/(10**isoperformance_slope_ci[0]):.1f}x')


It costs 4.9x less Active Parameters each year to keep MMLU performance fixed.
90% CI: 3.3x to 7.7x


# Fit a model of date = f(performance, cost)

In [27]:
time_model = fit_ols_regression(filtered_reg_df, [benchmark_to_analyze + '_log_error', cost_var], 'float_date')
time_model.summary()

0,1,2,3
Dep. Variable:,float_date,R-squared:,0.645
Model:,OLS,Adj. R-squared:,0.628
Method:,Least Squares,F-statistic:,39.0
Date:,"Wed, 27 Nov 2024",Prob (F-statistic):,2.19e-10
Time:,16:29:30,Log-Likelihood:,-35.567
No. Observations:,46,AIC:,77.13
Df Residuals:,43,BIC:,82.62
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2032.3398,1.533,1325.391,0.000,2029.247,2035.432
MMLU_log_error,1.8502,0.214,8.659,0.000,1.419,2.281
log_params,-1.0122,0.155,-6.539,0.000,-1.324,-0.700

0,1,2,3
Omnibus:,5.279,Durbin-Watson:,1.367
Prob(Omnibus):,0.071,Jarque-Bera (JB):,4.053
Skew:,-0.619,Prob(JB):,0.132
Kurtosis:,3.762,Cond. No.,208.0


In [28]:
# Define the range for performance and log_compute
performance_min, performance_max = filtered_reg_df[benchmark_to_analyze + '_log_error'].min(), filtered_reg_df[benchmark_to_analyze + '_log_error'].max()
log_cost_min, log_cost_max = filtered_reg_df[cost_var].min(), filtered_reg_df[cost_var].max()

# Create a grid of values
performance_vals = np.linspace(performance_min - 0.5, performance_max + 0.5, 100)
log_cost_vals = np.linspace(log_cost_min - 1, log_cost_max + 1, 100)
X_grid, Y_grid = np.meshgrid(performance_vals, log_cost_vals)

# Prepare the grid for prediction
X_pred = pd.DataFrame({
    benchmark_to_analyze + '_log_error': X_grid.ravel(),
    cost_var: Y_grid.ravel()
})

# Generate predictions
Z_pred = time_model.predict(X_pred)
Z_pred = Z_pred.values.reshape(X_grid.shape)

min_year = Z_pred.min()
max_year = Z_pred.max()

# Plot the contour using Plotly
fig = go.Figure(data=go.Contour(
    x=performance_vals,
    y=10**log_cost_vals,
    z=Z_pred,
    colorscale=colorscale,
    colorbar=dict(title='Year'),
    contours=dict(
        coloring='heatmap',
        showlabels=True,
        labelfont=dict(size=12, color='white')
    )
))

# Add the actual data points with Viridiscolorscale
fig.add_trace(go.Scatter(
    x=filtered_reg_df[benchmark_to_analyze + '_log_error'],
    y=10**filtered_reg_df[cost_var],
    mode='markers',
    marker=dict(
        color=filtered_reg_df['float_date'],
        colorscale=colorscale,
        cmin=min_year,
        cmax=max_year,
        line=dict(width=1, color='black')
    ),
    text=filtered_reg_df['System'],
    name='Data'
))

fig.update_yaxes(type='log')

fig.update_layout(
    title=f'Regression Model Predictions for {benchmark_to_analyze}',
    xaxis_title='Performance (negative log of error rate)',
    yaxis_title=var_labels[cost_var],
    coloraxis_colorbar=dict(title=f'Predicted Year'),
    width=800,
    height=400
)

if save:
    save_plot(fig, results_dir, f'{benchmark_to_analyze}_predictions_isotime_contour_cost={cost_var}')

fig.show()

# Put models into (performance, date) buckets and average the cost in each bucket

In [29]:
filtered_reg_df.head()

Unnamed: 0,System,Date,GSM8k,GSM1k,Training compute (FLOP),Speculative Compute,Active Parameters,Open/Closed,Model size (parameters),Dataset size,Training compute notes,BBH,GPQA,MMLU,HELM MMLU,SEAL Coding,SEAL Instruction Following,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,GPQA Notes,MMLU Notes,HELM MMLU Notes,Trust in benchmark results,Trust notes,log_compute,log_params,MMLU_log_error,float_date
71,BLOOM-176B,2022-11-09,,,4.12e+23,,176000000000.0,Open,176000000000.0,390000000000.0,,0.4491,,0.3913,,,,,,,,,,,,0.0,,23.614897,11.245513,0.49643,2022.855237
72,BloombergGPT,2023-03-30,,,2.12e+23,,50000000000.0,Closed,50000000000.0,708000000000.0,,0.4197,,0.3918,,,,,,,,,,,,0.0,,23.326336,10.69897,0.497252,2023.246066
78,Chinchilla 70B,2022-03-29,,,5.76e+23,,70000000000.0,Closed,,,,,,0.675,,,,,,,,,,,,0.0,,23.760422,10.845098,1.12393,2022.243328
89,Cohere Command R+,2024-04-04,,,,,104000000000.0,Open,,,,,,0.757,,,,,1190.0,,,,,https://artificialanalysis.ai/models/command-r...,,0.0,,,11.017033,1.414694,2024.258214
93,DeepSeek-7B,2023-11-29,,,8.4e+22,,7000000000.0,Open,7000000000.0,2000000000000.0,,,,0.482,,,,,,,,,,,,0.0,,22.924279,9.845098,0.65778,2023.909995


In [30]:
performance_min = 0.25
performance_max = 2.25
performance_step = 0.25
performance_buckets = np.arange(performance_min, performance_max, performance_step)
performance_col = benchmark_to_analyze + '_log_error'
performance_bucket_dfs = {}
for performance_low in performance_buckets:
    performance_high = performance_low + performance_step
    bucket_df = filtered_reg_df.loc[(filtered_reg_df[performance_col] >= performance_low) & (filtered_reg_df[performance_col] < performance_high)]
    performance_bucket_dfs[(performance_low, performance_high)] = bucket_df
    print(f'Bucket: {performance_low:.2f} to {performance_high:.2f}')
    print(bucket_df[['System', 'Date', cost_var, performance_col]])


Bucket: 0.25 to 0.50
           System       Date  log_params  MMLU_log_error
71     BLOOM-176B 2022-11-09   11.245513        0.496430
72   BloombergGPT 2023-03-30   10.698970        0.497252
133  GPT-NeoX 20B 2022-02-09   10.301030        0.409473
147    LLaMa-1 7B 2023-02-24    9.845098        0.432323
159        MPT 7B 2023-05-05    9.845098        0.368169
167       OPT-66B 2022-05-02   10.819544        0.446131
Bucket: 0.50 to 0.75
                   System       Date  log_params  MMLU_log_error
93            DeepSeek-7B 2023-11-29    9.845098        0.657780
116            Gemma 1 2B 2024-04-09    9.301030        0.549913
119              GLM 130B 2022-10-05   11.113943        0.594207
144           LLaMa-1 13B 2023-02-24   10.113943        0.632993
151            LLaMa-2 7B 2023-07-18    9.845098        0.603306
158               MPT 30B 2023-06-22   10.477121        0.653926
180      StableLM-3B-4E1T 2023-09-29    9.477121        0.602028
182  StableLM-alpha-7b-v2 2023-08-05   

In [31]:
# Plot the cost as a function of year for each bucket

fig = make_subplots(
    rows=2,
    cols=4,
    subplot_titles=[f'Performance: {accuracy_from_negative_log_error(performance_low)*100:.0f}% ' +
                    f'to {accuracy_from_negative_log_error(performance_high)*100:.0f}%'
                    for performance_low, performance_high in performance_bucket_dfs.keys()]
)

for i, (performance_bucket, bucket_df) in enumerate(performance_bucket_dfs.items()):
    fig.append_trace(
        go.Scatter(
            x=datetime_to_float_year(bucket_df['Date']),
            y=10**bucket_df[cost_var],
            mode='markers+text',
            text=bucket_df['System'],
            textposition='top right',
            name=f'{performance_bucket[0]:.2f} to {performance_bucket[1]:.2f}',
            showlegend=False,
        ),
        row=i//4 + 1, col=i%4 + 1
    )
    fig.update_xaxes(title_text='Release date', row=i//4 + 1, col=i%4 + 1)

# Only show whole years
fig.update_xaxes(tickmode='linear', dtick=1)
fig.update_yaxes(type='log')

fig.update_layout(
    title=f'{var_labels[cost_var]} of models over time for {benchmark_to_analyze} performance buckets',
    yaxis_title=var_labels[cost_var],
    width=1200,
    height=800,
    template='plotly_white'
)

if save:
    save_plot(fig, results_dir, f'{benchmark_to_analyze}_{cost_var}_for_performance_buckets')

fig.show()

In [32]:
# Within each bucket, group by year
for performance_bucket, bucket_df in performance_bucket_dfs.items():
    bucket_df = bucket_df.copy()
    bucket_df['Year'] = bucket_df['Date'].dt.year
    mean_cost_by_year = bucket_df.groupby('Year')[cost_var].mean()
    print(f'Bucket: {performance_bucket}')
    print(mean_cost_by_year)


Bucket: (0.25, 0.5)
Year
2022    10.788696
2023    10.129722
Name: log_params, dtype: float64
Bucket: (0.5, 0.75)
Year
2020    11.243038
2022    11.113943
2023     9.933913
2024     9.301030
Name: log_params, dtype: float64
Bucket: (0.75, 1.0)
Year
2021    11.447158
2022    11.019459
2023    10.262712
Name: log_params, dtype: float64
Bucket: (1.0, 1.25)
Year
2022    11.273510
2023    10.478721
2024    10.065167
Name: log_params, dtype: float64
Bucket: (1.25, 1.5)
Year
2023    10.894870
2024    11.017033
Name: log_params, dtype: float64
Bucket: (1.5, 1.75)
Year
2023    11.531479
2024    11.271168
Name: log_params, dtype: float64
Bucket: (1.75, 2.0)
Year
2023    11.447158
2024    10.967502
Name: log_params, dtype: float64
Bucket: (2.0, 2.25)
Year
2024    11.607455
Name: log_params, dtype: float64


# Simple insight: key models, decreasing cost, increasing performance

In [38]:
filtered_reg_df.head()

Unnamed: 0,System,Date,GSM8k,GSM1k,Training compute (FLOP),Speculative Compute,Active Parameters,Open/Closed,Model size (parameters),Dataset size,Training compute notes,BBH,GPQA,MMLU,HELM MMLU,SEAL Coding,SEAL Instruction Following,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,GPQA Notes,MMLU Notes,HELM MMLU Notes,Trust in benchmark results,Trust notes,log_compute,log_params,MMLU_log_error,float_date
71,BLOOM-176B,2022-11-09,,,4.12e+23,,176000000000.0,Open,176000000000.0,390000000000.0,,0.4491,,0.3913,,,,,,,,,,,,0.0,,23.614897,11.245513,0.49643,2022.855237
72,BloombergGPT,2023-03-30,,,2.12e+23,,50000000000.0,Closed,50000000000.0,708000000000.0,,0.4197,,0.3918,,,,,,,,,,,,0.0,,23.326336,10.69897,0.497252,2023.246066
78,Chinchilla 70B,2022-03-29,,,5.76e+23,,70000000000.0,Closed,,,,,,0.675,,,,,,,,,,,,0.0,,23.760422,10.845098,1.12393,2022.243328
89,Cohere Command R+,2024-04-04,,,,,104000000000.0,Open,,,,,,0.757,,,,,1190.0,,,,,https://artificialanalysis.ai/models/command-r...,,0.0,,,11.017033,1.414694,2024.258214
93,DeepSeek-7B,2023-11-29,,,8.4e+22,,7000000000.0,Open,7000000000.0,2000000000000.0,,,,0.482,,,,,,,,,,,,0.0,,22.924279,9.845098,0.65778,2023.909995


In [63]:
# Perform PCA on 'Active Parameters' and 'MMLU'
pca = PCA(n_components=2)
pca.fit(filtered_reg_df[['log_params', benchmark_to_analyze]])
filtered_reg_df['pca_1'] = pca.transform(filtered_reg_df[['log_params', benchmark_to_analyze]])[:, 0]
filtered_reg_df['pca_2'] = pca.transform(filtered_reg_df[['log_params', benchmark_to_analyze]])[:, 1]
filtered_reg_df.head()

Unnamed: 0,System,Date,GSM8k,GSM1k,Training compute (FLOP),Speculative Compute,Active Parameters,Open/Closed,Model size (parameters),Dataset size,Training compute notes,BBH,GPQA,MMLU,HELM MMLU,SEAL Coding,SEAL Instruction Following,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,GPQA Notes,MMLU Notes,HELM MMLU Notes,Trust in benchmark results,Trust notes,log_compute,log_params,MMLU_log_error,float_date,pca_1,pca_2
71,BLOOM-176B,2022-11-09,,,4.12e+23,,176000000000.0,Open,176000000000.0,390000000000.0,,0.4491,,0.3913,,,,,,,,,,,,0.0,,23.614897,11.245513,0.49643,2022.855237,0.537916,-0.297745
72,BloombergGPT,2023-03-30,,,2.12e+23,,50000000000.0,Closed,50000000000.0,708000000000.0,,0.4197,,0.3918,,,,,,,,,,,,0.0,,23.326336,10.69897,0.497252,2023.246066,-0.002484,-0.21603
78,Chinchilla 70B,2022-03-29,,,5.76e+23,,70000000000.0,Closed,,,,,,0.675,,,,,,,,,,,,0.0,,23.760422,10.845098,1.12393,2022.243328,0.184107,0.04231
89,Cohere Command R+,2024-04-04,,,,,104000000000.0,Open,,,,,,0.757,,,,,1190.0,,,,,https://artificialanalysis.ai/models/command-r...,,0.0,,,11.017033,1.414694,2024.258214,0.36632,0.097848
93,DeepSeek-7B,2023-11-29,,,8.4e+22,,7000000000.0,Open,7000000000.0,2000000000000.0,,,,0.482,,,,,,,,,,,,0.0,,22.924279,9.845098,0.65778,2023.909995,-0.83347,6.1e-05


In [71]:
import plotly.graph_objects as go

# Calculate the mean of the data
mean_x = filtered_reg_df['log_params'].mean()
mean_y = filtered_reg_df[benchmark_to_analyze].mean()

# Create scatter plot of raw data
fig = px.scatter(
    filtered_reg_df, 
    x='log_params', 
    y=benchmark_to_analyze,
    text='System', 
    title='Raw Data with Principal Components',
    template='plotly_white',
    width=800,
    height=600
)

# Calculate PCA component vectors
pc1 = pca.components_[0]
pc2 = pca.components_[1]

# Determine scaling factor for arrows
scale = max(filtered_reg_df['log_params'].max(), filtered_reg_df[benchmark_to_analyze].max()) * 0.05

# Add PC1 vector centered on the data
fig.add_trace(go.Scatter(
    x=[mean_x, mean_x + pc1[0] * scale],
    y=[mean_y, mean_y + pc1[1] * scale],
    mode='lines+markers',
    name='PC1',
    line=dict(color='red', width=2),
    marker=dict(size=[0, 10], color='red')
))

# Add PC2 vector centered on the data
fig.add_trace(go.Scatter(
    x=[mean_x, mean_x + pc2[0] * scale],
    y=[mean_y, mean_y + pc2[1] * scale],
    mode='lines+markers',
    name='PC2',
    line=dict(color='green', width=2),
    marker=dict(size=[0, 10], color='green')
))

# Update layout for better visualization
fig.update_layout(
    xaxis_title='log_params',
    yaxis_title=benchmark_to_analyze,
    legend=dict(x=0.01, y=0.99),
    showlegend=True
)

fig.show()


In [64]:
# Plot the PCA components
fig = px.scatter(filtered_reg_df, x='pca_1', y='pca_2', text='System', color='MMLU')
fig.show()


In [55]:
price_performance_df = filtered_reg_df.copy()
price_performance_df['price_performance'] = price_performance_df[cost_var] + price_performance_df[benchmark_to_analyze + '_log_error']
price_performance_df.sort_values(by='price_performance', ascending=True).head(10)


Unnamed: 0,System,Date,GSM8k,GSM1k,Training compute (FLOP),Speculative Compute,Active Parameters,Open/Closed,Model size (parameters),Dataset size,Training compute notes,BBH,GPQA,MMLU,HELM MMLU,SEAL Coding,SEAL Instruction Following,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,GPQA Notes,MMLU Notes,HELM MMLU Notes,Trust in benchmark results,Trust notes,log_compute,log_params,MMLU_log_error,float_date,price_performance
116,Gemma 1 2B,2024-04-09,,,,,2000000000.0,Open,2000000000.0,,,,,0.423,,,,,,,,,,,,0.0,,,9.30103,0.549913,2024.271903,9.850943
180,StableLM-3B-4E1T,2023-09-29,,,7.2e+22,,3000000000.0,Open,3000000000.0,4000000000000.0,,,,0.4523,,,,,,,,,,,,0.0,,22.857332,9.477121,0.602028,2023.743328,10.079149
159,MPT 7B,2023-05-05,,,4.2e+22,,7000000000.0,Open,7000000000.0,1000000000000.0,,0.31,,0.308,,,,,928.0,chat,,,,,,0.0,,22.623249,9.845098,0.368169,2023.344285,10.213267
147,LLaMa-1 7B,2023-02-24,,,4.2e+22,,7000000000.0,Open,7000000000.0,1000000000000.0,,0.303,,0.351,,,,,,,,,,,,0.0,,22.623249,9.845098,0.432323,2023.146305,10.277421
182,StableLM-alpha-7b-v2,2023-08-05,,,6.3e+22,,7000000000.0,Open,7000000000.0,1500000000000.0,,,,0.451,,,,,,,,,,,,0.0,,22.799341,9.845098,0.599657,2023.594285,10.444755
151,LLaMa-2 7B,2023-07-18,,,8.4e+22,,7000000000.0,Open,7000000000.0,2000000000000.0,,0.326,,0.453,,,,,1036.0,chat,,,,,,0.0,,22.924279,9.845098,0.603306,2023.546544,10.448405
93,DeepSeek-7B,2023-11-29,,,8.4e+22,,7000000000.0,Open,7000000000.0,2000000000000.0,,,,0.482,,,,,,,,,,,,0.0,,22.924279,9.845098,0.65778,2023.909995,10.502878
172,Qwen 7B,2023-09-28,,,1.01e+23,,7000000000.0,Open,,,,,,0.567,,,,,,,,,,,,0.0,,23.004321,9.845098,0.837018,2023.74059,10.682116
133,GPT-NeoX 20B,2022-02-09,,,5.67e+22,,20000000000.0,Open,20000000000.0,473000000000.0,,0.4025,,0.336,,,,,,,,,,,,0.0,,22.753583,10.30103,0.409473,2022.105237,10.710503
144,LLaMa-1 13B,2023-02-24,,,7.8e+22,,13000000000.0,Open,13000000000.0,1000000000000.0,,0.37,,0.469,,,,,799.0,,,,,,,0.0,,22.892095,10.113943,0.632993,2023.146305,10.746937


In [56]:
# Bar chart of price performance
fig = go.Figure(data=[go.Bar(
    x=price_performance_df.sort_values(by='price_performance', ascending=True)['price_performance'],
    y=price_performance_df.sort_values(by='price_performance', ascending=True)['System'],
    orientation='h'
)])

# Annotate each bar with the benchmark accuracy in %
for index, row in price_performance_df.iterrows():
    fig.add_annotation(
        x=row['price_performance'],
        y=row['System'],
        text=f"{row[benchmark_to_analyze]:.2%}",
        showarrow=True,
        arrowhead=2,
        ax=20,
        ay=-10
    )

fig.update_layout(
    title='Price Performance of Models',
    xaxis_title='Price Performance',
    yaxis_title='System',
    template='plotly_white'
)

fig.show()
