In [1]:
import numpy as np
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import statsmodels.formula.api as smf

from data import load_pcd_df
from plotting import save_plot

pio.templates.default = "plotly_white"

In [2]:
benchmarks_to_plot = ['MMLU', 'GPQA']  # Benchmarks to plot. Default: ['MMLU', 'GPQA']
benchmark_to_analyze = 'MMLU'  # Benchmark to analyze. Default: 'MMLU'
non_suspects_only = True  # Whether to only include not-suspicious benchmark scores in the analysis. Default: True
trusted_only = False  # Whether to only include actively trusted benchmark scores in the analysis (more strict). Default: False
save = True  # Whether to save plots and results to disk. Default: True

In [3]:
results_dir = 'results/2025-01-27/'
os.makedirs(results_dir, exist_ok=True)

In [4]:
bench_is_accuracy = {'MMLU': True, 'BBH': True, 'GSM1k': True, 'GPQA': True, 'LMSys Elo': False, 'SEAL Coding': False, 'SEAL Math': False}

# Load price data

In [5]:
api_price_df = pd.read_csv('data/API prices - full view.csv')

api_price_df.head()

Unnamed: 0,Price description,Price,Price Unit,Model,Fine Tuned Model,Price date,Model Version,Organization (model developer),Organization (API vendor),Context Window,Archived price link,Tags,Notes,Last Modified
0,$2.50 / 1M input tokens,$2.50000,$/1M input tokens,GPT-4o,,2024-08-12,gpt-4o-2024-08-06,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,12/7/2024 9:13pm
1,$10.00 / 1M output tokens,$10.00000,$/1M output tokens,GPT-4o,,2024-08-12,gpt-4o-2024-08-06,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,12/7/2024 9:13pm
2,$0.638 / 1k 512^2 px input images,$0.63800,$/1k 512^2 px input images,GPT-4o,,2024-08-12,gpt-4o-2024-08-06,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,,High-resolution input images are priced as fol...,12/7/2024 9:13pm
3,$1.275 / 1k 512^2 px input images,$1.27500,$/1k 512^2 px input images,GPT-4o,,2024-08-12,gpt-4o-2024-05-13,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,,High-resolution input images are priced as fol...,12/7/2024 9:13pm
4,$0.213 / 1k low resolution input images,$0.21300,$/1k low resolution input images,GPT-4o,,2024-08-12,gpt-4o-2024-08-06,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,,Low-resolution images use 85 input tokens/imag...,12/7/2024 9:13pm


In [6]:
api_price_df['Price Unit'].unique()

array(['$/1M input tokens', '$/1M output tokens',
       '$/1k 512^2 px input images', '$/1k low resolution input images',
       '$/1k 1024^2 px input images',
       '$/1M input tokens (for <=128k tokens)',
       '$/1M input tokens (for > 128k tokens)',
       '$/1M output tokens (for <= 128k tokens)',
       '$/1M output tokens (for > 128k tokens)',
       '$/1M cashed tokens (for <= 128k tokens)',
       '$/1M cashed tokens (for >128k tokens)',
       '$/1M cashed tokens per hour', '$/1M embedding tokens',
       '$/1M training tokens', '$/1k 1024^2 px images generated',
       '$/minute', '$/1M characters', '$/month'], dtype=object)

In [7]:
# Group by price unit and count
api_price_df.groupby('Price Unit').size()


Price Unit
$/1M cashed tokens (for <= 128k tokens)     3
$/1M cashed tokens (for >128k tokens)       3
$/1M cashed tokens per hour                 2
$/1M characters                             2
$/1M embedding tokens                       3
$/1M input tokens                          46
$/1M input tokens (for <=128k tokens)       3
$/1M input tokens (for > 128k tokens)       3
$/1M output tokens                         30
$/1M output tokens (for <= 128k tokens)     3
$/1M output tokens (for > 128k tokens)      3
$/1M training tokens                        5
$/1k 1024^2 px images generated             8
$/1k 1024^2 px input images                 3
$/1k 512^2 px input images                  3
$/1k low resolution input images            3
$/minute                                    1
$/month                                     1
dtype: int64

In [8]:
# Focus on the simplest price units: $/1M input tokens and $/1M output tokens
api_price_df = api_price_df[api_price_df['Price Unit'].isin(['$/1M input tokens', '$/1M output tokens'])]
api_price_df.head()

Unnamed: 0,Price description,Price,Price Unit,Model,Fine Tuned Model,Price date,Model Version,Organization (model developer),Organization (API vendor),Context Window,Archived price link,Tags,Notes,Last Modified
0,$2.50 / 1M input tokens,$2.50000,$/1M input tokens,GPT-4o,,2024-08-12,gpt-4o-2024-08-06,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,12/7/2024 9:13pm
1,$10.00 / 1M output tokens,$10.00000,$/1M output tokens,GPT-4o,,2024-08-12,gpt-4o-2024-08-06,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,12/7/2024 9:13pm
7,$5.00 / 1M input tokens,$5.00000,$/1M input tokens,GPT-4o,,2024-08-12,gpt-4o-2024-05-13,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,12/7/2024 9:13pm
8,$15.00 / 1M output tokens,$15.00000,$/1M output tokens,GPT-4o,,2024-08-12,gpt-4o-2024-05-13,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,12/7/2024 9:13pm
10,$0.15 / 1M input tokens,$0.15000,$/1M input tokens,GPT-4o mini,,2024-08-12,gpt-4o-mini-2024-07-18,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,8/16/2024 2:26pm


In [9]:
# Convert Price string to float
api_price_df['Price (USD)'] = api_price_df['Price'].str.replace('$', '').astype(float)

In [10]:
# Plot $/1M tokens over time
fig = px.scatter(api_price_df, x='Price date', y='Price (USD)', color='Price Unit', title='API Prices Over Time')
# Log y
fig.update_layout(yaxis_type='log')
if save:
    save_plot(fig, results_dir, 'prices_over_time')
fig.show()


In [11]:
# Regression on price over time
api_price_df['date'] = api_price_df['Price date'].map(lambda x: pd.Timestamp(x).toordinal())
api_price_df['price'] = api_price_df['Price (USD)']
api_price_df['log_price'] = np.log10(api_price_df['Price (USD)'])
model = smf.ols('log_price ~ date', data=api_price_df[api_price_df['Price Unit'] == '$/1M input tokens']).fit()
model.summary()

0,1,2,3
Dep. Variable:,log_price,R-squared:,0.021
Model:,OLS,Adj. R-squared:,-0.002
Method:,Least Squares,F-statistic:,0.9247
Date:,"Mon, 27 Jan 2025",Prob (F-statistic):,0.341
Time:,15:36:58,Log-Likelihood:,-45.549
No. Observations:,46,AIC:,95.1
Df Residuals:,44,BIC:,98.76
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,996.7351,1036.412,0.962,0.341,-1092.016,3085.486
date,-0.0013,0.001,-0.962,0.341,-0.004,0.001

0,1,2,3
Omnibus:,2.688,Durbin-Watson:,1.909
Prob(Omnibus):,0.261,Jarque-Bera (JB):,2.407
Skew:,-0.468,Prob(JB):,0.3
Kurtosis:,2.384,Cond. No.,7800000000.0


In [12]:
model = smf.ols('log_price ~ date', data=api_price_df[api_price_df['Price Unit'] == '$/1M output tokens']).fit()
model.summary()

0,1,2,3
Dep. Variable:,log_price,R-squared:,0.019
Model:,OLS,Adj. R-squared:,-0.016
Method:,Least Squares,F-statistic:,0.5359
Date:,"Mon, 27 Jan 2025",Prob (F-statistic):,0.47
Time:,15:36:58,Log-Likelihood:,-33.36
No. Observations:,30,AIC:,70.72
Df Residuals:,28,BIC:,73.52
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,906.1941,1237.141,0.732,0.470,-1627.974,3440.362
date,-0.0012,0.002,-0.732,0.470,-0.005,0.002

0,1,2,3
Omnibus:,1.228,Durbin-Watson:,1.897
Prob(Omnibus):,0.541,Jarque-Bera (JB):,0.809
Skew:,-0.401,Prob(JB):,0.667
Kurtosis:,2.928,Cond. No.,6580000000.0


In [13]:
# Model Version
api_price_df['Model Version'].unique()

array(['gpt-4o-2024-08-06', 'gpt-4o-2024-05-13', 'gpt-4o-mini-2024-07-18',
       nan, 'Llama-3.1-405B-Instruct', 'o1-preview-2024-09-12',
       'o1-mini-2024-09-12', 'Llama 3 8B 8k', 'Llama 3.1 8B Instruct',
       'Llama 3.1 405B Turbo', 'Llama 3.1 70B Instruct',
       'Jamba 1.5 Large', 'Cygnet'], dtype=object)

In [14]:
# Do a line plot of each Model's price over time
for model in api_price_df['Model'].unique():
    model_df = api_price_df[api_price_df['Model'] == model]
    fig = px.line(model_df, x='Price date', y='Price (USD)', color='Price Unit', 
                  title=f'{model} Prices Over Time', markers=True)
    fig.update_layout(yaxis_type='log')
    if save:
        save_plot(fig, results_dir, f'{model}_prices_over_time')
    fig.show()

# Price vs training compute

In [15]:
pcd_df = load_pcd_df()
pcd_df.head()

Unnamed: 0,Model,Domain,Task,Authors,Notability criteria,Notability criteria notes,Model accessibility,Link,Citations,Reference,...,Hardware type,Training compute estimation method,Biological model safeguards,Hardware utilization (temp),BenchmarkHub-v1,Hugging Face developer id,Post-training compute (FLOP),Post-training compute notes,Hardware maker,benchmarks/models
0,INTELLECT-MATH,,,,,,,,,,...,,,,,,,,,,INTELLECT-MATH
1,Cosmos-1.0-\nDiffusion-14B Video2World,"Robotics,Vision,Video","Robotic manipulation,Self-driving car,Video ge...","NVIDIA: Niket Agarwal, Arslan Ali, Maciej Bala...",,,Open weights (restricted use),https://arxiv.org/abs/2501.03575,,Cosmos World Foundation Model Platform for Phy...,...,,Hardware,,,,nvidia,,,NVIDIA,
2,OLMo 2 Furious 7B,Language,"Language modelling/generation,Question answering","Team OLMo, Pete Walsh, Luca Soldaini, Dirk Gro...",,,Open weights (unrestricted),https://arxiv.org/abs/2501.00656,,2 OLMo 2 Furious,...,,"Reported,Operation counting",,,,allenai,,,NVIDIA,
3,OLMo 2 Furious 13B,Language,"Language modelling/generation,Question answering","Team OLMo, Pete Walsh, Luca Soldaini, Dirk Gro...",,,Open weights (unrestricted),https://arxiv.org/abs/2501.00656,,2 OLMo 2 Furious,...,,"Reported,Operation counting",,,,allenai,,,NVIDIA,
4,DeepSeek-V3,Language,"Language modelling/generation,Code generation,...",,Training cost,training cost was $5.3million USD (Table 1),Open weights (restricted use),https://github.com/deepseek-ai/DeepSeek-V3/blo...,,DeepSeek-V3 Technical Report,...,,Operation counting,,,,deepseek-ai,,,NVIDIA,DeepSeek-V3


In [16]:
# Add 'Training compute (FLOP)' column to price_df
# Need to match on 'Model' which is a column in both dataframes
# Use a left join
price_df_cols = api_price_df.columns.tolist()
price_df = api_price_df.merge(pcd_df, on='Model', how='left')
# Drop all PCD columns except 'Training compute (FLOP)'
pcd_cols = [col for col in pcd_df.columns if col not in ['Model', 'Training compute (FLOP)']]
price_df = price_df.drop(columns=pcd_cols)
price_df[price_df['Training compute (FLOP)'].notna()]

Unnamed: 0,Price description,Price,Price Unit,Model,Fine Tuned Model,Price date,Model Version,Organization (model developer),Organization (API vendor),Context Window,Archived price link,Tags,Notes,Last Modified,Price (USD),date,price,log_price,Training compute (FLOP)
0,$2.50 / 1M input tokens,$2.50000,$/1M input tokens,GPT-4o,,2024-08-12,gpt-4o-2024-08-06,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,12/7/2024 9:13pm,2.5,739110,2.5,0.39794,3.810001e+25
1,$10.00 / 1M output tokens,$10.00000,$/1M output tokens,GPT-4o,,2024-08-12,gpt-4o-2024-08-06,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,12/7/2024 9:13pm,10.0,739110,10.0,1.0,3.810001e+25
2,$5.00 / 1M input tokens,$5.00000,$/1M input tokens,GPT-4o,,2024-08-12,gpt-4o-2024-05-13,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,12/7/2024 9:13pm,5.0,739110,5.0,0.69897,3.810001e+25
3,$15.00 / 1M output tokens,$15.00000,$/1M output tokens,GPT-4o,,2024-08-12,gpt-4o-2024-05-13,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,12/7/2024 9:13pm,15.0,739110,15.0,1.176091,3.810001e+25
4,$0.15 / 1M input tokens,$0.15000,$/1M input tokens,GPT-4o mini,,2024-08-12,gpt-4o-mini-2024-07-18,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,8/16/2024 2:26pm,0.15,739110,0.15,-0.823909,7.36001e+24
5,$0.60 / 1M output tokens,$0.60000,$/1M output tokens,GPT-4o mini,,2024-08-12,gpt-4o-mini-2024-07-18,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,8/16/2024 2:29pm,0.6,739110,0.6,-0.221849,7.36001e+24
6,$15 / 1M input tokens,$15.00000,$/1M input tokens,Claude 3 Opus,,2024-08-12,,Anthropic,Anthropic,,https://archive.is/5C8WA,,,8/13/2024 1:37pm,15.0,739110,15.0,1.176091,1.640001e+25
7,$75 / 1M output tokens,$75.00000,$/1M output tokens,Claude 3 Opus,,2024-08-12,,Anthropic,Anthropic,,https://archive.is/5C8WA,,,8/13/2024 1:37pm,75.0,739110,75.0,1.875061,1.640001e+25
10,$3 / 1M input tokens,$3.00000,$/1M input tokens,Claude 3.5 Sonnet,,2024-08-12,,Anthropic,Anthropic,,https://archive.is/5C8WA,,,12/7/2024 6:47pm,3.0,739110,3.0,0.477121,4.980001e+25
11,$15 / 1M output tokens,$15.00000,$/1M output tokens,Claude 3.5 Sonnet,,2024-08-12,,Anthropic,Anthropic,,https://archive.is/5C8WA,,,12/7/2024 6:47pm,15.0,739110,15.0,1.176091,4.980001e+25


In [17]:
# Use graph_objects to plot price vs training compute
fig = go.Figure()

input_price_df = price_df[price_df['Price Unit'] == '$/1M input tokens'].copy()
output_price_df = price_df[price_df['Price Unit'] == '$/1M output tokens'].copy()

fig.add_trace(go.Scatter(
    x=input_price_df['Training compute (FLOP)'], 
    y=input_price_df['Price (USD)'],
    mode='markers',
    marker=dict(size=10, opacity=0.7)
))
fig.update_layout(
    title='Input Price vs Training Compute',
    xaxis_title='Training Compute (FLOP)',
    yaxis_title='Input Price (USD)',
    yaxis_type='log',
    xaxis_type='log',
)
if save:
    save_plot(fig, results_dir, 'input_price_vs_training_compute')
fig.show()

# Output price vs training compute
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=output_price_df['Training compute (FLOP)'], 
    y=output_price_df['Price (USD)'],
    mode='markers',
    marker=dict(size=10, opacity=0.7)
))
fig.update_layout(
    title='Output Price vs Training Compute',
    xaxis_title='Training Compute (FLOP)',
    yaxis_title='Output Price (USD)',
    yaxis_type='log',
    xaxis_type='log',
)
if save:
    save_plot(fig, results_dir, 'output_price_vs_training_compute')
fig.show()

In [18]:
# Regression on price vs training compute
input_price_df['log_flop'] = np.log10(input_price_df['Training compute (FLOP)'])
input_price_df['price'] = input_price_df['Price (USD)']
model = smf.ols('log_price ~ log_flop', data=input_price_df).fit()
model.summary()


0,1,2,3
Dep. Variable:,log_price,R-squared:,0.167
Model:,OLS,Adj. R-squared:,0.13
Method:,Least Squares,F-statistic:,4.598
Date:,"Mon, 27 Jan 2025",Prob (F-statistic):,0.0428
Time:,15:37:01,Log-Likelihood:,-23.592
No. Observations:,25,AIC:,51.18
Df Residuals:,23,BIC:,53.62
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-8.1205,3.849,-2.110,0.046,-16.083,-0.158
log_flop,0.3303,0.154,2.144,0.043,0.012,0.649

0,1,2,3
Omnibus:,0.59,Durbin-Watson:,2.275
Prob(Omnibus):,0.745,Jarque-Bera (JB):,0.049
Skew:,-0.073,Prob(JB):,0.976
Kurtosis:,3.158,Cond. No.,743.0


In [19]:
# Plot the regression line
compute_range = np.linspace(input_price_df['log_flop'].min(), input_price_df['log_flop'].max(), 100)
predict_df = pd.DataFrame({'log_flop': compute_range})
predict_df['log_price'] = model.predict(predict_df)
fig = px.scatter(input_price_df, x='log_flop', y='log_price', title='Input Price vs Training Compute')
fig.add_trace(go.Scatter(x=predict_df['log_flop'], y=predict_df['log_price'], mode='lines', name='Regression Line'))
if save:
    save_plot(fig, results_dir, 'input_price_vs_training_compute_regression')
fig.show()


In [20]:
output_price_df.loc[:,'log_flop'] = np.log10(output_price_df['Training compute (FLOP)'])
output_price_df.loc[:,'price'] = output_price_df['Price (USD)']
model = smf.ols('log_price ~ log_flop', data=output_price_df).fit()
model.summary()


kurtosistest only valid for n>=20 ... continuing anyway, n=12



0,1,2,3
Dep. Variable:,log_price,R-squared:,0.113
Model:,OLS,Adj. R-squared:,0.024
Method:,Least Squares,F-statistic:,1.276
Date:,"Mon, 27 Jan 2025",Prob (F-statistic):,0.285
Time:,15:37:02,Log-Likelihood:,-11.068
No. Observations:,12,AIC:,26.14
Df Residuals:,10,BIC:,27.11
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-4.4577,4.570,-0.975,0.352,-14.641,5.726
log_flop,0.2075,0.184,1.130,0.285,-0.202,0.617

0,1,2,3
Omnibus:,1.187,Durbin-Watson:,2.544
Prob(Omnibus):,0.553,Jarque-Bera (JB):,0.781
Skew:,0.248,Prob(JB):,0.677
Kurtosis:,1.853,Cond. No.,592.0


In [21]:
# Plot the regression line
compute_range = np.linspace(output_price_df['log_flop'].min(), output_price_df['log_flop'].max(), 100)
predict_df = pd.DataFrame({'log_flop': compute_range})
predict_df['log_price'] = model.predict(predict_df)
fig = px.scatter(output_price_df, x='log_flop', y='log_price', title='Output Price vs Training Compute')
fig.add_trace(go.Scatter(x=predict_df['log_flop'], y=predict_df['log_price'], mode='lines', name='Regression Line'))
if save:
    save_plot(fig, results_dir, 'output_price_vs_training_compute_regression')
fig.show()


# Price over time at different levels of performance

In [22]:
# Load benchmark data
# data_path = "https://docs.google.com/spreadsheets/d/1etu9rXcME0uUA-S2ANA8bsfQbIZgNu-8NxqFGQdDIzQ/export?format=csv&gid=1305280917#gid=1305280917"
data_path = "data/benchmarks_with_model_accessibility.csv"
df = pd.read_csv(data_path)

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

df.head(30)

Unnamed: 0,System,Model size (parameters),Active Parameters,Dataset size,Date,Open/Closed,Training compute (FLOP),Training compute notes,BBH,GPQA,MMLU,HELM MMLU,SEAL Coding,SEAL Instruction Following,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,GPQA Notes,MMLU Notes,HELM MMLU Notes,Trust in benchmark results,Trust notes
0,BLOOM-176B,176000000000.0,176000000000.0,390000000000.0,2022-11-09,Open,4.12e+23,,0.4491,,0.3913,,,,,,,,,,,,0,
1,BloombergGPT,50000000000.0,50000000000.0,708000000000.0,2023-03-30,Closed,2.12e+23,,0.4197,,0.3918,,,,,,,,,,,,0,
2,Camelidae-8x34B,,,,2024-01-05,Open,,,,,0.756,,,,,,,,,,,,0,
3,ChatGLM-6B,6000000000.0,6000000000.0,,2023-03-01,Open,,,0.1873,,,,,,,880.0,,,,,,,0,
4,ChatGLM2-12B-base,12000000000.0,12000000000.0,,2023-06-25,Open,,,0.3602,,,,,,,,,,,,,,0,
5,ChatGLM2-6B-base,6000000000.0,6000000000.0,,2023-06-25,Open,,,0.3368,,,,,,,924.0,,,,,,,0,
6,ChatGLM3-6B,6000000000.0,6000000000.0,,2023-10-27,Open,5.04e+22,,0.661,,,,,,,955.0,,,,,,,0,
7,Chinchilla 70B,,70000000000.0,,2022-03-29,Closed,5.76e+23,,,,0.675,,,,,,,,,,,,0,
8,Claude 2,,,,2023-07-11,Closed,,,,0.353,0.785,,,,,1132.0,,,,Epoch evaluation,"Actually CoT, so probably an overestimate. HEL...",,0,
9,Claude 2.1,,,,2023-11-21,Closed,,,,0.361,,,,,,,,,,Epoch evaluation,,,0,Doesn't perform worse on GSM1k relative to GSM8k


In [23]:
# Rename 'System' to 'Model' if it exists
if 'System' in df.columns:
    df.rename(columns={'System': 'Model'}, inplace=True)

In [24]:
# Convert date string to datetime
df['Date'] = pd.to_datetime(df['Date'])
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
df

Unnamed: 0,Model,Model size (parameters),Active Parameters,Dataset size,Date,Open/Closed,Training compute (FLOP),Training compute notes,BBH,GPQA,MMLU,HELM MMLU,SEAL Coding,SEAL Instruction Following,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,GPQA Notes,MMLU Notes,HELM MMLU Notes,Trust in benchmark results,Trust notes
0,BLOOM-176B,1.760000e+11,176000000000,3.900000e+11,2022-11-09,Open,4.120000e+23,,0.4491,,0.3913,,,,,,,,,,,,0,
1,BloombergGPT,5.000000e+10,50000000000,7.080000e+11,2023-03-30,Closed,2.120000e+23,,0.4197,,0.3918,,,,,,,,,,,,0,
2,Camelidae-8x34B,,,,2024-01-05,Open,,,,,0.7560,,,,,,,,,,,,0,
3,ChatGLM-6B,6.000000e+09,6000000000,,2023-03-01,Open,,,0.1873,,,,,,,880.0,,,,,,,0,
4,ChatGLM2-12B-base,1.200000e+10,12000000000,,2023-06-25,Open,,,0.3602,,,,,,,,,,,,,,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,XVerse-7B,,7000000000,,2023-09-26,Open,,,,,,,,,,,,,,,,,0,
128,Yi-1.5-34B,,,,2024-05-10,Open,,,,0.060,,,,,,,,,,Epoch evaluation,,,0,
129,Yi-34B,3.400000e+10,34000000000,3.000000e+12,2023-11-02,Open,6.120000e+23,,0.5430,0.165,0.7635,,,,,1111.0,chat,,,Epoch evaluation,,,-1,MMLU-GPQA performance difference is relatively...
130,Yi-6B,6.000000e+09,6000000000,3.000000e+12,2023-11-02,Open,1.080000e+23,,0.4280,,0.6385,,,,,,,,,,,,0,


In [25]:
# Filter out finetuned systems

finetuned_systems = [
 'Layer Normalization: Handwriting sequence generation',
 'ULM-FiT',
 'ADP-FAIRSEQ + NGRAMRES',
 'Cross-lingual alignment',
 'UnifiedQA',
 '$\\infty$-former (SM)',
 'FLAN 137B',
 'AlphaFold-Multimer',
 'Masked Autoencoders',
 'Contriever',
 'BERT-RBP',
 'Minerva',
 'BlenderBot 3',
 'PaLM-SayCan',
 'NMST+GPT-2',
 'Decaying Fast Weights Transformer (WT-103)',
 'GPT-2 + Progressive LRD',
 'U-PaLM',
 'Flan-T5 11B',
 'Flan-PaLM 540B',
 'Taiyi-Stable Diffusion',
 'OPT-IML (175B)',
 'SparseOPT-175B',
 'DiT-XL/2',
 'VideoMAE V2',
 'Segment Anything Model',
 'gLM',
 'MOSS-Moon-003',
 'WizardLM-7B',
 'InstructBLIP',
 'Guanaco-65B',
 'WizardCoder-15.5B',
 'Code Llama-34B',
 'Code Llama-7B',
 'TigerBot-70B',
 'MiniGPT4 (Vicuna finetune)',
 'LLaMA-7B (protein-oriented instructions finetuned)',
 'FinGPT-13B',
 'LLaVA 1.5',
 'CogVLM',
 'Volcano 13B',
 'SPHINX (Llama 2 13B)',
 'Orca 2-13B',
 'Llama Guard',
 'FunSearch',
 'Elyza',
 'Code Llama-70B',
 'Swallow'
]

df = df[~df['Model'].isin(finetuned_systems)]
df = df[~df['Model'].str.contains('Flan', case=False)]

In [26]:
def convert_int(x):
    if pd.isna(x):
        return np.nan
    try:
        return int(str(x).replace(',', ''))
    except ValueError:
        return np.nan

df['Active Parameters'] = df['Active Parameters'].apply(convert_int)

In [27]:
def get_benchmark_df(df, bench):
    benchmark_df = df[~(df['Model'] == 'Random chance')]
    if non_suspects_only:
        if bench == 'GPQA':
            # GPQA was released November 20, 2023
            old_df = benchmark_df[benchmark_df['Date'] < pd.to_datetime('2023-11-20')]
            new_df = benchmark_df[benchmark_df['Date'] >= pd.to_datetime('2023-11-20')]
            new_df = new_df[new_df['Trust in benchmark results'] >= 0]
            benchmark_df = pd.concat([old_df, new_df])
        elif bench == 'MMLU':
            benchmark_df = benchmark_df[benchmark_df['Trust in benchmark results'] >= 0]
    elif trusted_only:
        benchmark_df = benchmark_df[benchmark_df['Trust in benchmark results'] > 0]
    return benchmark_df

In [28]:
fig = make_subplots(rows=1, cols=2, subplot_titles=benchmarks_to_plot, vertical_spacing=0.15)

for i, bench in enumerate(benchmarks_to_plot):
    benchmark_df = get_benchmark_df(df, bench)
    benchmark_df = benchmark_df[benchmark_df[bench].notna()]

    fig.append_trace(
        go.Scatter(
            x=benchmark_df['Date'],
            y=100 * benchmark_df[bench],
            mode='markers',
            text=benchmark_df['Model'],
            name=bench,
            showlegend=False
        ),
        row=i//2 + 1, col=i%2 + 1
    )

    # Update x and y axes for this subplot
    if i%2 + 1 == 1:
        fig.update_yaxes(title_text="Accuracy (%)", row=i//2 + 1, col=i%2 + 1)

# Improve the layout
fig.update_layout(
    template='plotly_white',
    width=800,
    height=400,
    font=dict(size=12),
    hovermode="closest",
)

# Save the plot
if save:
    save_plot(fig, results_dir, 'benchmark_training_compute')

# Show the plot
fig.show()

In [29]:
# Join price data to benchmark data
price_df = api_price_df.merge(df, on='Model', how='left')
price_df.head()

Unnamed: 0,Price description,Price,Price Unit,Model,Fine Tuned Model,Price date,Model Version,Organization (model developer),Organization (API vendor),Context Window,Archived price link,Tags,Notes,Last Modified,Price (USD),date,price,log_price,Model size (parameters),Active Parameters,Dataset size,Date,Open/Closed,Training compute (FLOP),Training compute notes,BBH,GPQA,MMLU,HELM MMLU,SEAL Coding,SEAL Instruction Following,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,GPQA Notes,MMLU Notes,HELM MMLU Notes,Trust in benchmark results,Trust notes
0,$2.50 / 1M input tokens,$2.50000,$/1M input tokens,GPT-4o,,2024-08-12,gpt-4o-2024-08-06,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,12/7/2024 9:13pm,2.5,739110,2.5,0.39794,,,,2024-05-13,Closed,,,,0.49,0.872,0.842,1138.0,0.886,0.9485,1286.0,2024-05-13,"""+3/-2""",,Epoch evaluation,MMLU Pro paper,2024-05-13,0.0,
1,$10.00 / 1M output tokens,$10.00000,$/1M output tokens,GPT-4o,,2024-08-12,gpt-4o-2024-08-06,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,12/7/2024 9:13pm,10.0,739110,10.0,1.0,,,,2024-05-13,Closed,,,,0.49,0.872,0.842,1138.0,0.886,0.9485,1286.0,2024-05-13,"""+3/-2""",,Epoch evaluation,MMLU Pro paper,2024-05-13,0.0,
2,$5.00 / 1M input tokens,$5.00000,$/1M input tokens,GPT-4o,,2024-08-12,gpt-4o-2024-05-13,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,12/7/2024 9:13pm,5.0,739110,5.0,0.69897,,,,2024-05-13,Closed,,,,0.49,0.872,0.842,1138.0,0.886,0.9485,1286.0,2024-05-13,"""+3/-2""",,Epoch evaluation,MMLU Pro paper,2024-05-13,0.0,
3,$15.00 / 1M output tokens,$15.00000,$/1M output tokens,GPT-4o,,2024-08-12,gpt-4o-2024-05-13,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,12/7/2024 9:13pm,15.0,739110,15.0,1.176091,,,,2024-05-13,Closed,,,,0.49,0.872,0.842,1138.0,0.886,0.9485,1286.0,2024-05-13,"""+3/-2""",,Epoch evaluation,MMLU Pro paper,2024-05-13,0.0,
4,$0.15 / 1M input tokens,$0.15000,$/1M input tokens,GPT-4o mini,,2024-08-12,gpt-4o-mini-2024-07-18,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,8/16/2024 2:26pm,0.15,739110,0.15,-0.823909,,,,2024-07-18,Closed,,,,0.403,0.82,,,,,1280.0,2024-07-18,"""+6/-4""",,Epoch evaluation,Suspect this is actually 0-shot CoT.,,0.0,


In [30]:
# Plot price vs performance
for bench in benchmarks_to_plot:
    fig = px.scatter(price_df, x='Price (USD)', y=bench, color='Model', title=f'Price vs {bench}')
    fig.show()

In [31]:
"""
  - Set a performance lower bound
  - Track the running best (cheapest) model
  - At each point in time (at some resolution)
    - Filter to new models published in this time window
    - Filter to models with performance above the lower bound
    - Check if any new model is cheaper than current best
    - If so, update the current best
    - Record the current best model at this time point
"""
bench = 'MMLU'
performance_lower_bound = 0.42
ts = pd.date_range(start='2020-01-01', end='2025-01-01', freq='MS')
cheapest_models = []
current_best = None

for i, t in enumerate(ts):
    # Get models published in this time window
    benchmark_df = get_benchmark_df(price_df, bench)
    if i > 0:
        prev_t = ts[i-1]
        benchmark_df = benchmark_df[(benchmark_df['Date'] >= prev_t) & (benchmark_df['Date'] < t)]
    else:
        benchmark_df = benchmark_df[benchmark_df['Date'] < t]
        
    # Filter for performance
    benchmark_df = benchmark_df[benchmark_df[bench].notna()]
    benchmark_df = benchmark_df[benchmark_df[bench] > performance_lower_bound]
    
    if not benchmark_df.empty:
        # Find cheapest new model
        new_best = benchmark_df.loc[benchmark_df['Price (USD)'].idxmin()]
        
        # Update current best if new model is cheaper (or if no current best)
        if current_best is None or new_best['Price (USD)'] < current_best['Price (USD)']:
            current_best = new_best
    
    # Record current best at this timepoint
    if current_best is not None:
        cheapest_models.append(current_best)
        print(t, current_best['Model'], current_best[bench], f"${current_best['Price (USD)']:.2f}")

2023-08-01 00:00:00 Claude 2 0.785 $8.00
2023-09-01 00:00:00 Claude 2 0.785 $8.00
2023-10-01 00:00:00 Claude 2 0.785 $8.00
2023-11-01 00:00:00 Claude 2 0.785 $8.00
2023-12-01 00:00:00 Claude 2 0.785 $8.00
2024-01-01 00:00:00 Gemini 1.0 Pro 0.718 $0.50
2024-02-01 00:00:00 Gemini 1.0 Pro 0.718 $0.50
2024-03-01 00:00:00 Gemini 1.0 Pro 0.718 $0.50
2024-04-01 00:00:00 Gemini 1.0 Pro 0.718 $0.50
2024-05-01 00:00:00 Gemini 1.0 Pro 0.718 $0.50
2024-06-01 00:00:00 Gemini 1.0 Pro 0.718 $0.50
2024-07-01 00:00:00 Gemini 1.0 Pro 0.718 $0.50
2024-08-01 00:00:00 GPT-4o mini 0.82 $0.15
2024-09-01 00:00:00 GPT-4o mini 0.82 $0.15
2024-10-01 00:00:00 GPT-4o mini 0.82 $0.15
2024-11-01 00:00:00 GPT-4o mini 0.82 $0.15
2024-12-01 00:00:00 GPT-4o mini 0.82 $0.15
2025-01-01 00:00:00 GPT-4o mini 0.82 $0.15


In [32]:
cheapest_models_df = pd.DataFrame(cheapest_models)

In [33]:
# Plot the cheapest models
fig = px.scatter(cheapest_models_df, x='Date', y='Price (USD)', 
                title=f'Cheapest {bench} Models',
                hover_data=['Model'])
fig.update_layout(yaxis_type='log')
if save:
    save_plot(fig, results_dir, f'cheapest_{bench}_models')
fig.show()
