In [1]:
import numpy as np
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import statsmodels.formula.api as smf

from data import load_pcd_df
from plotting import save_plot

pio.templates.default = "plotly_white"

In [2]:
results_dir = 'results/2025-02-05/'
os.makedirs(results_dir, exist_ok=True)

In [3]:
save = True
benchmarks = ['MMLU', 'GPQA Diamond', 'MATH 5', 'MATH-500', 'HumanEval']

# Using Artificial Analysis data

## Load Artificial Analysis dataset

In [4]:
aa_df = pd.read_csv('data/aa_data_with_math5.csv')
aa_df

Unnamed: 0,Model Name,Tier,Release Date,USD per 1M Tokens,Tokens per Second,Prefill Latency (s),Prompt Length (tk),MMLU,GPQA Diamond,MATH 5,MATH-500,HumanEval,LMSys Chatbot Arena ELO
0,Claude-3-Haiku,3,2024-03,0.5,122.7,0.467,1000,71,33,13.0,39.0,77.0,1179.0
1,Claude-3-Opus,1,2024-03,30.0,26.5,1.984,1000,84,50,34.0,64.0,83.0,1248.0
2,Claude-3-Sonnet,2,2024-03,6.0,61.8,0.789,1000,77,37,16.0,41.0,71.0,1201.0
3,Claude-3.5-Haiku,3,2024-06,1.6,64.2,0.768,1000,81,37,,67.0,87.0,
4,Claude-3.5-Sonnet-2024-06,2,2024-06,6.0,55.9,0.906,1000,88,56,46.0,71.0,90.0,1268.0
5,Claude-3.5-Sonnet-2024-10,2,2024-10,6.0,55.2,0.907,1000,89,58,53.0,76.0,96.0,1282.0
6,Gemini-1.5-Flash-2024-05,3,2024-05,0.13,298.4,0.307,1000,79,39,23.0,55.0,,1227.0
7,Gemini-1.5-Flash-2024-09,3,2024-09,0.13,190.5,0.348,1000,75,45,58.0,83.0,83.0,1271.0
8,Gemini-1.5-Flash-8B,3,2024-10,0.07,285.2,0.335,1000,75,30,,70.0,,1211.0
9,Gemini-1.5-Pro-2024-05,1,2024-05,2.19,64.8,0.738,1000,86,46,,66.0,,1260.0


## Load Epoch AI price dataset

In [5]:
api_price_df = pd.read_csv('data/API prices - full view.csv')
api_price_df

Unnamed: 0,Price description,Price,Price Unit,Model,Fine Tuned Model,Price date,Model Version,Organization (model developer),Organization (API vendor),Context Window Tokens,Archived price link,Tags,Notes,Last Modified,Price sheet document,Created By
0,$2.50 / 1M input tokens,$2.50000,$/1M input tokens,GPT-4o,,2024-08-06,gpt-4o-2024-08-06,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,2/3/2025 3:16pm,,Robi Rahman
1,$10.00 / 1M output tokens,$10.00000,$/1M output tokens,GPT-4o,,2024-08-06,gpt-4o-2024-08-06,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,2/3/2025 3:16pm,,Robi Rahman
2,$0.638 / 1k 512^2 px input images,$0.63800,$/1k 512^2 px input images,GPT-4o,,2024-08-06,gpt-4o-2024-08-06,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,,High-resolution input images are priced as fol...,2/3/2025 3:16pm,,James Sanders
3,$1.275 / 1k 512^2 px input images,$1.27500,$/1k 512^2 px input images,GPT-4o,,2024-05-13,gpt-4o-2024-05-13,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,,High-resolution input images are priced as fol...,2/3/2025 3:16pm,,James Sanders
4,$0.213 / 1k low resolution input images,$0.21300,$/1k low resolution input images,GPT-4o,,2024-08-06,gpt-4o-2024-08-06,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,,Low-resolution images use 85 input tokens/imag...,2/3/2025 3:16pm,,James Sanders
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,$0.95 / 1M output tokens,$0.95000,$/1M output tokens,Llama 2-70B,,2023-12-01,,Meta AI,DeepInfra,,https://web.archive.org/web/20231201234723/htt...,,,2/3/2025 4:58pm,,James Sanders
151,$0.35 / 1M token,$0.35000,$/1M tokens (input or output),Llama 2-13B,,2023-12-08,,Meta AI,DeepInfra,,https://web.archive.org/web/20231208122434/htt...,,,2/3/2025 5:00pm,,James Sanders
152,$0.22 / 1M token,$0.22000,$/1M tokens (input or output),Llama 2-13B,,2024-04-14,,Meta AI,DeepInfra,,https://web.archive.org/web/20240414171742/htt...,,,2/3/2025 5:00pm,,James Sanders
153,$0.20 / 1M token,$0.20000,$/1M tokens (input or output),Llama 2-7B,,2023-12-01,,Meta AI,DeepInfra,,https://web.archive.org/web/20231201223505/htt...,,,2/3/2025 5:01pm,,James Sanders


In [6]:
# Filter out pricing for finetuned versions of models
api_price_df = api_price_df[api_price_df['Fine Tuned Model'].isna()]
# Focus on the simplest price units: $/1M input tokens and $/1M output tokens
api_price_df = api_price_df[api_price_df['Price Unit'].isin(['$/1M input tokens', '$/1M output tokens'])]
api_price_df

Unnamed: 0,Price description,Price,Price Unit,Model,Fine Tuned Model,Price date,Model Version,Organization (model developer),Organization (API vendor),Context Window Tokens,Archived price link,Tags,Notes,Last Modified,Price sheet document,Created By
0,$2.50 / 1M input tokens,$2.50000,$/1M input tokens,GPT-4o,,2024-08-06,gpt-4o-2024-08-06,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,2/3/2025 3:16pm,,Robi Rahman
1,$10.00 / 1M output tokens,$10.00000,$/1M output tokens,GPT-4o,,2024-08-06,gpt-4o-2024-08-06,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,2/3/2025 3:16pm,,Robi Rahman
7,$5.00 / 1M input tokens,$5.00000,$/1M input tokens,GPT-4o,,2024-05-13,gpt-4o-2024-05-13,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,2/3/2025 3:16pm,,James Sanders
8,$15.00 / 1M output tokens,$15.00000,$/1M output tokens,GPT-4o,,2024-05-13,gpt-4o-2024-05-13,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,2/3/2025 3:16pm,,James Sanders
10,$0.15 / 1M input tokens,$0.15000,$/1M input tokens,GPT-4o mini,,2024-07-18,gpt-4o-mini-2024-07-18,OpenAI,OpenAI,,https://web.archive.org/web/20240812003133/htt...,OpenAI: 50% off for batch submission,,2/3/2025 3:17pm,,James Sanders
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146,$2.19 / 1M output tokens,$2.19000,$/1M output tokens,DeepSeek-R1,,2025-01-20,,DeepSeek,DeepSeek,,https://web.archive.org/web/20250130084330/htt...,,,2/3/2025 4:05pm,,James Sanders
147,$0.27 / 1M input tokens,$0.27000,$/1M input tokens,DeepSeek-V3,,2024-12-26,,DeepSeek,DeepSeek,,https://web.archive.org/web/20250131235106/htt...,,Non cached price (cache miss). They have a tem...,2/3/2025 4:08pm,,James Sanders
148,$1.1 / 1M output tokens,$1.10000,$/1M output tokens,DeepSeek-V3,,2024-12-26,,DeepSeek,DeepSeek,,https://web.archive.org/web/20250131235106/htt...,,They have a temporary promotional sale price. ...,2/3/2025 4:08pm,,James Sanders
149,$0.70 / 1M input tokens,$0.70000,$/1M input tokens,Llama 2-70B,,2023-12-01,,Meta AI,DeepInfra,,https://web.archive.org/web/20231201234723/htt...,,,2/3/2025 4:32pm,,James Sanders


## Merge in Epoch AI data that is not in AA data

In [7]:
epoch_price_models_with_date = list(sorted(set([f'{m} ({d})' for m, d in zip(api_price_df['Model'], api_price_df['Price date'])])))
aa_price_models_with_date = list(sorted(set([f'{m} ({d})' for m, d in zip(aa_df['Model Name'], aa_df['Release Date'])])))
print('Epoch AI models:', epoch_price_models_with_date)
print('AA models:', aa_price_models_with_date)

Epoch AI models: ['Claude 2 (2024-08-12)', 'Claude 2.1 (2024-08-12)', 'Claude 3 Haiku (2024-08-12)', 'Claude 3 Opus (2024-08-12)', 'Claude 3 Sonnet (2024-08-12)', 'Claude 3.5 Haiku (2024-11-04)', 'Claude 3.5 Sonnet (2024-08-12)', 'Claude 3.5 Sonnet (2024-09-10)', 'Claude Instant (2024-08-12)', 'Cohere Command (2024-08-13)', 'Cohere Command Light (2024-08-13)', 'Command R (2024-08-13)', 'Command R+ (2024-08-13)', 'Command R+ (2024-09-13)', 'DeepSeek-Coder-V2 236B (2024-09-11)', 'DeepSeek-R1 (2025-01-20)', 'DeepSeek-V3 (2024-12-26)', 'GPT-3.5 Turbo (2023-06-13)', 'GPT-3.5 Turbo (2023-11-06)', 'GPT-3.5 Turbo (2024-01-25)', 'GPT-4 (2023-03-14)', 'GPT-4 Turbo (2023-11-06)', 'GPT-4o (2024-05-13)', 'GPT-4o (2024-08-06)', 'GPT-4o mini (2024-07-18)', 'Gemini 1.0 Pro (2024-08-12)', 'Jamba (2024-09-12)', 'Llama 2-70B (2023-12-01)', 'Llama 3-8B (2024-09-01)', 'Llama 3.1-405B (2024-08-18)', 'Llama 3.1-405B (2024-08-22)', 'Llama 3.1-405B (2024-08-24)', 'Llama 3.1-405B (2024-09-05)', 'Llama 3.1-405B 

In [9]:
model_mappings = {
    # GPT Models
    'GPT-4 (2023-03-14)': 'GPT-4 (2023-03)',
    'GPT-3.5 Turbo (2023-11-06)': 'GPT-3.5 Turbo (2023-11)',
    'GPT-4 Turbo (2023-11-06)': 'GPT-4 Turbo (2023-11)',
    'GPT-4o (2024-05-13)': 'GPT-4o-2024-05 (2024-05)',
    'GPT-4o (2024-08-06)': 'GPT-4o-2024-08 (2024-08)',
    'GPT-4o mini (2024-07-18)': 'GPT-4o-mini (2024-07)',
    
    # Llama Models
    'Llama 3.1-405B (2024-07)': 'Llama-3.1-Instruct-405B (2024-07)',
}

# Find models in Epoch AI that are not in AA
epoch_models_not_in_aa = set()
for epoch_model in epoch_price_models_with_date:
    if epoch_model not in model_mappings:
        model_name = epoch_model.split(' (')[0]
        epoch_models_not_in_aa.add(model_name)

print('Models in Epoch AI but not in AA:', epoch_models_not_in_aa)


Models in Epoch AI but not in AA: {'Claude 3 Opus', 'Command R+', 'Cohere Command', 'o3-mini', 'o1-mini', 'Claude Instant', 'DeepSeek-Coder-V2 236B', 'Cohere Command Light', 'Mistral 7B', 'o1', 'DeepSeek-V3', 'Claude 3.5 Sonnet', 'Claude 3 Haiku', 'Llama 3.1-70B', 'GPT-3.5 Turbo', 'DeepSeek-R1', 'Llama 3-8B', 'Ministral 3B', 'Gemini 1.0 Pro', 'Command R', 'Claude 2.1', 'Ministral 8B', 'Claude 2', 'Jamba', 'o1-preview', 'Llama 2-70B', 'Llama 3.1-405B', 'Claude 3 Sonnet', 'Claude 3.5 Haiku', 'Reka Core'}


In [10]:
price_data_not_in_aa = api_price_df[api_price_df['Model'].isin(epoch_models_not_in_aa)]
price_data_not_in_aa


Unnamed: 0,Price description,Price,Price Unit,Model,Fine Tuned Model,Price date,Model Version,Organization (model developer),Organization (API vendor),Context Window Tokens,Archived price link,Tags,Notes,Last Modified,Price sheet document,Created By
15,$15 / 1M input tokens,$15.00000,$/1M input tokens,Claude 3 Opus,,2024-08-12,,Anthropic,Anthropic,,https://archive.is/5C8WA,,,8/13/2024 1:37pm,,Robi Rahman
16,$75 / 1M output tokens,$75.00000,$/1M output tokens,Claude 3 Opus,,2024-08-12,,Anthropic,Anthropic,,https://archive.is/5C8WA,,,8/13/2024 1:37pm,,James Sanders
17,$0.25 / 1M input tokens,$0.25000,$/1M input tokens,Claude 3 Haiku,,2024-08-12,,Anthropic,Anthropic,,https://archive.is/5C8WA,,,8/13/2024 1:37pm,,James Sanders
18,$1.25 / 1M output tokens,$1.25000,$/1M output tokens,Claude 3 Haiku,,2024-08-12,,Anthropic,Anthropic,,https://archive.is/5C8WA,,,8/13/2024 1:37pm,,James Sanders
19,$3 / 1M input tokens,$3.00000,$/1M input tokens,Claude 3.5 Sonnet,,2024-08-12,,Anthropic,Anthropic,,https://archive.is/5C8WA,,,12/7/2024 6:47pm,,James Sanders
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146,$2.19 / 1M output tokens,$2.19000,$/1M output tokens,DeepSeek-R1,,2025-01-20,,DeepSeek,DeepSeek,,https://web.archive.org/web/20250130084330/htt...,,,2/3/2025 4:05pm,,James Sanders
147,$0.27 / 1M input tokens,$0.27000,$/1M input tokens,DeepSeek-V3,,2024-12-26,,DeepSeek,DeepSeek,,https://web.archive.org/web/20250131235106/htt...,,Non cached price (cache miss). They have a tem...,2/3/2025 4:08pm,,James Sanders
148,$1.1 / 1M output tokens,$1.10000,$/1M output tokens,DeepSeek-V3,,2024-12-26,,DeepSeek,DeepSeek,,https://web.archive.org/web/20250131235106/htt...,,They have a temporary promotional sale price. ...,2/3/2025 4:08pm,,James Sanders
149,$0.70 / 1M input tokens,$0.70000,$/1M input tokens,Llama 2-70B,,2023-12-01,,Meta AI,DeepInfra,,https://web.archive.org/web/20231201234723/htt...,,,2/3/2025 4:32pm,,James Sanders


In [40]:
# Group by 'Model' and 'Model Version', print all prices
overall_prices = []
for (model, price_date), data in price_data_not_in_aa.groupby(['Model', 'Price date']):
    print(f'{model} ({price_date})')
    vendor_prices = {}
    for vendor, vendor_data in data.groupby('Organization (API vendor)'):
        input_price = None
        output_price = None
        print(f'  {vendor}')
        for index, row in vendor_data.iterrows():
            print(f'    {row["Price Unit"]}: {row["Price"]}. On: {row["Price date"]}')
            if row['Price Unit'] == '$/1M input tokens':
                input_price = float(row['Price'].replace('$', ''))
            else:
                output_price = float(row['Price'].replace('$', ''))
        if input_price is not None and output_price is not None:
            wavg_price = input_price * 0.75 + output_price * 0.25
            vendor_prices[vendor] = wavg_price
            print(f'  Weighted average price: ${wavg_price:.2f}')
        else:
            print('  No price data available')
    if len(vendor_prices) > 0:
        overall_prices.append({
            'Model Name': model,
            'Release Date': price_date,
            'USD per 1M Tokens': np.median(list(vendor_prices.values())),
        })
    print()

Claude 2 (2024-08-12)
  Anthropic
    $/1M input tokens: $8.00000. On: 2024-08-12
    $/1M output tokens: $24.00000. On: 2024-08-12
  Weighted average price: $12.00

Claude 2.1 (2024-08-12)
  Anthropic
    $/1M input tokens: $8.00000. On: 2024-08-12
    $/1M output tokens: $24.00000. On: 2024-08-12
  Weighted average price: $12.00

Claude 3 Haiku (2024-08-12)
  Anthropic
    $/1M input tokens: $0.25000. On: 2024-08-12
    $/1M output tokens: $1.25000. On: 2024-08-12
  Weighted average price: $0.50

Claude 3 Opus (2024-08-12)
  Anthropic
    $/1M input tokens: $15.00000. On: 2024-08-12
    $/1M output tokens: $75.00000. On: 2024-08-12
  Weighted average price: $30.00

Claude 3 Sonnet (2024-08-12)
  Anthropic
    $/1M input tokens: $3.00000. On: 2024-08-12
    $/1M output tokens: $15.00000. On: 2024-08-12
  Weighted average price: $6.00

Claude 3.5 Haiku (2024-11-04)

Claude 3.5 Sonnet (2024-08-12)
  Anthropic
    $/1M input tokens: $3.00000. On: 2024-08-12
    $/1M output tokens: $15.00

In [41]:
overall_prices_df = pd.DataFrame(overall_prices)
overall_prices_df.to_csv('data/epoch_ai_price_data_not_in_aa.csv', index=False)
overall_prices_df

Unnamed: 0,Model Name,Release Date,USD per 1M Tokens
0,Claude 2,2024-08-12,12.0
1,Claude 2.1,2024-08-12,12.0
2,Claude 3 Haiku,2024-08-12,0.5
3,Claude 3 Opus,2024-08-12,30.0
4,Claude 3 Sonnet,2024-08-12,6.0
5,Claude 3.5 Sonnet,2024-08-12,6.0
6,Claude Instant,2024-08-12,1.2
7,Cohere Command,2024-08-13,1.625
8,Cohere Command Light,2024-08-13,0.375
9,Command R,2024-08-13,0.75


## Explore the data

In [5]:
# 'Release Date' is a string with the format 'YYYY-MM'
aa_df['Release Date'] = pd.to_datetime(aa_df['Release Date'].str.strip(), format='%Y-%m')

In [6]:
# Plot 'MMLU' vs. 'USD per 1M Tokens'
fig = px.scatter(aa_df, x='MMLU', y='USD per 1M Tokens', title='MMLU vs. USD per 1M Tokens')
fig.update_layout(yaxis_type='log')
fig.show()


In [7]:
aa_df['MMLU price-performance'] = aa_df['MMLU'] / aa_df['USD per 1M Tokens']
# Plot 'MMLU price-performance' vs. 'Date'
fig = px.scatter(aa_df, x='Release Date', y='MMLU price-performance', title='MMLU price-performance over time', hover_data=['Model Name'])
fig.update_layout(yaxis_type='log')
fig.show()


In [8]:
# Construct a list of 'Model Name' values that were ever in the top-n based on 'MMLU price-performance'
top_n = 1
aa_df = aa_df.sort_values(by='Release Date')
ever_top_n_models = set()

unique_dates = aa_df['Release Date'].sort_values().unique()

for date in unique_dates:
    df_up_to_date = aa_df[aa_df['Release Date'] <= date]
    top_n_models = df_up_to_date.nlargest(top_n, 'MMLU price-performance')
    top_n_model_names = top_n_models['Model Name'].tolist()
    ever_top_n_models.update(top_n_model_names)

ever_top_n_list = sorted(ever_top_n_models)
print(ever_top_n_list)

# Plot top-n models over time
fig = px.scatter(aa_df[aa_df['Model Name'].isin(ever_top_n_list)], x='Release Date', y='MMLU price-performance', title='MMLU price-performance over time', hover_data=['Model Name'])
fig.update_layout(yaxis_type='log')
fig.show()

['Claude-3-Haiku', 'GPT-3.5 Turbo', 'GPT-4', 'Gemini-1.5-Flash-2024-05', 'Gemini-1.5-Flash-8B', 'Llama-2-Chat-13B', 'Llama-3-Instruct-8B', 'Llama-3.1-Instruct-8B', 'Llama-3.2-Instruct-3B']


## Try fitting a regression to lowest-priced models above a performance lower bound

In [9]:
"""
  - Set a performance lower bound
  - Track the running best (top) model
  - At each point in time (at some resolution)
    - Filter to new models published in this time window
    - Filter to models with performance above the lower bound
    - Check if any new model is cheaper than current best
    - If so, update the current best
    - Record the current best model at this time point
"""
bench = 'HumanEval'
performance_lower_bound = 80
ts = pd.date_range(start='2020-01-01', end='2025-01-01', freq='MS')
cheapest_models = []
current_best = None

for i, t in enumerate(ts):
    # Get models published in this time window
    benchmark_df = aa_df
    if i > 0:
        prev_t = ts[i-1]
        benchmark_df = benchmark_df[(benchmark_df['Release Date'] >= prev_t) & (benchmark_df['Release Date'] < t)]
    else:
        benchmark_df = benchmark_df[benchmark_df['Release Date'] < t]
        
    # Filter for performance
    benchmark_df = benchmark_df[benchmark_df[bench].notna()]
    benchmark_df = benchmark_df[benchmark_df[bench] > performance_lower_bound]
    
    if not benchmark_df.empty:
        # Find cheapest new model
        new_best = benchmark_df.loc[benchmark_df['USD per 1M Tokens'].idxmin()]
        
        # Update current best if new model is cheaper (or if no current best)
        if current_best is None or new_best['USD per 1M Tokens'] < current_best['USD per 1M Tokens']:
            current_best = new_best
            cheapest_models.append(current_best)
            print(t, current_best['Model Name'], current_best[bench], f"${current_best['USD per 1M Tokens']:.2f}")

2023-12-01 00:00:00 GPT-4 Turbo 92.0 $15.00
2024-06-01 00:00:00 GPT-4o-2024-05 93.0 $7.50
2024-07-01 00:00:00 Claude-3.5-Haiku 87.0 $1.60
2024-08-01 00:00:00 GPT-4o-mini 88.0 $0.26
2024-10-01 00:00:00 Gemini-1.5-Flash-2024-09 83.0 $0.13


In [10]:
cheapest_models_df = pd.DataFrame(cheapest_models)
cheapest_models_df.head()

Unnamed: 0,Model Name,Tier,Release Date,USD per 1M Tokens,Tokens per Second,Prefill Latency (s),Prompt Length (tk),MMLU,GPQA Diamond,MATH 5,MATH-500,HumanEval,LMSys Chatbot Arena ELO,MMLU price-performance
15,GPT-4 Turbo,1,2023-11-01,15.0,39.2,1.246,1000,87,50,36.0,74.0,92.0,1256.0,5.8
16,GPT-4o-2024-05,1,2024-05-01,7.5,86.4,0.687,1000,87,51,48.0,79.0,93.0,1285.0,11.6
3,Claude-3.5-Haiku,3,2024-06-01,1.6,64.2,0.768,1000,81,37,,67.0,87.0,,50.625
19,GPT-4o-mini,2,2024-07-01,0.26,112.2,0.626,1000,82,43,48.0,79.0,88.0,1273.0,315.384615
7,Gemini-1.5-Flash-2024-09,3,2024-09-01,0.13,190.5,0.348,1000,75,45,58.0,83.0,83.0,1271.0,576.923077


In [11]:
# Plot the cheapest models
fig = px.line(cheapest_models_df, x='Release Date', y='USD per 1M Tokens',
                title=f'Price of the cheapest model with {bench} > {performance_lower_bound}%',
                text='Model Name', markers=True,
                line_shape='hv')  # Make line vertical-horizontal
fig.update_traces(textposition='bottom left')
fig.update_layout(yaxis_type='log')
fig.update_layout(
    width=800,
    height=600,
    font=dict(size=10),
)
# if save:
#     save_plot(fig, results_dir, f'aa_cheapest_models_{bench}_above_{performance_lower_bound}')
fig.show()

In [12]:
# Fit a line to the data
cheapest_models_df['price'] = cheapest_models_df['USD per 1M Tokens']
cheapest_models_df['log_price'] = np.log10(cheapest_models_df['USD per 1M Tokens'])
cheapest_models_df['date'] = cheapest_models_df['Release Date'].map(lambda x: pd.Timestamp(x).toordinal())
exponential_model = smf.ols('log_price ~ date', data=cheapest_models_df).fit()
print(exponential_model.summary())

linear_model = smf.ols('price ~ date', data=cheapest_models_df).fit()
print(linear_model.summary())

                            OLS Regression Results                            
Dep. Variable:              log_price   R-squared:                       0.759
Model:                            OLS   Adj. R-squared:                  0.679
Method:                 Least Squares   F-statistic:                     9.454
Date:                Wed, 29 Jan 2025   Prob (F-statistic):             0.0544
Time:                        17:30:31   Log-Likelihood:                -2.4166
No. Observations:                   5   AIC:                             8.833
Df Residuals:                       3   BIC:                             8.052
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   5012.7090   1630.281      3.075      0.0


omni_normtest is not valid with less than 8 observations; 5 samples were given.


omni_normtest is not valid with less than 8 observations; 5 samples were given.



In [13]:
# Calculate annual rate of decrease
annual_slope = exponential_model.params['date'] * 365  # Convert daily to annual
annual_factor = int(round(10**(-annual_slope)))  # Convert log slope to factor

# Plot the exponential trendline with the data
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=cheapest_models_df['Release Date'],
    y=10**exponential_model.predict(cheapest_models_df['date']),
    mode='lines',
    name=f'Trendline: {annual_factor}x decrease per year',
    line=dict(color='lightgrey', dash='dash')
))
fig.add_trace(go.Scatter(
    x=cheapest_models_df['Release Date'],
    y=cheapest_models_df['USD per 1M Tokens'],
    mode='lines+markers+text',
    name='Data',
    text=cheapest_models_df['Model Name'],
    textposition='bottom left',
    line=dict(shape='hv')
))
fig.update_layout(
    title=f'Price of the cheapest model with {bench} > {performance_lower_bound}%'
)
fig.update_traces(textposition='bottom left')
fig.update_layout(yaxis_type='log')
fig.update_layout(xaxis_title='Month')
fig.update_layout(yaxis_title='Price in USD per million tokens')
# Lower the lower x limit
fig.update_layout(xaxis_range=[cheapest_models_df['Release Date'].min() - pd.Timedelta(days=90), cheapest_models_df['Release Date'].max()+pd.Timedelta(days=30)])
fig.update_layout(
    width=800,
    height=600,
    font=dict(size=10),
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99,
        bordercolor="lightgrey",
        borderwidth=1
    )
)
# if save:
#     save_plot(fig, results_dir, f'aa_cheapest_models_{bench}_above_{performance_lower_bound}_with_trendline')
fig.show()

## Regression on lowest-priced models above a performance lower bound

In [14]:
performance_lower_bounds = range(10, 100, 10)
os.makedirs(results_dir + 'aa_cheapest_models_run/', exist_ok=True)

# Open log file
log_path = results_dir + 'aa_cheapest_models_run/output.log'
results = []
with open(log_path, 'w') as log_file:
    for i, bench in enumerate(benchmarks):
        if i > 0:
            print('\n')
            print('\n', file=log_file)
        print(f'{bench}')
        print(f'{bench}', file=log_file)

        

        for performance_lower_bound in performance_lower_bounds:
            print(f'\nPerformance lower bound: {performance_lower_bound}%')
            print(f'\nPerformance lower bound: {performance_lower_bound}%', file=log_file)
            ts = pd.date_range(start='2020-01-01', end='2025-01-01', freq='MS')
            cheapest_models = []
            current_best = None

            for i, t in enumerate(ts):
                # Get models published in this time window
                benchmark_df = aa_df
                if i > 0:
                    prev_t = ts[i-1]
                    benchmark_df = benchmark_df[(benchmark_df['Release Date'] >= prev_t) & (benchmark_df['Release Date'] < t)]
                else:
                    benchmark_df = benchmark_df[benchmark_df['Release Date'] < t]
                    
                # Filter for performance
                benchmark_df = benchmark_df[benchmark_df[bench].notna()]
                benchmark_df = benchmark_df[benchmark_df[bench] > performance_lower_bound]
                
                if not benchmark_df.empty:
                    # Find cheapest new model
                    new_best = benchmark_df.loc[benchmark_df['USD per 1M Tokens'].idxmin()]
                    
                    # Update current best if new model is cheaper (or if no current best)
                    if current_best is None or new_best['USD per 1M Tokens'] < current_best['USD per 1M Tokens']:
                        current_best = new_best
                        cheapest_models.append(current_best)
                        print(t, current_best['Model Name'], current_best[bench], f"${current_best['USD per 1M Tokens']:.2f}")
                        print(t, current_best['Model Name'], current_best[bench], f"${current_best['USD per 1M Tokens']:.2f}", file=log_file)

            cheapest_models_df = pd.DataFrame(cheapest_models)
            if len(cheapest_models_df) < 2:
                print('Less than 2 models found')
                print('Less than 2 models found', file=log_file)
                continue

            # Fit a line to the data
            cheapest_models_df['price'] = cheapest_models_df['USD per 1M Tokens']
            cheapest_models_df['log_price'] = np.log10(cheapest_models_df['USD per 1M Tokens'])
            cheapest_models_df['date'] = cheapest_models_df['Release Date'].map(lambda x: pd.Timestamp(x).toordinal())
            exponential_model = smf.ols('log_price ~ date', data=cheapest_models_df).fit()

            # Calculate annual rate of decrease
            annual_slope = exponential_model.params['date'] * 365  # Convert daily to annual
            annual_factor = int(round(10**(-annual_slope)))  # Convert log slope to factor
            results.append({
                'bench': bench,
                'performance_lower_bound': performance_lower_bound,
                'sample_size': len(cheapest_models_df),
                'start_date': cheapest_models_df['Release Date'].min(),
                'end_date': cheapest_models_df['Release Date'].max(),
                'price_reduction_factor_per_year': annual_factor,
                'r_squared': round(exponential_model.rsquared, 2),
            })

            # Plot the exponential trendline with the data
            fig = go.Figure()
            fig.add_trace(go.Scatter(
                x=cheapest_models_df['Release Date'],
                y=10**exponential_model.predict(cheapest_models_df['date']),
                mode='lines',
                name=f'Trendline: {annual_factor}x decrease per year',
                line=dict(color='lightgrey', dash='dash')
            ))
            fig.add_trace(go.Scatter(
                x=cheapest_models_df['Release Date'],
                y=cheapest_models_df['USD per 1M Tokens'],
                mode='lines+markers+text',
                name='Data',
                text=cheapest_models_df['Model Name'],
                textposition='bottom left',
                line=dict(shape='hv')
            ))
            fig.update_layout(
                title=f'Price of the cheapest model with {bench} > {performance_lower_bound}%'
            )
            fig.update_traces(textposition='bottom left')
            fig.update_layout(yaxis_type='log')
            fig.update_layout(xaxis_title='Month')
            fig.update_layout(yaxis_title='Price in USD per million tokens')
            # Lower the lower x limit
            fig.update_layout(xaxis_range=[cheapest_models_df['Release Date'].min() - pd.Timedelta(days=90), cheapest_models_df['Release Date'].max()+pd.Timedelta(days=30)])
            fig.update_layout(
                width=800,
                height=600,
                font=dict(size=10),
                legend=dict(
                    yanchor="top",
                    y=0.99,
                    xanchor="right",
                    x=0.99,
                    bordercolor="lightgrey",
                    borderwidth=1
                )
            )
            if save:
                save_plot(
                    fig,
                    results_dir + 'aa_cheapest_models_run/',
                    f'aa_cheapest_models_{bench}_above_{performance_lower_bound}_with_trendline',
                    extensions=['png'],
                )

cheapest_model_results_df = pd.DataFrame(results)
cheapest_model_results_df.to_csv(results_dir + 'aa_cheapest_models_run/cheapest_model_results.csv', index=False)

# Create a summary DataFrame
summary_data = []
for bench in benchmarks:
    bench_results = cheapest_model_results_df[cheapest_model_results_df['bench'] == bench]
    # Get all performance lower bounds used for this benchmark
    perf_bounds = sorted(list(set(bench_results['performance_lower_bound'])))
    
    # Calculate geometric mean of price reduction factors
    price_factors = bench_results['price_reduction_factor_per_year'].dropna()
    geomean = np.exp(np.mean(np.log(price_factors))) if len(price_factors) > 0 else np.nan
    
    # Get range of price reduction factors
    factor_range = [price_factors.min(), price_factors.max()] if len(price_factors) > 0 else []
    
    summary_data.append({
        'bench': bench,
        'performance_lower_bounds': perf_bounds,
        'price_reduction_factor_per_year_geomean': round(geomean),
        'price_reduction_factor_per_year_range': factor_range
    })

cheapest_model_summary_df = pd.DataFrame(summary_data)
cheapest_model_summary_df.to_csv(results_dir + 'aa_cheapest_models_run/cheapest_model_summary.csv', index=False)

MMLU

Performance lower bound: 10%
2023-04-01 00:00:00 GPT-4 86 $37.50
2023-08-01 00:00:00 Llama-2-Chat-7B 13 $0.33
2024-05-01 00:00:00 Llama-3-Instruct-8B 64 $0.15
2024-06-01 00:00:00 Gemini-1.5-Flash-2024-05 79 $0.13
2024-08-01 00:00:00 Llama-3.1-Instruct-8B 71 $0.10
2024-10-01 00:00:00 Llama-3.2-Instruct-1B 35 $0.05

Performance lower bound: 20%
2023-04-01 00:00:00 GPT-4 86 $37.50
2023-08-01 00:00:00 Llama-2-Chat-13B 45 $0.56
2024-04-01 00:00:00 Claude-3-Haiku 71 $0.50
2024-05-01 00:00:00 Llama-3-Instruct-8B 64 $0.15
2024-06-01 00:00:00 Gemini-1.5-Flash-2024-05 79 $0.13
2024-08-01 00:00:00 Llama-3.1-Instruct-8B 71 $0.10
2024-10-01 00:00:00 Llama-3.2-Instruct-1B 35 $0.05

Performance lower bound: 30%
2023-04-01 00:00:00 GPT-4 86 $37.50
2023-08-01 00:00:00 Llama-2-Chat-13B 45 $0.56
2024-04-01 00:00:00 Claude-3-Haiku 71 $0.50
2024-05-01 00:00:00 Llama-3-Instruct-8B 64 $0.15
2024-06-01 00:00:00 Gemini-1.5-Flash-2024-05 79 $0.13
2024-08-01 00:00:00 Llama-3.1-Instruct-8B 71 $0.10
2024-10-

## Regression on top models by price-performance, one benchmark at a time

In [15]:
top_n = 1
results_subdir = f'aa_top_{top_n}_price_performance_run/'
os.makedirs(results_dir + results_subdir, exist_ok=True)

# Open log file
results = []
for bench in benchmarks:
    benchmark_df = aa_df.copy()
    benchmark_df = benchmark_df.dropna(subset=[bench])
    benchmark_df = benchmark_df.sort_values(by='Release Date')
    benchmark_df[f'{bench} price-performance'] = benchmark_df[bench] / benchmark_df['USD per 1M Tokens']

    # Find the rolling top-n models
    unique_dates = benchmark_df['Release Date'].unique()
    ever_top_n_models = set()
    for date in unique_dates:
        df_up_to_date = benchmark_df[benchmark_df['Release Date'] <= date]
        top_n_models = df_up_to_date.nlargest(top_n, f'{bench} price-performance')
        top_n_model_names = top_n_models['Model Name'].tolist()
        ever_top_n_models.update(top_n_model_names)
    ever_top_n_list = sorted(ever_top_n_models)
    benchmark_df = benchmark_df[benchmark_df['Model Name'].isin(ever_top_n_list)]

    if len(benchmark_df) < 2:
        print(f'Less than 2 models found for {bench}')
        continue

    # Fit a line to the data
    benchmark_df['price'] = benchmark_df['USD per 1M Tokens']
    benchmark_df['log_price'] = np.log10(benchmark_df['USD per 1M Tokens'])
    benchmark_df['bench'] = benchmark_df[bench]
    benchmark_df['date'] = benchmark_df['Release Date'].map(lambda x: pd.Timestamp(x).toordinal())
    exponential_model = smf.ols('log_price ~ date + bench', data=benchmark_df).fit()

    # Calculate annual rate of decrease
    annual_slope = exponential_model.params['date'] * 365  # Convert daily to annual
    annual_factor = int(round(10**(-annual_slope)))  # Convert log slope to factor
    annual_slope_ci = exponential_model.conf_int(alpha=0.1).loc['date']
    annual_slope_ci_high = int(round(10**(-annual_slope_ci[0] * 365)))
    annual_slope_ci_low = int(round(10**(-annual_slope_ci[1] * 365)))
    results.append({
        'bench': bench,
        'sample_size': len(benchmark_df),
        'start_date': benchmark_df['Release Date'].min(),
        'end_date': benchmark_df['Release Date'].max(),
        'price_reduction_factor_per_year_mean': annual_factor,
        'price_reduction_factor_per_year_ci': [annual_slope_ci_low, annual_slope_ci_high],
        'r_squared': round(exponential_model.rsquared, 2),
    })

    # Plot the exponential trendline with the data
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=benchmark_df['Release Date'],
        y=10**exponential_model.predict(benchmark_df),
        mode='lines',
        name=f'Regression: {annual_factor}x decrease per year at fixed performance',
        line=dict(color='lightgrey', dash='dash')
    ))
    fig.add_trace(go.Scatter(
        x=benchmark_df['Release Date'],
        y=benchmark_df['USD per 1M Tokens'],
        mode='markers+text',
        name='Model with the best price-performance at the time',
        text=benchmark_df['Model Name'],
        textposition='bottom left',
    ))
    fig.update_layout(
        title=f'Price of models with the best price-performance on {bench}'
    )
    fig.update_traces(textposition='bottom left')
    fig.update_layout(yaxis_type='log')
    fig.update_layout(xaxis_title='Month')
    fig.update_layout(yaxis_title='Price in USD per million tokens')
    # Lower the lower x limit
    fig.update_layout(xaxis_range=[benchmark_df['Release Date'].min() - pd.Timedelta(days=90), benchmark_df['Release Date'].max()+pd.Timedelta(days=30)])
    fig.update_layout(
        width=800,
        height=600,
        font=dict(size=10),
        legend=dict(
            yanchor="top",
            y=0.99,
            xanchor="right",
            x=0.99,
            bordercolor="lightgrey",
            borderwidth=1
        )
    )
    if save:
        save_plot(
            fig,
            results_dir + results_subdir,
            f'top_{top_n}_price_performance_{bench}_with_trendline',
            extensions=['png'],
        )

top_n_price_performance_results_df = pd.DataFrame(results)
top_n_price_performance_results_df.to_csv(results_dir + results_subdir + 'top_n_price_performance_results.csv', index=False)

## Regression on all data, one benchmark at a time

In [16]:
# Fit a regression to log_price ~ date + bench
bench = 'MMLU'
aa_df['log_price'] = np.log10(aa_df['USD per 1M Tokens'])
aa_df['date'] = aa_df['Release Date'].map(lambda x: pd.Timestamp(x).toordinal())
aa_df['bench'] = aa_df[bench]

# Fit the model
model = smf.ols('log_price ~ date + bench', data=aa_df).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              log_price   R-squared:                       0.542
Model:                            OLS   Adj. R-squared:                  0.514
Method:                 Least Squares   F-statistic:                     18.96
Date:                Wed, 29 Jan 2025   Prob (F-statistic):           3.71e-06
Time:                        17:30:34   Log-Likelihood:                -27.260
No. Observations:                  35   AIC:                             60.52
Df Residuals:                      32   BIC:                             65.19
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   2282.2475    534.586      4.269      0.0

In [17]:
print(model.params)
print(model.conf_int())


Intercept    2282.247526
date           -0.003092
bench           0.035833
dtype: float64
                     0            1
Intercept  1193.331122  3371.163930
date         -0.004566    -0.001618
bench         0.022982     0.048685


In [18]:
# Date parameter
date_param = model.params['date'] * 365  # Convert daily to annual
annual_factor = int(round(10**(-date_param)))  # Convert log slope to factor
print(f'It costs {annual_factor}x less each year to keep {bench} performance fixed.')

It costs 13x less each year to keep MMLU performance fixed.


In [19]:
model.conf_int().loc['date'][0]


-0.004565847685129619

In [20]:
results = []
for bench in benchmarks:
    print(f'{bench}')
    df = aa_df.copy()
    df = df.dropna(subset=[bench])
    df['bench'] = df[bench]

    model = smf.ols('log_price ~ date + bench', data=df).fit()
    # Print number of observations and R-squared
    print(f'Number of observations: {len(df)}')
    print(f'R-squared: {model.rsquared:.2f}')
    date_param = model.params['date'] * 365  # Convert daily to annual
    date_param_ci = model.conf_int(alpha=0.1).loc['date']
    date_param_ci_low = date_param_ci[0] * 365
    date_param_ci_high = date_param_ci[1] * 365
    annual_factor = int(round(10**(-date_param)))  # Convert log slope to factor
    annual_factor_ci_high = int(round(10**(-date_param_ci_low)))  # Convert log slope to factor
    annual_factor_ci_low = int(round(10**(-date_param_ci_high)))  # Convert log slope to factor

    min_date = pd.Timestamp.fromordinal(df["date"].min()).date()
    max_date = pd.Timestamp.fromordinal(df["date"].max()).date()
    date_range = max_date - min_date
    results.append({
        'bench': bench,
        'sample_size': len(df),
        'start_date': min_date,
        'end_date': max_date,
        'price_reduction_factor_per_year_mean': annual_factor,
        'price_reduction_factor_per_year_ci': [annual_factor_ci_low, annual_factor_ci_high],
        'r_squared': round(model.rsquared, 2),
    })

    print(f'The price to achieve a fixed level of {bench} performance fell at a rate of')
    print(f'{annual_factor}x [{annual_factor_ci_low}, {annual_factor_ci_high}] per year')
    print(f'over {date_range.days / 365:.1f} years ({min_date} to {max_date})')
    print()

single_benchmark_regression_results_df = pd.DataFrame(results)
single_benchmark_regression_results_df.to_csv(results_dir + 'single_benchmark_regression_results.csv', index=False)

MMLU
Number of observations: 35
R-squared: 0.54
The price to achieve a fixed level of MMLU performance fell at a rate of
13x [5, 38] per year
over 1.7 years (2023-03-01 to 2024-11-01)

GPQA Diamond
Number of observations: 35
R-squared: 0.65
The price to achieve a fixed level of GPQA Diamond performance fell at a rate of
17x [7, 42] per year
over 1.7 years (2023-03-01 to 2024-11-01)

MATH 5
Number of observations: 24
R-squared: 0.49
The price to achieve a fixed level of MATH 5 performance fell at a rate of
237x [19, 2914] per year
over 0.9 years (2023-11-01 to 2024-10-01)

MATH-500
Number of observations: 33
R-squared: 0.47
The price to achieve a fixed level of MATH-500 performance fell at a rate of
53x [13, 221] per year
over 1.7 years (2023-03-01 to 2024-11-01)

HumanEval
Number of observations: 31
R-squared: 0.49
The price to achieve a fixed level of HumanEval performance fell at a rate of
16x [5, 53] per year
over 1.7 years (2023-03-01 to 2024-11-01)



## Regression on all data, all benchmarks at once

In [21]:
df = aa_df.copy()
# Rename benchmark columns to all lowercase, underscore separated
new_bench_cols = [col.lower().replace(' ', '_').replace('-', '_') for col in benchmarks]
df.rename(columns={bench: new_bench_cols[i] for i, bench in enumerate(benchmarks)}, inplace=True)
df = df.dropna(subset=new_bench_cols)

model = smf.ols('log_price ~ date + ' + ' + '.join(new_bench_cols), data=df).fit()
# Print number of observations and R-squared
print(f'Number of observations: {len(df)}')
print(f'R-squared: {model.rsquared:.2f}')
date_param = model.params['date'] * 365  # Convert daily to annual
date_param_ci = model.conf_int(alpha=0.1).loc['date']
date_param_ci_low = date_param_ci[0] * 365
date_param_ci_high = date_param_ci[1] * 365
annual_factor = int(round(10**(-date_param)))  # Convert log slope to factor
annual_factor_ci_high = int(round(10**(-date_param_ci_low)))  # Convert log slope to factor
annual_factor_ci_low = int(round(10**(-date_param_ci_high)))  # Convert log slope to factor

min_date = pd.Timestamp.fromordinal(df["date"].min()).date()
max_date = pd.Timestamp.fromordinal(df["date"].max()).date()
date_range = max_date - min_date
results = [{
    'benchmarks': benchmarks,
    'sample_size': len(df),
    'start_date': min_date,
    'end_date': max_date,
    'price_reduction_factor_per_year_mean': annual_factor,
    'price_reduction_factor_per_year_ci': [annual_factor_ci_low, annual_factor_ci_high],
    'r_squared': round(model.rsquared, 2),
}]

print(f'The price to achieve a fixed level of performance fell at a rate of')
print(f'{annual_factor}x [{annual_factor_ci_low}, {annual_factor_ci_high}] per year')
print(f'over {date_range.days / 365:.1f} years ({min_date} to {max_date})')
print()

all_benchmarks_regression_results_df = pd.DataFrame(results)
all_benchmarks_regression_results_df.to_csv(results_dir + 'all_benchmarks_regression_results.csv', index=False)

Number of observations: 23
R-squared: 0.84
The price to achieve a fixed level of performance fell at a rate of
47x [8, 272] per year
over 0.9 years (2023-11-01 to 2024-10-01)



## Regression on top models by price-performance on any benchmark, all benchmarks at once

## Summary and comparison of methods

In [23]:
# Compare the cheapest model results to the full regression results
for bench in benchmarks:
    print(f'{bench} trends')
    cheapest_model_result = cheapest_model_summary_df[cheapest_model_summary_df['bench'] == bench].iloc[0]
    top_n_price_performance_result = top_n_price_performance_results_df[top_n_price_performance_results_df['bench'] == bench].iloc[0]
    single_benchmark_regression_result = single_benchmark_regression_results_df[single_benchmark_regression_results_df['bench'] == bench].iloc[0]
    all_benchmarks_regression_result = all_benchmarks_regression_results_df.iloc[0]
    print(
        'Cheapest model at threshold:',
        cheapest_model_result['price_reduction_factor_per_year_geomean'],
        cheapest_model_result['price_reduction_factor_per_year_range'],
    )
    print(
        'Top model by price-performance:',
        top_n_price_performance_result['price_reduction_factor_per_year_mean'],
        top_n_price_performance_result['price_reduction_factor_per_year_ci'],
    )
    print(
        'Single benchmark regression:',
        single_benchmark_regression_result['price_reduction_factor_per_year_mean'],
        single_benchmark_regression_result['price_reduction_factor_per_year_ci'],
    )
    print(
        'All benchmarks regression:',
        all_benchmarks_regression_result['price_reduction_factor_per_year_mean'],
        all_benchmarks_regression_result['price_reduction_factor_per_year_ci'],
    )
    print()

MMLU trends
Cheapest model at threshold: 42 [25, 96]
Top model by price-performance: 29 [15, 57]
Single benchmark regression: 13 [5, 38]
All benchmarks regression: 47 [8, 272]

GPQA Diamond trends
Cheapest model at threshold: 71 [27, 331]
Top model by price-performance: 48 [13, 173]
Single benchmark regression: 17 [7, 42]
All benchmarks regression: 47 [8, 272]

MATH 5 trends
Cheapest model at threshold: 1737 [26, 288635]
Top model by price-performance: 31 [3, 307]
Single benchmark regression: 237 [19, 2914]
All benchmarks regression: 47 [8, 272]

MATH-500 trends
Cheapest model at threshold: 84 [13, 390]
Top model by price-performance: 112 [24, 513]
Single benchmark regression: 53 [13, 221]
All benchmarks regression: 47 [8, 272]

HumanEval trends
Cheapest model at threshold: 52 [10, 299]
Top model by price-performance: 40 [14, 112]
Single benchmark regression: 16 [5, 53]
All benchmarks regression: 47 [8, 272]

