In [1]:
from collections import defaultdict
import numpy as np
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import statsmodels.formula.api as smf

from data import load_pcd_df
from plotting import save_plot

pio.templates.default = "plotly_white"

In [2]:
results_dir = 'results/2025-03-04/'
os.makedirs(results_dir, exist_ok=True)

In [3]:
save = True
benchmarks = ['MMLU', 'GPQA Diamond', 'MATH-500', 'MATH 5', 'HumanEval', 'LMSys Chatbot Arena ELO']
benchmark_is_mqa = {'MMLU': True, 'GPQA Diamond': True, 'MATH 5': False, 'MATH-500': False, 'HumanEval': False, 'LMSys Chatbot Arena ELO': False}
min_num_data_points_for_regression = 4
minimum_date_for_cheapest_models = '2024-01-01'
maximum_datespan_after_threshold_model = 365  # days

## Load Artificial Analysis dataset

In [4]:
aa_df = pd.read_csv('data/aa_data_with_math5.csv')
aa_df

Unnamed: 0,Model Name,Release Date,USD per 1M Tokens,MMLU,GPQA Diamond,HumanEval,MATH-500,LMSys Chatbot Arena ELO,MATH 5,Tokens per Second,Prefill Latency (s),Prompt Length (tk)
0,Claude-3-Haiku,2024-03-04,0.5,71,33,77.0,39.0,1179.0,13.0,122.7,0.467,1000.0
1,Claude-3-Opus,2024-03-04,30.0,84,50,83.0,64.0,1248.0,34.0,26.5,1.984,1000.0
2,Claude-3-Sonnet,2024-03-04,6.0,77,37,71.0,41.0,1201.0,16.0,61.8,0.789,1000.0
3,Claude-3.5-Haiku,2024-10-22,1.6,81,37,87.0,67.0,1236.0,,64.2,0.768,1000.0
4,Claude-3.5-Sonnet-2024-06,2024-06-20,6.0,88,56,90.0,71.0,1268.0,46.0,55.9,0.906,1000.0
5,Claude-3.5-Sonnet-2024-10,2024-10-22,6.0,89,58,96.0,76.0,1282.0,53.0,55.2,0.907,1000.0
6,Gemini 2.0 Flash,2025-02-05,0.175,88,62,90.0,93.0,1358.0,82.0,,,
7,Gemini-1.5-Flash-2024-05,2024-05-10,0.13,79,39,,55.0,1227.0,23.0,298.4,0.307,1000.0
8,Gemini-1.5-Flash-2024-09,2024-09-24,0.13,75,45,83.0,83.0,1271.0,58.0,190.5,0.348,1000.0
9,Gemini-1.5-Flash-8B,2024-10-03,0.07,75,30,12.0,70.0,1211.0,,285.2,0.335,1000.0


In [5]:
# 'Release Date' is a string with the format 'YYYY-MM'
aa_df['Release Date'] = pd.to_datetime(aa_df['Release Date'].str.strip(), format='%Y-%m-%d')

## Load Epoch AI price dataset

In [6]:
api_price_df = pd.read_csv('data/epoch_ai_price_data_not_in_aa_with_benchmarks.csv')
api_price_df['Release Date'] = pd.to_datetime(api_price_df['Release Date'].str.strip(), format='%Y-%m-%d')
api_price_df

Unnamed: 0,Model Name,Release Date,USD per 1M Tokens,MMLU,GPQA Diamond,HumanEval,MATH-500,LMSys Chatbot Arena ELO,MATH 5,Tokens per Second,Prefill Latency (s),Prompt Length (tk)
0,Claude 2,2024-08-12,12.0,78.5,35.0,,,,10.0,,,
1,Claude 2.1,2024-08-12,12.0,,36.0,16.0,,,11.0,,,
2,Claude Instant,2024-08-12,1.2,,,,,,,,,
3,Cohere Command,2024-08-13,1.625,,,,,,,,,
4,Cohere Command Light,2024-08-13,0.375,,,,,,,,,
5,Command R,2024-08-13,0.75,,,42.0,15.0,1180.0,,,,
6,Command R+,2024-08-13,6.0,75.7,34.0,63.0,40.0,1215.0,,,,
7,Command R+,2024-09-13,4.375,75.7,34.0,63.0,40.0,1215.0,,,,
8,DeepSeek-Coder-V2 236B,2024-09-11,0.175,79.2,,87.0,74.0,1178.0,,,,
9,DeepSeek-R1,2025-01-20,0.96,,71.7,98.0,96.0,1362.0,93.1,,,


In [7]:
# Merge the two datasets
aa_df = pd.concat([aa_df, api_price_df])
aa_df.dropna(subset=['USD per 1M Tokens'], inplace=True)
aa_df.sort_values(by='Release Date', inplace=True)
# Reset the index
aa_df.reset_index(drop=True, inplace=True)
aa_df

Unnamed: 0,Model Name,Release Date,USD per 1M Tokens,MMLU,GPQA Diamond,HumanEval,MATH-500,LMSys Chatbot Arena ELO,MATH 5,Tokens per Second,Prefill Latency (s),Prompt Length (tk)
0,GPT-3 175B (davinci),2021-11-20,60.000,43.9,,,,,,,,
1,GPT-3 175B (davinci),2022-08-31,60.000,43.9,,,,,,,,
2,GPT-3 175B (davinci),2022-09-01,20.000,43.9,,,,,,,,
3,GPT-3.5,2022-11-30,20.000,64.8,,,,,,,,
4,GPT-3.5 Turbo,2023-03-06,2.000,68.0,,,,1106.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
68,DeepSeek-R1,2025-01-20,0.960,,71.7,98.0,96.0,1362.0,93.1,,,
69,Mistral Small 3,2025-01-30,0.475,82.0,46.0,85.0,74.0,1210.0,45.0,,,
70,o1-mini,2025-01-31,1.925,,59.5,97.0,94.0,1308.0,84.3,,,
71,o3-mini,2025-01-31,1.925,,74.3,97.0,97.0,1306.0,95.2,,,


In [8]:
# Rename all instances of 'GPT-3 175B (davinci)' to 'GPT-3'
aa_df.loc[aa_df['Model Name'] == 'GPT-3 175B (davinci)', 'Model Name'] = 'GPT-3'
aa_df


Unnamed: 0,Model Name,Release Date,USD per 1M Tokens,MMLU,GPQA Diamond,HumanEval,MATH-500,LMSys Chatbot Arena ELO,MATH 5,Tokens per Second,Prefill Latency (s),Prompt Length (tk)
0,GPT-3,2021-11-20,60.000,43.9,,,,,,,,
1,GPT-3,2022-08-31,60.000,43.9,,,,,,,,
2,GPT-3,2022-09-01,20.000,43.9,,,,,,,,
3,GPT-3.5,2022-11-30,20.000,64.8,,,,,,,,
4,GPT-3.5 Turbo,2023-03-06,2.000,68.0,,,,1106.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
68,DeepSeek-R1,2025-01-20,0.960,,71.7,98.0,96.0,1362.0,93.1,,,
69,Mistral Small 3,2025-01-30,0.475,82.0,46.0,85.0,74.0,1210.0,45.0,,,
70,o1-mini,2025-01-31,1.925,,59.5,97.0,94.0,1308.0,84.3,,,
71,o3-mini,2025-01-31,1.925,,74.3,97.0,97.0,1306.0,95.2,,,


In [9]:
aa_df['Model Name and Date'] = aa_df['Model Name'] + ' (' + aa_df['Release Date'].dt.strftime('%Y-%m') + ')'
aa_df['Model Name and Date']

0                GPT-3 (2021-11)
1                GPT-3 (2022-08)
2                GPT-3 (2022-09)
3              GPT-3.5 (2022-11)
4        GPT-3.5 Turbo (2023-03)
                 ...            
68         DeepSeek-R1 (2025-01)
69     Mistral Small 3 (2025-01)
70             o1-mini (2025-01)
71             o3-mini (2025-01)
72    Gemini 2.0 Flash (2025-02)
Name: Model Name and Date, Length: 73, dtype: object

## Merge in the data from evaluation logs

In [10]:
# Saved in save_evaluation_data.ipynb
eval_log_df = pd.read_csv('data/epoch_ai_eval_data.csv')
eval_log_df.head()

Unnamed: 0,Model,Task,epochs,total_samples,input_tokens,output_tokens,total_tokens
0,Yi-34B-Chat,MATH level 5,8,10592,2222376,7308455,9530831
1,Qwen2.5-72B-Instruct,MATH level 5,8,10592,2243320,9708977,11952297
2,gpt-4-0613,MATH level 5,8,10592,1982888,3620454,5603342
3,o1-mini-2024-09-12_high,MATH level 5,4,5296,988900,9914854,10903754
4,gpt-4-turbo-2024-04-09,OTIS Mock AIME 2024-2025,8,360,65592,272569,338161


In [11]:
print(sorted(eval_log_df['Model'].unique(), key=lambda x: str(x).lower()))

['claude-2.0', 'claude-2.1', 'claude-3-5-haiku-20241022', 'claude-3-5-sonnet-20240620', 'claude-3-5-sonnet-20241022', 'claude-3-7-sonnet-20250219', 'claude-3-7-sonnet-20250219_16K', 'claude-3-haiku-20240307', 'claude-3-opus-20240229', 'claude-3-sonnet-20240229', 'dbrx-instruct', 'deepseek-llm-67b-chat', 'DeepSeek-R1', 'DeepSeek-V3', 'Eurus-2-7B-PRIME', 'gemini-1.0-pro-001', 'gemini-1.5-flash-001', 'gemini-1.5-flash-002', 'gemini-1.5-flash-8b-001', 'gemini-1.5-pro-001', 'gemini-1.5-pro-002', 'gemini-2.0-flash-001', 'gemini-2.0-flash-thinking-exp-01-21', 'gemini-2.0-pro-exp-02-05', 'gemma-2-27b-it', 'gemma-2-9b-it', 'gpt-3.5-turbo-0125', 'gpt-3.5-turbo-1106', 'gpt-4-0125-preview', 'gpt-4-0613', 'gpt-4-1106-preview', 'gpt-4-turbo-2024-04-09', 'gpt-4.5-preview-2025-02-27', 'gpt-4o-2024-05-13', 'gpt-4o-2024-08-06', 'gpt-4o-2024-11-20', 'gpt-4o-mini-2024-07-18', 'grok-2-1212', 'Hermes-2-Theta-Llama-3-70B', 'Llama-2-70b-chat-hf', 'Llama-3.1-405B-Instruct', 'Llama-3.1-70B-Instruct', 'Llama-3.1

In [12]:
model_mapping = {
    # Claude models
    'claude-3-haiku-20240307': 'Claude-3-Haiku',
    'claude-3-opus-20240229': 'Claude-3-Opus',
    'claude-3-sonnet-20240229': 'Claude-3-Sonnet',
    'claude-3-5-sonnet-20240620': 'Claude-3.5-Sonnet-2024-06',
    'claude-3-5-sonnet-20241022': 'Claude-3.5-Sonnet-2024-10',
    'claude-2.0': 'Claude 2',
    'claude-2.1': 'Claude 2.1',
    
    # Gemini models
    'gemini-2.0-flash-001': 'Gemini 2.0 Flash',
    # 'gemini-2.0-flash-thinking-exp-01-21': 'Gemini 2.0 Flash Thinking',
    'gemini-1.5-flash-001': 'Gemini-1.5-Flash-2024-05',
    'gemini-1.5-flash-002': 'Gemini-1.5-Flash-2024-09',
    'gemini-1.5-pro-001': 'Gemini-1.5-Pro-2024-05',
    'gemini-1.5-pro-002': 'Gemini-1.5-Pro-2024-09',
    'gemini-1.0-pro-001': 'Gemini 1.0 Pro',
    
    # Gemma models
    'gemma-2-27b-it': 'Gemma-2-27B',
    'gemma-2-9b-it': 'Gemma-2-9B',
    
    # GPT models
    'gpt-3.5-turbo-0125': 'GPT-3.5-Turbo-2024-01',
    'gpt-3.5-turbo-1106': 'GPT-3.5-Turbo-2023-11',
    'gpt-4-0613': 'GPT-4-0314',
    'gpt-4-1106-preview': 'GPT-4-1106',
    'gpt-4-0125-preview': 'GPT-4-0125',
    'gpt-4-turbo-2024-04-09': 'GPT-4 Turbo',
    'gpt-4o-2024-05-13': 'GPT-4o-2024-05',
    'gpt-4o-2024-08-06': 'GPT-4o-2024-08',
    'gpt-4o-2024-11-20': 'GPT-4o-2024-11',
    'gpt-4o-mini-2024-07-18': 'GPT-4o-mini',
    
    # Llama models
    'Llama-2-70b-chat-hf': 'Llama 2-70B Chat',
    'Meta-Llama-3-8B-Instruct': 'Llama-3-Instruct-8B',
    'Meta-Llama-3-70B-Instruct': 'Llama-3-Instruct-70B',
    'Meta-Llama-3.1-8B-Instruct': 'Llama-3.1-Instruct-8B',
    'Meta-Llama-3.1-70B-Instruct': 'Llama-3.1-Instruct-70B',
    'Meta-Llama-3.1-405B-Instruct': 'Llama-3.1-Instruct-405B',
    # 'llama-3.1-405b-instruct-maas': 'Llama-3.1-Instruct-405B',
    # 'llama-v3p3-70b-instruct': 'Llama-3.1-Instruct-70B',
    # 'Llama-3.1-Tulu-3-70B-DPO': 'Llama-3.1-Instruct-70B',
    
    # Mistral models
    'Mistral-7B-Instruct-v0.3': 'Mistral 7B',
    'mistral-large-2402': 'Mistral-Large-2024-02',
    'mistral-large-2407': 'Mistral-Large-2-2024-06',
    'open-mistral-7b': 'Mistral 7B',
    'open-mistral-nemo-2407': 'Mistral-NeMo',
    'mistral-small-2501': 'Mistral Small 3',
    'open-mixtral-8x22b': 'Mistral-8x22',
    # 'open-mixtral-8x7b': 'Mistral 7B',
    # 'Mixtral-8x7B-Instruct-v0.1': 'Mistral 7B',
    # 'WizardLM-2-8x22B': 'Mistral-8x22',
    
    # Phi models
    'phi-4': 'Phi 4',
    
    # Anthropic o models
    'o1-2024-12-17_high': 'o1',
    'o1-2024-12-17_medium': 'o1',
    'o1-mini-2024-09-12_high': 'o1-mini',
    'o1-mini-2024-09-12_medium': 'o1-mini',
    'o1-preview-2024-09-12_medium': 'o1-preview',
    'o3-mini-2025-01-31_high': 'o3-mini',
    'o3-mini-2025-01-31_medium': 'o3-mini',
    
    # DeepSeek models
    'deepseek-r1': 'DeepSeek-R1',
    'deepseek-v3': 'DeepSeek-V3',
    'DeepSeek-V3': 'DeepSeek-V3',
    
    # Qwen models (no exact matches in Set 2, but including for completeness)
    # 'qwen2p5-72b-instruct': 'Qwen2.5-72B-Instruct',
    # 'Qwen2.5-32B-Instruct': 'Qwen2.5-32B-Instruct',
    # 'Qwen2.5-72B-Instruct': 'Qwen2.5-72B-Instruct',
    # 'Qwen1.5-72B-Chat': 'Qwen1.5-72B-Chat',
    # 'Qwen1.5-32B-Chat': 'Qwen1.5-32B-Chat',
    # 'Qwen2-72B-Instruct': 'Qwen2-72B-Instruct',
    
    # Other models without clear matches
    # 'Yi-34B-Chat': 'Yi-34B-Chat',
    # 'Yi-1.5-34B-Chat': 'Yi-1.5-34B-Chat',
    # 'dbrx-instruct': 'dbrx-instruct',
    # 'grok-2-1212': 'grok-2-1212',
    # 'Hermes-2-Theta-Llama-3-70B': 'Hermes-2-Theta-Llama-3-70B',
    # 'Eurus-2-7B-PRIME': 'Eurus-2-7B-PRIME',
    # 'ministral-8b-2410': 'ministral-8b-2410',
    # 'ministral-3b-2410': 'ministral-3b-2410'
}

In [13]:
# Keep all rows from df, even if no match in aa_df
df_temp = eval_log_df.copy()
df_temp['Model Name'] = df_temp['Model'].map(model_mapping)
merged_df = pd.merge(df_temp, aa_df, on='Model Name', how='left')

# Keep all rows from aa_df, even if no match in df
df_temp = eval_log_df.copy()
df_temp['Model Name'] = df_temp['Model'].map(model_mapping)
merged_df = pd.merge(df_temp, aa_df, on='Model Name', how='right')

merged_df

Unnamed: 0,Model,Task,epochs,total_samples,input_tokens,output_tokens,total_tokens,Model Name,Release Date,USD per 1M Tokens,MMLU,GPQA Diamond,HumanEval,MATH-500,LMSys Chatbot Arena ELO,MATH 5,Tokens per Second,Prefill Latency (s),Prompt Length (tk),Model Name and Date
0,,,,,,,,GPT-3,2021-11-20,60.000,43.9,,,,,,,,,GPT-3 (2021-11)
1,,,,,,,,GPT-3,2022-08-31,60.000,43.9,,,,,,,,,GPT-3 (2022-08)
2,,,,,,,,GPT-3,2022-09-01,20.000,43.9,,,,,,,,,GPT-3 (2022-09)
3,,,,,,,,GPT-3.5,2022-11-30,20.000,64.8,,,,,,,,,GPT-3.5 (2022-11)
4,,,,,,,,GPT-3.5 Turbo,2023-03-06,2.000,68.0,,,,1106.0,,,,,GPT-3.5 Turbo (2023-03)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,o3-mini-2025-01-31_high,OTIS Mock AIME 2024-2025,8.0,360.0,65016.0,4697771.0,4762787.0,o3-mini,2025-01-31,1.925,,74.3,97.0,97.0,1306.0,95.2,,,,o3-mini (2025-01)
141,o3-mini-2025-01-31_medium,MATH level 5,8.0,10592.0,1892201.0,17775998.0,19668199.0,o3-mini,2025-01-31,1.925,,74.3,97.0,97.0,1306.0,95.2,,,,o3-mini (2025-01)
142,gemini-2.0-flash-001,MATH level 5,8.0,10592.0,1883976.0,9566695.0,11450671.0,Gemini 2.0 Flash,2025-02-05,0.175,88.0,62.0,90.0,93.0,1358.0,82.0,,,,Gemini 2.0 Flash (2025-02)
143,gemini-2.0-flash-001,OTIS Mock AIME 2024-2025,16.0,720.0,131472.0,1186729.0,1318201.0,Gemini 2.0 Flash,2025-02-05,0.175,88.0,62.0,90.0,93.0,1358.0,82.0,,,,Gemini 2.0 Flash (2025-02)


In [14]:
top_n = 1
aa_df = aa_df.sort_values(by='Release Date')

top_models_df_lookup = {}
for benchmark in benchmarks:
    # Get the top-1 model by benchmark score at each point in time
    ever_top_n_models = set()
    unique_dates = aa_df['Release Date'].sort_values().unique()
    for date in unique_dates:
        df_up_to_date = aa_df[aa_df['Release Date'] <= date]
        top_n_models = df_up_to_date.nlargest(top_n, benchmark)
        top_n_model_names = top_n_models['Model Name and Date'].tolist()
        ever_top_n_models.update(top_n_model_names)
    ever_top_n_list = list(ever_top_n_models)
    print(ever_top_n_list)
    top_models_df = aa_df[aa_df['Model Name and Date'].isin(ever_top_n_list)]
    top_models_df_lookup[benchmark] = top_models_df

    # Create base scatter plot with all data
    fig = px.scatter(aa_df, x='Release Date', y=benchmark, 
                    hover_data=['Model Name'], 
                    title=f'{benchmark} Performance over time',
                    color_discrete_sequence=['lightgray'])

    # Add top-1 models in a different color
    fig.add_scatter(x=top_models_df['Release Date'], 
                    y=top_models_df[benchmark],
                    mode='lines+markers',
                    name='Top Model',
                    hovertemplate='%{text}<extra></extra>',
                    text=top_models_df['Model Name'],
                    line=dict(color='red'))
    fig.update_layout(
        width=800,
        height=400,
    )
    fig.show()

['GPT-4 Turbo (2023-11)', 'Claude-3.5-Sonnet-2024-06 (2024-06)', 'GPT-3 (2021-11)', 'GPT-3.5 Turbo (2023-03)', 'GPT-4o-2024-08 (2024-08)', 'GPT-3.5 (2022-11)', 'GPT-4-0314 (2023-03)']


['GPT-4o-2024-05 (2024-05)', 'GPT-4 Turbo (2023-11)', 'o1 (2024-12)', 'Claude-3.5-Sonnet-2024-06 (2024-06)', 'GPT-3 (2021-11)', 'o1-preview (2024-09)', 'GPT-4-0314 (2023-03)']


['GPT-4o-2024-05 (2024-05)', 'o1-mini (2024-09)', 'GPT-4 Turbo (2023-11)', 'o1 (2024-12)', 'GPT-3.5-Turbo-2023-06 (2023-06)', 'GPT-3 (2021-11)', 'GPT-4o-2024-08 (2024-08)']


['o1-mini (2024-09)', 'GPT-4o-2024-05 (2024-05)', 'o3-mini (2025-01)', 'GPT-4 Turbo (2023-11)', 'GPT-4-0613 (2023-06)', 'o1 (2024-12)', 'GPT-3 (2021-11)']


['GPT-4o-2024-05 (2024-05)', 'o1-mini (2024-09)', 'GPT-4 Turbo (2023-11)', 'DeepSeek-R1 (2025-01)', 'GPT-3.5-Turbo-2023-06 (2023-06)', 'GPT-3 (2021-11)', 'GPT-4-0314 (2023-03)']


['GPT-4o-2024-05 (2024-05)', 'GPT-4 Turbo (2023-11)', 'DeepSeek-R1 (2025-01)', 'GPT-4o-2024-11 (2024-11)', 'GPT-3 (2021-11)', 'GPT-3.5 Turbo (2023-03)', 'GPT-4o-2024-08 (2024-08)', 'GPT-4-0314 (2023-03)']


## Measure the decline in evaluation cost

In [15]:
eval_cost_df = merged_df.copy().dropna(subset=['Model', 'total_tokens'])
eval_cost_df

Unnamed: 0,Model,Task,epochs,total_samples,input_tokens,output_tokens,total_tokens,Model Name,Release Date,USD per 1M Tokens,MMLU,GPQA Diamond,HumanEval,MATH-500,LMSys Chatbot Arena ELO,MATH 5,Tokens per Second,Prefill Latency (s),Prompt Length (tk),Model Name and Date
5,gpt-4-0613,MATH level 5,8.0,10592.0,1982888.0,3620454.0,5603342.0,GPT-4-0314,2023-03-14,37.500,86.0,33.0,67.0,,1186.0,,23.6,0.724,1000.0,GPT-4-0314 (2023-03)
6,gpt-4-0613,GPQA Diamond,16.0,3168.0,838928.0,694183.0,1533111.0,GPT-4-0314,2023-03-14,37.500,86.0,33.0,67.0,,1186.0,,23.6,0.724,1000.0,GPT-4-0314 (2023-03)
11,gpt-3.5-turbo-1106,GPQA Diamond,16.0,3168.0,838992.0,319979.0,1158971.0,GPT-3.5-Turbo-2023-11,2023-11-06,0.750,68.0,30.0,71.0,44.0,1107.0,15.0,121.5,0.598,1000.0,GPT-3.5-Turbo-2023-11 (2023-11)
12,gpt-3.5-turbo-1106,MATH level 5,8.0,10592.0,1982888.0,3495658.0,5478546.0,GPT-3.5-Turbo-2023-11,2023-11-06,0.750,68.0,30.0,71.0,44.0,1107.0,15.0,121.5,0.598,1000.0,GPT-3.5-Turbo-2023-11 (2023-11)
13,gpt-4-turbo-2024-04-09,OTIS Mock AIME 2024-2025,8.0,360.0,65592.0,272569.0,338161.0,GPT-4 Turbo,2023-11-06,15.000,87.0,50.0,92.0,74.0,1256.0,36.0,39.2,1.246,1000.0,GPT-4 Turbo (2023-11)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,o3-mini-2025-01-31_high,OTIS Mock AIME 2024-2025,8.0,360.0,65016.0,4697771.0,4762787.0,o3-mini,2025-01-31,1.925,,74.3,97.0,97.0,1306.0,95.2,,,,o3-mini (2025-01)
141,o3-mini-2025-01-31_medium,MATH level 5,8.0,10592.0,1892201.0,17775998.0,19668199.0,o3-mini,2025-01-31,1.925,,74.3,97.0,97.0,1306.0,95.2,,,,o3-mini (2025-01)
142,gemini-2.0-flash-001,MATH level 5,8.0,10592.0,1883976.0,9566695.0,11450671.0,Gemini 2.0 Flash,2025-02-05,0.175,88.0,62.0,90.0,93.0,1358.0,82.0,,,,Gemini 2.0 Flash (2025-02)
143,gemini-2.0-flash-001,OTIS Mock AIME 2024-2025,16.0,720.0,131472.0,1186729.0,1318201.0,Gemini 2.0 Flash,2025-02-05,0.175,88.0,62.0,90.0,93.0,1358.0,82.0,,,,Gemini 2.0 Flash (2025-02)


In [16]:
eval_cost_df['Average cost per question (USD)'] = eval_cost_df['total_tokens'] / eval_cost_df['total_samples'] * eval_cost_df['USD per 1M Tokens'] / 1e6
eval_cost_df

Unnamed: 0,Model,Task,epochs,total_samples,input_tokens,output_tokens,total_tokens,Model Name,Release Date,USD per 1M Tokens,...,GPQA Diamond,HumanEval,MATH-500,LMSys Chatbot Arena ELO,MATH 5,Tokens per Second,Prefill Latency (s),Prompt Length (tk),Model Name and Date,Average cost per question (USD)
5,gpt-4-0613,MATH level 5,8.0,10592.0,1982888.0,3620454.0,5603342.0,GPT-4-0314,2023-03-14,37.500,...,33.0,67.0,,1186.0,,23.6,0.724,1000.0,GPT-4-0314 (2023-03),0.019838
6,gpt-4-0613,GPQA Diamond,16.0,3168.0,838928.0,694183.0,1533111.0,GPT-4-0314,2023-03-14,37.500,...,33.0,67.0,,1186.0,,23.6,0.724,1000.0,GPT-4-0314 (2023-03),0.018148
11,gpt-3.5-turbo-1106,GPQA Diamond,16.0,3168.0,838992.0,319979.0,1158971.0,GPT-3.5-Turbo-2023-11,2023-11-06,0.750,...,30.0,71.0,44.0,1107.0,15.0,121.5,0.598,1000.0,GPT-3.5-Turbo-2023-11 (2023-11),0.000274
12,gpt-3.5-turbo-1106,MATH level 5,8.0,10592.0,1982888.0,3495658.0,5478546.0,GPT-3.5-Turbo-2023-11,2023-11-06,0.750,...,30.0,71.0,44.0,1107.0,15.0,121.5,0.598,1000.0,GPT-3.5-Turbo-2023-11 (2023-11),0.000388
13,gpt-4-turbo-2024-04-09,OTIS Mock AIME 2024-2025,8.0,360.0,65592.0,272569.0,338161.0,GPT-4 Turbo,2023-11-06,15.000,...,50.0,92.0,74.0,1256.0,36.0,39.2,1.246,1000.0,GPT-4 Turbo (2023-11),0.014090
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,o3-mini-2025-01-31_high,OTIS Mock AIME 2024-2025,8.0,360.0,65016.0,4697771.0,4762787.0,o3-mini,2025-01-31,1.925,...,74.3,97.0,97.0,1306.0,95.2,,,,o3-mini (2025-01),0.025468
141,o3-mini-2025-01-31_medium,MATH level 5,8.0,10592.0,1892201.0,17775998.0,19668199.0,o3-mini,2025-01-31,1.925,...,74.3,97.0,97.0,1306.0,95.2,,,,o3-mini (2025-01),0.003575
142,gemini-2.0-flash-001,MATH level 5,8.0,10592.0,1883976.0,9566695.0,11450671.0,Gemini 2.0 Flash,2025-02-05,0.175,...,62.0,90.0,93.0,1358.0,82.0,,,,Gemini 2.0 Flash (2025-02),0.000189
143,gemini-2.0-flash-001,OTIS Mock AIME 2024-2025,16.0,720.0,131472.0,1186729.0,1318201.0,Gemini 2.0 Flash,2025-02-05,0.175,...,62.0,90.0,93.0,1358.0,82.0,,,,Gemini 2.0 Flash (2025-02),0.000320


In [35]:
len(eval_cost_df['Model Name'].unique())

38

In [17]:
fig = px.scatter(
    data_frame=eval_cost_df,
    x='Release Date',
    y='Average cost per question (USD)',
    color='Task',
    hover_data=['Model Name and Date', 'Task'],
    log_y=True,
    title='Cost per Question Over Time'
)
fig.update_layout(
    xaxis_title='Release Date',
    yaxis_title='Average Cost per Question (USD, log scale)'
)
fig.show()

In [18]:
results_subdir = results_dir + 'lowest_cost_models_examples/'
os.makedirs(results_subdir, exist_ok=True)

In [19]:
cost_col = 'Average cost per question (USD)'

### Final selected example

In [20]:
eval_cost_df['Model Name'].unique()

array(['GPT-4-0314', 'GPT-3.5-Turbo-2023-11', 'GPT-4 Turbo',
       'Llama 2-70B Chat', 'GPT-4-0125', 'GPT-3.5-Turbo-2024-01',
       'Mistral-Large-2024-02', 'Claude-3-Haiku', 'Claude-3-Sonnet',
       'Claude-3-Opus', 'Mistral-8x22', 'Llama-3-Instruct-70B',
       'Llama-3-Instruct-8B', 'Gemini-1.5-Flash-2024-05',
       'GPT-4o-2024-05', 'Gemini-1.5-Pro-2024-05',
       'Claude-3.5-Sonnet-2024-06', 'Gemma-2-9B', 'Gemma-2-27B',
       'Mistral-NeMo', 'GPT-4o-mini', 'GPT-4o-2024-08', 'Claude 2',
       'Gemini 1.0 Pro', 'Claude 2.1', 'o1-preview', 'o1-mini',
       'Mistral 7B', 'Gemini-1.5-Pro-2024-09', 'Gemini-1.5-Flash-2024-09',
       'Claude-3.5-Sonnet-2024-10', 'GPT-4o-2024-11', 'Phi 4', 'o1',
       'DeepSeek-V3', 'Mistral Small 3', 'o3-mini', 'Gemini 2.0 Flash'],
      dtype=object)

In [21]:
# selected_benchmarks = ['MMLU', 'MATH 5', 'HumanEval']
bench = 'GPQA Diamond'
threshold_model = 'GPT-4-0314'
performance_lower_bound = eval_cost_df[eval_cost_df['Model Name'] == threshold_model].iloc[0][bench]
performance_upper_bound = 100
print(f'\nPerformance range: {performance_lower_bound}-{performance_upper_bound}%')

cheapest_models = []
current_best = None
benchmark_df = eval_cost_df.dropna(subset=[bench])
if bench == 'MATH 5':
    task = 'MATH level 5'
else:
    task = bench
benchmark_df = benchmark_df[benchmark_df['Task'] == task]
for i, row in benchmark_df.iterrows():
    if (row[bench] >= performance_lower_bound) and (row[bench] < performance_upper_bound):
        if (current_best is None) or (row[cost_col] < current_best[cost_col]):
            current_best = row
            cheapest_models.append(current_best)
            print(current_best['Release Date'], current_best['Model Name'], current_best[bench], f"${current_best[cost_col]:.2f}")

cheapest_models_df = pd.DataFrame(cheapest_models)
cheapest_models_df.reset_index(drop=True, inplace=True)

# Second step: check if the next row has the same Release date. If so, remove the current row.
# Note that the df is already in descending order of price, so we only need to check the next row.
idxs_to_remove = []
for i, row in cheapest_models_df.iterrows():
    if i < len(cheapest_models_df) - 1:
        next_row = cheapest_models_df.iloc[i+1]
        if next_row['Release Date'] == row['Release Date']:
            idxs_to_remove.append(i)
            print(f'Removing {row["Model Name"]} because it has the same Release date as {next_row["Model Name"]} and has a higher cost')
cheapest_models_df = cheapest_models_df.drop(idxs_to_remove)
cheapest_models_df = cheapest_models_df.reset_index(drop=True)

# Fit a line to the data
cheapest_models_df['price'] = cheapest_models_df[cost_col]
cheapest_models_df['log_price'] = np.log10(cheapest_models_df[cost_col])
cheapest_models_df['date'] = cheapest_models_df['Release Date'].map(lambda x: pd.Timestamp(x).toordinal())
exponential_model = smf.ols('log_price ~ date', data=cheapest_models_df).fit()

fig = go.Figure()

all_df = eval_cost_df[
    (eval_cost_df['Task'] == bench) &
    (eval_cost_df[bench].notna()) &
    (eval_cost_df[bench] >= performance_lower_bound) &
    (eval_cost_df[bench] < performance_upper_bound)
]
fig.add_trace(go.Scatter(
    x=all_df['Release Date'],
    y=all_df[cost_col],
    mode='markers',
    name=f'Other, GPT-4 level or better on {bench}',
    text=all_df['Model Name'],
    marker=dict(color='rgb(222, 222, 255)')
))

factor = cheapest_models_df[cost_col].iloc[0] / cheapest_models_df[cost_col].iloc[-1]
period_months = (cheapest_models_df['Release Date'].iloc[-1] - cheapest_models_df['Release Date'].iloc[0]).days / (365/12)

annual_slope = exponential_model.params['date'] * 365  # Convert daily to annual
annual_factor = int(round(10**(-annual_slope)))  # Convert log slope to factor
# Get the 90% CI
ci_90 = exponential_model.conf_int(alpha=0.1)
print(int(round(10**(-ci_90.loc['date'][1] * 365))), int(round(10**(-ci_90.loc['date'][0] * 365))))

# Plot the exponential trendline with the data
date_range = pd.date_range(start=cheapest_models_df['Release Date'].min(), end=cheapest_models_df['Release Date'].max(), freq='D')
pred_df = pd.DataFrame({'date': date_range.map(lambda x: pd.Timestamp(x).toordinal())})
fig.add_trace(go.Scatter(
    x=date_range,
    y=10**exponential_model.predict(pred_df['date']),
    mode='lines+text',
    name=f'{annual_factor}x decrease per year',
    # Only show text at middle index
    text=['' if i != len(date_range)//2 else f'{annual_factor}x per year' for i in range(len(date_range))],
    textposition='middle left',
    textfont=dict(size=14),
    line=dict(color='magenta', dash='dash'),
    hoverinfo='skip',
    showlegend=False,
))

# annotations = [""] * len(cheapest_models_df['Model Name'])
# annotations[0] = cheapest_models_df['Model Name'].iloc[0]
# annotations[-1] = cheapest_models_df['Model Name'].iloc[-1]
annotations = cheapest_models_df['Model Name']

fig.add_trace(go.Scatter(
    x=cheapest_models_df['Release Date'],
    y=cheapest_models_df[cost_col],
    mode='markers+text',
    name=f'Cheapest, GPT-4 level or better on {bench}',
    marker=dict(color='magenta'),
    text=annotations,
    textposition='top right',
    # marker=dict(color='blue'),
    # visible="legendonly",  # Hide this trace by default
    # legendgroup=f'{bench}_{performance_range_str}',
))
fig.update_layout(xaxis_range=[
    cheapest_models_df['Release Date'].min() - pd.Timedelta(days=30),
    cheapest_models_df['Release Date'].max() + pd.Timedelta(days=150)
])

fig.update_layout(
    # title=f'The cost to answer PhD-level science questions has fallen by {round(factor, -1):.0f}x in {period_months:.0f} months'
    title=f'The cost to answer PhD-level science questions as well as GPT-4 has fallen by {annual_factor}x per year'
)
fig.update_layout(yaxis_type='log')
fig.update_layout(xaxis_title='Month')
fig.update_layout(yaxis_title='Avg. cost per benchmark question (USD, log scale)')
# fig.update_layout(xaxis_range=[eval_cost_df['Release Date'].min() - pd.Timedelta(days=150), eval_cost_df['Release Date'].max() + pd.Timedelta(days=30)])
fig.update_layout(
    width=1000,
    height=600,
    # font=dict(size=14),
    legend=dict(
        yanchor="top",
        y=0.14,
        xanchor="right",
        x=0.45,
        bordercolor="lightgrey",
        borderwidth=1
    )
)

if save:
    save_plot(fig, results_subdir, f'lowest_cost_models_{bench}_{performance_lower_bound}_to_{performance_upper_bound}', extensions=['png', 'svg'])

fig.show()


Performance range: 33.0-100%
2023-03-14 00:00:00 GPT-4-0314 33.0 $0.02
2023-11-06 00:00:00 GPT-4 Turbo 50.0 $0.01
2024-02-26 00:00:00 Mistral-Large-2024-02 36.0 $0.00
2024-03-04 00:00:00 Claude-3-Haiku 33.0 $0.00
2024-05-10 00:00:00 Gemini-1.5-Flash-2024-05 39.0 $0.00
1 4422


### Full analysis

In [22]:
results_subdir = results_dir + 'lowest_cost_models_above_previous_frontier/'
os.makedirs(results_subdir, exist_ok=True)

performance_delta = 100
results = []
for i, bench in enumerate(benchmarks):    
    if i > 0:
        print('\n')
    print(f'{bench}')

    if bench == 'MATH 5':
        task = 'MATH level 5'
    else:
        task = bench
    if len(eval_cost_df[eval_cost_df['Task'] == task]) == 0:
        print(f'No eval data for {bench} - skipping')
        continue

    for i, frontier_model_data in top_models_df_lookup[bench].iterrows():
        performance_lower_bound = frontier_model_data[bench]
        if pd.isna(performance_lower_bound):
            print(f'Frontier model {frontier_model_data["Model Name"]} is missing a {bench} value - skipping')
            continue
        elif benchmark_is_mqa[bench] and performance_lower_bound < 30:
            print(f'Frontier model {frontier_model_data["Model Name"]} has a {bench} value of less than 30% on an MQA benchmark - skipping')
            continue
        if bench == 'LMSys Chatbot Arena ELO':
            performance_upper_bound = np.inf
        else:
            performance_upper_bound = min(performance_lower_bound + performance_delta, 100)
        print(f'\nPerformance range: {performance_lower_bound} ({frontier_model_data["Model Name"]} level) to {performance_upper_bound}')

        cheapest_models = []
        current_best = None
        benchmark_df = eval_cost_df.dropna(subset=[bench])
        benchmark_df = benchmark_df[benchmark_df['Task'] == task]
        for i, row in benchmark_df.iterrows():
            if (row[bench] >= performance_lower_bound) and (row[bench] < performance_upper_bound):
                if (current_best is None) or (row[cost_col] < current_best[cost_col]):
                    current_best = row
                    cheapest_models.append(current_best)
                    print(current_best['Release Date'], current_best['Model Name'], current_best[bench], f"${current_best[cost_col]:.6f}")

        cheapest_models_df = pd.DataFrame(cheapest_models)
        cheapest_models_df.reset_index(drop=True, inplace=True)

        # Second step: check if the next row has the same Release date. If so, remove the current row.
        # Note that the df is already in descending order of price, so we only need to check the next row.
        idxs_to_remove = []
        for i, row in cheapest_models_df.iterrows():
            if i < len(cheapest_models_df) - 1:
                next_row = cheapest_models_df.iloc[i+1]
                if next_row['Release Date'] == row['Release Date']:
                    idxs_to_remove.append(i)
                    print(f'Removing {row["Model Name"]} because it has the same Release date as {next_row["Model Name"]} and has a higher cost')
        cheapest_models_df = cheapest_models_df.drop(idxs_to_remove)
        cheapest_models_df = cheapest_models_df.reset_index(drop=True)

        if len(cheapest_models_df) < min_num_data_points_for_regression:
            print(f'Less than {min_num_data_points_for_regression} cheapest models found - skipping')
            continue

        # Fit a line to the data
        cheapest_models_df['cost'] = cheapest_models_df['Average cost per question (USD)']
        cheapest_models_df['log_cost'] = np.log10(cheapest_models_df['Average cost per question (USD)'])
        cheapest_models_df['date'] = cheapest_models_df['Release Date'].map(lambda x: pd.Timestamp(x).toordinal())
        exponential_model = smf.ols('log_cost ~ date', data=cheapest_models_df).fit()

        # Calculate annual rate of decrease
        annual_slope = exponential_model.params['date'] * 365  # Convert daily to annual
        annual_factor = int(round(10**(-annual_slope)))  # Convert log slope to factor
        # Get the 90% CI
        ci_90 = exponential_model.conf_int(alpha=0.1)
        annual_factor_low = int(round(10**(-ci_90.loc['date'][1] * 365)))
        annual_factor_high = int(round(10**(-ci_90.loc['date'][0] * 365)))
        results.append({
            'bench': bench,
            'threshold_model': frontier_model_data["Model Name"],
            'performance_range': [performance_lower_bound, performance_upper_bound],
            'sample_size': len(cheapest_models_df),
            'start_date': cheapest_models_df['Release Date'].min(),
            'end_date': cheapest_models_df['Release Date'].max(),
            'cost_reduction_factor_per_year': annual_factor,
            'cost_reduction_factor_per_year_90_ci': [annual_factor_low, annual_factor_high],
            'r_squared': round(exponential_model.rsquared, 2),
        })

        # Plot the exponential trendline with the data
        fig = go.Figure()
        fig.add_trace(go.Scatter(
            x=cheapest_models_df['Release Date'],
            y=10**exponential_model.predict(cheapest_models_df['date']),
            mode='lines',
            name=f'Trendline: {annual_factor}x decrease per year',
            line=dict(color='blue', dash='dash')
        ))
        fig.add_trace(go.Scatter(
            x=cheapest_models_df['Release Date'],
            y=cheapest_models_df['Average cost per question (USD)'],
            mode='markers+text',
            name=f'Cheapest models with {performance_lower_bound}-{performance_upper_bound} on {bench}',
            text=cheapest_models_df['Model Name'],
            textposition='bottom left',
            marker=dict(color='blue')
            # line=dict(shape='hv'),
        ))
        other_models_df = eval_cost_df[
            (eval_cost_df['Task'] == task) &
            (eval_cost_df[bench].notna()) &
            (eval_cost_df[bench] >= performance_lower_bound) &
            (eval_cost_df[bench] < performance_upper_bound) &
            ~(eval_cost_df['Model Name'].isin(cheapest_models_df['Model Name']))
        ]
        fig.add_trace(go.Scatter(
            x=other_models_df['Release Date'],
            y=other_models_df['Average cost per question (USD)'],
            mode='markers',
            name=f'Other models with {performance_lower_bound}-{performance_upper_bound} on {bench}',
            text=other_models_df['Model Name'],
            marker=dict(color='lightblue')
        ))
        fig.update_layout(
            title=f'The cost to answer {bench} questions as well as {frontier_model_data["Model Name"]} has fallen by {annual_factor}x per year'
        )
        fig.update_traces(textposition='bottom left')
        fig.update_layout(yaxis_type='log')
        fig.update_layout(xaxis_title='Month')
        fig.update_layout(yaxis_title='Avg. cost per question (USD, log scale)')
        # Lower the lower x limit
        min_date = min(cheapest_models_df['Release Date'].min(), other_models_df['Release Date'].min())
        max_date = max(cheapest_models_df['Release Date'].max(), other_models_df['Release Date'].max())
        fig.update_layout(xaxis_range=[min_date - pd.Timedelta(days=90), max_date + pd.Timedelta(days=30)])
        fig.update_layout(
            width=1000,
            height=600,
            font=dict(size=10),
            legend=dict(
                yanchor="top",
                y=0.99,
                xanchor="right",
                x=0.99,
                bordercolor="lightgrey",
                borderwidth=1
            )
        )
        if save:
            save_plot(
                fig,
                results_subdir,
                f'lowest_cost_models_{bench}_{performance_lower_bound}-{performance_upper_bound}_with_trendline',
                extensions=['png'],
            )

cheapest_model_results_df = pd.DataFrame(results)
cheapest_model_results_df.to_csv(results_subdir + 'cheapest_model_results.csv', index=False)

# Create a summary DataFrame
summary_data = []
for bench in benchmarks:
    if bench == 'LMSys Chatbot Arena ELO':
        continue
    bench_results = cheapest_model_results_df[cheapest_model_results_df['bench'] == bench]
    if len(bench_results) == 0:
        continue
    # Get all performance lower bounds used for this benchmark
    perf_bounds = list(bench_results['performance_range'])
    
    # Calculate geometric mean of price reduction factors
    cost_factors = bench_results['cost_reduction_factor_per_year'].dropna()
    geomean = np.exp(np.mean(np.log(cost_factors))) if len(cost_factors) > 0 else np.nan
    
    # Get range of price reduction factors
    factor_range = [cost_factors.min(), cost_factors.max()] if len(cost_factors) > 0 else []
    
    summary_data.append({
        'bench': bench,
        'performance_range': perf_bounds,
        'cost_reduction_factor_per_year_geomean': round(geomean),
        'cost_reduction_factor_per_year_range': factor_range
    })

cheapest_model_summary_df = pd.DataFrame(summary_data)
cheapest_model_summary_df.to_csv(results_subdir + 'cheapest_model_summary.csv', index=False)

MMLU
No eval data for MMLU - skipping


GPQA Diamond
Frontier model GPT-3 is missing a GPQA Diamond value - skipping

Performance range: 33.0 (GPT-4-0314 level) to 100
2023-03-14 00:00:00 GPT-4-0314 33.0 $0.018148
2023-11-06 00:00:00 GPT-4 Turbo 50.0 $0.012485
2024-02-26 00:00:00 Mistral-Large-2024-02 36.0 $0.003942
2024-03-04 00:00:00 Claude-3-Haiku 33.0 $0.000320
2024-05-10 00:00:00 Gemini-1.5-Flash-2024-05 39.0 $0.000080

Performance range: 50.0 (GPT-4 Turbo level) to 100
2023-11-06 00:00:00 GPT-4 Turbo 50.0 $0.012485
2024-05-13 00:00:00 GPT-4o-2024-05 53.0 $0.006448
2024-06-20 00:00:00 Claude-3.5-Sonnet-2024-06 56.0 $0.004296
2024-08-06 00:00:00 GPT-4o-2024-08 52.0 $0.003714
2024-09-24 00:00:00 Gemini-1.5-Pro-2024-09 61.0 $0.001513
2024-12-13 00:00:00 Phi 4 53.0 $0.000114

Performance range: 53.0 (GPT-4o-2024-05 level) to 100
2024-05-13 00:00:00 GPT-4o-2024-05 53.0 $0.006448
2024-06-20 00:00:00 Claude-3.5-Sonnet-2024-06 56.0 $0.004296
2024-09-24 00:00:00 Gemini-1.5-Pro-2024-09 61.0

In [24]:
eval_cost_results_df = pd.read_csv('results/2025-03-04/lowest_cost_models_above_previous_frontier/cheapest_model_results.csv')
price_results_df = pd.read_csv('results/2025-03-04/lowest_price_models_above_previous_frontier/lowest_price_models_results.csv')

In [34]:
# Print how price_reduction_factor_per_year compares tocost_reduction_factor_per_year for overlapping bench and threshold_model
for bench in eval_cost_results_df['bench'].unique():
    eval_cost_bench_df = eval_cost_results_df.loc[eval_cost_results_df['bench'] == bench]
    price_bench_df = price_results_df.loc[price_results_df['bench'] == bench]
    for threshold_model in eval_cost_bench_df['threshold_model'].unique():
        if threshold_model in price_bench_df['threshold_model'].unique():
            print(f'{bench}, {threshold_model}')
            eval_cost_factor = eval_cost_bench_df.loc[eval_cost_results_df['threshold_model'] == threshold_model]['cost_reduction_factor_per_year'].iloc[0]
            price_cost_factor = price_bench_df.loc[price_results_df['threshold_model'] == threshold_model]['price_reduction_factor_per_year'].iloc[0]
            factor_ratio = eval_cost_factor / price_cost_factor
            print(eval_cost_factor, price_cost_factor, factor_ratio)

GPQA Diamond, GPT-4-0314
61 42 1.4523809523809523
GPQA Diamond, GPT-4 Turbo
42 46 0.9130434782608695
GPQA Diamond, GPT-4o-2024-05
731 574 1.2735191637630663
GPQA Diamond, Claude-3.5-Sonnet-2024-06
172 366 0.46994535519125685
MATH 5, GPT-4 Turbo
113 160 0.70625
MATH 5, GPT-4o-2024-05
477 689 0.6923076923076923
