In [1]:
from collections import defaultdict
import numpy as np
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import statsmodels.formula.api as smf

from data import load_pcd_df
from plotting import save_plot

pio.templates.default = "plotly_white"

In [2]:
results_dir = 'results/2025-03-04_iterate_model_release_dates/'
os.makedirs(results_dir, exist_ok=True)

In [3]:
save = True
benchmarks = ['MMLU', 'GPQA Diamond', 'MATH-500', 'MATH 5', 'HumanEval', 'LMSys Chatbot Arena ELO']
benchmark_is_mqa = {'MMLU': True, 'GPQA Diamond': True, 'MATH 5': False, 'MATH-500': False, 'HumanEval': False, 'LMSys Chatbot Arena ELO': False}
min_num_data_points_for_regression = 4

## Load Artificial Analysis dataset

In [4]:
aa_df = pd.read_csv('data/aa_data_with_math5.csv')
aa_df

Unnamed: 0,Model Name,Release Date,USD per 1M Tokens,MMLU,GPQA Diamond,HumanEval,MATH-500,LMSys Chatbot Arena ELO,MATH 5,Tokens per Second,Prefill Latency (s),Prompt Length (tk)
0,Claude-3-Haiku,2024-03-04,0.5,71,33,77.0,39.0,1179.0,13.0,122.7,0.467,1000.0
1,Claude-3-Opus,2024-03-04,30.0,84,50,83.0,64.0,1248.0,34.0,26.5,1.984,1000.0
2,Claude-3-Sonnet,2024-03-04,6.0,77,37,71.0,41.0,1201.0,16.0,61.8,0.789,1000.0
3,Claude-3.5-Haiku,2024-10-22,1.6,81,37,87.0,67.0,1236.0,,64.2,0.768,1000.0
4,Claude-3.5-Sonnet-2024-06,2024-06-20,6.0,88,56,90.0,71.0,1268.0,46.0,55.9,0.906,1000.0
5,Claude-3.5-Sonnet-2024-10,2024-10-22,6.0,89,58,96.0,76.0,1282.0,53.0,55.2,0.907,1000.0
6,Gemini 2.0 Flash,2025-02-05,0.175,88,62,90.0,93.0,1358.0,82.0,,,
7,Gemini-1.5-Flash-2024-05,2024-05-10,0.13,79,39,,55.0,1227.0,23.0,298.4,0.307,1000.0
8,Gemini-1.5-Flash-2024-09,2024-09-24,0.13,75,45,83.0,83.0,1271.0,58.0,190.5,0.348,1000.0
9,Gemini-1.5-Flash-8B,2024-10-03,0.07,75,30,12.0,70.0,1211.0,,285.2,0.335,1000.0


In [5]:
# 'Release Date' is a string with the format 'YYYY-MM'
aa_df['Release Date'] = pd.to_datetime(aa_df['Release Date'].str.strip(), format='%Y-%m-%d')

## Load Epoch AI price dataset

In [6]:
api_price_df = pd.read_csv('data/epoch_ai_price_data_not_in_aa_with_benchmarks.csv')
api_price_df['Release Date'] = pd.to_datetime(api_price_df['Release Date'].str.strip(), format='%Y-%m-%d')
api_price_df

Unnamed: 0,Model Name,Release Date,USD per 1M Tokens,MMLU,GPQA Diamond,HumanEval,MATH-500,LMSys Chatbot Arena ELO,MATH 5,Tokens per Second,Prefill Latency (s),Prompt Length (tk)
0,Claude 2,2024-08-12,12.0,78.5,35.0,,,,10.0,,,
1,Claude 2.1,2024-08-12,12.0,,36.0,16.0,,,11.0,,,
2,Claude Instant,2024-08-12,1.2,,,,,,,,,
3,Cohere Command,2024-08-13,1.625,,,,,,,,,
4,Cohere Command Light,2024-08-13,0.375,,,,,,,,,
5,Command R,2024-08-13,0.75,,,42.0,15.0,1180.0,,,,
6,Command R+,2024-08-13,6.0,75.7,34.0,63.0,40.0,1215.0,,,,
7,Command R+,2024-09-13,4.375,75.7,34.0,63.0,40.0,1215.0,,,,
8,DeepSeek-Coder-V2 236B,2024-09-11,0.175,79.2,,87.0,74.0,1178.0,,,,
9,DeepSeek-R1,2025-01-20,0.96,,71.7,98.0,96.0,1362.0,93.1,,,


In [7]:
# Merge the two datasets
aa_df = pd.concat([aa_df, api_price_df])
aa_df.dropna(subset=['USD per 1M Tokens'], inplace=True)
aa_df.sort_values(by='Release Date', inplace=True)
# Reset the index
aa_df.reset_index(drop=True, inplace=True)
aa_df

Unnamed: 0,Model Name,Release Date,USD per 1M Tokens,MMLU,GPQA Diamond,HumanEval,MATH-500,LMSys Chatbot Arena ELO,MATH 5,Tokens per Second,Prefill Latency (s),Prompt Length (tk)
0,GPT-3 175B (davinci),2021-11-20,60.000,43.9,,,,,,,,
1,GPT-3 175B (davinci),2022-08-31,60.000,43.9,,,,,,,,
2,GPT-3 175B (davinci),2022-09-01,20.000,43.9,,,,,,,,
3,GPT-3.5,2022-11-30,20.000,64.8,,,,,,,,
4,GPT-3.5 Turbo,2023-03-06,2.000,68.0,,,,1106.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
66,DeepSeek-R1,2025-01-20,0.960,,71.7,98.0,96.0,1362.0,93.1,,,
67,Mistral Small 3,2025-01-30,0.475,82.0,46.0,85.0,74.0,1210.0,45.0,,,
68,o3-mini,2025-01-31,1.925,,74.3,97.0,97.0,1306.0,95.2,,,
69,o1-mini,2025-01-31,1.925,,59.5,97.0,94.0,1308.0,84.3,,,


In [8]:
# Rename all instances of 'GPT-3 175B (davinci)' to 'GPT-3'
aa_df.loc[aa_df['Model Name'] == 'GPT-3 175B (davinci)', 'Model Name'] = 'GPT-3'
aa_df


Unnamed: 0,Model Name,Release Date,USD per 1M Tokens,MMLU,GPQA Diamond,HumanEval,MATH-500,LMSys Chatbot Arena ELO,MATH 5,Tokens per Second,Prefill Latency (s),Prompt Length (tk)
0,GPT-3,2021-11-20,60.000,43.9,,,,,,,,
1,GPT-3,2022-08-31,60.000,43.9,,,,,,,,
2,GPT-3,2022-09-01,20.000,43.9,,,,,,,,
3,GPT-3.5,2022-11-30,20.000,64.8,,,,,,,,
4,GPT-3.5 Turbo,2023-03-06,2.000,68.0,,,,1106.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
66,DeepSeek-R1,2025-01-20,0.960,,71.7,98.0,96.0,1362.0,93.1,,,
67,Mistral Small 3,2025-01-30,0.475,82.0,46.0,85.0,74.0,1210.0,45.0,,,
68,o3-mini,2025-01-31,1.925,,74.3,97.0,97.0,1306.0,95.2,,,
69,o1-mini,2025-01-31,1.925,,59.5,97.0,94.0,1308.0,84.3,,,


In [9]:
aa_df['Model Name and Date'] = aa_df['Model Name'] + ' (' + aa_df['Release Date'].dt.strftime('%Y-%m') + ')'
aa_df['Model Name and Date']

0                GPT-3 (2021-11)
1                GPT-3 (2022-08)
2                GPT-3 (2022-09)
3              GPT-3.5 (2022-11)
4        GPT-3.5 Turbo (2023-03)
                 ...            
66         DeepSeek-R1 (2025-01)
67     Mistral Small 3 (2025-01)
68             o3-mini (2025-01)
69             o1-mini (2025-01)
70    Gemini 2.0 Flash (2025-02)
Name: Model Name and Date, Length: 71, dtype: object

## Explore the data

### Performance

In [10]:
top_n = 1
aa_df = aa_df.sort_values(by='Release Date')

top_models_df_lookup = {}
for benchmark in benchmarks:
    # Get the top-1 model by benchmark score at each point in time
    ever_top_n_models = set()
    unique_dates = aa_df['Release Date'].sort_values().unique()
    for date in unique_dates:
        df_up_to_date = aa_df[aa_df['Release Date'] <= date]
        top_n_models = df_up_to_date.nlargest(top_n, benchmark)
        top_n_model_names = top_n_models['Model Name and Date'].tolist()
        ever_top_n_models.update(top_n_model_names)
    ever_top_n_list = list(ever_top_n_models)
    print(ever_top_n_list)
    top_models_df = aa_df[aa_df['Model Name and Date'].isin(ever_top_n_list)]
    top_models_df_lookup[benchmark] = top_models_df

    # Create base scatter plot with all data
    fig = px.scatter(aa_df, x='Release Date', y=benchmark, 
                    hover_data=['Model Name'], 
                    title=f'{benchmark} Performance over time',
                    color_discrete_sequence=['lightgray'])

    # Add top-1 models in a different color
    fig.add_scatter(x=top_models_df['Release Date'], 
                    y=top_models_df[benchmark],
                    mode='lines+markers',
                    name='Top Model',
                    hovertemplate='%{text}<extra></extra>',
                    text=top_models_df['Model Name'],
                    line=dict(color='red'))
    fig.update_layout(
        width=800,
        height=400,
    )
    fig.show()

['GPT-4 Turbo (2023-11)', 'GPT-3.5 Turbo (2023-03)', 'Claude-3.5-Sonnet-2024-06 (2024-06)', 'GPT-4-0314 (2023-03)', 'GPT-4o-2024-08 (2024-08)', 'GPT-3 (2021-11)', 'GPT-3.5 (2022-11)']


['GPT-4 Turbo (2023-11)', 'o1-preview (2024-09)', 'GPT-4o-2024-05 (2024-05)', 'Claude-3.5-Sonnet-2024-06 (2024-06)', 'GPT-4-0314 (2023-03)', 'o1 (2024-12)', 'GPT-3 (2021-11)']


['GPT-4 Turbo (2023-11)', 'GPT-4o-2024-05 (2024-05)', 'GPT-3.5-Turbo-2023-06 (2023-06)', 'o1-mini (2024-09)', 'o1 (2024-12)', 'GPT-4o-2024-08 (2024-08)', 'GPT-3 (2021-11)']


['GPT-4 Turbo (2023-11)', 'GPT-4o-2024-05 (2024-05)', 'GPT-3.5-Turbo-2023-06 (2023-06)', 'o1-mini (2024-09)', 'o1 (2024-12)', 'GPT-3 (2021-11)', 'o3-mini (2025-01)']


['GPT-4 Turbo (2023-11)', 'GPT-4o-2024-05 (2024-05)', 'GPT-3.5-Turbo-2023-06 (2023-06)', 'GPT-4-0314 (2023-03)', 'o1-mini (2024-09)', 'GPT-3 (2021-11)', 'DeepSeek-R1 (2025-01)']


['GPT-4 Turbo (2023-11)', 'GPT-4o-2024-05 (2024-05)', 'GPT-3.5 Turbo (2023-03)', 'GPT-4-0314 (2023-03)', 'GPT-4o-2024-11 (2024-11)', 'GPT-4o-2024-08 (2024-08)', 'GPT-3 (2021-11)', 'DeepSeek-R1 (2025-01)']


In [11]:
top_models_df_lookup

{'MMLU':                    Model Name Release Date  USD per 1M Tokens  MMLU  \
 0                       GPT-3   2021-11-20              60.00  43.9   
 3                     GPT-3.5   2022-11-30              20.00  64.8   
 4               GPT-3.5 Turbo   2023-03-06               2.00  68.0   
 5                  GPT-4-0314   2023-03-14              37.50  86.0   
 10                GPT-4 Turbo   2023-11-06              15.00  87.0   
 29  Claude-3.5-Sonnet-2024-06   2024-06-20               6.00  88.0   
 38             GPT-4o-2024-08   2024-08-06               4.38  89.0   
 
     GPQA Diamond  HumanEval  MATH-500  LMSys Chatbot Arena ELO  MATH 5  \
 0            NaN        NaN       NaN                      NaN     NaN   
 3            NaN        NaN       NaN                      NaN     NaN   
 4            NaN        NaN       NaN                   1106.0     NaN   
 5           33.0       67.0       NaN                   1186.0     NaN   
 10          50.0       92.0      74.0 

### Price vs. performance

In [12]:
# Plot 'MMLU' vs. 'USD per 1M Tokens'
fig = px.scatter(aa_df, x='MMLU', y='USD per 1M Tokens', hover_data=['Model Name'], title='MMLU vs. USD per 1M Tokens')
fig.update_layout(yaxis_type='log')
fig.show()


In [13]:
aa_df['MMLU price-performance'] = aa_df['MMLU'] / aa_df['USD per 1M Tokens']
# Plot 'MMLU price-performance' vs. 'Date'
fig = px.scatter(aa_df, x='Release Date', y='MMLU price-performance', title='MMLU price-performance over time', hover_data=['Model Name'])
fig.update_layout(yaxis_type='log')
fig.show()


In [14]:
# Construct a list of 'Model Name' values that were ever in the top-n based on 'MMLU price-performance'
top_n = 1
aa_df = aa_df.sort_values(by='Release Date')
ever_top_n_models = set()

unique_dates = aa_df['Release Date'].sort_values().unique()

for date in unique_dates:
    df_up_to_date = aa_df[aa_df['Release Date'] <= date]
    top_n_models = df_up_to_date.nlargest(top_n, 'MMLU price-performance')
    top_n_model_names = top_n_models['Model Name and Date'].tolist()
    ever_top_n_models.update(top_n_model_names)

ever_top_n_list = sorted(ever_top_n_models)
print(ever_top_n_list)

# Plot top-n models over time
fig = px.scatter(aa_df[aa_df['Model Name and Date'].isin(ever_top_n_list)], x='Release Date', y='MMLU price-performance', title='MMLU price-performance over time', hover_data=['Model Name and Date'])
fig.update_layout(yaxis_type='log')
fig.show()

['GPT-3 (2021-11)', 'GPT-3 (2022-09)', 'GPT-3.5 (2022-11)', 'GPT-3.5 Turbo (2023-03)', 'GPT-3.5-Turbo-2023-11 (2023-11)', 'Gemini-1.5-Flash-2024-05 (2024-05)', 'Gemini-1.5-Flash-8B (2024-10)', 'Llama 2-7B (2023-12)', 'Llama 2-7B (2024-04)', 'Llama-2-Chat-13B (2023-07)', 'Llama-3-Instruct-8B (2024-04)', 'Llama-3.1-Instruct-8B (2024-07)', 'Llama-3.2-Instruct-3B (2024-09)']


## Regression to lowest-priced models above a performance lower bound

### Examples

In [15]:
results_subdir = results_dir + 'lowest_price_models_examples/'
os.makedirs(results_subdir, exist_ok=True)

In [16]:
"""
  - Set a performance lower bound
  - Track the running best (top) model
  - At each point in time (at some resolution)
    - Filter to new models published in this time window
    - Filter to models with performance above the lower bound
    - Check if any new model is cheaper than current best
    - If so, update the current best
    - Record the current best model at this time point
"""
bench = 'MMLU'
performance_lower_bound = 80
performance_upper_bound = 90
ts = pd.date_range(start='2020-01-01', end=pd.Timestamp.today(), freq='MS')
cheapest_models = []
current_best = None

for i, t in enumerate(ts):
    # Get models published in this time window
    benchmark_df = aa_df
    if i > 0:
        prev_t = ts[i-1]
        benchmark_df = benchmark_df[(benchmark_df['Release Date'] >= prev_t) & (benchmark_df['Release Date'] < t)]
    else:
        benchmark_df = benchmark_df[benchmark_df['Release Date'] < t]
        
    # Filter for performance
    benchmark_df = benchmark_df[
        benchmark_df[bench].notna() &
        (benchmark_df[bench] >= performance_lower_bound) &
        (benchmark_df[bench] < performance_upper_bound)
    ]
    
    if not benchmark_df.empty:
        # Find cheapest new model
        new_best = benchmark_df.loc[benchmark_df['USD per 1M Tokens'].idxmin()]
        
        # Update current best if new model is cheaper (or if no current best)
        if current_best is None or new_best['USD per 1M Tokens'] < current_best['USD per 1M Tokens']:
            current_best = new_best
            cheapest_models.append(current_best)
            print(t, current_best['Model Name'], current_best[bench], f"${current_best['USD per 1M Tokens']:.2f}")
        # elif new_best['USD per 1M Tokens'] == current_best['USD per 1M Tokens']:
        #     # Choose the model with the highest performance
        #     if new_best[bench] > current_best[bench]:
        #         current_best = new_best
        #         cheapest_models.append(current_best)
        #         print(t, current_best['Model Name'], current_best[bench], f"${current_best['USD per 1M Tokens']:.2f}")


2023-04-01 00:00:00 GPT-4-0314 86.0 $37.50
2023-12-01 00:00:00 GPT-4 Turbo 87.0 $15.00
2024-06-01 00:00:00 Gemini-1.5-Pro-2024-05 86.0 $2.19
2024-08-01 00:00:00 GPT-4o-mini 82.0 $0.26
2025-01-01 00:00:00 Phi 4 85.0 $0.12


In [17]:
cheapest_models_df = pd.DataFrame(cheapest_models)
cheapest_models_df.head()

Unnamed: 0,Model Name,Release Date,USD per 1M Tokens,MMLU,GPQA Diamond,HumanEval,MATH-500,LMSys Chatbot Arena ELO,MATH 5,Tokens per Second,Prefill Latency (s),Prompt Length (tk),Model Name and Date,MMLU price-performance
5,GPT-4-0314,2023-03-14,37.5,86.0,33.0,67.0,,1186.0,,23.6,0.724,1000.0,GPT-4-0314 (2023-03),2.293333
10,GPT-4 Turbo,2023-11-06,15.0,87.0,50.0,92.0,74.0,1256.0,36.0,39.2,1.246,1000.0,GPT-4 Turbo (2023-11),5.8
28,Gemini-1.5-Pro-2024-05,2024-05-23,2.19,86.0,46.0,83.0,66.0,1260.0,41.0,64.8,0.738,1000.0,Gemini-1.5-Pro-2024-05 (2024-05),39.269406
33,GPT-4o-mini,2024-07-18,0.26,82.0,43.0,88.0,79.0,1273.0,48.0,112.2,0.626,1000.0,GPT-4o-mini (2024-07),315.384615
63,Phi 4,2024-12-13,0.1225,85.0,53.0,87.0,81.0,1204.0,65.0,,,,Phi 4 (2024-12),693.877551


In [18]:
# Fit a line to the data
cheapest_models_df['price'] = cheapest_models_df['USD per 1M Tokens']
cheapest_models_df['log_price'] = np.log10(cheapest_models_df['USD per 1M Tokens'])
cheapest_models_df['date'] = cheapest_models_df['Release Date'].map(lambda x: pd.Timestamp(x).toordinal())
exponential_model = smf.ols('log_price ~ date', data=cheapest_models_df).fit()
print(exponential_model.summary())

linear_model = smf.ols('price ~ date', data=cheapest_models_df).fit()
print(linear_model.summary())

                            OLS Regression Results                            
Dep. Variable:              log_price   R-squared:                       0.909
Model:                            OLS   Adj. R-squared:                  0.879
Method:                 Least Squares   F-statistic:                     29.98
Date:                Tue, 04 Mar 2025   Prob (F-statistic):             0.0120
Time:                        11:43:18   Log-Likelihood:               -0.91580
No. Observations:                   5   AIC:                             5.832
Df Residuals:                       3   BIC:                             5.050
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   3058.9717    558.594      5.476      0.0


omni_normtest is not valid with less than 8 observations; 5 samples were given.


omni_normtest is not valid with less than 8 observations; 5 samples were given.



In [19]:
# Calculate annual rate of decrease
annual_slope = exponential_model.params['date'] * 365  # Convert daily to annual
annual_factor = int(round(10**(-annual_slope)))  # Convert log slope to factor

# Plot the exponential trendline with the data
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=cheapest_models_df['Release Date'],
    y=10**exponential_model.predict(cheapest_models_df['date']),
    mode='lines',
    name=f'Trendline: {annual_factor}x decrease per year',
    line=dict(color='blue', dash='dash')
))
fig.add_trace(go.Scatter(
    x=cheapest_models_df['Release Date'],
    y=cheapest_models_df['USD per 1M Tokens'],
    mode='markers+text',
    name=f'Cheapest models with {performance_lower_bound}-{performance_upper_bound}% on {bench}',
    text=cheapest_models_df['Model Name'],
    textposition='bottom left',
    marker=dict(color='blue')
    # line=dict(shape='hv'),
))
other_models_df = aa_df[(aa_df[bench].notna()) & (aa_df[bench] >= performance_lower_bound) & ~(aa_df['Model Name'].isin(cheapest_models_df['Model Name']))]
fig.add_trace(go.Scatter(
    x=other_models_df['Release Date'],
    y=other_models_df['USD per 1M Tokens'],
    mode='markers+text',
    name=f'Other models with {performance_lower_bound}-{performance_upper_bound}% on {bench}',
    text=other_models_df['Model Name'],
    marker=dict(color='lightblue')
))
fig.update_layout(
    title=f'Price of the cheapest model with {performance_lower_bound}-{performance_upper_bound}% on {bench}'
)
fig.update_traces(textposition='bottom left')
fig.update_layout(yaxis_type='log')
fig.update_layout(xaxis_title='Month')
fig.update_layout(yaxis_title='Price in USD per million tokens')
# Lower the lower x limit
min_date = min(cheapest_models_df['Release Date'].min(), other_models_df['Release Date'].min())
max_date = max(cheapest_models_df['Release Date'].max(), other_models_df['Release Date'].max())
fig.update_layout(xaxis_range=[min_date - pd.Timedelta(days=90), max_date + pd.Timedelta(days=30)])
fig.update_layout(
    width=1000,
    height=600,
    font=dict(size=10),
    legend=dict(
        yanchor="top",
        y=0.25,
        xanchor="right",
        x=0.4,
        bordercolor="lightgrey",
        borderwidth=1
    )
)
if save:
    save_plot(fig, results_subdir, f'lowest_price_models_{bench}_above_{performance_lower_bound}_with_trendline')
fig.show()

### Final selected example

In [20]:
aa_df['Release Date'].unique()

<DatetimeArray>
['2021-11-20 00:00:00', '2022-08-31 00:00:00', '2022-09-01 00:00:00',
 '2022-11-30 00:00:00', '2023-03-06 00:00:00', '2023-03-14 00:00:00',
 '2023-06-13 00:00:00', '2023-07-18 00:00:00', '2023-11-06 00:00:00',
 '2023-12-01 00:00:00', '2023-12-08 00:00:00', '2024-01-04 00:00:00',
 '2024-01-25 00:00:00', '2024-02-26 00:00:00', '2024-03-04 00:00:00',
 '2024-04-14 00:00:00', '2024-04-17 00:00:00', '2024-04-18 00:00:00',
 '2024-05-10 00:00:00', '2024-05-13 00:00:00', '2024-05-23 00:00:00',
 '2024-06-20 00:00:00', '2024-06-24 00:00:00', '2024-07-18 00:00:00',
 '2024-07-23 00:00:00', '2024-07-24 00:00:00', '2024-08-06 00:00:00',
 '2024-08-12 00:00:00', '2024-08-13 00:00:00', '2024-09-11 00:00:00',
 '2024-09-12 00:00:00', '2024-09-13 00:00:00', '2024-09-14 00:00:00',
 '2024-09-24 00:00:00', '2024-10-03 00:00:00', '2024-10-22 00:00:00',
 '2024-11-18 00:00:00', '2024-11-20 00:00:00', '2024-12-13 00:00:00',
 '2024-12-17 00:00:00', '2024-12-26 00:00:00', '2025-01-20 00:00:00',
 '20

In [None]:
# selected_benchmarks = ['MMLU', 'MATH 5', 'HumanEval']
bench = 'GPQA Diamond'
threshold_model = 'GPT-4-0314'
performance_lower_bound = aa_df[aa_df['Model Name'] == threshold_model].iloc[0][bench]
performance_upper_bound = 100
print(f'\nPerformance range: {performance_lower_bound}-{performance_upper_bound}%')

cheapest_models = []
current_best = None
for i, row in aa_df.iterrows():
    if pd.isna(row[bench]):
        continue
    if (row[bench] >= performance_lower_bound) and (row[bench] < performance_upper_bound):
        if (current_best is None) or (row['USD per 1M Tokens'] < current_best['USD per 1M Tokens']):
            current_best = row
            cheapest_models.append(current_best)
            print(current_best['Release Date'], current_best['Model Name'], current_best[bench], f"${current_best['USD per 1M Tokens']:.2f}")

cheapest_models_df = pd.DataFrame(cheapest_models)

# Fit a line to the data
cheapest_models_df['price'] = cheapest_models_df['USD per 1M Tokens']
cheapest_models_df['log_price'] = np.log10(cheapest_models_df['USD per 1M Tokens'])
cheapest_models_df['date'] = cheapest_models_df['Release Date'].map(lambda x: pd.Timestamp(x).toordinal())
exponential_model = smf.ols('log_price ~ date', data=cheapest_models_df).fit()

fig = go.Figure()

all_df = aa_df[
    (aa_df[bench].notna()) &
    (aa_df[bench] >= performance_lower_bound) &
    (aa_df[bench] < performance_upper_bound)
]
fig.add_trace(go.Scatter(
    x=all_df['Release Date'],
    y=all_df['USD per 1M Tokens'],
    mode='markers',
    name=f'Other, GPT-4 level or better on {bench}',
    text=all_df['Model Name'],
    marker=dict(color='rgb(222, 222, 255)')
))

factor = cheapest_models_df['USD per 1M Tokens'].iloc[0] / cheapest_models_df['USD per 1M Tokens'].iloc[-1]
period_months = (cheapest_models_df['Release Date'].iloc[-1] - cheapest_models_df['Release Date'].iloc[0]).days / (365/12)

annual_slope = exponential_model.params['date'] * 365  # Convert daily to annual
annual_factor = int(round(10**(-annual_slope)))  # Convert log slope to factor
# Get the 90% CI
ci_90 = exponential_model.conf_int(alpha=0.1)
print('90% CI:', int(round(10**(-ci_90.loc['date'][1] * 365))), int(round(10**(-ci_90.loc['date'][0] * 365))))

# Plot the exponential trendline with the data
date_range = pd.date_range(start=cheapest_models_df['Release Date'].min(), end=cheapest_models_df['Release Date'].max(), freq='MS')
pred_df = pd.DataFrame({'date': date_range.map(lambda x: pd.Timestamp(x).toordinal())})
fig.add_trace(go.Scatter(
    x=date_range,
    y=10**exponential_model.predict(pred_df['date']),
    mode='lines+text',
    name=f'{annual_factor}x decrease per year',
    # Only show text at middle index
    text=['' if i != len(date_range)//2 else f'{annual_factor}x per year' for i in range(len(date_range))],
    textposition='middle left',
    textfont=dict(size=14),
    line=dict(color='magenta', dash='dash'),
    hoverinfo='skip',
    showlegend=False,
))

# annotations = [""] * len(cheapest_models_df['Model Name'])
# annotations[0] = cheapest_models_df['Model Name'].iloc[0]
# annotations[-1] = cheapest_models_df['Model Name'].iloc[-1]
annotations = cheapest_models_df['Model Name']

fig.add_trace(go.Scatter(
    x=cheapest_models_df['Release Date'],
    y=cheapest_models_df['USD per 1M Tokens'],
    mode='markers+text',
    name=f'Cheapest, GPT-4 level or better on {bench}',
    marker=dict(color='magenta'),
    text=annotations,
    textposition='top right',
    # marker=dict(color='blue'),
    # visible="legendonly",  # Hide this trace by default
    # legendgroup=f'{bench}_{performance_range_str}',
))
fig.update_layout(xaxis_range=[
    cheapest_models_df['Release Date'].min() - pd.Timedelta(days=30),
    cheapest_models_df['Release Date'].max() + pd.Timedelta(days=150)
])

fig.update_layout(
    # title=f'The cost to answer PhD-level science questions has fallen by {round(factor, -1):.0f}x in {period_months:.0f} months'
    title=f'Price per token to answer PhD-level science questions as well as GPT-4 has fallen by {annual_factor}x per year'
)
fig.update_layout(yaxis_type='log')
fig.update_layout(xaxis_title='Month')
fig.update_layout(yaxis_title='Price in USD per million tokens')
# fig.update_layout(xaxis_range=[aa_df['Release Date'].min() - pd.Timedelta(days=150), aa_df['Release Date'].max() + pd.Timedelta(days=30)])
fig.update_layout(
    width=1000,
    height=600,
    # font=dict(size=14),
    legend=dict(
        yanchor="top",
        y=0.14,
        xanchor="right",
        x=0.45,
        bordercolor="lightgrey",
        borderwidth=1
    )
)

if save:
    save_plot(fig, results_subdir, f'lowest_price_models_{bench}_{performance_lower_bound}_to_{performance_upper_bound}', extensions=['png', 'svg'])

fig.show()


Performance range: 33.0-100%
2023-03-14 00:00:00 GPT-4-0314 33.0 $37.50
2023-11-06 00:00:00 GPT-4 Turbo 50.0 $15.00
2024-02-26 00:00:00 Mistral-Large-2024-02 36.0 $6.00
2024-03-04 00:00:00 Claude-3-Haiku 33.0 $0.50
2024-05-10 00:00:00 Gemini-1.5-Flash-2024-05 39.0 $0.13
2024-12-13 00:00:00 Phi 4 53.0 $0.12
90% CI: 4 401


### Full analysis

In [25]:
results_subdir = results_dir + 'lowest_price_models_above_previous_frontier/'
os.makedirs(results_subdir, exist_ok=True)

performance_delta = 100

cheapest_models_dfs = []
other_models_dfs = []
results = []
for i, bench in enumerate(benchmarks):
    if i > 0:
        print('\n')
    print(f'{bench}')

    for i, frontier_model_data in top_models_df_lookup[bench].iterrows():
        # TODO: check if top_models_df_lookup identified all versions of a model
        performance_lower_bound = frontier_model_data[bench]
        if pd.isna(performance_lower_bound):
            print(f'Frontier model {frontier_model_data["Model Name"]} is missing a {bench} value - skipping')
            continue
        elif benchmark_is_mqa[bench] and performance_lower_bound < 30:
            print(f'Frontier model {frontier_model_data["Model Name"]} has a {bench} value of less than 30% on an MQA benchmark - skipping')
            continue
        if bench == 'LMSys Chatbot Arena ELO':
            performance_upper_bound = np.inf
        else:
            performance_upper_bound = min(performance_lower_bound + performance_delta, 100)
        print(f'\nPerformance range: {performance_lower_bound} ({frontier_model_data["Model Name"]} level) to {performance_upper_bound}')

        cheapest_models = []
        current_best = None
        for i, row in aa_df.iterrows():
            if pd.isna(row[bench]):
                continue
            if (row[bench] >= performance_lower_bound) and (row[bench] < performance_upper_bound):
                if (current_best is None) or (row['USD per 1M Tokens'] < current_best['USD per 1M Tokens']):
                    current_best = row
                    cheapest_models.append(current_best)
                    print(current_best['Release Date'], current_best['Model Name'], current_best[bench], f"${current_best['USD per 1M Tokens']:.2f}")

        cheapest_models_df = pd.DataFrame(cheapest_models)
        if len(cheapest_models_df) < min_num_data_points_for_regression:
            print(f'Less than {min_num_data_points_for_regression} cheapest models found - skipping')
            continue

        # Fit a line to the data
        cheapest_models_df['price'] = cheapest_models_df['USD per 1M Tokens']
        cheapest_models_df['log_price'] = np.log10(cheapest_models_df['USD per 1M Tokens'])
        cheapest_models_df['date'] = cheapest_models_df['Release Date'].map(lambda x: pd.Timestamp(x).toordinal())
        exponential_model = smf.ols('log_price ~ date', data=cheapest_models_df).fit()
        predicted_log_prices = 10**exponential_model.predict(cheapest_models_df['date'])
        cheapest_models_df['predicted_log_price'] = predicted_log_prices
        cheapest_models_df['bench'] = [bench] * len(cheapest_models_df)
        cheapest_models_df['threshold_model'] = [frontier_model_data["Model Name"]] * len(cheapest_models_df)
        cheapest_models_df['performance_range'] = [str([performance_lower_bound, performance_upper_bound])] * len(cheapest_models_df)
        cheapest_models_dfs.append(cheapest_models_df)

        # Calculate annual rate of decrease
        annual_slope = exponential_model.params['date'] * 365  # Convert daily to annual
        annual_factor = int(round(10**(-annual_slope)))  # Convert log slope to factor
        # Get the 90% CI
        ci_90 = exponential_model.conf_int(alpha=0.1)
        annual_factor_low = int(round(10**(-ci_90.loc['date'][1] * 365)))
        annual_factor_high = int(round(10**(-ci_90.loc['date'][0] * 365)))
        results.append({
            'bench': bench,
            'threshold_model': frontier_model_data["Model Name"],
            'performance_range': [performance_lower_bound, performance_upper_bound],
            'sample_size': len(cheapest_models_df),
            'start_date': cheapest_models_df['Release Date'].min(),
            'end_date': cheapest_models_df['Release Date'].max(),
            'price_reduction_factor_per_year': annual_factor,
            'price_reduction_factor_per_year_90_ci': [annual_factor_low, annual_factor_high],
            'r_squared': round(exponential_model.rsquared, 2),
        })

        # Plot the exponential trendline with the data
        fig = go.Figure()
        fig.add_trace(go.Scatter(
            x=cheapest_models_df['Release Date'],
            y=predicted_log_prices,
            mode='lines',
            name=f'Trendline: {annual_factor}x decrease per year',
            line=dict(color='blue', dash='dash')
        ))
        fig.add_trace(go.Scatter(
            x=cheapest_models_df['Release Date'],
            y=cheapest_models_df['USD per 1M Tokens'],
            mode='markers+text',
            name=f'Cheapest models with {performance_lower_bound}-{performance_upper_bound} on {bench}',
            text=cheapest_models_df['Model Name'],
            textposition='bottom left',
            marker=dict(color='blue')
            # line=dict(shape='hv'),
        ))
        other_models_df = aa_df[
            aa_df[bench].notna() &
            (aa_df[bench] >= performance_lower_bound) &
            (aa_df[bench] < performance_upper_bound) &
            ~(aa_df['Model Name'].isin(cheapest_models_df['Model Name']))
        ]
        other_models_df['bench'] = [bench] * len(other_models_df)
        other_models_df['threshold_model'] = [frontier_model_data["Model Name"]] * len(other_models_df)
        other_models_df['performance_range'] = [str([performance_lower_bound, performance_upper_bound])] * len(other_models_df)
        other_models_dfs.append(other_models_df)
        fig.add_trace(go.Scatter(
            x=other_models_df['Release Date'],
            y=other_models_df['USD per 1M Tokens'],
            mode='markers',
            name=f'Other models with {performance_lower_bound}-{performance_upper_bound} on {bench}',
            text=other_models_df['Model Name'],
            marker=dict(color='lightblue')
        ))
        fig.update_layout(
            title=f'The cheapest model with {performance_lower_bound}-{performance_upper_bound} on {bench} has become {annual_factor}x cheaper per year'
        )
        fig.update_traces(textposition='bottom left')
        fig.update_layout(yaxis_type='log')
        fig.update_layout(xaxis_title='Month')
        fig.update_layout(yaxis_title='Price in USD per million tokens')
        # Lower the lower x limit
        min_date = min(cheapest_models_df['Release Date'].min(), other_models_df['Release Date'].min())
        max_date = max(cheapest_models_df['Release Date'].max(), other_models_df['Release Date'].max())
        fig.update_layout(xaxis_range=[min_date - pd.Timedelta(days=90), max_date + pd.Timedelta(days=30)])
        fig.update_layout(
            width=1000,
            height=600,
            font=dict(size=10),
            legend=dict(
                yanchor="top",
                y=0.99,
                xanchor="right",
                x=0.99,
                bordercolor="lightgrey",
                borderwidth=1
            )
        )
        if save:
            save_plot(
                fig,
                results_subdir,
                f'lowest_price_models_{bench}_{performance_lower_bound}-{performance_upper_bound}_with_trendline',
                extensions=['png'],
            )

all_cheapest_models_df = pd.concat(cheapest_models_dfs)
all_cheapest_models_df = all_cheapest_models_df[['bench', 'threshold_model', 'performance_range', 'Model Name', 'Release Date', 'USD per 1M Tokens', 'predicted_log_price']]
all_other_models_df = pd.concat(other_models_dfs)
all_other_models_df = all_other_models_df[['bench', 'threshold_model', 'performance_range', 'Model Name', 'Release Date', 'USD per 1M Tokens']]
cheapest_model_results_df = pd.DataFrame(results)
if save:
    all_cheapest_models_df.to_csv(results_subdir + 'lowest_price_models_data.csv', index=False)
    all_other_models_df.to_csv(results_subdir + 'other_models_data.csv', index=False)
    cheapest_model_results_df.to_csv(results_subdir + 'lowest_price_models_results.csv', index=False)

MMLU

Performance range: 43.9 (GPT-3 level) to 100
2021-11-20 00:00:00 GPT-3 43.9 $60.00
2022-09-01 00:00:00 GPT-3 43.9 $20.00
2023-03-06 00:00:00 GPT-3.5 Turbo 68.0 $2.00
2023-07-18 00:00:00 Llama-2-Chat-13B 45.0 $0.56
2023-12-01 00:00:00 Llama 2-7B 45.3 $0.20
2024-04-14 00:00:00 Llama 2-7B 45.3 $0.13
2024-07-23 00:00:00 Llama-3.1-Instruct-8B 71.0 $0.10
2024-09-24 00:00:00 Llama-3.2-Instruct-3B 64.0 $0.08
2024-10-03 00:00:00 Gemini-1.5-Flash-8B 75.0 $0.07

Performance range: 64.8 (GPT-3.5 level) to 100
2022-11-30 00:00:00 GPT-3.5 64.8 $20.00
2023-03-06 00:00:00 GPT-3.5 Turbo 68.0 $2.00
2023-11-06 00:00:00 GPT-3.5-Turbo-2023-11 68.0 $0.75
2024-03-04 00:00:00 Claude-3-Haiku 71.0 $0.50
2024-05-10 00:00:00 Gemini-1.5-Flash-2024-05 79.0 $0.13
2024-07-23 00:00:00 Llama-3.1-Instruct-8B 71.0 $0.10
2024-10-03 00:00:00 Gemini-1.5-Flash-8B 75.0 $0.07

Performance range: 68.0 (GPT-3.5 Turbo level) to 100
2023-03-06 00:00:00 GPT-3.5 Turbo 68.0 $2.00
2023-11-06 00:00:00 GPT-3.5-Turbo-2023-11 68.0 $



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/


Performance range: 87.0 (GPT-4 Turbo level) to 100
2023-11-06 00:00:00 GPT-4 Turbo 87.0 $15.00
2024-05-13 00:00:00 GPT-4o-2024-05 87.0 $7.50
2024-06-20 00:00:00 Claude-3.5-Sonnet-2024-06 88.0 $6.00
2024-07-23 00:00:00 Llama-3.1-Instruct-405B 87.0 $3.50
2025-02-05 00:00:00 Gemini 2.0 Flash 88.0 $0.17

Performance range: 88.0 (Claude-3.5-Sonnet-2024-06 level) to 100
2024-06-20 00:00:00 Claude-3.5-Sonnet-2024-06 88.0 $6.00
2024-08-06 00:00:00 GPT-4o-2024-08 89.0 $4.38
2025-02-05 00:00:00 Gemini 2.0 Flash 88.0 $0.17
Less than 4 cheapest models found - skipping

Performance range: 89.0 (GPT-4o-2024-08 level) to 100
2024-08-06 00:00:00 GPT-4o-2024-08 89.0 $4.38
Less than 4 cheapest models found - skipping


GPQA Diamond
Frontier model GPT-3 is missing a GPQA Diamond value - skipping

Performance range: 33.0 (GPT-4-0314 level) to 100
2023-03-14 00:00:00 GPT-4-0314 33.0 $37.50
2023-11-06 00:00:00 GPT-4 Turbo 50.0 $15.00
2024-02-26 00:00:00 Mistral-Large-2024-02 36.0 $6.00
2024-03-04 00:00:00 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/


Performance range: 56.0 (Claude-3.5-Sonnet-2024-06 level) to 100
2024-06-20 00:00:00 Claude-3.5-Sonnet-2024-06 56.0 $6.00
2024-09-12 00:00:00 o1-mini 59.5 $5.25
2024-09-24 00:00:00 Gemini-1.5-Pro-2024-09 61.0 $2.19
2024-12-26 00:00:00 DeepSeek-V3 57.0 $0.48
2025-02-05 00:00:00 Gemini 2.0 Flash 62.0 $0.17

Performance range: 69.7 (o1-preview level) to 100
2024-09-12 00:00:00 o1-preview 69.7 $26.25
2025-01-20 00:00:00 DeepSeek-R1 71.7 $0.96
Less than 4 cheapest models found - skipping

Performance range: 75.8 (o1 level) to 100
2024-12-17 00:00:00 o1 75.8 $26.25
Less than 4 cheapest models found - skipping


MATH-500
Frontier model GPT-3 is missing a MATH-500 value - skipping

Performance range: 44.0 (GPT-3.5-Turbo-2023-06 level) to 100
2023-06-13 00:00:00 GPT-3.5-Turbo-2023-06 44.0 $3.25
2023-11-06 00:00:00 GPT-3.5-Turbo-2023-11 44.0 $0.75
2024-05-10 00:00:00 Gemini-1.5-Flash-2024-05 55.0 $0.13
2024-07-23 00:00:00 Llama-3.1-Instruct-8B 50.0 $0.10
2024-09-24 00:00:00 Llama-3.2-Instruct-3



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/


Performance range: 80.0 (GPT-4o-2024-08 level) to 100
2024-08-06 00:00:00 GPT-4o-2024-08 80.0 $4.38
2024-09-24 00:00:00 Gemini-1.5-Pro-2024-09 88.0 $2.19
2024-09-24 00:00:00 Gemini-1.5-Flash-2024-09 83.0 $0.13
2024-12-13 00:00:00 Phi 4 81.0 $0.12

Performance range: 94.0 (o1-mini level) to 100
2024-09-12 00:00:00 o1-mini 94.0 $5.25
2025-01-20 00:00:00 DeepSeek-R1 96.0 $0.96
Less than 4 cheapest models found - skipping

Performance range: 97.0 (o1 level) to 100
2024-12-17 00:00:00 o1 97.0 $26.25
2025-01-31 00:00:00 o3-mini 97.0 $1.93
Less than 4 cheapest models found - skipping


MATH 5
Frontier model GPT-3 is missing a MATH 5 value - skipping

Performance range: 15.0 (GPT-3.5-Turbo-2023-06 level) to 100
2023-06-13 00:00:00 GPT-3.5-Turbo-2023-06 15.0 $3.25
2023-11-06 00:00:00 GPT-3.5-Turbo-2023-11 15.0 $0.75
2024-05-10 00:00:00 Gemini-1.5-Flash-2024-05 23.0 $0.13
2024-07-23 00:00:00 Llama-3.1-Instruct-8B 22.0 $0.10

Performance range: 36.0 (GPT-4 Turbo level) to 100
2023-11-06 00:00:00



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/


Performance range: 84.3 (o1-mini level) to 100
2024-09-12 00:00:00 o1-mini 84.3 $5.25
2025-01-20 00:00:00 DeepSeek-R1 93.1 $0.96
Less than 4 cheapest models found - skipping

Performance range: 94.4 (o1 level) to 100
2024-12-17 00:00:00 o1 94.4 $26.25
2025-01-31 00:00:00 o3-mini 95.2 $1.93
Less than 4 cheapest models found - skipping

Performance range: 95.2 (o3-mini level) to 100
2025-01-31 00:00:00 o3-mini 95.2 $1.93
Less than 4 cheapest models found - skipping


HumanEval
Frontier model GPT-3 is missing a HumanEval value - skipping

Performance range: 67.0 (GPT-4-0314 level) to 100
2023-03-14 00:00:00 GPT-4-0314 67.0 $37.50
2023-06-13 00:00:00 GPT-3.5-Turbo-2023-06 71.0 $3.25
2023-11-06 00:00:00 GPT-3.5-Turbo-2023-11 71.0 $0.75
2024-03-04 00:00:00 Claude-3-Haiku 77.0 $0.50
2024-06-24 00:00:00 Gemma-2-27B 76.0 $0.26
2024-07-18 00:00:00 Mistral-NeMo 71.0 $0.13
2024-07-23 00:00:00 Llama-3.1-Instruct-8B 67.0 $0.10

Performance range: 71.0 (GPT-3.5-Turbo-2023-06 level) to 100
2023-06-13



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/


Performance range: 1186.0 (GPT-4-0314 level) to inf
2023-03-14 00:00:00 GPT-4-0314 1186.0 $37.50
2023-11-06 00:00:00 GPT-4 Turbo 1256.0 $15.00
2024-03-04 00:00:00 Claude-3-Sonnet 1201.0 $6.00
2024-04-18 00:00:00 Llama-3-Instruct-70B 1206.0 $0.89
2024-05-10 00:00:00 Gemini-1.5-Flash-2024-05 1227.0 $0.13
2024-10-03 00:00:00 Gemini-1.5-Flash-8B 1211.0 $0.07

Performance range: 1256.0 (GPT-4 Turbo level) to inf
2023-11-06 00:00:00 GPT-4 Turbo 1256.0 $15.00
2024-05-13 00:00:00 GPT-4o-2024-05 1285.0 $7.50
2024-05-23 00:00:00 Gemini-1.5-Pro-2024-05 1260.0 $2.19
2024-07-18 00:00:00 GPT-4o-mini 1273.0 $0.26
2024-09-24 00:00:00 Gemini-1.5-Flash-2024-09 1271.0 $0.13

Performance range: 1285.0 (GPT-4o-2024-05 level) to inf
2024-05-13 00:00:00 GPT-4o-2024-05 1285.0 $7.50
2024-08-06 00:00:00 GPT-4o-2024-08 1337.0 $4.38
2024-09-24 00:00:00 Gemini-1.5-Pro-2024-09 1301.0 $2.19
2024-12-26 00:00:00 DeepSeek-V3 1318.0 $0.48
2025-02-05 00:00:00 Gemini 2.0 Flash 1358.0 $0.17

Performance range: 1337.0 (GPT



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [22]:
all_price_factors = cheapest_model_results_df['price_reduction_factor_per_year']
np.percentile(all_price_factors, [0, 50, 100])

array([  9.,  40., 851.])

In [23]:
# Monthly rates
np.percentile(all_price_factors**(1/12), [0, 50, 100])

array([1.20093696, 1.35976381, 1.75452999])

In [24]:
# Create figure
fig = go.Figure()

bench_aliases = {
    'MMLU': 'General knowledge',
    'GPQA Diamond': 'Scientific reasoning', 
    'HumanEval': 'Coding',
    'MATH 5': 'Math (harder)',
    'MATH-500': 'Math (easier)',
    'LMSys Chatbot Arena ELO': 'Chatbot Arena',
}

# Add violin plot
fig.add_trace(go.Violin(
    y=np.log10(all_price_factors),
    x0='All benchmarks',
    box_visible=True,
    name='Distribution',
    showlegend=False,
))

# Add points for each benchmark as scatter traces
for bench in benchmarks:
    # if bench == 'MATH-500':
    #     continue

    bench_results = cheapest_model_results_df[cheapest_model_results_df['bench'] == bench]
    price_factors = bench_results['price_reduction_factor_per_year'].dropna()
    bench_factors = np.log10(price_factors)
    bench_ranges = bench_results['threshold_model']
    
    fig.add_trace(go.Scatter(
        y=bench_factors,
        x=[bench_aliases[bench]] * len(bench_factors),
        mode='markers',
        name=bench,
        text=bench_ranges,
        textposition='middle right',
        marker=dict(size=10),
        hovertemplate='Cheapest models that are %{text} level or better',
        showlegend=False,
    ))

# Update layout
fig.update_layout(yaxis_zeroline=False)
fig.update_layout(yaxis_title='Annualized rate of decrease in price per token')
# Change y-axis labels to be powers of 10
tvs = list(range(0, 6))
fig.update_layout(yaxis=dict(tickmode='array', tickvals=tvs, ticktext=[f'{10**tickval:,.0f}x' for tickval in tvs]))
fig.update_layout(
    title='The lowest-priced LLMs above a capability threshold have become much cheaper to use',
    width=1000,
    height=600,
    font=dict(size=14),
    # add some left margin
    margin=dict(l=130),
    # Increase spacing between x values by expanding the range
    xaxis=dict(range=[-0.5, len(benchmarks) + 1.0])
)
if save:
    save_plot(fig, results_subdir, 'lowest_price_models_all_violin', extensions=['png'])
fig.show()

In [25]:
min_date_span_df = cheapest_model_results_df.copy()
start_dates = min_date_span_df['start_date']
end_dates = min_date_span_df['end_date']
date_spans = (end_dates - start_dates).dt.days
sample_sizes = min_date_span_df['sample_size']
# Correlation of date spans with np.log10(price_factors)
len(date_spans), len(np.log10(all_price_factors)), len(sample_sizes)

(22, 22, 22)

In [26]:
print(np.corrcoef(date_spans, np.log10(all_price_factors))[0,1])
print(np.corrcoef(sample_sizes, np.log10(all_price_factors))[0,1])

-0.7559166907167356
-0.582561142393052


In [27]:

# Create a scatter plot of log10(price factors) vs date spans
fig_corr = go.Figure()

fig_corr.add_trace(go.Scatter(
    x=date_spans,
    y=np.log10(all_price_factors),
    mode='markers',
    text=[f'n={s}' for s in sample_sizes],
    textposition='top center',
    marker=dict(size=10),
    hovertemplate='Date span: %{x} days<br>Log10 price reduction: %{y:.2f}',
))

# Update layout
fig_corr.update_layout(
    title='Correlation between date span and price reduction factor',
    xaxis_title='Date span of regression (days)',
    yaxis_title='Price reduction factor per year (log10)',
    width=800,
    height=500,
    font=dict(size=14),
)

# Add a trend line
z = np.polyfit(date_spans, np.log10(all_price_factors), 1)
p = np.poly1d(z)
x_trend = np.linspace(min(date_spans), max(date_spans), 100)
y_trend = p(x_trend)

fig_corr.add_trace(go.Scatter(
    x=x_trend,
    y=y_trend,
    mode='lines',
    line=dict(color='red', dash='dash'),
    name=f'Trend line (r={np.corrcoef(date_spans, np.log10(all_price_factors))[0,1]:.2f})',
))
if save:
    save_plot(fig_corr, results_subdir, 'price_factor_date_span_correlation', ['png'])
fig_corr.show()


In [28]:

# Create a scatter plot of log10(price factors) vs date spans
fig_corr = go.Figure()

fig_corr.add_trace(go.Scatter(
    x=sample_sizes,
    y=np.log10(all_price_factors),
    mode='markers',
    textposition='top center',
    marker=dict(size=10),
))

# Update layout
fig_corr.update_layout(
    title='Correlation between sample size and price reduction factor',
    xaxis_title='Sample size of regression',
    yaxis_title='Price reduction factor per year (log10)',
    width=800,
    height=500,
    font=dict(size=14),
)

# Add a trend line
z = np.polyfit(sample_sizes, np.log10(all_price_factors), 1)
p = np.poly1d(z)
x_trend = np.linspace(min(sample_sizes), max(sample_sizes), 100)
y_trend = p(x_trend)

fig_corr.add_trace(go.Scatter(
    x=x_trend,
    y=y_trend,
    mode='lines',
    line=dict(color='red', dash='dash'),
    name=f'Trend line (r={np.corrcoef(sample_sizes, np.log10(all_price_factors))[0,1]:.2f})',
))
if save:
    save_plot(fig_corr, results_subdir, 'price_factor_sample_size_correlation', ['png'])
fig_corr.show()


In [29]:
for minimum_date_span in range(0, 365*2, 10):
    print(f'Minimum date span of {minimum_date_span} days')
    min_date_span_df = cheapest_model_results_df.copy()
    start_dates = min_date_span_df['start_date']
    end_dates = min_date_span_df['end_date']
    date_spans = (end_dates - start_dates).dt.days

    min_date_span_df = min_date_span_df[date_spans >= minimum_date_span]
    start_dates = min_date_span_df['start_date']
    end_dates = min_date_span_df['end_date']
    filtered_date_spans = (end_dates - start_dates).dt.days

    filtered_price_factors = min_date_span_df['price_reduction_factor_per_year']
    print(len(filtered_date_spans), 'models remaining')
    print(np.corrcoef(filtered_date_spans, np.log10(filtered_price_factors))[0,1])
    print(np.percentile(filtered_price_factors, [0, 50, 100]))

Minimum date span of 0 days
22 models remaining
-0.7559166907167356
[  9.  40. 851.]
Minimum date span of 10 days
22 models remaining
-0.7559166907167356
[  9.  40. 851.]
Minimum date span of 20 days
22 models remaining
-0.7559166907167356
[  9.  40. 851.]
Minimum date span of 30 days
22 models remaining
-0.7559166907167356
[  9.  40. 851.]
Minimum date span of 40 days
22 models remaining
-0.7559166907167356
[  9.  40. 851.]
Minimum date span of 50 days
22 models remaining
-0.7559166907167356
[  9.  40. 851.]
Minimum date span of 60 days
22 models remaining
-0.7559166907167356
[  9.  40. 851.]
Minimum date span of 70 days
22 models remaining
-0.7559166907167356
[  9.  40. 851.]
Minimum date span of 80 days
22 models remaining
-0.7559166907167356
[  9.  40. 851.]
Minimum date span of 90 days
22 models remaining
-0.7559166907167356
[  9.  40. 851.]
Minimum date span of 100 days
22 models remaining
-0.7559166907167356
[  9.  40. 851.]
Minimum date span of 110 days
22 models remaining
-0.7


Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply



In [30]:
minimum_date_span = 365  # days
min_date_span_df = cheapest_model_results_df.copy()
start_dates = min_date_span_df['start_date']
end_dates = min_date_span_df['end_date']
date_spans = (end_dates - start_dates).dt.days
min_date_span_df = min_date_span_df[date_spans >= minimum_date_span]
all_price_factors = min_date_span_df['price_reduction_factor_per_year']
np.percentile(all_price_factors, [0, 50, 100])

array([  9. ,  21.5, 152. ])

In [31]:
np.percentile(all_price_factors**(1/12), [0, 50, 100])

array([1.20093696, 1.29000443, 1.51991849])

In [32]:
# Create figure
fig = go.Figure()

bench_aliases = {
    'MMLU': 'General knowledge',
    'GPQA Diamond': 'Scientific reasoning', 
    'HumanEval': 'Coding',
    'MATH 5': 'Math (harder)',
    'MATH-500': 'Math (easier)',
    'LMSys Chatbot Arena ELO': 'Chatbot Arena',
}

# Add violin plot
fig.add_trace(go.Violin(
    y=np.log10(all_price_factors),
    x0='All benchmarks',
    box_visible=True,
    name='Distribution',
    showlegend=False,
))

# Add points for each benchmark as scatter traces
for bench in benchmarks:
    # if bench == 'MATH-500':
    #     continue

    bench_results = cheapest_model_results_df[cheapest_model_results_df['bench'] == bench]
    print(f'{len(bench_results)} before date span filter')

    start_dates = bench_results['start_date']
    end_dates = bench_results['end_date']
    date_spans = (end_dates - start_dates).dt.days
    bench_results = bench_results[date_spans >= minimum_date_span]
    print(f'{len(bench_results)} after date span filter')

    price_factors = bench_results['price_reduction_factor_per_year'].dropna()
    bench_factors = np.log10(price_factors)
    bench_ranges = bench_results['threshold_model']
    
    fig.add_trace(go.Scatter(
        y=bench_factors,
        x=[bench_aliases[bench]] * len(bench_factors),
        mode='markers',
        name=bench,
        text=bench_ranges,
        textposition='middle right',
        marker=dict(size=10),
        hovertemplate='Cheapest models that are %{text} level or better',
        showlegend=False,
    ))

# Update layout
fig.update_layout(yaxis_zeroline=False)
fig.update_layout(yaxis_title='Annualized rate of decrease in price per token')
# Change y-axis labels to be powers of 10
tvs = list(range(0, 6))
fig.update_layout(yaxis=dict(tickmode='array', tickvals=tvs, ticktext=[f'{10**tickval:,.0f}x' for tickval in tvs]))
fig.update_layout(
    title='The lowest-priced LLMs above a capability threshold have become much cheaper to use',
    width=1000,
    height=600,
    font=dict(size=14),
    # add some left margin
    margin=dict(l=130),
    # Increase spacing between x values by expanding the range
    xaxis=dict(range=[-0.5, len(benchmarks) + 1.0])
)
if save:
    save_plot(fig, results_subdir, f'lowest_price_models_all_violin_date_span={minimum_date_span}d', extensions=['png'])
fig.show()

5 before date span filter
5 after date span filter
4 before date span filter
2 after date span filter
3 before date span filter
2 after date span filter
3 before date span filter
2 after date span filter
3 before date span filter
3 after date span filter
4 before date span filter
2 after date span filter


In [33]:
# Bar chart of price reduction factor per year
fig = go.Figure()

# Define colors for each benchmark
benchmark_colors = {
    bench: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)]
    for i, bench in enumerate(benchmarks)
}

# Define patterns for each model to make bars distinguishable
patterns = ['', '-', '/', '\\']

# Add a bar trace for each model in each benchmark
for bench in benchmarks:
    bench_results = cheapest_model_results_df[cheapest_model_results_df['bench'] == bench]
    bench_color = benchmark_colors[bench]
    
    # Get all models and their price factors for this benchmark
    for i, (_, row) in enumerate(bench_results.iterrows()):
        if pd.notna(row['price_reduction_factor_per_year']):
            model_name = row['threshold_model']
            price_factor = row['price_reduction_factor_per_year']
            
            # Assign a pattern based on the model index
            pattern = patterns[i % len(patterns)]
            
            fig.add_trace(go.Bar(
                x=[bench_aliases[bench]],
                y=[np.log10(price_factor)],
                name=f"{model_name}",
                # text=[f"{price_factor:.1f}x - {model_name}"],
                # textposition='outside',
                # Offset bars within the same benchmark group
                offset=i * 0.15 - (len(bench_results) * 0.15 / 2),
                width=0.15,
                marker_color=bench_color,
                marker_pattern_shape=pattern,
            ))


tvs = list(range(0, 4))
fig.update_layout(
    yaxis=dict(
        tickmode='array',
        tickvals=tvs,
        ticktext=[f'{10**tickval:,.0f}x' for tickval in tvs],
        range=[0, 3.1],  # Set y-axis limits from 0 to 3 (log10 scale)
        # dtick='D10',  # Show only powers of 10
        title=None,  # Remove the side title
    ),
)
fig.update_layout(
    annotations=[
        dict(
            x=-0.08,  # Center of the plot
            y=1.08,  # Above the plot
            xref='paper',
            yref='paper',
            text='Reduction in usage price per year',
            showarrow=False,
            font=dict(size=16),
        )
    ],
)
fig.update_layout(
    title='The usage price of LLMs at a given capability level has fallen by 7x-300x per year',
    width=1200,
    height=800,
    font=dict(size=14),
    margin=dict(l=130, t=150),
    xaxis=dict(title='Benchmark',range=[-0.5, len(benchmarks) + 1.0]),
    legend_title_text="Capability threshold"
)
if save:
    save_plot(fig, results_subdir, 'lowest_price_models_all_bar', extensions=['png', 'svg'])
fig.show()

## Merge in the data from evaluation logs

In [34]:
# Saved in save_evaluation_data.ipynb
eval_log_df = pd.read_csv('data/epoch_ai_eval_data.csv')
eval_log_df.head()

Unnamed: 0,Model,Task,epochs,total_samples,input_tokens,output_tokens,total_tokens
0,Yi-34B-Chat,MATH level 5,8,10592,2222376,7308455,9530831
1,qwen2p5-72b-instruct,MATH level 5,8,10592,2243320,9708977,11952297
2,gpt-4-0613,MATH level 5,8,10592,1982888,3620454,5603342
3,o1-mini-2024-09-12_high,MATH level 5,4,5296,988900,9914854,10903754
4,gemma-2-9b-it,MATH level 5,8,10592,1989896,4551564,6541460


In [35]:
model_mapping = {
    # Claude models
    'claude-3-haiku-20240307': 'Claude-3-Haiku',
    'claude-3-opus-20240229': 'Claude-3-Opus',
    'claude-3-sonnet-20240229': 'Claude-3-Sonnet',
    'claude-3-5-sonnet-20240620': 'Claude-3.5-Sonnet-2024-06',
    'claude-3-5-sonnet-20241022': 'Claude-3.5-Sonnet-2024-10',
    'claude-2.0': 'Claude 2',
    'claude-2.1': 'Claude 2.1',
    
    # Gemini models
    'gemini-2.0-flash-001': 'Gemini 2.0 Flash',
    # 'gemini-2.0-flash-thinking-exp-01-21': 'Gemini 2.0 Flash Thinking',
    'gemini-1.5-flash-001': 'Gemini-1.5-Flash-2024-05',
    'gemini-1.5-flash-002': 'Gemini-1.5-Flash-2024-09',
    'gemini-1.5-pro-001': 'Gemini-1.5-Pro-2024-05',
    'gemini-1.5-pro-002': 'Gemini-1.5-Pro-2024-09',
    'gemini-1.0-pro-001': 'Gemini 1.0 Pro',
    
    # Gemma models
    'gemma-2-27b-it': 'Gemma-2-27B',
    'gemma-2-9b-it': 'Gemma-2-9B',
    
    # GPT models
    'gpt-3.5-turbo-0125': 'GPT-3.5-Turbo-2024-01',
    'gpt-3.5-turbo-1106': 'GPT-3.5-Turbo-2023-11',
    # 'gpt-4-0125-preview': 'GPT-4',
    'gpt-4-0613': 'GPT-4-0613',
    # 'gpt-4-1106-preview': 'GPT-4 Turbo',
    'gpt-4o-2024-05-13': 'GPT-4o-2024-05',
    'gpt-4o-2024-08-06': 'GPT-4o-2024-08',
    'gpt-4o-2024-11-20': 'GPT-4o-2024-11',
    'gpt-4o-mini-2024-07-18': 'GPT-4o-mini',
    
    # Llama models
    'Llama-2-70b-chat-hf': 'Llama 2-70B Chat',
    'Meta-Llama-3-8B-Instruct': 'Llama-3-Instruct-8B',
    'Meta-Llama-3-70B-Instruct': 'Llama-3-Instruct-70B',
    'Meta-Llama-3.1-8B-Instruct': 'Llama-3.1-Instruct-8B',
    'Meta-Llama-3.1-70B-Instruct': 'Llama-3.1-Instruct-70B',
    'Meta-Llama-3.1-405B-Instruct': 'Llama-3.1-Instruct-405B',
    # 'llama-3.1-405b-instruct-maas': 'Llama-3.1-Instruct-405B',
    # 'llama-v3p3-70b-instruct': 'Llama-3.1-Instruct-70B',
    # 'Llama-3.1-Tulu-3-70B-DPO': 'Llama-3.1-Instruct-70B',
    
    # Mistral models
    'Mistral-7B-Instruct-v0.3': 'Mistral 7B',
    'mistral-large-2402': 'Mistral-Large-2024-02',
    'mistral-large-2407': 'Mistral-Large-2-2024-06',
    'open-mistral-7b': 'Mistral 7B',
    'open-mistral-nemo-2407': 'Mistral-NeMo',
    'mistral-small-2501': 'Mistral Small 3',
    'open-mixtral-8x22b': 'Mistral-8x22',
    # 'open-mixtral-8x7b': 'Mistral 7B',
    # 'Mixtral-8x7B-Instruct-v0.1': 'Mistral 7B',
    # 'WizardLM-2-8x22B': 'Mistral-8x22',
    
    # Phi models
    'phi-4': 'Phi 4',
    
    # Anthropic o models
    'o1-2024-12-17_high': 'o1',
    'o1-2024-12-17': 'o1',
    'o1-mini-2024-09-12_high': 'o1-mini',
    'o1-mini-2024-09-12': 'o1-mini',
    'o1-preview-2024-09-12': 'o1-preview',
    'o3-mini-2025-01-31_high': 'o3-mini',
    'o3-mini-2025-01-31': 'o3-mini',
    
    # DeepSeek models
    'deepseek-r1': 'DeepSeek-R1',
    'deepseek-v3': 'DeepSeek-V3',
    'DeepSeek-V3': 'DeepSeek-V3',
    
    # Qwen models (no exact matches in Set 2, but including for completeness)
    # 'qwen2p5-72b-instruct': 'Qwen2.5-72B-Instruct',
    # 'Qwen2.5-32B-Instruct': 'Qwen2.5-32B-Instruct',
    # 'Qwen2.5-72B-Instruct': 'Qwen2.5-72B-Instruct',
    # 'Qwen1.5-72B-Chat': 'Qwen1.5-72B-Chat',
    # 'Qwen1.5-32B-Chat': 'Qwen1.5-32B-Chat',
    # 'Qwen2-72B-Instruct': 'Qwen2-72B-Instruct',
    
    # Other models without clear matches
    # 'Yi-34B-Chat': 'Yi-34B-Chat',
    # 'Yi-1.5-34B-Chat': 'Yi-1.5-34B-Chat',
    # 'dbrx-instruct': 'dbrx-instruct',
    # 'grok-2-1212': 'grok-2-1212',
    # 'Hermes-2-Theta-Llama-3-70B': 'Hermes-2-Theta-Llama-3-70B',
    # 'Eurus-2-7B-PRIME': 'Eurus-2-7B-PRIME',
    # 'ministral-8b-2410': 'ministral-8b-2410',
    # 'ministral-3b-2410': 'ministral-3b-2410'
}

In [36]:
# Keep all rows from df, even if no match in aa_df
df_temp = eval_log_df.copy()
df_temp['Model Name'] = df_temp['Model'].map(model_mapping)
merged_df = pd.merge(df_temp, aa_df, on='Model Name', how='left')

# Keep all rows from aa_df, even if no match in df
df_temp = eval_log_df.copy()
df_temp['Model Name'] = df_temp['Model'].map(model_mapping)
merged_df = pd.merge(df_temp, aa_df, on='Model Name', how='right')

merged_df

Unnamed: 0,Model,Task,epochs,total_samples,input_tokens,output_tokens,total_tokens,Model Name,Release Date,USD per 1M Tokens,...,GPQA Diamond,HumanEval,MATH-500,LMSys Chatbot Arena ELO,MATH 5,Tokens per Second,Prefill Latency (s),Prompt Length (tk),Model Name and Date,MMLU price-performance
0,,,,,,,,GPT-3,2021-11-20,60.000,...,,,,,,,,,GPT-3 (2021-11),0.731667
1,,,,,,,,GPT-3,2022-08-31,60.000,...,,,,,,,,,GPT-3 (2022-08),0.731667
2,,,,,,,,GPT-3,2022-09-01,20.000,...,,,,,,,,,GPT-3 (2022-09),2.195000
3,,,,,,,,GPT-3.5,2022-11-30,20.000,...,,,,,,,,,GPT-3.5 (2022-11),3.240000
4,,,,,,,,GPT-3.5 Turbo,2023-03-06,2.000,...,,,,1106.0,,,,,GPT-3.5 Turbo (2023-03),34.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,o1-mini-2024-09-12_high,GPQA Diamond,8.0,1584.0,429856.0,2973549.0,3403405.0,o1-mini,2025-01-31,1.925,...,59.5,97.0,94.0,1308.0,84.3,,,,o1-mini (2025-01),
115,o1-mini-2024-09-12,GPQA Diamond,16.0,3168.0,859744.0,5597033.0,6456777.0,o1-mini,2025-01-31,1.925,...,59.5,97.0,94.0,1308.0,84.3,,,,o1-mini (2025-01),
116,o1-mini-2024-09-12,MATH level 5,8.0,10592.0,1977800.0,16812577.0,18790377.0,o1-mini,2025-01-31,1.925,...,59.5,97.0,94.0,1308.0,84.3,,,,o1-mini (2025-01),
117,gemini-2.0-flash-001,MATH level 5,8.0,10592.0,1883976.0,9566695.0,11450671.0,Gemini 2.0 Flash,2025-02-05,0.175,...,62.0,90.0,93.0,1358.0,82.0,,,,Gemini 2.0 Flash (2025-02),502.857143


## Measure the decline in evaluation cost

In [37]:
eval_cost_df = merged_df.copy().dropna(subset=['Model', 'total_tokens'])
eval_cost_df

Unnamed: 0,Model,Task,epochs,total_samples,input_tokens,output_tokens,total_tokens,Model Name,Release Date,USD per 1M Tokens,...,GPQA Diamond,HumanEval,MATH-500,LMSys Chatbot Arena ELO,MATH 5,Tokens per Second,Prefill Latency (s),Prompt Length (tk),Model Name and Date,MMLU price-performance
9,gpt-3.5-turbo-1106,GPQA Diamond,16.0,3168.0,838992.0,319979.0,1158971.0,GPT-3.5-Turbo-2023-11,2023-11-06,0.7500,...,30.0,71.0,44.0,1107.0,15.0,121.5,0.598,1000.0,GPT-3.5-Turbo-2023-11 (2023-11),90.666667
10,gpt-3.5-turbo-1106,MATH level 5,8.0,10592.0,1982888.0,3495658.0,5478546.0,GPT-3.5-Turbo-2023-11,2023-11-06,0.7500,...,30.0,71.0,44.0,1107.0,15.0,121.5,0.598,1000.0,GPT-3.5-Turbo-2023-11 (2023-11),90.666667
12,Llama-2-70b-chat-hf,GPQA Diamond,16.0,3168.0,956816.0,1361586.0,2318402.0,Llama 2-70B Chat,2023-12-01,0.7625,...,26.0,,,1093.0,3.0,,,,Llama 2-70B Chat (2023-12),90.360656
13,Llama-2-70b-chat-hf,MATH level 5,8.0,10592.0,2211400.0,4559177.0,6770577.0,Llama 2-70B Chat,2023-12-01,0.7625,...,26.0,,,1093.0,3.0,,,,Llama 2-70B Chat (2023-12),90.360656
17,gpt-3.5-turbo-0125,GPQA Diamond,16.0,3168.0,838976.0,328388.0,1167364.0,GPT-3.5-Turbo-2024-01,2024-01-25,0.7500,...,30.0,71.0,44.0,1106.0,11.0,,,,GPT-3.5-Turbo-2024-01 (2024-01),90.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,o1-mini-2024-09-12_high,GPQA Diamond,8.0,1584.0,429856.0,2973549.0,3403405.0,o1-mini,2025-01-31,1.9250,...,59.5,97.0,94.0,1308.0,84.3,,,,o1-mini (2025-01),
115,o1-mini-2024-09-12,GPQA Diamond,16.0,3168.0,859744.0,5597033.0,6456777.0,o1-mini,2025-01-31,1.9250,...,59.5,97.0,94.0,1308.0,84.3,,,,o1-mini (2025-01),
116,o1-mini-2024-09-12,MATH level 5,8.0,10592.0,1977800.0,16812577.0,18790377.0,o1-mini,2025-01-31,1.9250,...,59.5,97.0,94.0,1308.0,84.3,,,,o1-mini (2025-01),
117,gemini-2.0-flash-001,MATH level 5,8.0,10592.0,1883976.0,9566695.0,11450671.0,Gemini 2.0 Flash,2025-02-05,0.1750,...,62.0,90.0,93.0,1358.0,82.0,,,,Gemini 2.0 Flash (2025-02),502.857143


In [38]:
eval_cost_df['Average cost per question (USD)'] = eval_cost_df['total_tokens'] / eval_cost_df['total_samples'] * eval_cost_df['USD per 1M Tokens'] / 1e6
eval_cost_df

Unnamed: 0,Model,Task,epochs,total_samples,input_tokens,output_tokens,total_tokens,Model Name,Release Date,USD per 1M Tokens,...,HumanEval,MATH-500,LMSys Chatbot Arena ELO,MATH 5,Tokens per Second,Prefill Latency (s),Prompt Length (tk),Model Name and Date,MMLU price-performance,Average cost per question (USD)
9,gpt-3.5-turbo-1106,GPQA Diamond,16.0,3168.0,838992.0,319979.0,1158971.0,GPT-3.5-Turbo-2023-11,2023-11-06,0.7500,...,71.0,44.0,1107.0,15.0,121.5,0.598,1000.0,GPT-3.5-Turbo-2023-11 (2023-11),90.666667,0.000274
10,gpt-3.5-turbo-1106,MATH level 5,8.0,10592.0,1982888.0,3495658.0,5478546.0,GPT-3.5-Turbo-2023-11,2023-11-06,0.7500,...,71.0,44.0,1107.0,15.0,121.5,0.598,1000.0,GPT-3.5-Turbo-2023-11 (2023-11),90.666667,0.000388
12,Llama-2-70b-chat-hf,GPQA Diamond,16.0,3168.0,956816.0,1361586.0,2318402.0,Llama 2-70B Chat,2023-12-01,0.7625,...,,,1093.0,3.0,,,,Llama 2-70B Chat (2023-12),90.360656,0.000558
13,Llama-2-70b-chat-hf,MATH level 5,8.0,10592.0,2211400.0,4559177.0,6770577.0,Llama 2-70B Chat,2023-12-01,0.7625,...,,,1093.0,3.0,,,,Llama 2-70B Chat (2023-12),90.360656,0.000487
17,gpt-3.5-turbo-0125,GPQA Diamond,16.0,3168.0,838976.0,328388.0,1167364.0,GPT-3.5-Turbo-2024-01,2024-01-25,0.7500,...,71.0,44.0,1106.0,11.0,,,,GPT-3.5-Turbo-2024-01 (2024-01),90.666667,0.000276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,o1-mini-2024-09-12_high,GPQA Diamond,8.0,1584.0,429856.0,2973549.0,3403405.0,o1-mini,2025-01-31,1.9250,...,97.0,94.0,1308.0,84.3,,,,o1-mini (2025-01),,0.004136
115,o1-mini-2024-09-12,GPQA Diamond,16.0,3168.0,859744.0,5597033.0,6456777.0,o1-mini,2025-01-31,1.9250,...,97.0,94.0,1308.0,84.3,,,,o1-mini (2025-01),,0.003923
116,o1-mini-2024-09-12,MATH level 5,8.0,10592.0,1977800.0,16812577.0,18790377.0,o1-mini,2025-01-31,1.9250,...,97.0,94.0,1308.0,84.3,,,,o1-mini (2025-01),,0.003415
117,gemini-2.0-flash-001,MATH level 5,8.0,10592.0,1883976.0,9566695.0,11450671.0,Gemini 2.0 Flash,2025-02-05,0.1750,...,90.0,93.0,1358.0,82.0,,,,Gemini 2.0 Flash (2025-02),502.857143,0.000189


In [39]:
fig = px.scatter(
    data_frame=eval_cost_df,
    x='Release Date',
    y='Average cost per question (USD)',
    color='Task',
    hover_data=['Model Name and Date', 'Task'],
    log_y=True,
    title='Cost per Question Over Time'
)
fig.update_layout(
    xaxis_title='Release Date',
    yaxis_title='Average Cost per Question (USD, log scale)'
)
fig.show()

In [40]:
results_subdir = results_dir + 'lowest_cost_models_examples/'
os.makedirs(results_subdir, exist_ok=True)

In [41]:
cost_col = 'Average cost per question (USD)'

### Final selected example

In [42]:
eval_cost_df['Model Name'].unique()

array(['GPT-3.5-Turbo-2023-11', 'Llama 2-70B Chat',
       'GPT-3.5-Turbo-2024-01', 'Mistral-Large-2024-02', 'Claude-3-Haiku',
       'Claude-3-Sonnet', 'Claude-3-Opus', 'Mistral-8x22',
       'Llama-3-Instruct-70B', 'Llama-3-Instruct-8B',
       'Gemini-1.5-Flash-2024-05', 'GPT-4o-2024-05',
       'Gemini-1.5-Pro-2024-05', 'Claude-3.5-Sonnet-2024-06',
       'Gemma-2-9B', 'Gemma-2-27B', 'Mistral-NeMo', 'GPT-4o-mini',
       'Llama-3.1-Instruct-8B', 'Llama-3.1-Instruct-405B',
       'Llama-3.1-Instruct-70B', 'GPT-4o-2024-08', 'Claude 2',
       'Claude 2.1', 'Gemini 1.0 Pro', 'o1-mini', 'o1-preview',
       'Mistral 7B', 'Gemini-1.5-Pro-2024-09', 'Gemini-1.5-Flash-2024-09',
       'Claude-3.5-Sonnet-2024-10', 'GPT-4o-2024-11', 'Phi 4', 'o1',
       'DeepSeek-V3', 'DeepSeek-R1', 'Mistral Small 3', 'o3-mini',
       'Gemini 2.0 Flash'], dtype=object)

In [43]:
# selected_benchmarks = ['MMLU', 'MATH 5', 'HumanEval']
bench = 'GPQA Diamond'
threshold_model = 'GPT-4o-2024-05'
performance_lower_bound = eval_cost_df[eval_cost_df['Model Name'] == threshold_model].iloc[0][bench]
performance_upper_bound = 100
print(f'\nPerformance range: {performance_lower_bound}-{performance_upper_bound}%')
# TODO: Iterate through model release dates rather than fixed months
ts = pd.date_range(start='2020-01-01', end=pd.Timestamp.today(), freq='MS')
cheapest_models = []
current_best = None

for i, t in enumerate(ts):
    # Get models published in this time window
    benchmark_df = eval_cost_df
    benchmark_df = benchmark_df[benchmark_df['Task'] == bench]
    if i > 0:
        prev_t = ts[i-1]
        benchmark_df = benchmark_df[(benchmark_df['Release Date'] >= prev_t) & (benchmark_df['Release Date'] < t)]
    else:
        benchmark_df = benchmark_df[benchmark_df['Release Date'] < t]
        
    # Filter for performance
    benchmark_df = benchmark_df[
        benchmark_df[bench].notna() &
        (benchmark_df[bench] >= performance_lower_bound) &
        (benchmark_df[bench] < performance_upper_bound)
    ]
    
    if not benchmark_df.empty:
        # Find cheapest new model
        new_best = benchmark_df.loc[benchmark_df[cost_col].idxmin()]
        
        # Update current best if new model is cheaper (or if no current best)
        if current_best is None or new_best[cost_col] < current_best[cost_col]:
            current_best = new_best
            cheapest_models.append(current_best)
            print(t, current_best['Model Name'], current_best[bench], f"${current_best[cost_col]:.2f}")

cheapest_models_df = pd.DataFrame(cheapest_models)

# Fit a line to the data
cheapest_models_df['price'] = cheapest_models_df[cost_col]
cheapest_models_df['log_price'] = np.log10(cheapest_models_df[cost_col])
cheapest_models_df['date'] = cheapest_models_df['Release Date'].map(lambda x: pd.Timestamp(x).toordinal())
exponential_model = smf.ols('log_price ~ date', data=cheapest_models_df).fit()

fig = go.Figure()

all_df = eval_cost_df[
    (eval_cost_df['Task'] == bench) &
    (eval_cost_df[bench].notna()) &
    (eval_cost_df[bench] >= performance_lower_bound) &
    (eval_cost_df[bench] < performance_upper_bound)
]
fig.add_trace(go.Scatter(
    x=all_df['Release Date'],
    y=all_df[cost_col],
    mode='markers',
    name=f'Other, GPT-4 level or better on {bench}',
    text=all_df['Model Name'],
    marker=dict(color='rgb(222, 222, 255)')
))

factor = cheapest_models_df[cost_col].iloc[0] / cheapest_models_df[cost_col].iloc[-1]
period_months = (cheapest_models_df['Release Date'].iloc[-1] - cheapest_models_df['Release Date'].iloc[0]).days / (365/12)

annual_slope = exponential_model.params['date'] * 365  # Convert daily to annual
annual_factor = int(round(10**(-annual_slope)))  # Convert log slope to factor
# Get the 90% CI
ci_90 = exponential_model.conf_int(alpha=0.1)
print(int(round(10**(-ci_90.loc['date'][1] * 365))), int(round(10**(-ci_90.loc['date'][0] * 365))))

# Plot the exponential trendline with the data
date_range = pd.date_range(start=cheapest_models_df['Release Date'].min(), end=cheapest_models_df['Release Date'].max(), freq='MS')
pred_df = pd.DataFrame({'date': date_range.map(lambda x: pd.Timestamp(x).toordinal())})
fig.add_trace(go.Scatter(
    x=date_range,
    y=10**exponential_model.predict(pred_df['date']),
    mode='lines+text',
    name=f'{annual_factor}x decrease per year',
    # Only show text at middle index
    text=['' if i != len(date_range)//2 else f'{annual_factor}x per year' for i in range(len(date_range))],
    textposition='middle left',
    textfont=dict(size=14),
    line=dict(color='magenta', dash='dash'),
    hoverinfo='skip',
    showlegend=False,
))

# annotations = [""] * len(cheapest_models_df['Model Name'])
# annotations[0] = cheapest_models_df['Model Name'].iloc[0]
# annotations[-1] = cheapest_models_df['Model Name'].iloc[-1]
annotations = cheapest_models_df['Model Name']

fig.add_trace(go.Scatter(
    x=cheapest_models_df['Release Date'],
    y=cheapest_models_df[cost_col],
    mode='markers+text',
    name=f'Cheapest, GPT-4 level or better on {bench}',
    marker=dict(color='magenta'),
    text=annotations,
    textposition='top right',
    # marker=dict(color='blue'),
    # visible="legendonly",  # Hide this trace by default
    # legendgroup=f'{bench}_{performance_range_str}',
))
fig.update_layout(xaxis_range=[
    cheapest_models_df['Release Date'].min() - pd.Timedelta(days=30),
    cheapest_models_df['Release Date'].max() + pd.Timedelta(days=150)
])

fig.update_layout(
    # title=f'The cost to answer PhD-level science questions has fallen by {round(factor, -1):.0f}x in {period_months:.0f} months'
    title=f'The cost to answer PhD-level science questions as well as GPT-4 has fallen by {annual_factor}x per year'
)
fig.update_layout(yaxis_type='log')
fig.update_layout(xaxis_title='Month')
fig.update_layout(yaxis_title='Avg. cost per benchmark question (USD, log scale)')
# fig.update_layout(xaxis_range=[eval_cost_df['Release Date'].min() - pd.Timedelta(days=150), eval_cost_df['Release Date'].max() + pd.Timedelta(days=30)])
fig.update_layout(
    width=1000,
    height=600,
    # font=dict(size=14),
    legend=dict(
        yanchor="top",
        y=0.14,
        xanchor="right",
        x=0.45,
        bordercolor="lightgrey",
        borderwidth=1
    )
)

if save:
    save_plot(fig, results_subdir, f'lowest_cost_models_{bench}_{performance_lower_bound}_to_{performance_upper_bound}', extensions=['png', 'svg'])

fig.show()


Performance range: 53.0-100%
2024-06-01 00:00:00 GPT-4o-2024-05 53.0 $0.01
2024-07-01 00:00:00 Claude-3.5-Sonnet-2024-06 56.0 $0.00
2024-10-01 00:00:00 Gemini-1.5-Pro-2024-09 61.0 $0.00
2025-01-01 00:00:00 Phi 4 53.0 $0.00
14 37062


### Full analysis

In [44]:
results_subdir = results_dir + 'lowest_cost_models_above_previous_frontier/'
os.makedirs(results_subdir, exist_ok=True)

performance_delta = 100
results = []
for i, bench in enumerate(benchmarks):    
    if i > 0:
        print('\n')
    print(f'{bench}')

    if bench == 'MATH 5':
        task = 'MATH level 5'
    else:
        task = bench
    if len(eval_cost_df[eval_cost_df['Task'] == task]) == 0:
        print(f'No eval data for {bench} - skipping')
        continue

    for i, frontier_model_data in top_models_df_lookup[bench].iterrows():
        performance_lower_bound = frontier_model_data[bench]
        if pd.isna(performance_lower_bound):
            print(f'Frontier model {frontier_model_data["Model Name"]} is missing a {bench} value - skipping')
            continue
        elif benchmark_is_mqa[bench] and performance_lower_bound < 30:
            print(f'Frontier model {frontier_model_data["Model Name"]} has a {bench} value of less than 30% on an MQA benchmark - skipping')
            continue
        if bench == 'LMSys Chatbot Arena ELO':
            performance_upper_bound = np.inf
        else:
            performance_upper_bound = min(performance_lower_bound + performance_delta, 100)
        print(f'\nPerformance range: {performance_lower_bound} ({frontier_model_data["Model Name"]} level) to {performance_upper_bound}')

        ts = pd.date_range(start='2020-01-01', end=pd.Timestamp.today(), freq='MS')
        cheapest_models = []
        current_best = None
        for i, t in enumerate(ts):
            # Get models published in this time window
            benchmark_df = eval_cost_df
            benchmark_df = benchmark_df[benchmark_df['Task'] == task]
            if i > 0:
                prev_t = ts[i-1]
                benchmark_df = benchmark_df[(benchmark_df['Release Date'] >= prev_t) & (benchmark_df['Release Date'] < t)]
            else:
                benchmark_df = benchmark_df[benchmark_df['Release Date'] < t]
                
            # Filter for performance
            benchmark_df = benchmark_df[
                benchmark_df[bench].notna() &
                (benchmark_df[bench] >= performance_lower_bound) &
                (benchmark_df[bench] < performance_upper_bound)
            ]
            
            if not benchmark_df.empty:
                # Find cheapest new model
                new_best = benchmark_df.loc[benchmark_df['Average cost per question (USD)'].idxmin()]
                
                # Update current best if new model is cheaper (or if no current best)
                if current_best is None or new_best['Average cost per question (USD)'] < current_best['Average cost per question (USD)']:
                    current_best = new_best
                    cheapest_models.append(current_best)
                    print(t, current_best['Model Name'], current_best[bench], f"${current_best['Average cost per question (USD)']:.6f}")

        cheapest_models_df = pd.DataFrame(cheapest_models)
        if len(cheapest_models_df) < min_num_data_points_for_regression:
            print(f'Less than {min_num_data_points_for_regression} cheapest models found - skipping')
            continue

        # Fit a line to the data
        cheapest_models_df['cost'] = cheapest_models_df['Average cost per question (USD)']
        cheapest_models_df['log_cost'] = np.log10(cheapest_models_df['Average cost per question (USD)'])
        cheapest_models_df['date'] = cheapest_models_df['Release Date'].map(lambda x: pd.Timestamp(x).toordinal())
        exponential_model = smf.ols('log_cost ~ date', data=cheapest_models_df).fit()

        # Calculate annual rate of decrease
        annual_slope = exponential_model.params['date'] * 365  # Convert daily to annual
        annual_factor = int(round(10**(-annual_slope)))  # Convert log slope to factor
        # Get the 90% CI
        ci_90 = exponential_model.conf_int(alpha=0.1)
        annual_factor_low = int(round(10**(-ci_90.loc['date'][1] * 365)))
        annual_factor_high = int(round(10**(-ci_90.loc['date'][0] * 365)))
        results.append({
            'bench': bench,
            'threshold_model': frontier_model_data["Model Name"],
            'performance_range': [performance_lower_bound, performance_upper_bound],
            'sample_size': len(cheapest_models_df),
            'start_date': cheapest_models_df['Release Date'].min(),
            'end_date': cheapest_models_df['Release Date'].max(),
            'cost_reduction_factor_per_year': annual_factor,
            'cost_reduction_factor_per_year_90_ci': [annual_factor_low, annual_factor_high],
            'r_squared': round(exponential_model.rsquared, 2),
        })

        # Plot the exponential trendline with the data
        fig = go.Figure()
        fig.add_trace(go.Scatter(
            x=cheapest_models_df['Release Date'],
            y=10**exponential_model.predict(cheapest_models_df['date']),
            mode='lines',
            name=f'Trendline: {annual_factor}x decrease per year',
            line=dict(color='blue', dash='dash')
        ))
        fig.add_trace(go.Scatter(
            x=cheapest_models_df['Release Date'],
            y=cheapest_models_df['Average cost per question (USD)'],
            mode='markers+text',
            name=f'Cheapest models with {performance_lower_bound}-{performance_upper_bound} on {bench}',
            text=cheapest_models_df['Model Name'],
            textposition='bottom left',
            marker=dict(color='blue')
            # line=dict(shape='hv'),
        ))
        other_models_df = eval_cost_df[
            (eval_cost_df['Task'] == task) &
            (eval_cost_df[bench].notna()) &
            (eval_cost_df[bench] >= performance_lower_bound) &
            (eval_cost_df[bench] < performance_upper_bound) &
            ~(eval_cost_df['Model Name'].isin(cheapest_models_df['Model Name']))
        ]
        fig.add_trace(go.Scatter(
            x=other_models_df['Release Date'],
            y=other_models_df['Average cost per question (USD)'],
            mode='markers',
            name=f'Other models with {performance_lower_bound}-{performance_upper_bound} on {bench}',
            text=other_models_df['Model Name'],
            marker=dict(color='lightblue')
        ))
        fig.update_layout(
            title=f'The cost to answer {bench} questions as well as {frontier_model_data["Model Name"]} has fallen by {annual_factor}x per year'
        )
        fig.update_traces(textposition='bottom left')
        fig.update_layout(yaxis_type='log')
        fig.update_layout(xaxis_title='Month')
        fig.update_layout(yaxis_title='Avg. cost per question (USD, log scale)')
        # Lower the lower x limit
        min_date = min(cheapest_models_df['Release Date'].min(), other_models_df['Release Date'].min())
        max_date = max(cheapest_models_df['Release Date'].max(), other_models_df['Release Date'].max())
        fig.update_layout(xaxis_range=[min_date - pd.Timedelta(days=90), max_date + pd.Timedelta(days=30)])
        fig.update_layout(
            width=1000,
            height=600,
            font=dict(size=10),
            legend=dict(
                yanchor="top",
                y=0.99,
                xanchor="right",
                x=0.99,
                bordercolor="lightgrey",
                borderwidth=1
            )
        )
        if save:
            save_plot(
                fig,
                results_subdir,
                f'lowest_cost_models_{bench}_{performance_lower_bound}-{performance_upper_bound}_with_trendline',
                extensions=['png'],
            )

cheapest_model_results_df = pd.DataFrame(results)
cheapest_model_results_df.to_csv(results_subdir + 'cheapest_model_results.csv', index=False)

# Create a summary DataFrame
summary_data = []
for bench in benchmarks:
    if bench == 'LMSys Chatbot Arena ELO':
        continue
    bench_results = cheapest_model_results_df[cheapest_model_results_df['bench'] == bench]
    if len(bench_results) == 0:
        continue
    # Get all performance lower bounds used for this benchmark
    perf_bounds = list(bench_results['performance_range'])
    
    # Calculate geometric mean of price reduction factors
    cost_factors = bench_results['cost_reduction_factor_per_year'].dropna()
    geomean = np.exp(np.mean(np.log(cost_factors))) if len(cost_factors) > 0 else np.nan
    
    # Get range of price reduction factors
    factor_range = [cost_factors.min(), cost_factors.max()] if len(cost_factors) > 0 else []
    
    summary_data.append({
        'bench': bench,
        'performance_range': perf_bounds,
        'cost_reduction_factor_per_year_geomean': round(geomean),
        'cost_reduction_factor_per_year_range': factor_range
    })

cheapest_model_summary_df = pd.DataFrame(summary_data)
cheapest_model_summary_df.to_csv(results_subdir + 'cheapest_model_summary.csv', index=False)

MMLU
No eval data for MMLU - skipping


GPQA Diamond
Frontier model GPT-3 is missing a GPQA Diamond value - skipping

Performance range: 33.0 (GPT-4-0314 level) to 100
2024-03-01 00:00:00 Mistral-Large-2024-02 36.0 $0.003942
2024-04-01 00:00:00 Claude-3-Haiku 33.0 $0.000320
2024-06-01 00:00:00 Gemini-1.5-Flash-2024-05 39.0 $0.000080
Less than 4 cheapest models found - skipping

Performance range: 50.0 (GPT-4 Turbo level) to 100
2024-04-01 00:00:00 Claude-3-Opus 50.0 $0.021734
2024-06-01 00:00:00 GPT-4o-2024-05 53.0 $0.006448
2024-07-01 00:00:00 Claude-3.5-Sonnet-2024-06 56.0 $0.004296
2024-08-01 00:00:00 Llama-3.1-Instruct-405B 50.0 $0.003208
2024-10-01 00:00:00 Gemini-1.5-Pro-2024-09 61.0 $0.001513
2025-01-01 00:00:00 Phi 4 53.0 $0.000114

Performance range: 53.0 (GPT-4o-2024-05 level) to 100
2024-06-01 00:00:00 GPT-4o-2024-05 53.0 $0.006448
2024-07-01 00:00:00 Claude-3.5-Sonnet-2024-06 56.0 $0.004296
2024-10-01 00:00:00 Gemini-1.5-Pro-2024-09 61.0 $0.001513
2025-01-01 00:00:00 Phi 4 