In [1]:
import numpy as np
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import statsmodels.formula.api as smf

from data import load_pcd_df
from plotting import save_plot

pio.templates.default = "plotly_white"

In [2]:
results_dir = 'results/2025-02-12_performance_range/'
os.makedirs(results_dir, exist_ok=True)

In [3]:
save = True
benchmarks = ['MMLU', 'GPQA Diamond', 'MATH 5', 'MATH-500', 'HumanEval']
benchmark_is_mqa = {'MMLU': True, 'GPQA Diamond': True, 'MATH 5': False, 'MATH-500': False, 'HumanEval': False}
min_num_data_points_for_regression = 4

# Using Artificial Analysis data

## Load Artificial Analysis dataset

In [4]:
aa_df = pd.read_csv('data/aa_data_with_math5.csv')
aa_df

Unnamed: 0,Model Name,Release Date,USD per 1M Tokens,Tokens per Second,Prefill Latency (s),Prompt Length (tk),MMLU,GPQA Diamond,MATH 5,MATH-500,HumanEval,LMSys Chatbot Arena ELO
0,Claude-3-Haiku,2024-03,0.5,122.7,0.467,1000,71,33,13.0,39.0,77.0,1179.0
1,Claude-3-Opus,2024-03,30.0,26.5,1.984,1000,84,50,34.0,64.0,83.0,1248.0
2,Claude-3-Sonnet,2024-03,6.0,61.8,0.789,1000,77,37,16.0,41.0,71.0,1201.0
3,Claude-3.5-Haiku,2024-06,1.6,64.2,0.768,1000,81,37,,67.0,87.0,
4,Claude-3.5-Sonnet-2024-06,2024-06,6.0,55.9,0.906,1000,88,56,46.0,71.0,90.0,1268.0
5,Claude-3.5-Sonnet-2024-10,2024-10,6.0,55.2,0.907,1000,89,58,53.0,76.0,96.0,1282.0
6,Gemini-1.5-Flash-2024-05,2024-05,0.13,298.4,0.307,1000,79,39,23.0,55.0,,1227.0
7,Gemini-1.5-Flash-2024-09,2024-09,0.13,190.5,0.348,1000,75,45,58.0,83.0,83.0,1271.0
8,Gemini-1.5-Flash-8B,2024-10,0.07,285.2,0.335,1000,75,30,,70.0,,1211.0
9,Gemini-1.5-Pro-2024-05,2024-05,2.19,64.8,0.738,1000,86,46,41.0,66.0,,1260.0


In [5]:
# 'Release Date' is a string with the format 'YYYY-MM'
aa_df['Release Date'] = pd.to_datetime(aa_df['Release Date'].str.strip(), format='%Y-%m')

## Load Epoch AI price dataset

In [6]:
api_price_df = pd.read_csv('data/epoch_ai_price_data_not_in_aa_with_benchmarks.csv')
api_price_df['Release Date'] = pd.to_datetime(api_price_df['Release Date'].str.strip(), format='%Y-%m-%d')
api_price_df

Unnamed: 0,Model Name,Release Date,USD per 1M Tokens,Tokens per Second,Prefill Latency (s),Prompt Length (tk),MMLU,GPQA Diamond,MATH 5,MATH-500,HumanEval,LMSys Chatbot Arena ELO
0,Claude 2,2024-08-12,12.0,,,,78.5,35.0,10.0,,,
1,Claude 2.1,2024-08-12,12.0,,,,,36.0,11.0,,,
2,Claude Instant,2024-08-12,1.2,,,,,,,,,
3,Cohere Command,2024-08-13,1.625,,,,,,,,,
4,Cohere Command Light,2024-08-13,0.375,,,,,,,,,
5,Command R,2024-08-13,0.75,,,,,,,,,
6,Command R+,2024-08-13,6.0,,,,75.7,,,,,
7,Command R+,2024-09-13,4.375,,,,75.7,,,,,
8,DeepSeek-Coder-V2 236B,2024-09-11,0.175,,,,79.2,,,,,
9,DeepSeek-R1,2025-01-20,0.96,,,,,71.7,93.1,,,


In [7]:
# Merge the two datasets
aa_df = pd.concat([aa_df, api_price_df])
aa_df.sort_values(by='Release Date', inplace=True)
# Reset the index
aa_df.reset_index(drop=True, inplace=True)
aa_df

Unnamed: 0,Model Name,Release Date,USD per 1M Tokens,Tokens per Second,Prefill Latency (s),Prompt Length (tk),MMLU,GPQA Diamond,MATH 5,MATH-500,HumanEval,LMSys Chatbot Arena ELO
0,GPT-3 175B (davinci),2021-11-20,60.0000,,,,43.9,,,,,
1,GPT-3 175B (davinci),2022-08-31,60.0000,,,,43.9,,,,,
2,GPT-3 175B (davinci),2022-09-01,20.0000,,,,43.9,,,,,
3,GPT-3.5,2022-11-30,20.0000,,,,64.8,,,,,
4,GPT-4,2023-03-01,37.5000,23.6,0.724,1000.0,86.0,33.0,23.0,21.0,67.0,1186.0
...,...,...,...,...,...,...,...,...,...,...,...,...
63,o1,2024-12-17,26.2500,,,,,75.8,94.4,,,
64,DeepSeek-V3,2024-12-26,0.4775,,,,,57.0,65.0,,,
65,DeepSeek-R1,2025-01-20,0.9600,,,,,71.7,93.1,,,
66,o1-mini,2025-01-31,1.9250,,,,,59.5,84.3,,,


In [8]:
# Rename all instances of 'GPT-3 175B (davinci)' to 'GPT-3'
aa_df.loc[aa_df['Model Name'] == 'GPT-3 175B (davinci)', 'Model Name'] = 'GPT-3'
aa_df


Unnamed: 0,Model Name,Release Date,USD per 1M Tokens,Tokens per Second,Prefill Latency (s),Prompt Length (tk),MMLU,GPQA Diamond,MATH 5,MATH-500,HumanEval,LMSys Chatbot Arena ELO
0,GPT-3,2021-11-20,60.0000,,,,43.9,,,,,
1,GPT-3,2022-08-31,60.0000,,,,43.9,,,,,
2,GPT-3,2022-09-01,20.0000,,,,43.9,,,,,
3,GPT-3.5,2022-11-30,20.0000,,,,64.8,,,,,
4,GPT-4,2023-03-01,37.5000,23.6,0.724,1000.0,86.0,33.0,23.0,21.0,67.0,1186.0
...,...,...,...,...,...,...,...,...,...,...,...,...
63,o1,2024-12-17,26.2500,,,,,75.8,94.4,,,
64,DeepSeek-V3,2024-12-26,0.4775,,,,,57.0,65.0,,,
65,DeepSeek-R1,2025-01-20,0.9600,,,,,71.7,93.1,,,
66,o1-mini,2025-01-31,1.9250,,,,,59.5,84.3,,,


## Explore the data

In [9]:
# Plot 'MMLU' vs. 'USD per 1M Tokens'
fig = px.scatter(aa_df, x='MMLU', y='USD per 1M Tokens', hover_data=['Model Name'], title='MMLU vs. USD per 1M Tokens')
fig.update_layout(yaxis_type='log')
fig.show()


In [10]:
aa_df['MMLU price-performance'] = aa_df['MMLU'] / aa_df['USD per 1M Tokens']
# Plot 'MMLU price-performance' vs. 'Date'
fig = px.scatter(aa_df, x='Release Date', y='MMLU price-performance', title='MMLU price-performance over time', hover_data=['Model Name'])
fig.update_layout(yaxis_type='log')
fig.show()


In [11]:
aa_df['Model Name and Date'] = aa_df['Model Name'] + ' (' + aa_df['Release Date'].dt.strftime('%Y-%m') + ')'
aa_df['Model Name and Date']

0           GPT-3 (2021-11)
1           GPT-3 (2022-08)
2           GPT-3 (2022-09)
3         GPT-3.5 (2022-11)
4           GPT-4 (2023-03)
              ...          
63             o1 (2024-12)
64    DeepSeek-V3 (2024-12)
65    DeepSeek-R1 (2025-01)
66        o1-mini (2025-01)
67        o3-mini (2025-01)
Name: Model Name and Date, Length: 68, dtype: object

In [12]:
# Construct a list of 'Model Name' values that were ever in the top-n based on 'MMLU price-performance'
top_n = 1
aa_df = aa_df.sort_values(by='Release Date')
ever_top_n_models = set()

unique_dates = aa_df['Release Date'].sort_values().unique()

for date in unique_dates:
    df_up_to_date = aa_df[aa_df['Release Date'] <= date]
    top_n_models = df_up_to_date.nlargest(top_n, 'MMLU price-performance')
    top_n_model_names = top_n_models['Model Name and Date'].tolist()
    ever_top_n_models.update(top_n_model_names)

ever_top_n_list = sorted(ever_top_n_models)
print(ever_top_n_list)

# Plot top-n models over time
fig = px.scatter(aa_df[aa_df['Model Name and Date'].isin(ever_top_n_list)], x='Release Date', y='MMLU price-performance', title='MMLU price-performance over time', hover_data=['Model Name and Date'])
fig.update_layout(yaxis_type='log')
fig.show()

['GPT-3 (2021-11)', 'GPT-3 (2022-09)', 'GPT-3.5 (2022-11)', 'GPT-3.5 Turbo (2023-03)', 'GPT-3.5 Turbo (2023-11)', 'Gemini-1.5-Flash-2024-05 (2024-05)', 'Gemini-1.5-Flash-8B (2024-10)', 'Llama 2-7B (2023-12)', 'Llama-2-Chat-13B (2023-07)', 'Llama-3-Instruct-8B (2024-04)', 'Llama-3.1-Instruct-8B (2024-07)', 'Llama-3.2-Instruct-3B (2024-09)']


## Try fitting a regression to lowest-priced models above a performance lower bound

In [13]:
"""
  - Set a performance lower bound
  - Track the running best (top) model
  - At each point in time (at some resolution)
    - Filter to new models published in this time window
    - Filter to models with performance above the lower bound
    - Check if any new model is cheaper than current best
    - If so, update the current best
    - Record the current best model at this time point
"""
bench = 'GPQA Diamond'
performance_lower_bound = 50
performance_upper_bound = 60
ts = pd.date_range(start='2020-01-01', end='2025-01-01', freq='MS')
cheapest_models = []
current_best = None

for i, t in enumerate(ts):
    # Get models published in this time window
    benchmark_df = aa_df
    if i > 0:
        prev_t = ts[i-1]
        benchmark_df = benchmark_df[(benchmark_df['Release Date'] >= prev_t) & (benchmark_df['Release Date'] < t)]
    else:
        benchmark_df = benchmark_df[benchmark_df['Release Date'] < t]
        
    # Filter for performance
    benchmark_df = benchmark_df[
        benchmark_df[bench].notna() &
        (benchmark_df[bench] > performance_lower_bound) &
        (benchmark_df[bench] <= performance_upper_bound)
    ]
    
    if not benchmark_df.empty:
        # Find cheapest new model
        new_best = benchmark_df.loc[benchmark_df['USD per 1M Tokens'].idxmin()]
        
        # Update current best if new model is cheaper (or if no current best)
        if current_best is None or new_best['USD per 1M Tokens'] < current_best['USD per 1M Tokens']:
            current_best = new_best
            cheapest_models.append(current_best)
            print(t, current_best['Model Name'], current_best[bench], f"${current_best['USD per 1M Tokens']:.2f}")
        # elif new_best['USD per 1M Tokens'] == current_best['USD per 1M Tokens']:
        #     # Choose the model with the highest performance
        #     if new_best[bench] > current_best[bench]:
        #         current_best = new_best
        #         cheapest_models.append(current_best)
        #         print(t, current_best['Model Name'], current_best[bench], f"${current_best['USD per 1M Tokens']:.2f}")


2024-06-01 00:00:00 GPT-4o-2024-05 51.0 $7.50
2024-07-01 00:00:00 Claude-3.5-Sonnet-2024-06 56.0 $6.00
2024-09-01 00:00:00 GPT-4o-2024-08 51.0 $4.38
2025-01-01 00:00:00 DeepSeek-V3 57.0 $0.48


In [14]:
cheapest_models_df = pd.DataFrame(cheapest_models)
cheapest_models_df.head()

Unnamed: 0,Model Name,Release Date,USD per 1M Tokens,Tokens per Second,Prefill Latency (s),Prompt Length (tk),MMLU,GPQA Diamond,MATH 5,MATH-500,HumanEval,LMSys Chatbot Arena ELO,MMLU price-performance,Model Name and Date
28,GPT-4o-2024-05,2024-05-01,7.5,86.4,0.687,1000.0,87.0,51.0,48.0,79.0,93.0,1285.0,11.6,GPT-4o-2024-05 (2024-05)
33,Claude-3.5-Sonnet-2024-06,2024-06-01,6.0,55.9,0.906,1000.0,88.0,56.0,46.0,71.0,90.0,1268.0,14.666667,Claude-3.5-Sonnet-2024-06 (2024-06)
39,GPT-4o-2024-08,2024-08-01,4.38,81.0,0.673,1000.0,89.0,51.0,47.0,80.0,93.0,1337.0,20.319635,GPT-4o-2024-08 (2024-08)
64,DeepSeek-V3,2024-12-26,0.4775,,,,,57.0,65.0,,,,,DeepSeek-V3 (2024-12)


In [15]:
# Fit a line to the data
cheapest_models_df['price'] = cheapest_models_df['USD per 1M Tokens']
cheapest_models_df['log_price'] = np.log10(cheapest_models_df['USD per 1M Tokens'])
cheapest_models_df['date'] = cheapest_models_df['Release Date'].map(lambda x: pd.Timestamp(x).toordinal())
exponential_model = smf.ols('log_price ~ date', data=cheapest_models_df).fit()
print(exponential_model.summary())

linear_model = smf.ols('price ~ date', data=cheapest_models_df).fit()
print(linear_model.summary())

                            OLS Regression Results                            
Dep. Variable:              log_price   R-squared:                       0.962
Model:                            OLS   Adj. R-squared:                  0.944
Method:                 Least Squares   F-statistic:                     51.27
Date:                Wed, 12 Feb 2025   Prob (F-statistic):             0.0190
Time:                        16:50:41   Log-Likelihood:                 3.8456
No. Observations:                   4   AIC:                            -3.691
Df Residuals:                       2   BIC:                            -4.919
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   3767.6983    526.123      7.161      0.0


omni_normtest is not valid with less than 8 observations; 4 samples were given.


omni_normtest is not valid with less than 8 observations; 4 samples were given.



In [16]:
# Calculate annual rate of decrease
annual_slope = exponential_model.params['date'] * 365  # Convert daily to annual
annual_factor = int(round(10**(-annual_slope)))  # Convert log slope to factor

# Plot the exponential trendline with the data
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=cheapest_models_df['Release Date'],
    y=10**exponential_model.predict(cheapest_models_df['date']),
    mode='lines',
    name=f'Trendline: {annual_factor}x decrease per year',
    line=dict(color='lightgrey', dash='dash')
))
fig.add_trace(go.Scatter(
    x=cheapest_models_df['Release Date'],
    y=cheapest_models_df['USD per 1M Tokens'],
    mode='markers+text',
    name=f'Cheapest models with {performance_lower_bound}-{performance_upper_bound} on {bench}',
    text=cheapest_models_df['Model Name'],
    textposition='bottom left',
    marker=dict(color='blue')
    # line=dict(shape='hv'),
))
other_models_df = aa_df[(aa_df[bench].notna()) & (aa_df[bench] > performance_lower_bound) & ~(aa_df['Model Name'].isin(cheapest_models_df['Model Name']))]
fig.add_trace(go.Scatter(
    x=other_models_df['Release Date'],
    y=other_models_df['USD per 1M Tokens'],
    mode='markers+text',
    name=f'Other models with {bench} > {performance_lower_bound}%',
    text=other_models_df['Model Name'],
    marker=dict(color='lightblue')
))
fig.update_layout(
    title=f'Price of the cheapest model with {performance_lower_bound}-{performance_upper_bound}% on {bench}'
)
fig.update_traces(textposition='bottom left')
fig.update_layout(yaxis_type='log')
fig.update_layout(xaxis_title='Month')
fig.update_layout(yaxis_title='Price in USD per million tokens')
# Lower the lower x limit
min_date = min(cheapest_models_df['Release Date'].min(), other_models_df['Release Date'].min())
max_date = max(cheapest_models_df['Release Date'].max(), other_models_df['Release Date'].max())
fig.update_layout(xaxis_range=[min_date - pd.Timedelta(days=90), max_date + pd.Timedelta(days=30)])
fig.update_layout(
    width=800,
    height=600,
    font=dict(size=10),
    legend=dict(
        yanchor="top",
        y=0.25,
        xanchor="right",
        x=0.4,
        bordercolor="lightgrey",
        borderwidth=1
    )
)
if save:
    os.makedirs(results_dir + 'aa_cheapest_models_examples/', exist_ok=True)
    save_plot(fig, results_dir + 'aa_cheapest_models_examples/', f'aa_cheapest_models_{bench}_above_{performance_lower_bound}_with_trendline')
fig.show()

## Regression on cheapest models above a performance lower bound, one benchmark at a time

In [36]:
performance_step = 10
performance_delta = 10

os.makedirs(results_dir + 'aa_cheapest_models_run/', exist_ok=True)

results = []
for i, bench in enumerate(benchmarks):
    if i > 0:
        print('\n')
    print(f'{bench}')

    if benchmark_is_mqa[bench]:
        performance_lower_bounds = range(30, 100, performance_step)
    else:
        performance_lower_bounds = range(10, 100, performance_step)
    for performance_lower_bound in performance_lower_bounds:
        performance_upper_bound = min(performance_lower_bound + performance_delta, 100)
        print(f'\nPerformance range: {performance_lower_bound}-{performance_upper_bound}%')
        ts = pd.date_range(start='2020-01-01', end='2025-01-01', freq='MS')
        cheapest_models = []
        current_best = None

        for i, t in enumerate(ts):
            # Get models published in this time window
            benchmark_df = aa_df
            if i > 0:
                prev_t = ts[i-1]
                benchmark_df = benchmark_df[(benchmark_df['Release Date'] >= prev_t) & (benchmark_df['Release Date'] < t)]
            else:
                benchmark_df = benchmark_df[benchmark_df['Release Date'] < t]
                
            # Filter for performance
            benchmark_df = benchmark_df[
                benchmark_df[bench].notna() &
                (benchmark_df[bench] > performance_lower_bound) &
                (benchmark_df[bench] <= performance_upper_bound)
            ]
            
            if not benchmark_df.empty:
                # Find cheapest new model
                new_best = benchmark_df.loc[benchmark_df['USD per 1M Tokens'].idxmin()]
                
                # Update current best if new model is cheaper (or if no current best)
                if current_best is None or new_best['USD per 1M Tokens'] < current_best['USD per 1M Tokens']:
                    current_best = new_best
                    cheapest_models.append(current_best)
                    print(t, current_best['Model Name'], current_best[bench], f"${current_best['USD per 1M Tokens']:.2f}")

        cheapest_models_df = pd.DataFrame(cheapest_models)
        if len(cheapest_models_df) < min_num_data_points_for_regression:
            print(f'Less than {min_num_data_points_for_regression} models found')
            continue

        # Fit a line to the data
        cheapest_models_df['price'] = cheapest_models_df['USD per 1M Tokens']
        cheapest_models_df['log_price'] = np.log10(cheapest_models_df['USD per 1M Tokens'])
        cheapest_models_df['date'] = cheapest_models_df['Release Date'].map(lambda x: pd.Timestamp(x).toordinal())
        exponential_model = smf.ols('log_price ~ date', data=cheapest_models_df).fit()

        # Calculate annual rate of decrease
        annual_slope = exponential_model.params['date'] * 365  # Convert daily to annual
        annual_factor = int(round(10**(-annual_slope)))  # Convert log slope to factor
        results.append({
            'bench': bench,
            'performance_range': [performance_lower_bound, performance_upper_bound],
            'sample_size': len(cheapest_models_df),
            'start_date': cheapest_models_df['Release Date'].min(),
            'end_date': cheapest_models_df['Release Date'].max(),
            'price_reduction_factor_per_year': annual_factor,
            'r_squared': round(exponential_model.rsquared, 2),
        })

        # Plot the exponential trendline with the data
        fig = go.Figure()
        fig.add_trace(go.Scatter(
            x=cheapest_models_df['Release Date'],
            y=10**exponential_model.predict(cheapest_models_df['date']),
            mode='lines',
            name=f'Trendline: {annual_factor}x decrease per year',
            line=dict(color='lightgrey', dash='dash')
        ))
        fig.add_trace(go.Scatter(
            x=cheapest_models_df['Release Date'],
            y=cheapest_models_df['USD per 1M Tokens'],
            mode='markers+text',
            name=f'Cheapest models with {performance_lower_bound}-{performance_upper_bound}% on {bench}',
            text=cheapest_models_df['Model Name'],
            textposition='bottom left',
            marker=dict(color='blue')
            # line=dict(shape='hv'),
        ))
        other_models_df = aa_df[(
            aa_df[bench].notna()) &
            (aa_df[bench] > performance_lower_bound) &
            (aa_df[bench] <= performance_upper_bound) &
            ~(aa_df['Model Name'].isin(cheapest_models_df['Model Name'])
        )]
        fig.add_trace(go.Scatter(
            x=other_models_df['Release Date'],
            y=other_models_df['USD per 1M Tokens'],
            mode='markers',
            name=f'Other models with {performance_lower_bound}-{performance_upper_bound}% on {bench}',
            text=other_models_df['Model Name'],
            marker=dict(color='lightblue')
        ))
        fig.update_layout(
            title=f'The cheapest model with {performance_lower_bound}-{performance_upper_bound}% on {bench} has become {annual_factor}x cheaper per year'
        )
        fig.update_traces(textposition='bottom left')
        fig.update_layout(yaxis_type='log')
        fig.update_layout(xaxis_title='Month')
        fig.update_layout(yaxis_title='Price in USD per million tokens')
        # Lower the lower x limit
        min_date = min(cheapest_models_df['Release Date'].min(), other_models_df['Release Date'].min())
        max_date = max(cheapest_models_df['Release Date'].max(), other_models_df['Release Date'].max())
        fig.update_layout(xaxis_range=[min_date - pd.Timedelta(days=90), max_date + pd.Timedelta(days=30)])
        fig.update_layout(
            width=800,
            height=600,
            font=dict(size=10),
            legend=dict(
                yanchor="top",
                y=0.99,
                xanchor="right",
                x=0.99,
                bordercolor="lightgrey",
                borderwidth=1
            )
        )
        if save:
            save_plot(
                fig,
                results_dir + 'aa_cheapest_models_run/',
                f'aa_cheapest_models_{bench}_{performance_lower_bound}-{performance_upper_bound}_with_trendline',
                extensions=['png'],
            )

cheapest_model_results_df = pd.DataFrame(results)
cheapest_model_results_df.to_csv(results_dir + 'aa_cheapest_models_run/cheapest_model_results.csv', index=False)

# Create a summary DataFrame
summary_data = []
for bench in benchmarks:
    bench_results = cheapest_model_results_df[cheapest_model_results_df['bench'] == bench]
    # Get all performance lower bounds used for this benchmark
    perf_bounds = list(bench_results['performance_range'])
    
    # Calculate geometric mean of price reduction factors
    price_factors = bench_results['price_reduction_factor_per_year'].dropna()
    geomean = np.exp(np.mean(np.log(price_factors))) if len(price_factors) > 0 else np.nan
    
    # Get range of price reduction factors
    factor_range = [price_factors.min(), price_factors.max()] if len(price_factors) > 0 else []
    
    summary_data.append({
        'bench': bench,
        'performance_range': perf_bounds,
        'price_reduction_factor_per_year_geomean': round(geomean),
        'price_reduction_factor_per_year_range': factor_range
    })

cheapest_model_summary_df = pd.DataFrame(summary_data)
cheapest_model_summary_df.to_csv(results_dir + 'aa_cheapest_models_run/cheapest_model_summary.csv', index=False)

MMLU

Performance range: 30-40%
2024-10-01 00:00:00 Llama-3.2-Instruct-1B 35.0 $0.05
Less than 4 models found

Performance range: 40-50%
2021-12-01 00:00:00 GPT-3 43.9 $60.00
2022-10-01 00:00:00 GPT-3 43.9 $20.00
2023-08-01 00:00:00 Llama-2-Chat-13B 45.0 $0.56
2024-01-01 00:00:00 Llama 2-7B 45.3 $0.20
2024-05-01 00:00:00 Llama 2-7B 45.3 $0.13

Performance range: 50-60%
2024-01-01 00:00:00 Llama 2-13B 54.8 $0.35
2024-05-01 00:00:00 Llama 2-13B 54.8 $0.22
Less than 4 models found

Performance range: 60-70%
2022-12-01 00:00:00 GPT-3.5 64.8 $20.00
2023-04-01 00:00:00 GPT-3.5 Turbo 68.0 $2.00
2023-12-01 00:00:00 GPT-3.5 Turbo 68.0 $0.75
2024-05-01 00:00:00 Llama-3-Instruct-8B 64.0 $0.15
2024-07-01 00:00:00 Mistral-NeMo 66.0 $0.13
2024-10-01 00:00:00 Llama-3.2-Instruct-3B 64.0 $0.08

Performance range: 70-80%
2024-04-01 00:00:00 Claude-3-Haiku 71.0 $0.50
2024-06-01 00:00:00 Gemini-1.5-Flash-2024-05 79.0 $0.13
2024-08-01 00:00:00 Llama-3.1-Instruct-8B 71.0 $0.10
2024-11-01 00:00:00 Gemini-1.5

## Regression on cheapest models above a performance lower bound, average benchmark

In [31]:
df = aa_df.copy()
# Rename benchmark columns to all lowercase, underscore separated
new_bench_cols = [col.lower().replace(' ', '_').replace('-', '_') for col in benchmarks]
df.rename(columns={bench: new_bench_cols[i] for i, bench in enumerate(benchmarks)}, inplace=True)
df = df.dropna(subset=new_bench_cols)
df['avg_bench'] = df[new_bench_cols].mean(axis=1)
df[['Model Name', 'Release Date', 'avg_bench']].sort_values(by='avg_bench', ascending=False).head(10)

Unnamed: 0,Model Name,Release Date,avg_bench
48,Gemini-1.5-Pro-2024-09,2024-09-01,78.0
60,Claude-3.5-Sonnet-2024-10,2024-10-01,74.4
39,GPT-4o-2024-08,2024-08-01,72.0
28,GPT-4o-2024-05,2024-05-01,71.6
33,Claude-3.5-Sonnet-2024-06,2024-06-01,70.2
61,GPT-4o-2024-11,2024-11-01,69.8
49,Gemini-1.5-Flash-2024-09,2024-09-01,68.8
35,GPT-4o-mini,2024-07-01,68.0
29,Mistral-Large-2-2024-06,2024-06-01,68.0
37,Llama-3.1-Instruct-405B,2024-07-01,67.8


In [35]:
performance_step = 10
performance_delta = 10
performance_lower_bounds = range(10, 100, performance_step)

results_subdir = results_dir + 'aa_cheapest_models_avg_benchmark_run/'
os.makedirs(results_subdir, exist_ok=True)

bench = 'avg_bench'
results = []
for performance_lower_bound in performance_lower_bounds:
    performance_upper_bound = min(performance_lower_bound + performance_delta, 100)
    print(f'\nPerformance range: {performance_lower_bound}-{performance_upper_bound}%')
    ts = pd.date_range(start='2020-01-01', end='2025-01-01', freq='MS')
    cheapest_models = []
    current_best = None

    for i, t in enumerate(ts):
        # Get models published in this time window
        benchmark_df = df
        if i > 0:
            prev_t = ts[i-1]
            benchmark_df = benchmark_df[(benchmark_df['Release Date'] >= prev_t) & (benchmark_df['Release Date'] < t)]
        else:
            benchmark_df = benchmark_df[benchmark_df['Release Date'] < t]
            
        # Filter for performance
        benchmark_df = benchmark_df[
            benchmark_df[bench].notna() &
            (benchmark_df[bench] > performance_lower_bound) &
            (benchmark_df[bench] <= performance_upper_bound)
        ]
        
        if not benchmark_df.empty:
            # Find cheapest new model
            new_best = benchmark_df.loc[benchmark_df['USD per 1M Tokens'].idxmin()]
            
            # Update current best if new model is cheaper (or if no current best)
            if current_best is None or new_best['USD per 1M Tokens'] < current_best['USD per 1M Tokens']:
                current_best = new_best
                cheapest_models.append(current_best)
                print(t, current_best['Model Name'], current_best[bench], f"${current_best['USD per 1M Tokens']:.2f}")

    cheapest_models_df = pd.DataFrame(cheapest_models)
    if len(cheapest_models_df) < min_num_data_points_for_regression:
        print(f'Less than {min_num_data_points_for_regression} models found')
        continue

    # Fit a line to the data
    cheapest_models_df['price'] = cheapest_models_df['USD per 1M Tokens']
    cheapest_models_df['log_price'] = np.log10(cheapest_models_df['USD per 1M Tokens'])
    cheapest_models_df['date'] = cheapest_models_df['Release Date'].map(lambda x: pd.Timestamp(x).toordinal())
    exponential_model = smf.ols('log_price ~ date', data=cheapest_models_df).fit()

    # Calculate annual rate of decrease
    annual_slope = exponential_model.params['date'] * 365  # Convert daily to annual
    annual_factor = int(round(10**(-annual_slope)))  # Convert log slope to factor
    results.append({
        'bench': bench,
        'performance_range': [performance_lower_bound, performance_upper_bound],
        'sample_size': len(cheapest_models_df),
        'start_date': cheapest_models_df['Release Date'].min(),
        'end_date': cheapest_models_df['Release Date'].max(),
        'price_reduction_factor_per_year': annual_factor,
        'r_squared': round(exponential_model.rsquared, 2),
    })

    # Plot the exponential trendline with the data
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=cheapest_models_df['Release Date'],
        y=10**exponential_model.predict(cheapest_models_df['date']),
        mode='lines',
        name=f'Trendline: {annual_factor}x decrease per year',
        line=dict(color='lightgrey', dash='dash')
    ))
    fig.add_trace(go.Scatter(
        x=cheapest_models_df['Release Date'],
        y=cheapest_models_df['USD per 1M Tokens'],
        mode='markers+text',
        name=f'Cheapest models with {performance_lower_bound}-{performance_upper_bound}% on {bench}',
        text=cheapest_models_df['Model Name'],
        textposition='bottom left',
        marker=dict(color='blue')
        # line=dict(shape='hv'),
    ))
    other_models_df = df[(
        df[bench].notna()) &
        (df[bench] > performance_lower_bound) &
        (df[bench] <= performance_upper_bound) &
        ~(df['Model Name'].isin(cheapest_models_df['Model Name'])
    )]
    fig.add_trace(go.Scatter(
        x=other_models_df['Release Date'],
        y=other_models_df['USD per 1M Tokens'],
        mode='markers',
        name=f'Other models with {performance_lower_bound}-{performance_upper_bound}% on {bench}',
        text=other_models_df['Model Name'],
        marker=dict(color='lightblue')
    ))
    fig.update_layout(
        title=f'The cheapest model with {performance_lower_bound}-{performance_upper_bound}% on {bench} has become {annual_factor}x cheaper per year'
    )
    fig.update_traces(textposition='bottom left')
    fig.update_layout(yaxis_type='log')
    fig.update_layout(xaxis_title='Month')
    fig.update_layout(yaxis_title='Price in USD per million tokens')
    # Lower the lower x limit
    min_date = min(cheapest_models_df['Release Date'].min(), other_models_df['Release Date'].min())
    max_date = max(cheapest_models_df['Release Date'].max(), other_models_df['Release Date'].max())
    fig.update_layout(xaxis_range=[min_date - pd.Timedelta(days=90), max_date + pd.Timedelta(days=30)])
    fig.update_layout(
        width=800,
        height=600,
        font=dict(size=10),
        legend=dict(
            yanchor="top",
            y=0.99,
            xanchor="right",
            x=0.99,
            bordercolor="lightgrey",
            borderwidth=1
        )
    )
    if save:
        save_plot(
            fig,
            results_subdir,
            f'aa_cheapest_models_{bench}_{performance_lower_bound}-{performance_upper_bound}_with_trendline',
            extensions=['png'],
        )

cheapest_model_results_df = pd.DataFrame(results)
cheapest_model_results_df.to_csv(results_subdir + 'cheapest_model_results.csv', index=False)

# Create a summary DataFrame
summary_data = []
bench_results = cheapest_model_results_df[cheapest_model_results_df['bench'] == bench]
# Get all performance lower bounds used for this benchmark
perf_bounds = list(bench_results['performance_range'])

# Calculate geometric mean of price reduction factors
price_factors = bench_results['price_reduction_factor_per_year'].dropna()
geomean = np.exp(np.mean(np.log(price_factors))) if len(price_factors) > 0 else np.nan

# Get range of price reduction factors
factor_range = [price_factors.min(), price_factors.max()] if len(price_factors) > 0 else []

summary_data.append({
    'bench': bench,
    'performance_range': perf_bounds,
    'price_reduction_factor_per_year_geomean': round(geomean),
    'price_reduction_factor_per_year_range': factor_range
})

cheapest_model_summary_df = pd.DataFrame(summary_data)
cheapest_model_summary_df.to_csv(results_subdir + 'cheapest_model_summary.csv', index=False)


Performance range: 10-20%
Less than 4 models found

Performance range: 20-30%
Less than 4 models found

Performance range: 30-40%
2024-05-01 00:00:00 Llama-3-Instruct-8B 37.4 $0.15
Less than 4 models found

Performance range: 40-50%
2023-04-01 00:00:00 GPT-4 46.0 $37.50
2023-07-01 00:00:00 GPT-3.5 Turbo 45.36 $3.25
2023-12-01 00:00:00 GPT-3.5 Turbo 45.6 $0.75
2024-04-01 00:00:00 Claude-3-Haiku 46.6 $0.50
2024-07-01 00:00:00 Gemma-2-9B 47.2 $0.13
2024-08-01 00:00:00 Llama-3.1-Instruct-8B 47.4 $0.10

Performance range: 50-60%
2024-05-01 00:00:00 Mistral-8x22 53.2 $1.20
2024-07-01 00:00:00 Gemma-2-27B 53.8 $0.26
Less than 4 models found

Performance range: 60-70%
2023-12-01 00:00:00 GPT-4 Turbo 67.8 $15.00
2024-07-01 00:00:00 Mistral-Large-2-2024-06 68.0 $3.00
2024-08-01 00:00:00 GPT-4o-mini 68.0 $0.26
2024-10-01 00:00:00 Gemini-1.5-Flash-2024-09 68.8 $0.13

Performance range: 70-80%
2024-06-01 00:00:00 GPT-4o-2024-05 71.6 $7.50
2024-07-01 00:00:00 Claude-3.5-Sonnet-2024-06 70.2 $6.00
20

## Regression on top models by price-performance, one benchmark at a time

In [18]:
# One-off analysis for GPQA benchmark
results_subdir = f'aa_top_{top_n}_price_performance_run/'
os.makedirs(results_dir + results_subdir, exist_ok=True)
bench = 'GPQA Diamond'
benchmark_df = aa_df.copy()
benchmark_df = benchmark_df.dropna(subset=[bench])
if benchmark_is_mqa[bench]:
    # Filter for above-random-chance baseline
    # Use 30% rather than 25%, for signififance
    benchmark_df = benchmark_df[benchmark_df[bench] > 30]
benchmark_df = benchmark_df.sort_values(by='Release Date')
benchmark_df[f'{bench} price-performance'] = benchmark_df[bench] / benchmark_df['USD per 1M Tokens']

# Find the rolling top-n models
unique_dates = benchmark_df['Release Date'].unique()
ever_top_n_models = set()
for date in unique_dates:
    df_up_to_date = benchmark_df[benchmark_df['Release Date'] <= date]
    top_n_models = df_up_to_date.nlargest(1, f'{bench} price-performance')  # top_n = 1
    top_n_model_names = top_n_models['Model Name and Date'].tolist()
    ever_top_n_models.update(top_n_model_names)
ever_top_n_list = sorted(ever_top_n_models)
benchmark_df = benchmark_df[benchmark_df['Model Name and Date'].isin(ever_top_n_list)]

if len(benchmark_df) >= min_num_data_points_for_regression:
    # Fit a line to the data
    benchmark_df['price'] = benchmark_df['USD per 1M Tokens']
    benchmark_df['log_price'] = np.log10(benchmark_df['USD per 1M Tokens'])
    benchmark_df['bench'] = benchmark_df[bench]
    benchmark_df['date'] = benchmark_df['Release Date'].map(lambda x: pd.Timestamp(x).toordinal())
    exponential_model = smf.ols('log_price ~ date + bench', data=benchmark_df).fit()

    # Calculate annual rate of decrease
    annual_slope = exponential_model.params['date'] * 365  # Convert daily to annual
    annual_factor = int(round(10**(-annual_slope)))  # Convert log slope to factor
    annual_slope_ci = exponential_model.conf_int(alpha=0.1).loc['date']
    annual_slope_ci_high = int(round(10**(-annual_slope_ci[0] * 365)))
    annual_slope_ci_low = int(round(10**(-annual_slope_ci[1] * 365)))
    
    print(f"Price reduction factor per year (mean): {annual_factor}")
    print(f"90% CI: [{annual_slope_ci_low}, {annual_slope_ci_high}]")
    print(f"Sample size: {len(benchmark_df)}")
    print(f"Date range: {benchmark_df['Release Date'].min()} to {benchmark_df['Release Date'].max()}")

    prediction_dates = pd.date_range(start=benchmark_df['Release Date'].min(), end=benchmark_df['Release Date'].max(), freq='MS')
    reference_model = 'GPT-4'
    reference_performance = aa_df[aa_df['Model Name'] == reference_model][bench].iloc[0]
    prediction_df = pd.DataFrame({
        'date': [pd.Timestamp(d).toordinal() for d in prediction_dates],  # Convert to ordinal dates
        'bench': [reference_performance] * len(prediction_dates)
    })
    prediction_df['log_price'] = exponential_model.predict(prediction_df)
    prediction_df['price'] = 10**prediction_df['log_price']

    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=prediction_dates,
        y=prediction_df['price'],
        mode='lines',
        name=f'Regression: {annual_factor}x decrease per year at {reference_model} performance',
        line=dict(color='lightgrey', dash='dash')
    ))
    fig.add_trace(go.Scatter(
        x=benchmark_df['Release Date'],
        y=benchmark_df['USD per 1M Tokens'],
        mode='markers+text',
        name='Model with the best price-performance at the time',
        text=benchmark_df['Model Name'],
        textposition='bottom left',
        marker=dict(color='blue'),
    ))
    other_models_df = aa_df[(
        aa_df[bench].notna()) &
        ~(aa_df['Model Name'].isin(benchmark_df['Model Name'])
    )]
    fig.add_trace(go.Scatter(
        x=other_models_df['Release Date'],
        y=other_models_df['USD per 1M Tokens'],
        mode='markers',
        name=f'Other models',
        text=other_models_df['Model Name'],
        marker=dict(color='lightblue')
    ))
    fig.update_layout(
        title=f'Price of models with the best price-performance on {bench}'
    )
    fig.update_traces(textposition='bottom left')
    fig.update_layout(yaxis_type='log')
    fig.update_layout(xaxis_title='Month')
    fig.update_layout(yaxis_title='Price in USD per million tokens')
    # Lower the lower x limit
    fig.update_layout(xaxis_range=[benchmark_df['Release Date'].min() - pd.Timedelta(days=90), benchmark_df['Release Date'].max()+pd.Timedelta(days=30)])
    fig.update_layout(
        width=800,
        height=600,
        font=dict(size=10),
        legend=dict(
            yanchor="top",
            y=0.99,
            xanchor="right",
            x=0.99,
            bordercolor="lightgrey",
            borderwidth=1
        )
    )
    # if save:
    #     save_plot(
    #         fig,
    #         results_dir + results_subdir,
    #         f'top_{top_n}_price_performance_{bench}_with_trendline',
    #         extensions=['png'],
    #     )
    fig.show()
else:
    print(f'Less than {min_num_data_points_for_regression} models found for {bench}')


Price reduction factor per year (mean): 110
90% CI: [17, 706]
Sample size: 6
Date range: 2023-03-01 00:00:00 to 2024-09-01 00:00:00


In [19]:
top_n = 1
results_subdir = f'aa_top_{top_n}_price_performance_run/'
os.makedirs(results_dir + results_subdir, exist_ok=True)

# Open log file
results = []
for bench in benchmarks:
    benchmark_df = aa_df.copy()
    benchmark_df = benchmark_df.dropna(subset=[bench])
    if benchmark_is_mqa[bench]:
        # Filter for above-random-chance baseline
        # Use 30% rather than 25%, for signififance
        benchmark_df = benchmark_df[benchmark_df[bench] > 30]
    benchmark_df = benchmark_df.sort_values(by='Release Date')
    benchmark_df[f'{bench} price-performance'] = benchmark_df[bench] / benchmark_df['USD per 1M Tokens']

    # Find the rolling top-n models
    unique_dates = benchmark_df['Release Date'].unique()
    ever_top_n_models = set()
    for date in unique_dates:
        df_up_to_date = benchmark_df[benchmark_df['Release Date'] <= date]
        top_n_models = df_up_to_date.nlargest(top_n, f'{bench} price-performance')
        top_n_model_names = top_n_models['Model Name and Date'].tolist()
        ever_top_n_models.update(top_n_model_names)
    ever_top_n_list = sorted(ever_top_n_models)
    benchmark_df = benchmark_df[benchmark_df['Model Name and Date'].isin(ever_top_n_list)]

    if len(benchmark_df) < min_num_data_points_for_regression:
        print(f'Less than {min_num_data_points_for_regression} models found for {bench}')
        continue

    # Fit a line to the data
    benchmark_df['price'] = benchmark_df['USD per 1M Tokens']
    benchmark_df['log_price'] = np.log10(benchmark_df['USD per 1M Tokens'])
    benchmark_df['bench'] = benchmark_df[bench]
    benchmark_df['date'] = benchmark_df['Release Date'].map(lambda x: pd.Timestamp(x).toordinal())
    exponential_model = smf.ols('log_price ~ date + bench', data=benchmark_df).fit()

    # Calculate annual rate of decrease
    annual_slope = exponential_model.params['date'] * 365  # Convert daily to annual
    annual_factor = int(round(10**(-annual_slope)))  # Convert log slope to factor
    annual_slope_ci = exponential_model.conf_int(alpha=0.1).loc['date']
    annual_slope_ci_high = int(round(10**(-annual_slope_ci[0] * 365)))
    annual_slope_ci_low = int(round(10**(-annual_slope_ci[1] * 365)))
    results.append({
        'bench': bench,
        'sample_size': len(benchmark_df),
        'start_date': benchmark_df['Release Date'].min(),
        'end_date': benchmark_df['Release Date'].max(),
        'price_reduction_factor_per_year_mean': annual_factor,
        'price_reduction_factor_per_year_ci': [annual_slope_ci_low, annual_slope_ci_high],
        'r_squared': round(exponential_model.rsquared, 2),
    })

    # Plot the exponential trendline with the data

    # Use GPT-4 as the reference point
    prediction_dates = pd.date_range(start=benchmark_df['Release Date'].min(), end=benchmark_df['Release Date'].max(), freq='MS')
    reference_model = 'GPT-4'
    reference_performance = aa_df[aa_df['Model Name'] == reference_model][bench].iloc[0]
    prediction_df = pd.DataFrame({
        'date': [pd.Timestamp(d).toordinal() for d in prediction_dates],  # Convert to ordinal dates
        'bench': [reference_performance] * len(prediction_dates)
    })
    prediction_df['log_price'] = exponential_model.predict(prediction_df)
    prediction_df['price'] = 10**prediction_df['log_price']

    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=prediction_dates,
        y=prediction_df['price'],
        mode='lines',
        name=f'Regression: {annual_factor}x decrease per year at {reference_model} performance',
        line=dict(color='lightgrey', dash='dash')
    ))
    fig.add_trace(go.Scatter(
        x=benchmark_df['Release Date'],
        y=benchmark_df['USD per 1M Tokens'],
        mode='markers+text',
        name='Model with the best price-performance at the time',
        text=benchmark_df['Model Name'],
        textposition='bottom left',
        marker=dict(color='blue'),
    ))
    other_models_df = aa_df[(
        aa_df[bench].notna()) &
        ~(aa_df['Model Name'].isin(benchmark_df['Model Name'])
    )]
    fig.add_trace(go.Scatter(
        x=other_models_df['Release Date'],
        y=other_models_df['USD per 1M Tokens'],
        mode='markers',
        name=f'Other models',
        text=other_models_df['Model Name'],
        marker=dict(color='lightblue')
    ))
    fig.update_layout(
        title=f'Models with the best price-performance on {bench} get {annual_factor}x cheaper per year'
    )
    fig.update_traces(textposition='bottom left')
    fig.update_layout(yaxis_type='log')
    fig.update_layout(xaxis_title='Month')
    fig.update_layout(yaxis_title='Price in USD per million tokens')
    # Lower the lower x limit
    fig.update_layout(xaxis_range=[benchmark_df['Release Date'].min() - pd.Timedelta(days=90), benchmark_df['Release Date'].max()+pd.Timedelta(days=30)])
    fig.update_layout(
        width=800,
        height=600,
        font=dict(size=10),
        legend=dict(
            yanchor="top",
            y=0.99,
            xanchor="right",
            x=0.99,
            bordercolor="lightgrey",
            borderwidth=1
        )
    )
    if save:
        save_plot(
            fig,
            results_dir + results_subdir,
            f'top_{top_n}_price_performance_{bench}_with_trendline',
            extensions=['png'],
        )

top_n_price_performance_results_df = pd.DataFrame(results)
top_n_price_performance_results_df.to_csv(results_dir + results_subdir + 'top_n_price_performance_results.csv', index=False)

## Regression on all data, one benchmark at a time

In [20]:
# Fit a regression to log_price ~ date + bench
bench = 'MMLU'
aa_df['price'] = aa_df['USD per 1M Tokens']
aa_df['log_price'] = np.log10(aa_df['price'])
aa_df['date'] = aa_df['Release Date'].map(lambda x: pd.Timestamp(x).toordinal())
aa_df['bench'] = aa_df[bench]

# Fit the model
model = smf.ols('log_price ~ date + bench', data=aa_df).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              log_price   R-squared:                       0.514
Model:                            OLS   Adj. R-squared:                  0.495
Method:                 Least Squares   F-statistic:                     28.01
Date:                Wed, 12 Feb 2025   Prob (F-statistic):           5.02e-09
Time:                        16:50:43   Log-Likelihood:                -48.089
No. Observations:                  56   AIC:                             102.2
Df Residuals:                      53   BIC:                             108.3
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   2143.3714    303.000      7.074      0.0

In [21]:
print(model.params)
print(model.conf_int())


Intercept    2143.371352
date           -0.002904
bench           0.033162
dtype: float64
                     0            1
Intercept  1535.629985  2751.112718
date         -0.003727    -0.002081
bench         0.021552     0.044772


In [22]:
# Date parameter
date_param = model.params['date'] * 365  # Convert daily to annual
annual_factor = int(round(10**(-date_param)))  # Convert log slope to factor
print(f'It costs {annual_factor}x less each year to keep {bench} performance fixed.')

It costs 11x less each year to keep MMLU performance fixed.


In [23]:
model.conf_int().loc['date'][0]


-0.003726549630275529

In [24]:
results = []
for bench in benchmarks:
    print(f'{bench}')
    df = aa_df.copy()
    df = df.dropna(subset=[bench])
    df['bench'] = df[bench]

    model = smf.ols('log_price ~ date + bench', data=df).fit()
    # Print number of observations and R-squared
    print(f'Number of observations: {len(df)}')
    print(f'R-squared: {model.rsquared:.2f}')
    date_param = model.params['date'] * 365  # Convert daily to annual
    date_param_ci = model.conf_int(alpha=0.1).loc['date']
    date_param_ci_low = date_param_ci[0] * 365
    date_param_ci_high = date_param_ci[1] * 365
    annual_factor = int(round(10**(-date_param)))  # Convert log slope to factor
    annual_factor_ci_high = int(round(10**(-date_param_ci_low)))  # Convert log slope to factor
    annual_factor_ci_low = int(round(10**(-date_param_ci_high)))  # Convert log slope to factor

    min_date = pd.Timestamp.fromordinal(df["date"].min()).date()
    max_date = pd.Timestamp.fromordinal(df["date"].max()).date()
    date_range = max_date - min_date
    results.append({
        'bench': bench,
        'sample_size': len(df),
        'start_date': min_date,
        'end_date': max_date,
        'price_reduction_factor_per_year_mean': annual_factor,
        'price_reduction_factor_per_year_ci': [annual_factor_ci_low, annual_factor_ci_high],
        'r_squared': round(model.rsquared, 2),
    })

    print(f'The price to achieve a fixed level of {bench} performance fell at a rate of')
    print(f'{annual_factor}x [{annual_factor_ci_low}, {annual_factor_ci_high}] per year')
    print(f'over {date_range.days / 365:.1f} years ({min_date} to {max_date})')
    print()


results_subdir = results_dir + 'aa_all_data_single_benchmark/'
os.makedirs(results_subdir, exist_ok=True)
single_benchmark_regression_results_df = pd.DataFrame(results)
single_benchmark_regression_results_df.to_csv(results_subdir + 'single_benchmark_regression_results.csv', index=False)

MMLU
Number of observations: 56
R-squared: 0.51
The price to achieve a fixed level of MMLU performance fell at a rate of
11x [6, 20] per year
over 3.0 years (2021-11-20 to 2024-11-01)

GPQA Diamond
Number of observations: 49
R-squared: 0.52
The price to achieve a fixed level of GPQA Diamond performance fell at a rate of
11x [5, 25] per year
over 1.9 years (2023-03-01 to 2025-01-31)

MATH 5
Number of observations: 41
R-squared: 0.24
The price to achieve a fixed level of MATH 5 performance fell at a rate of
9x [2, 31] per year
over 1.9 years (2023-03-01 to 2025-01-31)

MATH-500
Number of observations: 35
R-squared: 0.46
The price to achieve a fixed level of MATH-500 performance fell at a rate of
37x [10, 132] per year
over 1.7 years (2023-03-01 to 2024-11-01)

HumanEval
Number of observations: 33
R-squared: 0.48
The price to achieve a fixed level of HumanEval performance fell at a rate of
12x [4, 35] per year
over 1.7 years (2023-03-01 to 2024-11-01)



In [25]:
bench = 'GPQA Diamond'
df = aa_df.copy()
df = df.dropna(subset=[bench])
df['bench'] = df[bench]

model = smf.ols('log_price ~ date + bench', data=df).fit()
date_param = model.params['date'] * 365  # Convert daily to annual
date_param_ci = model.conf_int(alpha=0.1).loc['date']
date_param_ci_low = date_param_ci[0] * 365
date_param_ci_high = date_param_ci[1] * 365
annual_factor = int(round(10**(-date_param)))  # Convert log slope to factor
annual_factor_ci_high = int(round(10**(-date_param_ci_low)))  # Convert log slope to factor
annual_factor_ci_low = int(round(10**(-date_param_ci_high)))  # Convert log slope to factor

# Create a contour plot to visualize the regression model surface

# Create a grid of date and price values
date_range = np.linspace(
    df['date'].min(),
    df['date'].max(),
    100
)
price_range = np.linspace(
    df['log_price'].min(),
    df['log_price'].max(),
    100
)
date_mesh, price_mesh = np.meshgrid(date_range, price_range)

# Create prediction data
pred_data = pd.DataFrame({
    'date': date_mesh.ravel(),
    'log_price': price_mesh.ravel()
})

# Get predicted bench values
# Need to rearrange the regression equation to solve for bench
# log_price = b0 + b1*date + b2*bench
# bench = (log_price - b0 - b1*date) / b2
b0 = model.params['Intercept']
b1 = model.params['date']
b2 = model.params['bench']
pred_bench = (pred_data['log_price'] - b0 - b1*pred_data['date']) / b2

# Reshape predictions for contour plot
Z = pred_bench.to_numpy().reshape(date_mesh.shape)

# Create contour plot
fig = go.Figure(data=[
    go.Contour(
        x=date_range,
        y=10**price_range, # Convert back to regular price scale
        z=Z,
        contours=dict(
            start=30,
            end=90,
            size=10,
            showlabels=False,
            labelfont=dict(size=12),
            coloring='lines'
        ),
        line=dict(width=2),
        colorbar=dict(
            title=bench,
            titleside='right',
        ),
        colorscale='Viridis',
        zmin=30,  # Set consistent range
        zmax=90,
        showscale=False,
    )
])

# Add actual data points
fig.add_trace(go.Scatter(
    x=df['date'],
    y=df['price'],
    text=df['Model Name'],
    mode='markers',
    name='Actual Data',
    marker=dict(
        color=df[bench],  # Color based on benchmark score
        colorscale='Viridis',
        colorbar=dict(
            title=f'{bench} score',
            titleside='right'
        ),
        size=8,
        showscale=True,
        cmin=30,  # Set consistent range
        cmax=90
    )
))

# Update layout
fig.update_layout(
    title=f'For a fixed {bench} score, the usage price of LLMs has fallen by {annual_factor}x per year',
    xaxis_title='Date',
    yaxis_title='Price (USD per 1M tokens)',
    width=800,
    height=600,
    margin=dict(l=50, r=50, t=50, b=50)  # Reduce margins
)

# Remove gridlines
fig.update_layout(
    xaxis=dict(
        # showgrid=False,
        tickfont=dict(size=12)  # Larger x-axis tick labels
    ),
    yaxis=dict(
        # showgrid=False,
        tickfont=dict(size=12)  # Larger y-axis tick labels
    )
)

# Convert x-axis ticks to dates
fig.update_xaxes(
    ticktext=[pd.Timestamp.fromordinal(int(d)).strftime('%Y-%m') for d in date_range[::20]],
    tickvals=date_range[::20]
)

# Use log scale for y-axis
fig.update_layout(yaxis_type='log')

if save:
    save_plot(fig, results_subdir, f'{bench}_contours', extensions=['png'])

fig.show()

## Regression on all data, all benchmarks at once

In [26]:
df = aa_df.copy()
# Rename benchmark columns to all lowercase, underscore separated
new_bench_cols = [col.lower().replace(' ', '_').replace('-', '_') for col in benchmarks]
df.rename(columns={bench: new_bench_cols[i] for i, bench in enumerate(benchmarks)}, inplace=True)
df = df.dropna(subset=new_bench_cols)

model = smf.ols('log_price ~ date + ' + ' + '.join(new_bench_cols), data=df).fit()
# Print number of observations and R-squared
print(f'Number of observations: {len(df)}')
print(f'R-squared: {model.rsquared:.2f}')
date_param = model.params['date'] * 365  # Convert daily to annual
date_param_ci = model.conf_int(alpha=0.1).loc['date']
date_param_ci_low = date_param_ci[0] * 365
date_param_ci_high = date_param_ci[1] * 365
annual_factor = int(round(10**(-date_param)))  # Convert log slope to factor
annual_factor_ci_high = int(round(10**(-date_param_ci_low)))  # Convert log slope to factor
annual_factor_ci_low = int(round(10**(-date_param_ci_high)))  # Convert log slope to factor

min_date = pd.Timestamp.fromordinal(df["date"].min()).date()
max_date = pd.Timestamp.fromordinal(df["date"].max()).date()
date_range = max_date - min_date
results = [{
    'benchmarks': benchmarks,
    'sample_size': len(df),
    'start_date': min_date,
    'end_date': max_date,
    'price_reduction_factor_per_year_mean': annual_factor,
    'price_reduction_factor_per_year_ci': [annual_factor_ci_low, annual_factor_ci_high],
    'r_squared': round(model.rsquared, 2),
}]

print(f'The price to achieve a fixed level of performance fell at a rate of')
print(f'{annual_factor}x [{annual_factor_ci_low}, {annual_factor_ci_high}] per year')
print(f'over {date_range.days / 365:.1f} years ({min_date} to {max_date})')
print()

results_subdir = results_dir + 'all_data_all_benchmarks/'
os.makedirs(results_subdir, exist_ok=True)
all_benchmarks_regression_results_df = pd.DataFrame(results)
all_benchmarks_regression_results_df.to_csv(results_subdir + 'all_benchmarks_regression_results.csv', index=False)

Number of observations: 27
R-squared: 0.79
The price to achieve a fixed level of performance fell at a rate of
15x [5, 49] per year
over 1.7 years (2023-03-01 to 2024-11-01)



## Regression on all data, on the average of benchmark scores

In [27]:
df = aa_df.copy()
# Rename benchmark columns to all lowercase, underscore separated
new_bench_cols = [col.lower().replace(' ', '_').replace('-', '_') for col in benchmarks]
df.rename(columns={bench: new_bench_cols[i] for i, bench in enumerate(benchmarks)}, inplace=True)
df = df.dropna(subset=new_bench_cols)
df['avg_bench'] = df[new_bench_cols].mean(axis=1)
df[['Model Name', 'Release Date', 'avg_bench']].sort_values(by='avg_bench', ascending=False).head(10)

Unnamed: 0,Model Name,Release Date,avg_bench
48,Gemini-1.5-Pro-2024-09,2024-09-01,78.0
60,Claude-3.5-Sonnet-2024-10,2024-10-01,74.4
39,GPT-4o-2024-08,2024-08-01,72.0
28,GPT-4o-2024-05,2024-05-01,71.6
33,Claude-3.5-Sonnet-2024-06,2024-06-01,70.2
61,GPT-4o-2024-11,2024-11-01,69.8
49,Gemini-1.5-Flash-2024-09,2024-09-01,68.8
35,GPT-4o-mini,2024-07-01,68.0
29,Mistral-Large-2-2024-06,2024-06-01,68.0
37,Llama-3.1-Instruct-405B,2024-07-01,67.8


In [28]:
model = smf.ols('log_price ~ date + avg_bench', data=df).fit()
# Print number of observations and R-squared
print(f'Number of observations: {len(df)}')
print(f'R-squared: {model.rsquared:.2f}')
date_param = model.params['date'] * 365  # Convert daily to annual
date_param_ci = model.conf_int(alpha=0.1).loc['date']
date_param_ci_low = date_param_ci[0] * 365
date_param_ci_high = date_param_ci[1] * 365
annual_factor = int(round(10**(-date_param)))  # Convert log slope to factor
annual_factor_ci_high = int(round(10**(-date_param_ci_low)))  # Convert log slope to factor
annual_factor_ci_low = int(round(10**(-date_param_ci_high)))  # Convert log slope to factor

min_date = pd.Timestamp.fromordinal(df["date"].min()).date()
max_date = pd.Timestamp.fromordinal(df["date"].max()).date()
date_range = max_date - min_date
results = [{
    'benchmarks': benchmarks,
    'sample_size': len(df),
    'start_date': min_date,
    'end_date': max_date,
    'price_reduction_factor_per_year_mean': annual_factor,
    'price_reduction_factor_per_year_ci': [annual_factor_ci_low, annual_factor_ci_high],
    'r_squared': round(model.rsquared, 2),
}]

print(f'The price to achieve a fixed level of performance fell at a rate of')
print(f'{annual_factor}x [{annual_factor_ci_low}, {annual_factor_ci_high}] per year')
print(f'over {date_range.days / 365:.1f} years ({min_date} to {max_date})')
print()

results_subdir = results_dir + 'aa_all_data_avg_benchmark/'
os.makedirs(results_subdir, exist_ok=True)
avg_benchmarks_regression_results_df = pd.DataFrame(results)
avg_benchmarks_regression_results_df.to_csv(results_subdir + 'avg_benchmarks_regression_results.csv', index=False)

Number of observations: 27
R-squared: 0.60
The price to achieve a fixed level of performance fell at a rate of
42x [12, 143] per year
over 1.7 years (2023-03-01 to 2024-11-01)



In [29]:
# Create a contour plot to visualize the regression model surface

# Create a grid of date and price values
date_range = np.linspace(
    df['date'].min(),
    df['date'].max(),
    100
)
price_range = np.linspace(
    df['log_price'].min(),
    df['log_price'].max(),
    100
)
date_mesh, price_mesh = np.meshgrid(date_range, price_range)

# Create prediction data
pred_data = pd.DataFrame({
    'date': date_mesh.ravel(),
    'log_price': price_mesh.ravel()
})

# Get predicted bench values
# Need to rearrange the regression equation to solve for bench
# log_price = b0 + b1*date + b2*bench
# bench = (log_price - b0 - b1*date) / b2
b0 = model.params['Intercept']
b1 = model.params['date']
b2 = model.params['avg_bench']
pred_bench = (pred_data['log_price'] - b0 - b1*pred_data['date']) / b2

# Reshape predictions for contour plot
Z = pred_bench.to_numpy().reshape(date_mesh.shape)

# Create contour plot
fig = go.Figure(data=[
    go.Contour(
        x=date_range,
        y=10**price_range,  # Convert back to regular price scale
        z=Z,
        contours=dict(
            start=30,
            end=90,
            size=10,
            showlabels=False,
            labelfont=dict(size=12),
            coloring='lines'
        ),
        line=dict(width=2),
        colorbar=dict(
            title='Average benchmark score',
            titleside='right',
        ),
        colorscale='Viridis',
        zmin=30,  # Set consistent range
        zmax=90,
        showscale=False,
    )
])

# Add actual data points
fig.add_trace(go.Scatter(
    x=df['date'],
    y=df['price'],
    text=df['Model Name'],
    mode='markers',
    name='Actual Data',
    marker=dict(
        color=df['avg_bench'],  # Color based on average benchmark score
        colorscale='Viridis',
        colorbar=dict(
            title='Average benchmark score',
            titleside='right'
        ),
        size=8,
        showscale=True,
        cmin=30,  # Set consistent range
        cmax=90
    )
))

# Update layout
fig.update_layout(
    title=f'For a fixed performance level, the usage price of LLMs has fallen by {annual_factor}x per year',
    xaxis_title='Date',
    yaxis_title='Price (USD per 1M tokens)',
    width=800,
    height=600,
    margin=dict(l=50, r=50, t=50, b=50)  # Reduce margins
)

# Remove gridlines
fig.update_layout(
    xaxis=dict(
        tickfont=dict(size=12)  # Larger x-axis tick labels
    ),
    yaxis=dict(
        tickfont=dict(size=12)  # Larger y-axis tick labels
    )
)

# Convert x-axis ticks to dates
fig.update_xaxes(
    ticktext=[pd.Timestamp.fromordinal(int(d)).strftime('%Y-%m') for d in date_range[::20]],
    tickvals=date_range[::20]
)

# Use log scale for y-axis
fig.update_layout(yaxis_type='log')

if save:
    save_plot(fig, results_subdir, 'avg_bench_contours', extensions=['png'])

fig.show()


## Regression on top models by price-performance on any benchmark, all benchmarks at once

## Summary and comparison of methods

In [30]:
# Compare the cheapest model results to the full regression results
for bench in benchmarks:
    print(f'{bench} trends')
    cheapest_model_result = cheapest_model_summary_df[cheapest_model_summary_df['bench'] == bench].iloc[0]
    top_n_price_performance_result = top_n_price_performance_results_df[top_n_price_performance_results_df['bench'] == bench].iloc[0]
    single_benchmark_regression_result = single_benchmark_regression_results_df[single_benchmark_regression_results_df['bench'] == bench].iloc[0]
    all_benchmarks_regression_result = all_benchmarks_regression_results_df.iloc[0]
    avg_benchmarks_regression_result = avg_benchmarks_regression_results_df.iloc[0]
    print(
        'Cheapest model at threshold:',
        cheapest_model_result['price_reduction_factor_per_year_geomean'],
        cheapest_model_result['price_reduction_factor_per_year_range'],
    )
    print(
        'Top model by price-performance:',
        top_n_price_performance_result['price_reduction_factor_per_year_mean'],
        top_n_price_performance_result['price_reduction_factor_per_year_ci'],
    )
    print(
        'Regression on this benchmark score:',
        single_benchmark_regression_result['price_reduction_factor_per_year_mean'],
        single_benchmark_regression_result['price_reduction_factor_per_year_ci'],
    )
    print(
        'Regression on all benchmark scores:',
        all_benchmarks_regression_result['price_reduction_factor_per_year_mean'],
        all_benchmarks_regression_result['price_reduction_factor_per_year_ci'],
    )
    print(
        'Regression on average benchmark score:',
        avg_benchmarks_regression_result['price_reduction_factor_per_year_mean'],
        avg_benchmarks_regression_result['price_reduction_factor_per_year_ci'],
    )
    print()

MMLU trends
Cheapest model at threshold: 20 [17, 25]
Top model by price-performance: 18 [11, 28]
Regression on this benchmark score: 11 [6, 20]
Regression on all benchmark scores: 15 [5, 49]
Regression on average benchmark score: 42 [12, 143]

GPQA Diamond trends
Cheapest model at threshold: 79 [19, 331]
Top model by price-performance: 110 [17, 706]
Regression on this benchmark score: 11 [5, 25]
Regression on all benchmark scores: 15 [5, 49]
Regression on average benchmark score: 42 [12, 143]

MATH 5 trends
Cheapest model at threshold: 41 [22, 76]
Top model by price-performance: 64 [29, 141]
Regression on this benchmark score: 9 [2, 31]
Regression on all benchmark scores: 15 [5, 49]
Regression on average benchmark score: 42 [12, 143]

MATH-500 trends
Cheapest model at threshold: 342 [22, 20513]
Top model by price-performance: 55 [15, 198]
Regression on this benchmark score: 37 [10, 132]
Regression on all benchmark scores: 15 [5, 49]
Regression on average benchmark score: 42 [12, 143]

