In [1]:
import numpy as np
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import statsmodels.formula.api as smf

from data import load_pcd_df
from plotting import save_plot

pio.templates.default = "plotly_white"

In [2]:
results_dir = 'results/2025-02-10/'
os.makedirs(results_dir, exist_ok=True)

In [3]:
save = True
benchmarks = ['MMLU', 'GPQA Diamond', 'MATH 5', 'MATH-500', 'HumanEval']

# Using Artificial Analysis data

## Load Artificial Analysis dataset

In [4]:
aa_df = pd.read_csv('data/aa_data_with_math5.csv')
aa_df

Unnamed: 0,Model Name,Release Date,USD per 1M Tokens,Tokens per Second,Prefill Latency (s),Prompt Length (tk),MMLU,GPQA Diamond,MATH 5,MATH-500,HumanEval,LMSys Chatbot Arena ELO
0,Claude-3-Haiku,2024-03,0.5,122.7,0.467,1000,71,33,13.0,39.0,77.0,1179.0
1,Claude-3-Opus,2024-03,30.0,26.5,1.984,1000,84,50,34.0,64.0,83.0,1248.0
2,Claude-3-Sonnet,2024-03,6.0,61.8,0.789,1000,77,37,16.0,41.0,71.0,1201.0
3,Claude-3.5-Haiku,2024-06,1.6,64.2,0.768,1000,81,37,,67.0,87.0,
4,Claude-3.5-Sonnet-2024-06,2024-06,6.0,55.9,0.906,1000,88,56,46.0,71.0,90.0,1268.0
5,Claude-3.5-Sonnet-2024-10,2024-10,6.0,55.2,0.907,1000,89,58,53.0,76.0,96.0,1282.0
6,Gemini-1.5-Flash-2024-05,2024-05,0.13,298.4,0.307,1000,79,39,23.0,55.0,,1227.0
7,Gemini-1.5-Flash-2024-09,2024-09,0.13,190.5,0.348,1000,75,45,58.0,83.0,83.0,1271.0
8,Gemini-1.5-Flash-8B,2024-10,0.07,285.2,0.335,1000,75,30,,70.0,,1211.0
9,Gemini-1.5-Pro-2024-05,2024-05,2.19,64.8,0.738,1000,86,46,,66.0,,1260.0


In [5]:
# 'Release Date' is a string with the format 'YYYY-MM'
aa_df['Release Date'] = pd.to_datetime(aa_df['Release Date'].str.strip(), format='%Y-%m')

## Load Epoch AI price dataset

In [6]:
api_price_df = pd.read_csv('data/epoch_ai_price_data_not_in_aa_with_benchmarks.csv')
api_price_df['Release Date'] = pd.to_datetime(api_price_df['Release Date'].str.strip(), format='%Y-%m-%d')
api_price_df

Unnamed: 0,Model Name,Release Date,USD per 1M Tokens,Tokens per Second,Prefill Latency (s),Prompt Length (tk),MMLU,GPQA Diamond,MATH 5,MATH-500,HumanEval,LMSys Chatbot Arena ELO
0,Claude 2,2024-08-12,12.0,,,,78.5,35.0,10.0,,,
1,Claude 2.1,2024-08-12,12.0,,,,,36.0,11.0,,,
2,Claude Instant,2024-08-12,1.2,,,,,,,,,
3,Cohere Command,2024-08-13,1.625,,,,,,,,,
4,Cohere Command Light,2024-08-13,0.375,,,,,,,,,
5,Command R,2024-08-13,0.75,,,,,,,,,
6,Command R+,2024-08-13,6.0,,,,75.7,,,,,
7,Command R+,2024-09-13,4.375,,,,75.7,,,,,
8,DeepSeek-Coder-V2 236B,2024-09-11,0.175,,,,79.2,,,,,
9,DeepSeek-R1,2025-01-20,0.96,,,,,71.7,93.1,,,


In [7]:
# Merge the two datasets
aa_df = pd.concat([aa_df, api_price_df])
aa_df.sort_values(by='Release Date', inplace=True)
# Reset the index
aa_df.reset_index(drop=True, inplace=True)
aa_df

Unnamed: 0,Model Name,Release Date,USD per 1M Tokens,Tokens per Second,Prefill Latency (s),Prompt Length (tk),MMLU,GPQA Diamond,MATH 5,MATH-500,HumanEval,LMSys Chatbot Arena ELO
0,GPT-3 175B (davinci),2021-11-20,60.0000,,,,43.9,,,,,
1,GPT-3 175B (davinci),2022-08-31,60.0000,,,,43.9,,,,,
2,GPT-3 175B (davinci),2022-09-01,20.0000,,,,43.9,,,,,
3,GPT-3.5,2022-11-30,20.0000,,,,64.8,,,,,
4,GPT-4,2023-03-01,37.5000,23.6,0.724,1000.0,86.0,33.0,,21.0,67.0,1186.0
...,...,...,...,...,...,...,...,...,...,...,...,...
63,o1,2024-12-17,26.2500,,,,,75.8,94.4,,,
64,DeepSeek-V3,2024-12-26,0.4775,,,,,,,,,
65,DeepSeek-R1,2025-01-20,0.9600,,,,,71.7,93.1,,,
66,o1-mini,2025-01-31,1.9250,,,,,59.5,84.3,,,


In [8]:
# Rename all instances of 'GPT-3 175B (davinci)' to 'GPT-3'
aa_df.loc[aa_df['Model Name'] == 'GPT-3 175B (davinci)', 'Model Name'] = 'GPT-3'
aa_df


Unnamed: 0,Model Name,Release Date,USD per 1M Tokens,Tokens per Second,Prefill Latency (s),Prompt Length (tk),MMLU,GPQA Diamond,MATH 5,MATH-500,HumanEval,LMSys Chatbot Arena ELO
0,GPT-3,2021-11-20,60.0000,,,,43.9,,,,,
1,GPT-3,2022-08-31,60.0000,,,,43.9,,,,,
2,GPT-3,2022-09-01,20.0000,,,,43.9,,,,,
3,GPT-3.5,2022-11-30,20.0000,,,,64.8,,,,,
4,GPT-4,2023-03-01,37.5000,23.6,0.724,1000.0,86.0,33.0,,21.0,67.0,1186.0
...,...,...,...,...,...,...,...,...,...,...,...,...
63,o1,2024-12-17,26.2500,,,,,75.8,94.4,,,
64,DeepSeek-V3,2024-12-26,0.4775,,,,,,,,,
65,DeepSeek-R1,2025-01-20,0.9600,,,,,71.7,93.1,,,
66,o1-mini,2025-01-31,1.9250,,,,,59.5,84.3,,,


## Explore the data

In [9]:
# Plot 'MMLU' vs. 'USD per 1M Tokens'
fig = px.scatter(aa_df, x='MMLU', y='USD per 1M Tokens', hover_data=['Model Name'], title='MMLU vs. USD per 1M Tokens')
fig.update_layout(yaxis_type='log')
fig.show()


In [10]:
aa_df['MMLU price-performance'] = aa_df['MMLU'] / aa_df['USD per 1M Tokens']
# Plot 'MMLU price-performance' vs. 'Date'
fig = px.scatter(aa_df, x='Release Date', y='MMLU price-performance', title='MMLU price-performance over time', hover_data=['Model Name'])
fig.update_layout(yaxis_type='log')
fig.show()


In [11]:
aa_df['Model Name and Date'] = aa_df['Model Name'] + ' (' + aa_df['Release Date'].dt.strftime('%Y-%m') + ')'
aa_df['Model Name and Date']

0           GPT-3 (2021-11)
1           GPT-3 (2022-08)
2           GPT-3 (2022-09)
3         GPT-3.5 (2022-11)
4           GPT-4 (2023-03)
              ...          
63             o1 (2024-12)
64    DeepSeek-V3 (2024-12)
65    DeepSeek-R1 (2025-01)
66        o1-mini (2025-01)
67        o3-mini (2025-01)
Name: Model Name and Date, Length: 68, dtype: object

In [12]:
# Construct a list of 'Model Name' values that were ever in the top-n based on 'MMLU price-performance'
top_n = 1
aa_df = aa_df.sort_values(by='Release Date')
ever_top_n_models = set()

unique_dates = aa_df['Release Date'].sort_values().unique()

for date in unique_dates:
    df_up_to_date = aa_df[aa_df['Release Date'] <= date]
    top_n_models = df_up_to_date.nlargest(top_n, 'MMLU price-performance')
    top_n_model_names = top_n_models['Model Name and Date'].tolist()
    ever_top_n_models.update(top_n_model_names)

ever_top_n_list = sorted(ever_top_n_models)
print(ever_top_n_list)

# Plot top-n models over time
fig = px.scatter(aa_df[aa_df['Model Name and Date'].isin(ever_top_n_list)], x='Release Date', y='MMLU price-performance', title='MMLU price-performance over time', hover_data=['Model Name and Date'])
fig.update_layout(yaxis_type='log')
fig.show()

['GPT-3 (2021-11)', 'GPT-3 (2022-09)', 'GPT-3.5 (2022-11)', 'GPT-3.5 Turbo (2023-03)', 'GPT-3.5 Turbo (2023-11)', 'Gemini-1.5-Flash-2024-05 (2024-05)', 'Gemini-1.5-Flash-8B (2024-10)', 'Llama 2-7B (2023-12)', 'Llama-2-Chat-13B (2023-07)', 'Llama-3-Instruct-8B (2024-04)', 'Llama-3.1-Instruct-8B (2024-07)', 'Llama-3.2-Instruct-3B (2024-09)']


## Try fitting a regression to lowest-priced models above a performance lower bound

In [13]:
"""
  - Set a performance lower bound
  - Track the running best (top) model
  - At each point in time (at some resolution)
    - Filter to new models published in this time window
    - Filter to models with performance above the lower bound
    - Check if any new model is cheaper than current best
    - If so, update the current best
    - Record the current best model at this time point
"""
bench = 'MMLU'
performance_lower_bound = 83
ts = pd.date_range(start='2020-01-01', end='2025-01-01', freq='MS')
cheapest_models = []
current_best = None

for i, t in enumerate(ts):
    # Get models published in this time window
    benchmark_df = aa_df
    if i > 0:
        prev_t = ts[i-1]
        benchmark_df = benchmark_df[(benchmark_df['Release Date'] >= prev_t) & (benchmark_df['Release Date'] < t)]
    else:
        benchmark_df = benchmark_df[benchmark_df['Release Date'] < t]
        
    # Filter for performance
    benchmark_df = benchmark_df[benchmark_df[bench].notna()]
    benchmark_df = benchmark_df[benchmark_df[bench] > performance_lower_bound]
    
    if not benchmark_df.empty:
        # Find cheapest new model
        new_best = benchmark_df.loc[benchmark_df['USD per 1M Tokens'].idxmin()]
        
        # Update current best if new model is cheaper (or if no current best)
        if current_best is None or new_best['USD per 1M Tokens'] < current_best['USD per 1M Tokens']:
            current_best = new_best
            cheapest_models.append(current_best)
            print(t, current_best['Model Name'], current_best[bench], f"${current_best['USD per 1M Tokens']:.2f}")
        # elif new_best['USD per 1M Tokens'] == current_best['USD per 1M Tokens']:
        #     # Choose the model with the highest performance
        #     if new_best[bench] > current_best[bench]:
        #         current_best = new_best
        #         cheapest_models.append(current_best)
        #         print(t, current_best['Model Name'], current_best[bench], f"${current_best['USD per 1M Tokens']:.2f}")


2023-04-01 00:00:00 GPT-4 86.0 $37.50
2023-12-01 00:00:00 GPT-4 Turbo 87.0 $15.00
2024-06-01 00:00:00 Gemini-1.5-Pro-2024-05 86.0 $2.19
2024-08-01 00:00:00 Llama-3.1-Instruct-70B 84.0 $0.72


In [14]:
cheapest_models_df = pd.DataFrame(cheapest_models)
cheapest_models_df.head()

Unnamed: 0,Model Name,Release Date,USD per 1M Tokens,Tokens per Second,Prefill Latency (s),Prompt Length (tk),MMLU,GPQA Diamond,MATH 5,MATH-500,HumanEval,LMSys Chatbot Arena ELO,MMLU price-performance,Model Name and Date
4,GPT-4,2023-03-01,37.5,23.6,0.724,1000.0,86.0,33.0,,21.0,67.0,1186.0,2.293333,GPT-4 (2023-03)
10,GPT-4 Turbo,2023-11-01,15.0,39.2,1.246,1000.0,87.0,50.0,36.0,74.0,92.0,1256.0,5.8,GPT-4 Turbo (2023-11)
26,Gemini-1.5-Pro-2024-05,2024-05-01,2.19,64.8,0.738,1000.0,86.0,46.0,,66.0,,1260.0,39.269406,Gemini-1.5-Pro-2024-05 (2024-05)
36,Llama-3.1-Instruct-70B,2024-07-01,0.72,71.6,0.425,1000.0,84.0,43.0,39.0,64.0,80.0,1249.0,116.666667,Llama-3.1-Instruct-70B (2024-07)


In [15]:
# Plot the cheapest models
fig = px.line(cheapest_models_df, x='Release Date', y='USD per 1M Tokens',
                title=f'Price of the cheapest model with {bench} > {performance_lower_bound}%',
                text='Model Name', markers=True,
                line_shape='hv')  # Make line vertical-horizontal
fig.update_traces(textposition='bottom left')
fig.update_layout(yaxis_type='log')
# xrange
fig.update_layout(xaxis_range=[cheapest_models_df['Release Date'].min() - pd.Timedelta(days=90), cheapest_models_df['Release Date'].max()+pd.Timedelta(days=30)])
fig.update_layout(
    width=800,
    height=600,
    font=dict(size=10),
)
if save:
    save_plot(fig, results_dir, f'aa_cheapest_models_{bench}_above_{performance_lower_bound}')
fig.show()

In [16]:
# Fit a line to the data
cheapest_models_df['price'] = cheapest_models_df['USD per 1M Tokens']
cheapest_models_df['log_price'] = np.log10(cheapest_models_df['USD per 1M Tokens'])
cheapest_models_df['date'] = cheapest_models_df['Release Date'].map(lambda x: pd.Timestamp(x).toordinal())
exponential_model = smf.ols('log_price ~ date', data=cheapest_models_df).fit()
print(exponential_model.summary())

linear_model = smf.ols('price ~ date', data=cheapest_models_df).fit()
print(linear_model.summary())

                            OLS Regression Results                            
Dep. Variable:              log_price   R-squared:                       0.917
Model:                            OLS   Adj. R-squared:                  0.875
Method:                 Least Squares   F-statistic:                     22.09
Date:                Mon, 10 Feb 2025   Prob (F-statistic):             0.0424
Time:                        13:10:16   Log-Likelihood:                0.87176
No. Observations:                   4   AIC:                             2.256
Df Residuals:                       2   BIC:                             1.029
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   2518.3676    535.632      4.702      0.0


omni_normtest is not valid with less than 8 observations; 4 samples were given.


omni_normtest is not valid with less than 8 observations; 4 samples were given.



In [17]:
# Calculate annual rate of decrease
annual_slope = exponential_model.params['date'] * 365  # Convert daily to annual
annual_factor = int(round(10**(-annual_slope)))  # Convert log slope to factor

# Plot the exponential trendline with the data
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=cheapest_models_df['Release Date'],
    y=10**exponential_model.predict(cheapest_models_df['date']),
    mode='lines',
    name=f'Trendline: {annual_factor}x decrease per year',
    line=dict(color='lightgrey', dash='dash')
))
fig.add_trace(go.Scatter(
    x=cheapest_models_df['Release Date'],
    y=cheapest_models_df['USD per 1M Tokens'],
    mode='lines+markers+text',
    name='Data',
    text=cheapest_models_df['Model Name'],
    textposition='bottom left',
    line=dict(shape='hv')
))
fig.update_layout(
    title=f'Price of the cheapest model with {bench} > {performance_lower_bound}%'
)
fig.update_traces(textposition='bottom left')
fig.update_layout(yaxis_type='log')
fig.update_layout(xaxis_title='Month')
fig.update_layout(yaxis_title='Price in USD per million tokens')
# Lower the lower x limit
fig.update_layout(xaxis_range=[cheapest_models_df['Release Date'].min() - pd.Timedelta(days=90), cheapest_models_df['Release Date'].max()+pd.Timedelta(days=30)])
fig.update_layout(
    width=800,
    height=600,
    font=dict(size=10),
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99,
        bordercolor="lightgrey",
        borderwidth=1
    )
)
if save:
    save_plot(fig, results_dir, f'aa_cheapest_models_{bench}_above_{performance_lower_bound}_with_trendline')
fig.show()

## Regression on lowest-priced models above a performance lower bound

In [18]:
performance_lower_bounds = range(10, 100, 10)
os.makedirs(results_dir + 'aa_cheapest_models_run/', exist_ok=True)

# Open log file
log_path = results_dir + 'aa_cheapest_models_run/output.log'
results = []
with open(log_path, 'w') as log_file:
    for i, bench in enumerate(benchmarks):
        if i > 0:
            print('\n')
            print('\n', file=log_file)
        print(f'{bench}')
        print(f'{bench}', file=log_file)

        

        for performance_lower_bound in performance_lower_bounds:
            print(f'\nPerformance lower bound: {performance_lower_bound}%')
            print(f'\nPerformance lower bound: {performance_lower_bound}%', file=log_file)
            ts = pd.date_range(start='2020-01-01', end='2025-01-01', freq='MS')
            cheapest_models = []
            current_best = None

            for i, t in enumerate(ts):
                # Get models published in this time window
                benchmark_df = aa_df
                if i > 0:
                    prev_t = ts[i-1]
                    benchmark_df = benchmark_df[(benchmark_df['Release Date'] >= prev_t) & (benchmark_df['Release Date'] < t)]
                else:
                    benchmark_df = benchmark_df[benchmark_df['Release Date'] < t]
                    
                # Filter for performance
                benchmark_df = benchmark_df[benchmark_df[bench].notna()]
                benchmark_df = benchmark_df[benchmark_df[bench] > performance_lower_bound]
                
                if not benchmark_df.empty:
                    # Find cheapest new model
                    new_best = benchmark_df.loc[benchmark_df['USD per 1M Tokens'].idxmin()]
                    
                    # Update current best if new model is cheaper (or if no current best)
                    if current_best is None or new_best['USD per 1M Tokens'] < current_best['USD per 1M Tokens']:
                        current_best = new_best
                        cheapest_models.append(current_best)
                        print(t, current_best['Model Name'], current_best[bench], f"${current_best['USD per 1M Tokens']:.2f}")
                        print(t, current_best['Model Name'], current_best[bench], f"${current_best['USD per 1M Tokens']:.2f}", file=log_file)

            cheapest_models_df = pd.DataFrame(cheapest_models)
            if len(cheapest_models_df) < 2:
                print('Less than 2 models found')
                print('Less than 2 models found', file=log_file)
                continue

            # Fit a line to the data
            cheapest_models_df['price'] = cheapest_models_df['USD per 1M Tokens']
            cheapest_models_df['log_price'] = np.log10(cheapest_models_df['USD per 1M Tokens'])
            cheapest_models_df['date'] = cheapest_models_df['Release Date'].map(lambda x: pd.Timestamp(x).toordinal())
            exponential_model = smf.ols('log_price ~ date', data=cheapest_models_df).fit()

            # Calculate annual rate of decrease
            annual_slope = exponential_model.params['date'] * 365  # Convert daily to annual
            annual_factor = int(round(10**(-annual_slope)))  # Convert log slope to factor
            results.append({
                'bench': bench,
                'performance_lower_bound': performance_lower_bound,
                'sample_size': len(cheapest_models_df),
                'start_date': cheapest_models_df['Release Date'].min(),
                'end_date': cheapest_models_df['Release Date'].max(),
                'price_reduction_factor_per_year': annual_factor,
                'r_squared': round(exponential_model.rsquared, 2),
            })

            # Plot the exponential trendline with the data
            fig = go.Figure()
            fig.add_trace(go.Scatter(
                x=cheapest_models_df['Release Date'],
                y=10**exponential_model.predict(cheapest_models_df['date']),
                mode='lines',
                name=f'Trendline: {annual_factor}x decrease per year',
                line=dict(color='lightgrey', dash='dash')
            ))
            fig.add_trace(go.Scatter(
                x=cheapest_models_df['Release Date'],
                y=cheapest_models_df['USD per 1M Tokens'],
                mode='lines+markers+text',
                name='Data',
                text=cheapest_models_df['Model Name'],
                textposition='bottom left',
                line=dict(shape='hv')
            ))
            fig.update_layout(
                title=f'Price of the cheapest model with {bench} > {performance_lower_bound}%'
            )
            fig.update_traces(textposition='bottom left')
            fig.update_layout(yaxis_type='log')
            fig.update_layout(xaxis_title='Month')
            fig.update_layout(yaxis_title='Price in USD per million tokens')
            # Lower the lower x limit
            fig.update_layout(xaxis_range=[cheapest_models_df['Release Date'].min() - pd.Timedelta(days=90), cheapest_models_df['Release Date'].max()+pd.Timedelta(days=30)])
            fig.update_layout(
                width=800,
                height=600,
                font=dict(size=10),
                legend=dict(
                    yanchor="top",
                    y=0.99,
                    xanchor="right",
                    x=0.99,
                    bordercolor="lightgrey",
                    borderwidth=1
                )
            )
            if save:
                save_plot(
                    fig,
                    results_dir + 'aa_cheapest_models_run/',
                    f'aa_cheapest_models_{bench}_above_{performance_lower_bound}_with_trendline',
                    extensions=['png'],
                )

cheapest_model_results_df = pd.DataFrame(results)
cheapest_model_results_df.to_csv(results_dir + 'aa_cheapest_models_run/cheapest_model_results.csv', index=False)

# Create a summary DataFrame
summary_data = []
for bench in benchmarks:
    bench_results = cheapest_model_results_df[cheapest_model_results_df['bench'] == bench]
    # Get all performance lower bounds used for this benchmark
    perf_bounds = sorted(list(set(bench_results['performance_lower_bound'])))
    
    # Calculate geometric mean of price reduction factors
    price_factors = bench_results['price_reduction_factor_per_year'].dropna()
    geomean = np.exp(np.mean(np.log(price_factors))) if len(price_factors) > 0 else np.nan
    
    # Get range of price reduction factors
    factor_range = [price_factors.min(), price_factors.max()] if len(price_factors) > 0 else []
    
    summary_data.append({
        'bench': bench,
        'performance_lower_bounds': perf_bounds,
        'price_reduction_factor_per_year_geomean': round(geomean),
        'price_reduction_factor_per_year_range': factor_range
    })

cheapest_model_summary_df = pd.DataFrame(summary_data)
cheapest_model_summary_df.to_csv(results_dir + 'aa_cheapest_models_run/cheapest_model_summary.csv', index=False)

MMLU

Performance lower bound: 10%
2021-12-01 00:00:00 GPT-3 43.9 $60.00
2022-10-01 00:00:00 GPT-3 43.9 $20.00
2023-04-01 00:00:00 GPT-3.5 Turbo 68.0 $2.00
2023-08-01 00:00:00 Llama-2-Chat-7B 13.0 $0.33
2024-01-01 00:00:00 Llama 2-7B 45.3 $0.20
2024-05-01 00:00:00 Llama 2-7B 45.3 $0.13
2024-08-01 00:00:00 Llama-3.1-Instruct-8B 71.0 $0.10
2024-10-01 00:00:00 Llama-3.2-Instruct-1B 35.0 $0.05

Performance lower bound: 20%
2021-12-01 00:00:00 GPT-3 43.9 $60.00
2022-10-01 00:00:00 GPT-3 43.9 $20.00
2023-04-01 00:00:00 GPT-3.5 Turbo 68.0 $2.00
2023-08-01 00:00:00 Llama-2-Chat-13B 45.0 $0.56
2024-01-01 00:00:00 Llama 2-7B 45.3 $0.20
2024-05-01 00:00:00 Llama 2-7B 45.3 $0.13
2024-08-01 00:00:00 Llama-3.1-Instruct-8B 71.0 $0.10
2024-10-01 00:00:00 Llama-3.2-Instruct-1B 35.0 $0.05

Performance lower bound: 30%
2021-12-01 00:00:00 GPT-3 43.9 $60.00
2022-10-01 00:00:00 GPT-3 43.9 $20.00
2023-04-01 00:00:00 GPT-3.5 Turbo 68.0 $2.00
2023-08-01 00:00:00 Llama-2-Chat-13B 45.0 $0.56
2024-01-01 00:00:00

## Regression on top models by price-performance, one benchmark at a time

In [19]:
top_n = 1
results_subdir = f'aa_top_{top_n}_price_performance_run/'
os.makedirs(results_dir + results_subdir, exist_ok=True)

# Open log file
results = []
for bench in benchmarks:
    benchmark_df = aa_df.copy()
    benchmark_df = benchmark_df.dropna(subset=[bench])
    benchmark_df = benchmark_df.sort_values(by='Release Date')
    benchmark_df[f'{bench} price-performance'] = benchmark_df[bench] / benchmark_df['USD per 1M Tokens']

    # Find the rolling top-n models
    unique_dates = benchmark_df['Release Date'].unique()
    ever_top_n_models = set()
    for date in unique_dates:
        df_up_to_date = benchmark_df[benchmark_df['Release Date'] <= date]
        top_n_models = df_up_to_date.nlargest(top_n, f'{bench} price-performance')
        top_n_model_names = top_n_models['Model Name and Date'].tolist()
        ever_top_n_models.update(top_n_model_names)
    ever_top_n_list = sorted(ever_top_n_models)
    benchmark_df = benchmark_df[benchmark_df['Model Name and Date'].isin(ever_top_n_list)]

    if len(benchmark_df) < 2:
        print(f'Less than 2 models found for {bench}')
        continue

    # Fit a line to the data
    benchmark_df['price'] = benchmark_df['USD per 1M Tokens']
    benchmark_df['log_price'] = np.log10(benchmark_df['USD per 1M Tokens'])
    benchmark_df['bench'] = benchmark_df[bench]
    benchmark_df['date'] = benchmark_df['Release Date'].map(lambda x: pd.Timestamp(x).toordinal())
    exponential_model = smf.ols('log_price ~ date + bench', data=benchmark_df).fit()

    # Calculate annual rate of decrease
    annual_slope = exponential_model.params['date'] * 365  # Convert daily to annual
    annual_factor = int(round(10**(-annual_slope)))  # Convert log slope to factor
    annual_slope_ci = exponential_model.conf_int(alpha=0.1).loc['date']
    annual_slope_ci_high = int(round(10**(-annual_slope_ci[0] * 365)))
    annual_slope_ci_low = int(round(10**(-annual_slope_ci[1] * 365)))
    results.append({
        'bench': bench,
        'sample_size': len(benchmark_df),
        'start_date': benchmark_df['Release Date'].min(),
        'end_date': benchmark_df['Release Date'].max(),
        'price_reduction_factor_per_year_mean': annual_factor,
        'price_reduction_factor_per_year_ci': [annual_slope_ci_low, annual_slope_ci_high],
        'r_squared': round(exponential_model.rsquared, 2),
    })

    # Plot the exponential trendline with the data
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=benchmark_df['Release Date'],
        y=10**exponential_model.predict(benchmark_df),
        mode='lines',
        name=f'Regression: {annual_factor}x decrease per year at fixed performance',
        line=dict(color='lightgrey', dash='dash')
    ))
    fig.add_trace(go.Scatter(
        x=benchmark_df['Release Date'],
        y=benchmark_df['USD per 1M Tokens'],
        mode='markers+text',
        name='Model with the best price-performance at the time',
        text=benchmark_df['Model Name'],
        textposition='bottom left',
    ))
    fig.update_layout(
        title=f'Price of models with the best price-performance on {bench}'
    )
    fig.update_traces(textposition='bottom left')
    fig.update_layout(yaxis_type='log')
    fig.update_layout(xaxis_title='Month')
    fig.update_layout(yaxis_title='Price in USD per million tokens')
    # Lower the lower x limit
    fig.update_layout(xaxis_range=[benchmark_df['Release Date'].min() - pd.Timedelta(days=90), benchmark_df['Release Date'].max()+pd.Timedelta(days=30)])
    fig.update_layout(
        width=800,
        height=600,
        font=dict(size=10),
        legend=dict(
            yanchor="top",
            y=0.99,
            xanchor="right",
            x=0.99,
            bordercolor="lightgrey",
            borderwidth=1
        )
    )
    if save:
        save_plot(
            fig,
            results_dir + results_subdir,
            f'top_{top_n}_price_performance_{bench}_with_trendline',
            extensions=['png'],
        )

top_n_price_performance_results_df = pd.DataFrame(results)
top_n_price_performance_results_df.to_csv(results_dir + results_subdir + 'top_n_price_performance_results.csv', index=False)

## Regression on all data, one benchmark at a time

In [20]:
# Fit a regression to log_price ~ date + bench
bench = 'MMLU'
aa_df['log_price'] = np.log10(aa_df['USD per 1M Tokens'])
aa_df['date'] = aa_df['Release Date'].map(lambda x: pd.Timestamp(x).toordinal())
aa_df['bench'] = aa_df[bench]

# Fit the model
model = smf.ols('log_price ~ date + bench', data=aa_df).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              log_price   R-squared:                       0.514
Model:                            OLS   Adj. R-squared:                  0.495
Method:                 Least Squares   F-statistic:                     28.01
Date:                Mon, 10 Feb 2025   Prob (F-statistic):           5.02e-09
Time:                        13:10:19   Log-Likelihood:                -48.089
No. Observations:                  56   AIC:                             102.2
Df Residuals:                      53   BIC:                             108.3
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   2143.3714    303.000      7.074      0.0

In [21]:
print(model.params)
print(model.conf_int())


Intercept    2143.371352
date           -0.002904
bench           0.033162
dtype: float64
                     0            1
Intercept  1535.629985  2751.112718
date         -0.003727    -0.002081
bench         0.021552     0.044772


In [22]:
# Date parameter
date_param = model.params['date'] * 365  # Convert daily to annual
annual_factor = int(round(10**(-date_param)))  # Convert log slope to factor
print(f'It costs {annual_factor}x less each year to keep {bench} performance fixed.')

It costs 11x less each year to keep MMLU performance fixed.


In [23]:
model.conf_int().loc['date'][0]


-0.003726549630275529

In [24]:
results = []
for bench in benchmarks:
    print(f'{bench}')
    df = aa_df.copy()
    df = df.dropna(subset=[bench])
    df['bench'] = df[bench]

    model = smf.ols('log_price ~ date + bench', data=df).fit()
    # Print number of observations and R-squared
    print(f'Number of observations: {len(df)}')
    print(f'R-squared: {model.rsquared:.2f}')
    date_param = model.params['date'] * 365  # Convert daily to annual
    date_param_ci = model.conf_int(alpha=0.1).loc['date']
    date_param_ci_low = date_param_ci[0] * 365
    date_param_ci_high = date_param_ci[1] * 365
    annual_factor = int(round(10**(-date_param)))  # Convert log slope to factor
    annual_factor_ci_high = int(round(10**(-date_param_ci_low)))  # Convert log slope to factor
    annual_factor_ci_low = int(round(10**(-date_param_ci_high)))  # Convert log slope to factor

    min_date = pd.Timestamp.fromordinal(df["date"].min()).date()
    max_date = pd.Timestamp.fromordinal(df["date"].max()).date()
    date_range = max_date - min_date
    results.append({
        'bench': bench,
        'sample_size': len(df),
        'start_date': min_date,
        'end_date': max_date,
        'price_reduction_factor_per_year_mean': annual_factor,
        'price_reduction_factor_per_year_ci': [annual_factor_ci_low, annual_factor_ci_high],
        'r_squared': round(model.rsquared, 2),
    })

    print(f'The price to achieve a fixed level of {bench} performance fell at a rate of')
    print(f'{annual_factor}x [{annual_factor_ci_low}, {annual_factor_ci_high}] per year')
    print(f'over {date_range.days / 365:.1f} years ({min_date} to {max_date})')
    print()

single_benchmark_regression_results_df = pd.DataFrame(results)
single_benchmark_regression_results_df.to_csv(results_dir + 'single_benchmark_regression_results.csv', index=False)

MMLU
Number of observations: 56
R-squared: 0.51
The price to achieve a fixed level of MMLU performance fell at a rate of
11x [6, 20] per year
over 3.0 years (2021-11-20 to 2024-11-01)

GPQA Diamond
Number of observations: 48
R-squared: 0.51
The price to achieve a fixed level of GPQA Diamond performance fell at a rate of
9x [4, 22] per year
over 1.9 years (2023-03-01 to 2025-01-31)

MATH 5
Number of observations: 35
R-squared: 0.19
The price to achieve a fixed level of MATH 5 performance fell at a rate of
6x [1, 39] per year
over 1.6 years (2023-06-13 to 2025-01-31)

MATH-500
Number of observations: 35
R-squared: 0.46
The price to achieve a fixed level of MATH-500 performance fell at a rate of
36x [10, 130] per year
over 1.7 years (2023-03-01 to 2024-11-01)

HumanEval
Number of observations: 33
R-squared: 0.48
The price to achieve a fixed level of HumanEval performance fell at a rate of
12x [4, 35] per year
over 1.7 years (2023-03-01 to 2024-11-01)



## Regression on all data, all benchmarks at once

In [25]:
df = aa_df.copy()
# Rename benchmark columns to all lowercase, underscore separated
new_bench_cols = [col.lower().replace(' ', '_').replace('-', '_') for col in benchmarks]
df.rename(columns={bench: new_bench_cols[i] for i, bench in enumerate(benchmarks)}, inplace=True)
df = df.dropna(subset=new_bench_cols)

model = smf.ols('log_price ~ date + ' + ' + '.join(new_bench_cols), data=df).fit()
# Print number of observations and R-squared
print(f'Number of observations: {len(df)}')
print(f'R-squared: {model.rsquared:.2f}')
date_param = model.params['date'] * 365  # Convert daily to annual
date_param_ci = model.conf_int(alpha=0.1).loc['date']
date_param_ci_low = date_param_ci[0] * 365
date_param_ci_high = date_param_ci[1] * 365
annual_factor = int(round(10**(-date_param)))  # Convert log slope to factor
annual_factor_ci_high = int(round(10**(-date_param_ci_low)))  # Convert log slope to factor
annual_factor_ci_low = int(round(10**(-date_param_ci_high)))  # Convert log slope to factor

min_date = pd.Timestamp.fromordinal(df["date"].min()).date()
max_date = pd.Timestamp.fromordinal(df["date"].max()).date()
date_range = max_date - min_date
results = [{
    'benchmarks': benchmarks,
    'sample_size': len(df),
    'start_date': min_date,
    'end_date': max_date,
    'price_reduction_factor_per_year_mean': annual_factor,
    'price_reduction_factor_per_year_ci': [annual_factor_ci_low, annual_factor_ci_high],
    'r_squared': round(model.rsquared, 2),
}]

print(f'The price to achieve a fixed level of performance fell at a rate of')
print(f'{annual_factor}x [{annual_factor_ci_low}, {annual_factor_ci_high}] per year')
print(f'over {date_range.days / 365:.1f} years ({min_date} to {max_date})')
print()

all_benchmarks_regression_results_df = pd.DataFrame(results)
all_benchmarks_regression_results_df.to_csv(results_dir + 'all_benchmarks_regression_results.csv', index=False)

Number of observations: 25
R-squared: 0.84
The price to achieve a fixed level of performance fell at a rate of
38x [11, 130] per year
over 1.3 years (2023-06-13 to 2024-10-01)



## Regression on top models by price-performance on any benchmark, all benchmarks at once

## Summary and comparison of methods

In [26]:
# Compare the cheapest model results to the full regression results
for bench in benchmarks:
    print(f'{bench} trends')
    cheapest_model_result = cheapest_model_summary_df[cheapest_model_summary_df['bench'] == bench].iloc[0]
    top_n_price_performance_result = top_n_price_performance_results_df[top_n_price_performance_results_df['bench'] == bench].iloc[0]
    single_benchmark_regression_result = single_benchmark_regression_results_df[single_benchmark_regression_results_df['bench'] == bench].iloc[0]
    all_benchmarks_regression_result = all_benchmarks_regression_results_df.iloc[0]
    print(
        'Cheapest model at threshold:',
        cheapest_model_result['price_reduction_factor_per_year_geomean'],
        cheapest_model_result['price_reduction_factor_per_year_range'],
    )
    print(
        'Top model by price-performance:',
        top_n_price_performance_result['price_reduction_factor_per_year_mean'],
        top_n_price_performance_result['price_reduction_factor_per_year_ci'],
    )
    print(
        'Single benchmark regression:',
        single_benchmark_regression_result['price_reduction_factor_per_year_mean'],
        single_benchmark_regression_result['price_reduction_factor_per_year_ci'],
    )
    print(
        'All benchmarks regression:',
        all_benchmarks_regression_result['price_reduction_factor_per_year_mean'],
        all_benchmarks_regression_result['price_reduction_factor_per_year_ci'],
    )
    print()

MMLU trends
Cheapest model at threshold: 20 [12, 96]
Top model by price-performance: 18 [11, 28]
Single benchmark regression: 11 [6, 20]
All benchmarks regression: 38 [11, 130]

GPQA Diamond trends
Cheapest model at threshold: 68 [27, 331]
Top model by price-performance: 46 [16, 135]
Single benchmark regression: 9 [4, 22]
All benchmarks regression: 38 [11, 130]

MATH 5 trends
Cheapest model at threshold: 1754 [27, 288635]
Top model by price-performance: 31 [11, 83]
Single benchmark regression: 6 [1, 39]
All benchmarks regression: 38 [11, 130]

MATH-500 trends
Cheapest model at threshold: 87 [20, 390]
Top model by price-performance: 55 [15, 198]
Single benchmark regression: 36 [10, 130]
All benchmarks regression: 38 [11, 130]

HumanEval trends
Cheapest model at threshold: 46 [10, 299]
Top model by price-performance: 34 [14, 81]
Single benchmark regression: 12 [4, 35]
All benchmarks regression: 38 [11, 130]

