In [1]:
import numpy as np
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import statsmodels.formula.api as smf

from data import load_pcd_df
from plotting import save_plot

pio.templates.default = "plotly_white"

In [2]:
results_dir = 'results/2025-01-28_fresh/'
os.makedirs(results_dir, exist_ok=True)
save = True

# Testing Artificial Analysis data

In [3]:
aa_df = pd.read_csv('data/aa_data.csv')
aa_df

Unnamed: 0,Model Name,Tier,Release Date,USD per 1M Tokens,Tokens per Second,Prefill Latency (s),Prompt Length (tk),MMLU,GPQA Diamond,MATH-500,HumanEval,LMSys Chatbot Arena ELO
0,GPT-4o-2024-05,1,2024-05,7.5,86.4,0.687,1000,87,51,79.0,93.0,1285.0
1,GPT-4o-2024-08,1,2024-08,4.38,81.0,0.673,1000,89,51,80.0,93.0,1337.0
2,GPT-4o-2024-11,1,2024-11,4.38,148.9,0.404,1000,86,39,74.0,93.0,1361.0
3,GPT-4o-mini,2,2024-07,0.26,112.2,0.626,1000,82,43,79.0,88.0,1273.0
4,GPT-4,1,2023-03,37.5,23.6,0.724,1000,86,33,21.0,67.0,1186.0
5,GPT-4 Turbo,1,2023-11,15.0,39.2,1.246,1000,87,50,74.0,92.0,1256.0
6,GPT-3.5 Turbo,2,2023-11,0.75,121.5,0.598,1000,68,30,44.0,71.0,1107.0
7,Llama-3.1-Instruct-405B,1,2024-07,3.5,27.9,0.719,1000,87,50,70.0,87.0,1266.0
8,Llama-3.1-Instruct-70B,2,2024-07,0.72,71.6,0.425,1000,84,43,64.0,80.0,1249.0
9,Llama-3.1-Instruct-8B,3,2024-07,0.1,164.3,0.35,1000,71,27,50.0,67.0,1172.0


In [4]:
# 'Release Date' is a string with the format 'YYYY-MM'
aa_df['Release Date'] = pd.to_datetime(aa_df['Release Date'].str.strip(), format='%Y-%m')

In [5]:
# Plot 'MMLU' vs. 'USD per 1M Tokens'
fig = px.scatter(aa_df, x='MMLU', y='USD per 1M Tokens', title='MMLU vs. USD per 1M Tokens')
fig.update_layout(yaxis_type='log')
fig.show()


In [6]:
aa_df['MMLU price-performance'] = aa_df['MMLU'] / aa_df['USD per 1M Tokens']
# Plot 'MMLU price-performance' vs. 'Date'
fig = px.scatter(aa_df, x='Release Date', y='MMLU price-performance', title='MMLU price-performance over time', hover_data=['Model Name'])
fig.update_layout(yaxis_type='log')
fig.show()


In [7]:
# Calculate the cumulative max of 'MMLU price-performance' over time
aa_df = aa_df.sort_values(by='Release Date')
aa_df['cumulative_max'] = aa_df['MMLU price-performance'].cummax()
fig = px.scatter(aa_df, x='Release Date', y='cumulative_max', title='Cumulative max MMLU price-performance over time', hover_data=['Model Name'])
fig.update_layout(yaxis_type='log')
fig.show()


## Fit minimum cost curve

In [8]:
"""
  - Set a performance lower bound
  - Track the running best (cheapest) model
  - At each point in time (at some resolution)
    - Filter to new models published in this time window
    - Filter to models with performance above the lower bound
    - Check if any new model is cheaper than current best
    - If so, update the current best
    - Record the current best model at this time point
"""
bench = 'HumanEval'
performance_lower_bound = 80
ts = pd.date_range(start='2020-01-01', end='2025-01-01', freq='MS')
cheapest_models = []
current_best = None

for i, t in enumerate(ts):
    # Get models published in this time window
    benchmark_df = aa_df
    if i > 0:
        prev_t = ts[i-1]
        benchmark_df = benchmark_df[(benchmark_df['Release Date'] >= prev_t) & (benchmark_df['Release Date'] < t)]
    else:
        benchmark_df = benchmark_df[benchmark_df['Release Date'] < t]
        
    # Filter for performance
    benchmark_df = benchmark_df[benchmark_df[bench].notna()]
    benchmark_df = benchmark_df[benchmark_df[bench] > performance_lower_bound]
    
    if not benchmark_df.empty:
        # Find cheapest new model
        new_best = benchmark_df.loc[benchmark_df['USD per 1M Tokens'].idxmin()]
        
        # Update current best if new model is cheaper (or if no current best)
        if current_best is None or new_best['USD per 1M Tokens'] < current_best['USD per 1M Tokens']:
            current_best = new_best
            cheapest_models.append(current_best)
            print(t, current_best['Model Name'], current_best[bench], f"${current_best['USD per 1M Tokens']:.2f}")

2023-12-01 00:00:00 GPT-4 Turbo 92.0 $15.00
2024-06-01 00:00:00 GPT-4o-2024-05 93.0 $7.50
2024-07-01 00:00:00 Claude-3.5-Haiku 87.0 $1.60
2024-08-01 00:00:00 GPT-4o-mini 88.0 $0.26
2024-10-01 00:00:00 Gemini-1.5-Flash-2024-09 83.0 $0.13


In [9]:
cheapest_models_df = pd.DataFrame(cheapest_models)
cheapest_models_df.head()

Unnamed: 0,Model Name,Tier,Release Date,USD per 1M Tokens,Tokens per Second,Prefill Latency (s),Prompt Length (tk),MMLU,GPQA Diamond,MATH-500,HumanEval,LMSys Chatbot Arena ELO,MMLU price-performance,cumulative_max
5,GPT-4 Turbo,1,2023-11-01,15.0,39.2,1.246,1000,87,50,74.0,92.0,1256.0,5.8,80.357143
0,GPT-4o-2024-05,1,2024-05-01,7.5,86.4,0.687,1000,87,51,79.0,93.0,1285.0,11.6,426.666667
26,Claude-3.5-Haiku,3,2024-06-01,1.6,64.2,0.768,1000,81,37,67.0,87.0,,50.625,607.692308
3,GPT-4o-mini,2,2024-07-01,0.26,112.2,0.626,1000,82,43,79.0,88.0,1273.0,315.384615,710.0
18,Gemini-1.5-Flash-2024-09,3,2024-09-01,0.13,190.5,0.348,1000,75,45,83.0,83.0,1271.0,576.923077,710.0


In [10]:
# Plot the cheapest models
fig = px.line(cheapest_models_df, x='Release Date', y='USD per 1M Tokens',
                title=f'Cost of the cheapest model with {bench} > {performance_lower_bound}%',
                text='Model Name', markers=True,
                line_shape='hv')  # Make line vertical-horizontal
fig.update_traces(textposition='bottom left')
fig.update_layout(yaxis_type='log')
fig.update_layout(
    width=800,
    height=600,
    font=dict(size=10),
)
# if save:
#     save_plot(fig, results_dir, f'aa_cheapest_models_{bench}_above_{performance_lower_bound}')
fig.show()

In [11]:
# Fit a line to the data
cheapest_models_df['price'] = cheapest_models_df['USD per 1M Tokens']
cheapest_models_df['log_price'] = np.log10(cheapest_models_df['USD per 1M Tokens'])
cheapest_models_df['date'] = cheapest_models_df['Release Date'].map(lambda x: pd.Timestamp(x).toordinal())
exponential_model = smf.ols('log_price ~ date', data=cheapest_models_df).fit()
print(exponential_model.summary())

linear_model = smf.ols('price ~ date', data=cheapest_models_df).fit()
print(linear_model.summary())

                            OLS Regression Results                            
Dep. Variable:              log_price   R-squared:                       0.759
Model:                            OLS   Adj. R-squared:                  0.679
Method:                 Least Squares   F-statistic:                     9.454
Date:                Tue, 28 Jan 2025   Prob (F-statistic):             0.0544
Time:                        15:50:03   Log-Likelihood:                -2.4166
No. Observations:                   5   AIC:                             8.833
Df Residuals:                       3   BIC:                             8.052
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   5012.7090   1630.281      3.075      0.0


omni_normtest is not valid with less than 8 observations; 5 samples were given.


omni_normtest is not valid with less than 8 observations; 5 samples were given.



In [12]:
# Calculate annual rate of decrease
annual_slope = exponential_model.params['date'] * 365  # Convert daily to annual
annual_factor = int(round(10**(-annual_slope)))  # Convert log slope to factor

# Plot the exponential trendline with the data
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=cheapest_models_df['Release Date'],
    y=10**exponential_model.predict(cheapest_models_df['date']),
    mode='lines',
    name=f'Trendline: {annual_factor}x decrease per year',
    line=dict(color='lightgrey', dash='dash')
))
fig.add_trace(go.Scatter(
    x=cheapest_models_df['Release Date'],
    y=cheapest_models_df['USD per 1M Tokens'],
    mode='lines+markers+text',
    name='Data',
    text=cheapest_models_df['Model Name'],
    textposition='bottom left',
    line=dict(shape='hv')
))
fig.update_layout(
    title=f'Cost of the cheapest model with {bench} > {performance_lower_bound}%'
)
fig.update_traces(textposition='bottom left')
fig.update_layout(yaxis_type='log')
fig.update_layout(xaxis_title='Month')
fig.update_layout(yaxis_title='Cost in USD per million tokens')
# Lower the lower x limit
fig.update_layout(xaxis_range=[cheapest_models_df['Release Date'].min() - pd.Timedelta(days=90), cheapest_models_df['Release Date'].max()+pd.Timedelta(days=30)])
fig.update_layout(
    width=800,
    height=600,
    font=dict(size=10),
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99,
        bordercolor="lightgrey",
        borderwidth=1
    )
)
if save:
    save_plot(fig, results_dir, f'aa_cheapest_models_{bench}_above_{performance_lower_bound}_with_trendline')
fig.show()

In [13]:
benchmarks = ['MMLU', 'GPQA Diamond', 'MATH-500','HumanEval']
performance_lower_bounds = [40, 50, 60, 70, 80, 90]
os.makedirs(results_dir + 'aa_cheapest_models_run/', exist_ok=True)

# Open log file
log_path = results_dir + 'aa_cheapest_models_run/output.log'
results = []
with open(log_path, 'w') as log_file:
    for i, bench in enumerate(benchmarks):
        if i > 0:
            print('\n')
            print('\n', file=log_file)
        print(f'{bench}')
        print(f'{bench}', file=log_file)
        for performance_lower_bound in performance_lower_bounds:
            print(f'\nPerformance lower bound: {performance_lower_bound}%')
            print(f'\nPerformance lower bound: {performance_lower_bound}%', file=log_file)
            ts = pd.date_range(start='2020-01-01', end='2025-01-01', freq='MS')
            cheapest_models = []
            current_best = None

            for i, t in enumerate(ts):
                # Get models published in this time window
                benchmark_df = aa_df
                if i > 0:
                    prev_t = ts[i-1]
                    benchmark_df = benchmark_df[(benchmark_df['Release Date'] >= prev_t) & (benchmark_df['Release Date'] < t)]
                else:
                    benchmark_df = benchmark_df[benchmark_df['Release Date'] < t]
                    
                # Filter for performance
                benchmark_df = benchmark_df[benchmark_df[bench].notna()]
                benchmark_df = benchmark_df[benchmark_df[bench] > performance_lower_bound]
                
                if not benchmark_df.empty:
                    # Find cheapest new model
                    new_best = benchmark_df.loc[benchmark_df['USD per 1M Tokens'].idxmin()]
                    
                    # Update current best if new model is cheaper (or if no current best)
                    if current_best is None or new_best['USD per 1M Tokens'] < current_best['USD per 1M Tokens']:
                        current_best = new_best
                        cheapest_models.append(current_best)
                        print(t, current_best['Model Name'], current_best[bench], f"${current_best['USD per 1M Tokens']:.2f}")
                        print(t, current_best['Model Name'], current_best[bench], f"${current_best['USD per 1M Tokens']:.2f}", file=log_file)

            cheapest_models_df = pd.DataFrame(cheapest_models)
            if len(cheapest_models_df) < 2:
                print('Less than 2 models found')
                print('Less than 2 models found', file=log_file)
                continue

            # Fit a line to the data
            cheapest_models_df['price'] = cheapest_models_df['USD per 1M Tokens']
            cheapest_models_df['log_price'] = np.log10(cheapest_models_df['USD per 1M Tokens'])
            cheapest_models_df['date'] = cheapest_models_df['Release Date'].map(lambda x: pd.Timestamp(x).toordinal())
            exponential_model = smf.ols('log_price ~ date', data=cheapest_models_df).fit()

            # Calculate annual rate of decrease
            annual_slope = exponential_model.params['date'] * 365  # Convert daily to annual
            annual_factor = int(round(10**(-annual_slope)))  # Convert log slope to factor
            results.append({
                'bench': bench,
                'performance_lower_bound': performance_lower_bound,
                'sample_size': len(cheapest_models_df),
                'start_date': cheapest_models_df['Release Date'].min(),
                'end_date': cheapest_models_df['Release Date'].max(),
                'cost_reduction_factor_per_year': annual_factor,
                'r_squared': round(exponential_model.rsquared, 2),
            })

            # Plot the exponential trendline with the data
            fig = go.Figure()
            fig.add_trace(go.Scatter(
                x=cheapest_models_df['Release Date'],
                y=10**exponential_model.predict(cheapest_models_df['date']),
                mode='lines',
                name=f'Trendline: {annual_factor}x decrease per year',
                line=dict(color='lightgrey', dash='dash')
            ))
            fig.add_trace(go.Scatter(
                x=cheapest_models_df['Release Date'],
                y=cheapest_models_df['USD per 1M Tokens'],
                mode='lines+markers+text',
                name='Data',
                text=cheapest_models_df['Model Name'],
                textposition='bottom left',
                line=dict(shape='hv')
            ))
            fig.update_layout(
                title=f'Cost of the cheapest model with {bench} > {performance_lower_bound}%'
            )
            fig.update_traces(textposition='bottom left')
            fig.update_layout(yaxis_type='log')
            fig.update_layout(xaxis_title='Month')
            fig.update_layout(yaxis_title='Cost in USD per million tokens')
            # Lower the lower x limit
            fig.update_layout(xaxis_range=[cheapest_models_df['Release Date'].min() - pd.Timedelta(days=90), cheapest_models_df['Release Date'].max()+pd.Timedelta(days=30)])
            fig.update_layout(
                width=800,
                height=600,
                font=dict(size=10),
                legend=dict(
                    yanchor="top",
                    y=0.99,
                    xanchor="right",
                    x=0.99,
                    bordercolor="lightgrey",
                    borderwidth=1
                )
            )
            if save:
                save_plot(
                    fig,
                    results_dir + 'aa_cheapest_models_run/',
                    f'aa_cheapest_models_{bench}_above_{performance_lower_bound}_with_trendline',
                    extensions=['png'],
                )

cheapest_model_results_df = pd.DataFrame(results)
cheapest_model_results_df.to_csv(results_dir + 'aa_cheapest_models_run/cheapest_model_results.csv', index=False)

MMLU

Performance lower bound: 40%
2023-04-01 00:00:00 GPT-4 86 $37.50
2023-08-01 00:00:00 Llama-2-Chat-13B 45 $0.56
2024-04-01 00:00:00 Claude-3-Haiku 71 $0.50
2024-05-01 00:00:00 Llama-3-Instruct-8B 64 $0.15
2024-06-01 00:00:00 Gemini-1.5-Flash-2024-05 79 $0.13
2024-08-01 00:00:00 Llama-3.1-Instruct-8B 71 $0.10
2024-10-01 00:00:00 Llama-3.2-Instruct-3B 64 $0.08
2024-11-01 00:00:00 Gemini-1.5-Flash-8B 75 $0.07

Performance lower bound: 50%
2023-04-01 00:00:00 GPT-4 86 $37.50
2023-12-01 00:00:00 GPT-3.5 Turbo 68 $0.75
2024-04-01 00:00:00 Claude-3-Haiku 71 $0.50
2024-05-01 00:00:00 Llama-3-Instruct-8B 64 $0.15
2024-06-01 00:00:00 Gemini-1.5-Flash-2024-05 79 $0.13
2024-08-01 00:00:00 Llama-3.1-Instruct-8B 71 $0.10
2024-10-01 00:00:00 Llama-3.2-Instruct-3B 64 $0.08
2024-11-01 00:00:00 Gemini-1.5-Flash-8B 75 $0.07

Performance lower bound: 60%
2023-04-01 00:00:00 GPT-4 86 $37.50
2023-12-01 00:00:00 GPT-3.5 Turbo 68 $0.75
2024-04-01 00:00:00 Claude-3-Haiku 71 $0.50
2024-05-01 00:00:00 Llama

In [14]:
# Create a summary DataFrame
summary_data = []
for bench in benchmarks:
    bench_results = cheapest_model_results_df[cheapest_model_results_df['bench'] == bench]
    # Get all performance lower bounds used for this benchmark
    perf_bounds = sorted(list(set(bench_results['performance_lower_bound'])))
    
    # Calculate geometric mean of cost reduction factors
    cost_factors = bench_results['cost_reduction_factor_per_year'].dropna()
    geomean = np.exp(np.mean(np.log(cost_factors))) if len(cost_factors) > 0 else np.nan
    
    # Get range of cost reduction factors
    factor_range = [cost_factors.min(), cost_factors.max()] if len(cost_factors) > 0 else []
    
    summary_data.append({
        'bench': bench,
        'performance_lower_bounds': perf_bounds,
        'cost_reduction_factor_per_year_geomean': round(geomean),
        'cost_reduction_factor_per_year_range': factor_range
    })

cheapest_model_summary_df = pd.DataFrame(summary_data)
cheapest_model_summary_df.to_csv(results_dir + 'aa_cheapest_models_run/cheapest_model_summary.csv', index=False)

In [15]:
# Fit a regression to log_price ~ date + bench
bench = 'MMLU'
aa_df['log_price'] = np.log10(aa_df['USD per 1M Tokens'])
aa_df['date'] = aa_df['Release Date'].map(lambda x: pd.Timestamp(x).toordinal())
aa_df['bench'] = aa_df[bench]

# Fit the model
model = smf.ols('log_price ~ date + bench', data=aa_df).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              log_price   R-squared:                       0.542
Model:                            OLS   Adj. R-squared:                  0.514
Method:                 Least Squares   F-statistic:                     18.96
Date:                Tue, 28 Jan 2025   Prob (F-statistic):           3.71e-06
Time:                        15:50:08   Log-Likelihood:                -27.260
No. Observations:                  35   AIC:                             60.52
Df Residuals:                      32   BIC:                             65.19
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   2282.2475    534.586      4.269      0.0

In [16]:
print(model.params)
print(model.conf_int())


Intercept    2282.247526
date           -0.003092
bench           0.035833
dtype: float64
                     0            1
Intercept  1193.331122  3371.163930
date         -0.004566    -0.001618
bench         0.022982     0.048685


In [17]:
# Date parameter
date_param = model.params['date'] * 365  # Convert daily to annual
annual_factor = int(round(10**(-date_param)))  # Convert log slope to factor
print(f'It costs {annual_factor}x less each year to keep {bench} performance fixed.')

It costs 13x less each year to keep MMLU performance fixed.


In [18]:
model.conf_int().loc['date'][0]


-0.004565847685129619

In [19]:
results = []
for bench in benchmarks:
    print(f'{bench}')
    df = aa_df.copy()
    df = df.dropna(subset=[bench])
    df['bench'] = df[bench]

    model = smf.ols('log_price ~ date + bench', data=df).fit()
    # Print number of observations and R-squared
    print(f'Number of observations: {len(df)}')
    print(f'R-squared: {model.rsquared:.2f}')
    date_param = model.params['date'] * 365  # Convert daily to annual
    date_param_ci = model.conf_int().loc['date']
    date_param_ci_low = date_param_ci[0] * 365
    date_param_ci_high = date_param_ci[1] * 365
    annual_factor = int(round(10**(-date_param)))  # Convert log slope to factor
    annual_factor_ci_high = int(round(10**(-date_param_ci_low)))  # Convert log slope to factor
    annual_factor_ci_low = int(round(10**(-date_param_ci_high)))  # Convert log slope to factor

    min_date = pd.Timestamp.fromordinal(df["date"].min()).date()
    max_date = pd.Timestamp.fromordinal(df["date"].max()).date()
    date_range = max_date - min_date
    results.append({
        'bench': bench,
        'sample_size': len(df),
        'start_date': min_date,
        'end_date': max_date,
        'cost_reduction_factor_per_year_mean': annual_factor,
        'cost_reduction_factor_per_year_ci': [annual_factor_ci_low, annual_factor_ci_high],
        'r_squared': round(model.rsquared, 2),
    })

    print(f'The cost to achieve a fixed level of {bench} performance fell at a rate of')
    print(f'{annual_factor}x [{annual_factor_ci_low}, {annual_factor_ci_high}] per year')
    print(f'over {date_range.days / 365:.1f} years ({min_date} to {max_date})')
    print()

full_regression_results_df = pd.DataFrame(results)
full_regression_results_df.to_csv(results_dir + 'aa_cheapest_models_run/full_regression_results.csv', index=False)

MMLU
Number of observations: 35
R-squared: 0.54
The cost to achieve a fixed level of MMLU performance fell at a rate of
13x [4, 46] per year
over 1.7 years (2023-03-01 to 2024-11-01)

GPQA Diamond
Number of observations: 35
R-squared: 0.65
The cost to achieve a fixed level of GPQA Diamond performance fell at a rate of
17x [6, 51] per year
over 1.7 years (2023-03-01 to 2024-11-01)

MATH-500
Number of observations: 33
R-squared: 0.47
The cost to achieve a fixed level of MATH-500 performance fell at a rate of
53x [9, 295] per year
over 1.7 years (2023-03-01 to 2024-11-01)

HumanEval
Number of observations: 31
R-squared: 0.49
The cost to achieve a fixed level of HumanEval performance fell at a rate of
16x [4, 68] per year
over 1.7 years (2023-03-01 to 2024-11-01)



In [20]:
# Compare the cheapest model results to the full regression results
for bench in benchmarks:
    print(f'{bench}')
    cheapest_model_result = cheapest_model_summary_df[cheapest_model_summary_df['bench'] == bench].iloc[0]
    full_regression_result = full_regression_results_df[full_regression_results_df['bench'] == bench].iloc[0]
    print(
        'Cheapest model trend:',
        cheapest_model_result['cost_reduction_factor_per_year_geomean'],
        cheapest_model_result['cost_reduction_factor_per_year_range'],
    )
    print(
        'Full regression trend:',
        full_regression_result['cost_reduction_factor_per_year_mean'],
        full_regression_result['cost_reduction_factor_per_year_ci'],
    )
    print()

MMLU
Cheapest model trend: 45 [25, 96]
Full regression trend: 13 [4, 46]

GPQA Diamond
Cheapest model trend: 95 [27, 331]
Full regression trend: 17 [6, 51]

MATH-500
Cheapest model trend: 148 [13, 390]
Full regression trend: 53 [9, 295]

HumanEval
Cheapest model trend: 50 [10, 299]
Full regression trend: 16 [4, 68]

