In [2]:
import pandas as pd
import plotly.graph_objects as go

In [3]:
def save_plot(fig, folder, filename, extensions=['png', 'svg', 'pdf', 'html'], scale=2):
    if 'png' in extensions:
        fig.write_image(folder + filename + '.png', scale=scale)
    if 'svg' in extensions:
        fig.write_image(folder + filename + '.svg', scale=scale)
    if 'pdf' in extensions:
        fig.write_image(folder + filename + '.pdf', scale=scale)
    if 'html' in extensions:
        fig.write_html(folder + filename + '.html')

In [4]:
results_dir = 'results/2025-03-06_exclude_reasoning_models/lowest_price_models_above_previous_frontier/'

In [5]:
all_cheapest_models_df = pd.read_csv(results_dir + 'lowest_price_models_data.csv')
all_other_models_df = pd.read_csv(results_dir + 'other_models_data.csv')
results_df = pd.read_csv(results_dir + 'lowest_price_models_results.csv')

In [7]:
# Count unique Model Name / Release Date pairs in all_cheapest_models_df
unique_model_releases = all_cheapest_models_df[['Model Name', 'Release Date']].drop_duplicates()
print(f"Number of unique observations in cheapest models: {len(unique_model_releases)}")

# Count unique models in all_other_models_df for comparison
unique_other_models = all_other_models_df[['Model Name', 'Release Date']].drop_duplicates()
print(f"Number of unique observations in other models: {len(unique_other_models)}")


Number of unique observations in cheapest models: 36
Number of unique observations in other models: 51


In [9]:
len(all_cheapest_models_df[['Model Name']].drop_duplicates())

34

## Plot of one trend with scatter data

In [5]:
bench = 'GPQA Diamond'
threshold_model = 'GPT-4-0314'
cheapest_models_df = all_cheapest_models_df[
    (all_cheapest_models_df['bench'] == bench) &
    (all_cheapest_models_df['threshold_model'] == threshold_model)
]
other_models_df = all_other_models_df[
    (all_other_models_df['bench'] == bench) &
    (all_other_models_df['threshold_model'] == threshold_model)
]
result = results_df[
    (results_df['bench'] == bench) &
    (results_df['threshold_model'] == threshold_model)
]
annual_factor = result['price_reduction_factor_per_year'].iloc[0]
predicted_log_prices = cheapest_models_df['predicted_log_price']
performance_range = eval(cheapest_models_df['performance_range'].iloc[0])
performance_lower_bound = performance_range[0]
performance_upper_bound = performance_range[1]

fig = go.Figure()


fig.add_trace(go.Scatter(
    x=cheapest_models_df['Release Date'],
    y=cheapest_models_df['USD per 1M Tokens'],
    mode='markers+text',
    name=f'Lowest-price models that are {threshold_model} or better on {bench}',
    text=cheapest_models_df['Model Name'],
    textposition='bottom left',
    marker=dict(color='blue'),
    zorder=3,
))
fig.add_trace(go.Scatter(
    x=cheapest_models_df['Release Date'],
    y=predicted_log_prices,
    mode='lines',
    name=f'Trendline: {annual_factor}x decrease per year',
    line=dict(color='blue', dash='dash'),
    zorder=2,
))
fig.add_trace(go.Scatter(
    x=other_models_df['Release Date'],
    y=other_models_df['USD per 1M Tokens'],
    mode='markers',
    name=f'Other models that are {threshold_model} or better on {bench}',
    text=other_models_df['Model Name'],
    textposition='bottom left',
    marker=dict(color='lightgrey'),
    zorder=1,
))
fig.update_layout(
    title=f'Price per token to answer {bench} questions as well as {threshold_model} has fallen by {annual_factor}x per year'
)
fig.update_traces(textposition='bottom left')
fig.update_layout(yaxis_type='log')
fig.update_layout(xaxis_title='Month')
fig.update_layout(yaxis_title='Usage price in USD per million tokens')
fig.update_layout(
    width=1000,
    height=600,
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99,
        bordercolor="lightgrey",
        borderwidth=1
    ),
    template='plotly_white',
)

fig.show()

## Identify min, median, max trends

In [6]:
min_trend_idx = results_df['price_reduction_factor_per_year'].idxmin()
min_trend_result = results_df.iloc[min_trend_idx]
min_trend_bench = min_trend_result['bench']
min_trend_threshold_model = min_trend_result['threshold_model']
min_trend_annual_factor = min_trend_result['price_reduction_factor_per_year']
min_trend_result

bench                                             MMLU
threshold_model                          GPT-3.5 Turbo
performance_range                          [68.0, 100]
sample_size                                          6
start_date                                  2023-03-06
end_date                                    2024-10-03
price_reduction_factor_per_year                      9
price_reduction_factor_per_year_90_ci          [5, 18]
r_squared                                         0.93
Name: 2, dtype: object

In [7]:
max_trend_idx = results_df['price_reduction_factor_per_year'].idxmax()
max_trend_result = results_df.iloc[max_trend_idx]
max_trend_bench = max_trend_result['bench']
max_trend_threshold_model = max_trend_result['threshold_model']
max_trend_annual_factor = max_trend_result['price_reduction_factor_per_year']
max_trend_result


bench                                      GPQA Diamond
threshold_model                          GPT-4o-2024-05
performance_range                           [53.0, 100]
sample_size                                           4
start_date                                   2024-05-13
end_date                                     2024-12-13
price_reduction_factor_per_year                     851
price_reduction_factor_per_year_90_ci       [6, 117290]
r_squared                                          0.89
Name: 7, dtype: object

In [8]:
median_trend_idx = (results_df['price_reduction_factor_per_year'] - results_df['price_reduction_factor_per_year'].median()).abs().idxmin()
median_trend_result = results_df.iloc[median_trend_idx]
median_trend_bench = median_trend_result['bench']
median_trend_threshold_model = median_trend_result['threshold_model']
median_trend_annual_factor = median_trend_result['price_reduction_factor_per_year']
median_trend_result

bench                                    GPQA Diamond
threshold_model                           GPT-4 Turbo
performance_range                         [50.0, 100]
sample_size                                         6
start_date                                 2023-11-06
end_date                                   2024-12-13
price_reduction_factor_per_year                    46
price_reduction_factor_per_year_90_ci        [4, 565]
r_squared                                        0.73
Name: 6, dtype: object

## Plot of one trend with other trends greyed out

In [9]:
selected_benches = results_df['bench'].unique()
selected_threshold_models = results_df['threshold_model'].unique()
selected_bench = 'GPQA Diamond'
selected_threshold_model = 'GPT-4-0314'
selected_annual_factor = results_df[
    (results_df['bench'] == selected_bench) &
    (results_df['threshold_model'] == selected_threshold_model)
]['price_reduction_factor_per_year'].iloc[0]

fig = go.Figure()

for i, row in results_df.iterrows():
    bench = row['bench']
    threshold_model = row['threshold_model']
    print(f'Processing {bench} and {threshold_model}')
    cheapest_models_df = all_cheapest_models_df[
        (all_cheapest_models_df['bench'] == bench) &
        (all_cheapest_models_df['threshold_model'] == threshold_model)
    ]
    annual_factor = row['price_reduction_factor_per_year']
    predicted_log_prices = cheapest_models_df['predicted_log_price']

    if bench == selected_bench and threshold_model == selected_threshold_model:
        color = 'red'
        showlegend = True
        zorder = 1
        legendgroup = f'{selected_bench} {selected_threshold_model}'
        legendrank = 0
    else:
        color = 'rgb(230, 230, 230)'
        showlegend = i == 0  # Only show the first one in legend
        zorder = 0
        legendgroup = "Other benchmarks and performance levels"
        legendrank = 1

    fig.add_trace(go.Scatter(
        x=cheapest_models_df['Release Date'],
        y=predicted_log_prices,
        mode='lines',
        name=f'{threshold_model} level performance on {bench}' if bench == selected_bench and threshold_model == selected_threshold_model else "Other benchmarks and performance levels",
        line=dict(color=color),
        showlegend=showlegend,
        zorder=zorder,
        legendgroup=legendgroup,
        legendrank=legendrank,
    ))
    if bench == selected_bench and threshold_model == selected_threshold_model:
        fig.add_trace(go.Scatter(
            x=cheapest_models_df['Release Date'],
            y=cheapest_models_df['USD per 1M Tokens'],
            mode='markers',
            name=f'Lowest-price models that are {threshold_model} or better on {bench}',
            text=cheapest_models_df['Model Name'],
            textposition='bottom left',
            marker=dict(color='red'),
            showlegend=False,
            zorder=zorder+1,
            legendgroup=legendgroup,
            legendrank=legendrank,
        ))

# Remove minor y labels
fig.update_layout(
    yaxis=dict(
        dtick='D10',
        # range=[-1, 2],
    )
)
fig.update_layout(
    title=f'The usage price of LLMs at a given performance level has fallen by {min_trend_annual_factor}–{round(max_trend_annual_factor, -2)}x per year'
)
fig.update_traces(textposition='bottom left')
fig.update_layout(yaxis_type='log')
fig.update_layout(xaxis_title='Release date')
# fig.update_layout(yaxis_title='Usage price in USD per million tokens')
fig.update_layout(
    annotations=[
        dict(
            x=-0.037,  # Center of the plot
            y=1.07,  # Above the plot
            xref='paper',
            yref='paper',
            text='Usage price in USD per million tokens',
            showarrow=False,
            font=dict(size=12),
        ),
        # Add annotation for highlighted trend
        dict(
            x=0.7,  # Position on x-axis (adjust as needed)
            y=0.65,  # Position on y-axis (adjust as needed)
            xref='paper',
            yref='paper',
            text=f'{selected_annual_factor}x decrease per year',
            showarrow=False,
            arrowhead=2,
            arrowsize=1,
            arrowwidth=1,
            arrowcolor='black',
            ax=-30,
            ay=30,
            font=dict(size=12, color='red'),
            bgcolor='rgba(255, 255, 255, 0.7)',
            # bordercolor='lightgrey',
            # borderwidth=1,
            # borderpad=0,
        )
    ],
)
fig.update_layout(
    width=1000,
    height=600,
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99,
        bordercolor="lightgrey",
        borderwidth=1
    ),
    margin=dict(t=120),
    template='plotly_white',
)

save_plot(fig, results_dir, 'lowest_price_trends_all_highlighting_max_trend')

fig.show()

Processing MMLU and GPT-3
Processing MMLU and GPT-3.5
Processing MMLU and GPT-3.5 Turbo
Processing MMLU and GPT-4-0314
Processing MMLU and GPT-4 Turbo
Processing GPQA Diamond and GPT-4-0314
Processing GPQA Diamond and GPT-4 Turbo
Processing GPQA Diamond and GPT-4o-2024-05
Processing GPQA Diamond and Claude-3.5-Sonnet-2024-06
Processing MATH-500 and GPT-3.5-Turbo-2023-06
Processing MATH-500 and GPT-4 Turbo
Processing MATH-500 and GPT-4o-2024-05
Processing MATH 5 and GPT-4-0613
Processing MATH 5 and GPT-4 Turbo
Processing MATH 5 and GPT-4o-2024-05
Processing HumanEval and GPT-4-0314
Processing HumanEval and GPT-3.5-Turbo-2023-06
Processing LMSys Chatbot Arena ELO and GPT-3.5 Turbo
Processing LMSys Chatbot Arena ELO and GPT-4-0314
Processing LMSys Chatbot Arena ELO and GPT-4 Turbo
Processing LMSys Chatbot Arena ELO and GPT-4o-2024-05


## Plot of trends in the past year

In [10]:
min_date_df = results_df.copy()
start_date_ordinals = min_date_df['start_date'].map(lambda x: pd.Timestamp(x).toordinal())
current_date_ordinal = pd.Timestamp.now().toordinal()
min_date_df = min_date_df[start_date_ordinals > current_date_ordinal - 365]
min_date_df

Unnamed: 0,bench,threshold_model,performance_range,sample_size,start_date,end_date,price_reduction_factor_per_year,price_reduction_factor_per_year_90_ci,r_squared
7,GPQA Diamond,GPT-4o-2024-05,"[53.0, 100]",4,2024-05-13,2024-12-13,851,"[6, 117290]",0.89
8,GPQA Diamond,Claude-3.5-Sonnet-2024-06,"[56.0, 100]",4,2024-06-20,2025-02-05,251,"[42, 1518]",0.98
11,MATH-500,GPT-4o-2024-05,"[79.0, 100]",4,2024-05-13,2024-12-13,689,"[0, 3966835]",0.71
14,MATH 5,GPT-4o-2024-05,"[48.0, 100]",4,2024-05-13,2024-12-13,689,"[0, 3966835]",0.71
20,LMSys Chatbot Arena ELO,GPT-4o-2024-05,"[1285.0, inf]",5,2024-05-13,2025-02-05,177,"[39, 800]",0.96


In [11]:
[(row['bench'], row['threshold_model']) for i, row in min_date_df.iterrows()]

[('GPQA Diamond', 'GPT-4o-2024-05'),
 ('GPQA Diamond', 'Claude-3.5-Sonnet-2024-06'),
 ('MATH-500', 'GPT-4o-2024-05'),
 ('MATH 5', 'GPT-4o-2024-05'),
 ('LMSys Chatbot Arena ELO', 'GPT-4o-2024-05')]

In [12]:
selected_benches = results_df['bench'].unique()
selected_threshold_models = results_df['threshold_model'].unique()
selected_trends = [(row['bench'], row['threshold_model']) for i, row in min_date_df.iterrows()]

bench_aliases = {
    'MMLU': 'general knowledge',
    'GPQA Diamond': 'Ph.D. level science questions',
    'MATH-500': 'math',
    'MATH 5': 'math (advanced)',
    'HumanEval': 'coding',
    'LMSys Chatbot Arena ELO': 'chatbot competition',
}

fig = go.Figure()

first_one_in_selected_trends = True
selected_trend_idx = 0
colors = ['orange', 'magenta', 'turquoise', 'turquoise', 'blue']
for i, row in results_df.iterrows():
    bench = row['bench']
    threshold_model = row['threshold_model']
    # print(f'Processing {bench} and {threshold_model}')
    cheapest_models_df = all_cheapest_models_df[
        (all_cheapest_models_df['bench'] == bench) &
        (all_cheapest_models_df['threshold_model'] == threshold_model)
    ]
    annual_factor = row['price_reduction_factor_per_year']
    predicted_log_prices = cheapest_models_df['predicted_log_price']

    is_in_selected_trends = (bench, threshold_model) in selected_trends

    if is_in_selected_trends:
        color = colors[selected_trend_idx]
        selected_trend_idx += 1
        showlegend = True  # or: first_one_in_selected_trends
        zorder = 1
        legendgroup = 'Trends in the past year'
        # special_name = f'Trends in the past year: {min_date_df['price_reduction_factor_per_year'].min():.0f}x to {min_date_df['price_reduction_factor_per_year'].max():.0f}x per year'
        special_name = f'{threshold_model} level or better in {bench_aliases[bench]}'
        legendrank = 0
        first_one_in_selected_trends = False
        print(f'{color}: {annual_factor}x per year')
    else:
        color = 'rgb(230, 230, 230)'
        showlegend = i == 0  # Only show the first one in legend
        zorder = 0
        legendgroup = "Other benchmarks and performance levels"
        legendrank = 1

    fig.add_trace(go.Scatter(
        x=cheapest_models_df['Release Date'],
        y=predicted_log_prices,
        mode='lines',
        name=special_name if is_in_selected_trends else "Other benchmarks and performance levels",
        line=dict(color=color),
        showlegend=showlegend,
        zorder=zorder,
        legendgroup=legendgroup,
        legendrank=legendrank,
    ))
    # if is_in_selected_trends:
    #     fig.add_trace(go.Scatter(
    #         x=cheapest_models_df['Release Date'],
    #         y=cheapest_models_df['USD per 1M Tokens'],
    #         mode='markers',
    #         name=f'Lowest-price models that are {threshold_model} or better on {bench}',
    #         text=cheapest_models_df['Model Name'],
    #         textposition='bottom left',
    #         marker=dict(color='red'),
    #         showlegend=False,
    #         zorder=zorder+1,
    #         legendgroup=legendgroup,
    #         legendrank=legendrank,
    #     ))

# Remove minor y labels
fig.update_layout(
    yaxis=dict(
        dtick='D10',
        # range=[-1, 2],
    )
)
# fig.update_layout(
#     title=f'The usage price of LLMs at a given performance level has fallen by {min_trend_annual_factor}–{round(max_trend_annual_factor, -2)}x per year'
# )
fig.update_traces(textposition='bottom left')
fig.update_layout(yaxis_type='log')
fig.update_layout(xaxis_title='Release date')
# fig.update_layout(yaxis_title='Usage price in USD per million tokens')
fig.update_layout(
    annotations=[
        dict(
            x=-0.037,  # Center of the plot
            y=1.07,  # Above the plot
            xref='paper',
            yref='paper',
            text='Usage price in USD per million tokens',
            showarrow=False,
            font=dict(size=12),
        ),
    ],
)
fig.update_layout(
    width=1000,
    height=600,
    legend=dict(
        yanchor="top",
        y=0.4,
        xanchor="right",
        x=0.65,
        bordercolor="lightgrey",
        borderwidth=1
    ),
    margin=dict(t=60),
    template='plotly_white',
)

save_plot(fig, results_dir, 'lowest_price_trends_all_highlighting_recent_trends')

fig.show()

orange: 851x per year
magenta: 251x per year
turquoise: 689x per year
turquoise: 689x per year
blue: 177x per year
