In [1]:
import pandas as pd
import plotly.graph_objects as go

In [2]:
def save_plot(fig, folder, filename, extensions=['png', 'svg', 'pdf', 'html'], scale=2):
    if 'png' in extensions:
        fig.write_image(folder + filename + '.png', scale=scale)
    if 'svg' in extensions:
        fig.write_image(folder + filename + '.svg', scale=scale)
    if 'pdf' in extensions:
        fig.write_image(folder + filename + '.pdf', scale=scale)
    if 'html' in extensions:
        fig.write_html(folder + filename + '.html')

In [3]:
all_cheapest_models_df = pd.read_csv('results/2025-02-28/lowest_price_models_above_previous_frontier/lowest_price_models_data.csv')
all_other_models_df = pd.read_csv('results/2025-02-28/lowest_price_models_above_previous_frontier/other_models_data.csv')
results_df = pd.read_csv('results/2025-02-28/lowest_price_models_above_previous_frontier/lowest_price_models_results.csv')

In [4]:
bench = 'GPQA Diamond'
threshold_model = 'GPT-4'
cheapest_models_df = all_cheapest_models_df[
    (all_cheapest_models_df['bench'] == bench) &
    (all_cheapest_models_df['threshold_model'] == threshold_model)
]
other_models_df = all_other_models_df[
    (all_other_models_df['bench'] == bench) &
    (all_other_models_df['threshold_model'] == threshold_model)
]
result = results_df[
    (results_df['bench'] == bench) &
    (results_df['threshold_model'] == threshold_model)
]
annual_factor = result['price_reduction_factor_per_year'].iloc[0]
predicted_log_prices = cheapest_models_df['predicted_log_price']
performance_range = eval(cheapest_models_df['performance_range'].iloc[0])
performance_lower_bound = performance_range[0]
performance_upper_bound = performance_range[1]

fig = go.Figure()


fig.add_trace(go.Scatter(
    x=cheapest_models_df['Release Date'],
    y=cheapest_models_df['USD per 1M Tokens'],
    mode='markers+text',
    name=f'Lowest-price models that are {threshold_model} or better on {bench}',
    text=cheapest_models_df['Model Name'],
    textposition='bottom left',
    marker=dict(color='blue'),
    zorder=3,
))
fig.add_trace(go.Scatter(
    x=cheapest_models_df['Release Date'],
    y=predicted_log_prices,
    mode='lines',
    name=f'Trendline: {annual_factor}x decrease per year',
    line=dict(color='blue', dash='dash'),
    zorder=2,
))
fig.add_trace(go.Scatter(
    x=other_models_df['Release Date'],
    y=other_models_df['USD per 1M Tokens'],
    mode='markers',
    name=f'Other models that are {threshold_model} or better on {bench}',
    text=other_models_df['Model Name'],
    textposition='bottom left',
    marker=dict(color='lightgrey'),
    zorder=1,
))
fig.update_layout(
    title=f'Price per token to answer {bench} questions as well as {threshold_model} has fallen by {annual_factor}x per year'
)
fig.update_traces(textposition='bottom left')
fig.update_layout(yaxis_type='log')
fig.update_layout(xaxis_title='Month')
fig.update_layout(yaxis_title='Usage price in USD per million tokens')
fig.update_layout(
    width=1000,
    height=600,
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99,
        bordercolor="lightgrey",
        borderwidth=1
    ),
    template='plotly_white',
)

fig.show()

In [5]:
min_trend_idx = results_df['price_reduction_factor_per_year'].idxmin()
min_trend_result = results_df.iloc[min_trend_idx]
min_trend_bench = min_trend_result['bench']
min_trend_threshold_model = min_trend_result['threshold_model']
min_trend_annual_factor = min_trend_result['price_reduction_factor_per_year']
min_trend_result

bench                                           MMLU
threshold_model                          GPT-4 Turbo
performance_range                        [87.0, 100]
sample_size                                        4
start_date                                2023-11-01
end_date                                  2024-07-01
price_reduction_factor_per_year                    7
price_reduction_factor_per_year_90_ci        [2, 27]
r_squared                                       0.89
Name: 2, dtype: object

In [6]:
max_trend_idx = results_df['price_reduction_factor_per_year'].idxmax()
max_trend_result = results_df.iloc[max_trend_idx]
max_trend_bench = max_trend_result['bench']
max_trend_threshold_model = max_trend_result['threshold_model']
max_trend_annual_factor = max_trend_result['price_reduction_factor_per_year']
max_trend_result


bench                                    LMSys Chatbot Arena ELO
threshold_model                                      GPT-4 Turbo
performance_range                                  [1256.0, inf]
sample_size                                                    4
start_date                                            2023-11-01
end_date                                              2024-09-01
price_reduction_factor_per_year                              331
price_reduction_factor_per_year_90_ci                 [17, 6288]
r_squared                                                   0.94
Name: 16, dtype: object

In [7]:
median_trend_idx = (results_df['price_reduction_factor_per_year'] - results_df['price_reduction_factor_per_year'].median()).abs().idxmin()
median_trend_result = results_df.iloc[median_trend_idx]
median_trend_bench = median_trend_result['bench']
median_trend_threshold_model = median_trend_result['threshold_model']
median_trend_annual_factor = median_trend_result['price_reduction_factor_per_year']
median_trend_result

bench                                    GPQA Diamond
threshold_model                           GPT-4 Turbo
performance_range                         [50.0, 100]
sample_size                                         7
start_date                                 2023-11-01
end_date                                   2025-01-01
price_reduction_factor_per_year                    45
price_reduction_factor_per_year_90_ci       [10, 200]
r_squared                                        0.84
Name: 4, dtype: object

In [8]:
selected_benches = results_df['bench'].unique()
selected_threshold_models = results_df['threshold_model'].unique()
selected_bench = max_trend_bench
selected_threshold_model = max_trend_threshold_model
selected_annual_factor = max_trend_annual_factor

fig = go.Figure()

for i, row in results_df.iterrows():
    bench = row['bench']
    threshold_model = row['threshold_model']
    print(f'Processing {bench} and {threshold_model}')
    cheapest_models_df = all_cheapest_models_df[
        (all_cheapest_models_df['bench'] == bench) &
        (all_cheapest_models_df['threshold_model'] == threshold_model)
    ]
    annual_factor = row['price_reduction_factor_per_year']
    predicted_log_prices = cheapest_models_df['predicted_log_price']

    if bench == selected_bench and threshold_model == selected_threshold_model:
        color = 'red'
        showlegend = True
        zorder = 1
        legendgroup = f'{selected_bench} {selected_threshold_model}'
        legendrank = 0
    else:
        color = 'lightgrey'
        showlegend = i == 0  # Only show the first one in legend
        zorder = 0
        legendgroup = "Other benchmarks and performance levels"
        legendrank = 1

    fig.add_trace(go.Scatter(
        x=cheapest_models_df['Release Date'],
        y=predicted_log_prices,
        mode='lines',
        name=f'{threshold_model} level performance on {bench}' if bench == selected_bench and threshold_model == selected_threshold_model else "Other benchmarks and performance levels",
        line=dict(color=color),
        showlegend=showlegend,
        zorder=zorder,
        legendgroup=legendgroup,
        legendrank=legendrank,
    ))
    if bench == selected_bench and threshold_model == selected_threshold_model:
        fig.add_trace(go.Scatter(
            x=cheapest_models_df['Release Date'],
            y=cheapest_models_df['USD per 1M Tokens'],
            mode='markers',
            name=f'Lowest-price models that are {threshold_model} or better on {bench}',
            text=cheapest_models_df['Model Name'],
            textposition='bottom left',
            marker=dict(color='red'),
            showlegend=False,
            zorder=zorder+1,
            legendgroup=legendgroup,
            legendrank=legendrank,
        ))

# Remove minor y labels
fig.update_layout(
    yaxis=dict(
        dtick='D10',
        # range=[-1, 2],
    )
)
fig.update_layout(
    title=f'The usage price of LLMs at a given performance level has fallen by {min_trend_annual_factor}–{round(max_trend_annual_factor, -2)}x per year'
)
fig.update_traces(textposition='bottom left')
fig.update_layout(yaxis_type='log')
fig.update_layout(xaxis_title='Release date')
# fig.update_layout(yaxis_title='Usage price in USD per million tokens')
fig.update_layout(
    annotations=[
        dict(
            x=-0.037,  # Center of the plot
            y=1.07,  # Above the plot
            xref='paper',
            yref='paper',
            text='Usage price in USD per million tokens',
            showarrow=False,
            font=dict(size=12),
        ),
        # Add annotation for highlighted trend
        dict(
            x=0.7,  # Position on x-axis (adjust as needed)
            y=0.65,  # Position on y-axis (adjust as needed)
            xref='paper',
            yref='paper',
            text=f'{selected_annual_factor}x decrease per year',
            showarrow=False,
            arrowhead=2,
            arrowsize=1,
            arrowwidth=1,
            arrowcolor='black',
            ax=-30,
            ay=30,
            font=dict(size=12, color='red'),
            bgcolor='rgba(255, 255, 255, 0.7)',
            # bordercolor='lightgrey',
            # borderwidth=1,
            # borderpad=0,
        )
    ],
)
fig.update_layout(
    width=1000,
    height=600,
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99,
        bordercolor="lightgrey",
        borderwidth=1
    ),
    margin=dict(t=120),
    template='plotly_white',
)

save_plot(fig, 'results/2025-02-28/lowest_price_models_above_previous_frontier/', 'lowest_price_trends_all_highlighting_max_trend')

fig.show()

Processing MMLU and GPT-3
Processing MMLU and GPT-3.5
Processing MMLU and GPT-4 Turbo
Processing GPQA Diamond and GPT-4
Processing GPQA Diamond and GPT-4 Turbo
Processing GPQA Diamond and GPT-4o-2024-05
Processing MATH-500 and GPT-4
Processing MATH-500 and GPT-3.5 Turbo
Processing MATH-500 and GPT-4 Turbo
Processing MATH-500 and GPT-4o-2024-05
Processing MATH 5 and GPT-4
Processing MATH 5 and GPT-4 Turbo
Processing MATH 5 and GPT-4o-2024-05
Processing HumanEval and GPT-4
Processing HumanEval and GPT-3.5 Turbo
Processing LMSys Chatbot Arena ELO and GPT-4
Processing LMSys Chatbot Arena ELO and GPT-4 Turbo
Processing LMSys Chatbot Arena ELO and GPT-4o-2024-05
