In [1]:
import numpy as np
import os
import pandas as pd

In [None]:
def save_plot(fig, folder, filename, extensions=['png', 'svg', 'pdf', 'html'], scale=2):
    if 'png' in extensions:
        fig.write_image(folder + filename + '.png', scale=scale)
    if 'svg' in extensions:
        fig.write_image(folder + filename + '.svg', scale=scale)
    if 'pdf' in extensions:
        fig.write_image(folder + filename + '.pdf', scale=scale)
    if 'html' in extensions:
        fig.write_html(folder + filename + '.html')

In [2]:
results_dir = 'results/evaluation_log_analysis/2025-02-24/'
os.makedirs(results_dir, exist_ok=True)

In [3]:
# Saved in save_evaluation_data.ipynb
df = pd.read_csv('data/epoch_ai_eval_data.csv')

In [4]:
df

Unnamed: 0,Model,Task,epochs,total_samples,input_tokens,output_tokens,total_tokens
0,Yi-34B-Chat,MATH level 5,8,10592,2222376,7308455,9530831
1,qwen2p5-72b-instruct,MATH level 5,8,10592,2243320,9708977,11952297
2,gpt-4-0613,MATH level 5,8,10592,1982888,3620454,5603342
3,o1-mini-2024-09-12_high,MATH level 5,4,5296,988900,9914854,10903754
4,gemma-2-9b-it,MATH level 5,8,10592,1989896,4551564,6541460
...,...,...,...,...,...,...,...
130,open-mistral-7b,MATH level 5,8,10592,2150624,6429532,8580156
131,Qwen2-72B-Instruct,GPQA Diamond,16,3168,889664,1028460,1918124
132,claude-3-5-sonnet-20240620,MATH level 5,8,10592,2137040,4674647,6811687
133,claude-3-opus-20240229,MATH level 5,8,10592,2137040,5204915,7341955


In [5]:
len(df['Model'].unique())

73

In [6]:
df['total_tokens_per_trial'] = df['total_tokens'] / df['epochs']
df['total_tokens_per_question'] = df['total_tokens'] / df['total_samples']
df['output_tokens_per_question'] = df['output_tokens'] / df['total_samples']

In [7]:
# Create separate bar charts for each task
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Get unique tasks
tasks = df['Task'].unique()

# Create a figure for each task
for task in tasks:
    # Filter data for this task
    task_df = df[df['Task'] == task]
    
    # Sort by total tokens in descending order
    sorted_task_df = task_df.sort_values('output_tokens_per_question', ascending=False)
    
    # Create the bar chart for this task
    fig = px.bar(
        sorted_task_df,
        x='Model',
        y='output_tokens_per_question',
        title=f'Average of output tokens per question by model for {task}',
        labels={
            'Model': 'Model name',
            'output_tokens_per_question': 'Output tokens per question',
        },
    )
    
    # Update layout for better readability
    fig.update_layout(
        width=1200,
        height=600,
        xaxis_tickangle=-45,
        xaxis_title='Model',
        yaxis_title='Output Tokens',
        margin=dict(b=100)  # Add bottom margin for rotated labels
    )

    save_plot(fig, results_dir, f'output_tokens_per_question_by_model_for_{task}')
    
    fig.show()


In [8]:
sorted(df['Model'].unique())

['DeepSeek-V3',
 'Eurus-2-7B-PRIME',
 'Hermes-2-Theta-Llama-3-70B',
 'Llama-2-70b-chat-hf',
 'Llama-3.1-Tulu-3-70B-DPO',
 'Meta-Llama-3-70B-Instruct',
 'Meta-Llama-3-8B-Instruct',
 'Meta-Llama-3.1-405B-Instruct',
 'Meta-Llama-3.1-70B-Instruct',
 'Meta-Llama-3.1-8B-Instruct',
 'Mistral-7B-Instruct-v0.3',
 'Mixtral-8x7B-Instruct-v0.1',
 'Phi-3-medium-128k-instruct',
 'Qwen1.5-32B-Chat',
 'Qwen1.5-72B-Chat',
 'Qwen2-72B-Instruct',
 'Qwen2.5-32B-Instruct',
 'Qwen2.5-72B-Instruct',
 'WizardLM-2-8x22B',
 'Yi-1.5-34B-Chat',
 'Yi-34B-Chat',
 'claude-2.0',
 'claude-2.1',
 'claude-3-5-sonnet-20240620',
 'claude-3-5-sonnet-20241022',
 'claude-3-haiku-20240307',
 'claude-3-opus-20240229',
 'claude-3-sonnet-20240229',
 'dbrx-instruct',
 'deepseek-llm-67b-chat',
 'deepseek-r1',
 'deepseek-v3',
 'gemini-1.0-pro-001',
 'gemini-1.5-flash-001',
 'gemini-1.5-flash-002',
 'gemini-1.5-pro-001',
 'gemini-1.5-pro-002',
 'gemini-2.0-flash-001',
 'gemini-2.0-flash-thinking-exp-01-21',
 'gemini-2.0-pro-exp-02-0

In [9]:
reasoning_models = [
    'deepseek-r1',
    'o1-mini-2024-09-12',
    'o1-mini-2024-09-12_high',
    'o1-preview-2024-09-12',
    'o1-2024-12-17',
    'o1-2024-12-17_high',
    'o3-mini-2025-01-31',
    'o3-mini-2025-01-31_high',
    'gemini-2.0-flash-thinking-exp-01-21',
]
df[df['Model'].isin(reasoning_models)]

Unnamed: 0,Model,Task,epochs,total_samples,input_tokens,output_tokens,total_tokens,total_tokens_per_trial,total_tokens_per_question,output_tokens_per_question
3,o1-mini-2024-09-12_high,MATH level 5,4,5296,988900,9914854,10903754,2725938.0,2058.865937,1872.140106
10,o3-mini-2025-01-31_high,GPQA Diamond,8,1584,410883,12231480,12642363,1580295.0,7981.289773,7721.893939
12,o1-preview-2024-09-12,GPQA Diamond,16,3168,859696,9599382,10459078,653692.4,3301.476641,3030.107955
17,o1-2024-12-17_high,GPQA Diamond,1,198,51410,1258852,1310262,1310262.0,6617.484848,6357.838384
23,o1-mini-2024-09-12_high,GPQA Diamond,8,1584,429856,2973549,3403405,425425.6,2148.614268,1877.24053
26,deepseek-r1,GPQA Diamond,1,198,49010,1503756,1552766,1552766.0,7842.252525,7594.727273
27,o1-mini-2024-09-12,GPQA Diamond,16,3168,859744,5597033,6456777,403548.6,2038.124053,1766.740215
29,gemini-2.0-flash-thinking-exp-01-21,GPQA Diamond,1,198,50596,119526,170122,170122.0,859.20202,603.666667
37,o1-2024-12-17,GPQA Diamond,1,198,51407,816755,868162,868162.0,4384.656566,4125.025253
49,o3-mini-2025-01-31_high,MATH level 5,4,5296,946823,15759757,16706580,4176645.0,3154.56571,2975.784932


In [10]:
reasoning_model_df = df[df['Model'].isin(reasoning_models)]
reasoning_model_df['output_tokens_per_question'].mean()

3527.5329727881267

In [11]:
other_model_df = df[~df['Model'].isin(reasoning_models)]
other_model_df['output_tokens_per_question'].mean()

554.8069170780594

In [12]:
reasoning_model_df['output_tokens_per_question'].mean() / other_model_df['output_tokens_per_question'].mean()

6.35812724067356

In [13]:
# Find the models that had the cheapest price per token and GPT-4 level or greater
# At least, the nearest models we've evaluated here
cheapest_models_gpqa_diamond = [
    'gpt-4-0613',
    # Can't find gpt-4-turbo
    'mistral-large-2402',
    'claude-3-haiku-20240307',
    'gemini-1.5-flash-001',
    'phi-4',
]
df[(df['Model'].isin(cheapest_models_gpqa_diamond)) & (df['Task'] == 'GPQA Diamond')]

Unnamed: 0,Model,Task,epochs,total_samples,input_tokens,output_tokens,total_tokens,total_tokens_per_trial,total_tokens_per_question,output_tokens_per_question
5,phi-4,GPQA Diamond,16,3168,838784,2106559,2945343,184083.9375,929.716856,664.949179
6,mistral-large-2402,GPQA Diamond,16,3168,928128,1152988,2081116,130069.75,656.917929,363.948232
41,gemini-1.5-flash-001,GPQA Diamond,16,3168,809504,1140579,1950083,121880.1875,615.556503,360.03125
47,gpt-4-0613,GPQA Diamond,16,3168,838928,694183,1533111,95819.4375,483.936553,219.123422
113,claude-3-haiku-20240307,GPQA Diamond,16,3168,936096,1092395,2028491,126780.6875,640.306503,344.821654


In [14]:
cheapest_models_gpqa_diamond_df = df[(df['Model'].isin(cheapest_models_gpqa_diamond)) & (df['Task'] == 'GPQA Diamond')]
np.percentile(cheapest_models_gpqa_diamond_df['output_tokens_per_question'], [0, 50, 100])

array([219.12342172, 360.03125   , 664.94917929])

In [15]:
# Plot the distribution of total_tokens
import plotly.express as px

for task in df['Task'].unique():
    fig = px.histogram(
        df[df['Task'] == task], 
        x='output_tokens_per_question',
        title=f'Distribution of output tokens per question for {task}',
        labels={'output_tokens_per_question': 'Output tokens per question'},
        # nbins=30,
        histnorm='probability'  # Normalize by total count
    )

    # Update layout for better readability
    fig.update_layout(
        showlegend=False,
        xaxis_title='Output tokens',
        yaxis_title='Relative frequency',  # Updated to reflect normalization
        bargap=0.1
    )

    save_plot(fig, results_dir, f'output_tokens_per_question_distribution_for_{task}')

    fig.show()

In [16]:
model_mapping = {
    # Claude models
    'claude-3-haiku-20240307': 'Claude-3-Haiku',
    'claude-3-opus-20240229': 'Claude-3-Opus',
    'claude-3-sonnet-20240229': 'Claude-3-Sonnet',
    'claude-3-5-sonnet-20240620': 'Claude-3.5-Sonnet-2024-06',
    'claude-3-5-sonnet-20241022': 'Claude-3.5-Sonnet-2024-10',
    'claude-2.0': 'Claude 2',
    'claude-2.1': 'Claude 2.1',
    
    # Gemini models
    'gemini-2.0-flash-001': 'Gemini 2.0 Flash',
    # 'gemini-2.0-flash-thinking-exp-01-21': 'Gemini 2.0 Flash Thinking',
    'gemini-1.5-flash-001': 'Gemini-1.5-Flash-2024-05',
    'gemini-1.5-flash-002': 'Gemini-1.5-Flash-2024-09',
    'gemini-1.5-pro-001': 'Gemini-1.5-Pro-2024-05',
    'gemini-1.5-pro-002': 'Gemini-1.5-Pro-2024-09',
    'gemini-1.0-pro-001': 'Gemini 1.0 Pro',
    
    # Gemma models
    'gemma-2-27b-it': 'Gemma-2-27B',
    'gemma-2-9b-it': 'Gemma-2-9B',
    
    # GPT models
    'gpt-3.5-turbo-0125': 'GPT-3.5-Turbo-2024-01',
    'gpt-3.5-turbo-1106': 'GPT-3.5-Turbo-2023-11',
    # 'gpt-4-0125-preview': 'GPT-4',
    'gpt-4-0613': 'GPT-4',
    # 'gpt-4-1106-preview': 'GPT-4 Turbo',
    'gpt-4o-2024-05-13': 'GPT-4o-2024-05',
    'gpt-4o-2024-08-06': 'GPT-4o-2024-08',
    'gpt-4o-2024-11-20': 'GPT-4o-2024-11',
    'gpt-4o-mini-2024-07-18': 'GPT-4o-mini',
    
    # Llama models
    'Llama-2-70b-chat-hf': 'Llama 2-70B Chat',
    'Meta-Llama-3-8B-Instruct': 'Llama-3-Instruct-8B',
    'Meta-Llama-3-70B-Instruct': 'Llama-3-Instruct-70B',
    'Meta-Llama-3.1-8B-Instruct': 'Llama-3.1-Instruct-8B',
    'Meta-Llama-3.1-70B-Instruct': 'Llama-3.1-Instruct-70B',
    'Meta-Llama-3.1-405B-Instruct': 'Llama-3.1-Instruct-405B',
    # 'llama-3.1-405b-instruct-maas': 'Llama-3.1-Instruct-405B',
    # 'llama-v3p3-70b-instruct': 'Llama-3.1-Instruct-70B',
    # 'Llama-3.1-Tulu-3-70B-DPO': 'Llama-3.1-Instruct-70B',
    
    # Mistral models
    'Mistral-7B-Instruct-v0.3': 'Mistral 7B',
    'mistral-large-2402': 'Mistral-Large-2024-02',
    'mistral-large-2407': 'Mistral-Large-2-2024-06',
    'open-mistral-7b': 'Mistral 7B',
    'open-mistral-nemo-2407': 'Mistral-NeMo',
    'mistral-small-2501': 'Mistral Small 3',
    'open-mixtral-8x22b': 'Mistral-8x22',
    # 'open-mixtral-8x7b': 'Mistral 7B',
    # 'Mixtral-8x7B-Instruct-v0.1': 'Mistral 7B',
    # 'WizardLM-2-8x22B': 'Mistral-8x22',
    
    # Phi models
    'phi-4': 'Phi 4',
    
    # Anthropic o models
    'o1-2024-12-17_high': 'o1',
    'o1-2024-12-17': 'o1',
    'o1-mini-2024-09-12_high': 'o1-mini',
    'o1-mini-2024-09-12': 'o1-mini',
    'o1-preview-2024-09-12': 'o1-preview',
    'o3-mini-2025-01-31_high': 'o3-mini',
    'o3-mini-2025-01-31': 'o3-mini',
    
    # DeepSeek models
    'deepseek-r1': 'DeepSeek-R1',
    'deepseek-v3': 'DeepSeek-V3',
    'DeepSeek-V3': 'DeepSeek-V3',
    
    # Qwen models (no exact matches in Set 2, but including for completeness)
    # 'qwen2p5-72b-instruct': 'Qwen2.5-72B-Instruct',
    # 'Qwen2.5-32B-Instruct': 'Qwen2.5-32B-Instruct',
    # 'Qwen2.5-72B-Instruct': 'Qwen2.5-72B-Instruct',
    # 'Qwen1.5-72B-Chat': 'Qwen1.5-72B-Chat',
    # 'Qwen1.5-32B-Chat': 'Qwen1.5-32B-Chat',
    # 'Qwen2-72B-Instruct': 'Qwen2-72B-Instruct',
    
    # Other models without clear matches
    # 'Yi-34B-Chat': 'Yi-34B-Chat',
    # 'Yi-1.5-34B-Chat': 'Yi-1.5-34B-Chat',
    # 'dbrx-instruct': 'dbrx-instruct',
    # 'grok-2-1212': 'grok-2-1212',
    # 'Hermes-2-Theta-Llama-3-70B': 'Hermes-2-Theta-Llama-3-70B',
    # 'Eurus-2-7B-PRIME': 'Eurus-2-7B-PRIME',
    # 'ministral-8b-2410': 'ministral-8b-2410',
    # 'ministral-3b-2410': 'ministral-3b-2410'
}

## Load Artificial Analysis dataset

In [17]:
aa_df = pd.read_csv('data/aa_data_with_math5.csv')
aa_df

Unnamed: 0,Model Name,Release Date,USD per 1M Tokens,Tokens per Second,Prefill Latency (s),Prompt Length (tk),MMLU,GPQA Diamond,MATH-500,HumanEval,LMSys Chatbot Arena ELO,MATH 5
0,Claude-3-Haiku,2024-03,0.5,122.7,0.467,1000.0,71,33,39.0,77.0,1179.0,13.0
1,Claude-3-Opus,2024-03,30.0,26.5,1.984,1000.0,84,50,64.0,83.0,1248.0,34.0
2,Claude-3-Sonnet,2024-03,6.0,61.8,0.789,1000.0,77,37,41.0,71.0,1201.0,16.0
3,Claude-3.5-Haiku,2024-06,1.6,64.2,0.768,1000.0,81,37,67.0,87.0,1236.0,
4,Claude-3.5-Sonnet-2024-06,2024-06,6.0,55.9,0.906,1000.0,88,56,71.0,90.0,1268.0,46.0
5,Claude-3.5-Sonnet-2024-10,2024-10,6.0,55.2,0.907,1000.0,89,58,76.0,96.0,1282.0,53.0
6,Gemini 2.0 Flash,2025-02,0.175,,,,88,62,93.0,90.0,1358.0,82.0
7,Gemini-1.5-Flash-2024-05,2024-05,0.13,298.4,0.307,1000.0,79,39,55.0,,1227.0,23.0
8,Gemini-1.5-Flash-2024-09,2024-09,0.13,190.5,0.348,1000.0,75,45,83.0,83.0,1271.0,58.0
9,Gemini-1.5-Flash-8B,2024-10,0.07,285.2,0.335,1000.0,75,30,70.0,12.0,1211.0,


In [18]:
# 'Release Date' is a string with the format 'YYYY-MM'
aa_df['Release Date'] = pd.to_datetime(aa_df['Release Date'].str.strip(), format='%Y-%m')

## Load Epoch AI price dataset

In [19]:
api_price_df = pd.read_csv('data/epoch_ai_price_data_not_in_aa_with_benchmarks.csv')
api_price_df['Release Date'] = pd.to_datetime(api_price_df['Release Date'].str.strip(), format='%Y-%m-%d')
api_price_df

Unnamed: 0,Model Name,Release Date,USD per 1M Tokens,Tokens per Second,Prefill Latency (s),Prompt Length (tk),MMLU,GPQA Diamond,MATH 5,MATH-500,HumanEval,LMSys Chatbot Arena ELO
0,Claude 2,2024-08-12,12.0,,,,78.5,35.0,10.0,,,
1,Claude 2.1,2024-08-12,12.0,,,,,36.0,11.0,,,
2,Claude Instant,2024-08-12,1.2,,,,,,,,,
3,Cohere Command,2024-08-13,1.625,,,,,,,,,
4,Cohere Command Light,2024-08-13,0.375,,,,,,,,,
5,Command R,2024-08-13,0.75,,,,,,,,,1180.0
6,Command R+,2024-08-13,6.0,,,,75.7,,,,,1215.0
7,Command R+,2024-09-13,4.375,,,,75.7,,,,,1215.0
8,DeepSeek-Coder-V2 236B,2024-09-11,0.175,,,,79.2,,,,,1178.0
9,DeepSeek-R1,2025-01-20,0.96,,,,,71.7,93.1,,,1362.0


In [20]:
# Merge the two datasets
aa_df = pd.concat([aa_df, api_price_df])
aa_df.sort_values(by='Release Date', inplace=True)
# Reset the index
aa_df.reset_index(drop=True, inplace=True)
aa_df

Unnamed: 0,Model Name,Release Date,USD per 1M Tokens,Tokens per Second,Prefill Latency (s),Prompt Length (tk),MMLU,GPQA Diamond,MATH-500,HumanEval,LMSys Chatbot Arena ELO,MATH 5
0,GPT-3 175B (davinci),2021-11-20,60.000,,,,43.9,,,,,
1,GPT-3 175B (davinci),2022-08-31,60.000,,,,43.9,,,,,
2,GPT-3 175B (davinci),2022-09-01,20.000,,,,43.9,,,,,
3,GPT-3.5,2022-11-30,20.000,,,,64.8,,,,,
4,GPT-4,2023-03-01,37.500,23.6,0.724,1000.0,86.0,33.0,21.0,67.0,1186.0,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...
66,Mistral Small 3,2025-01-01,0.475,,,,82.0,46.0,74.0,85.0,1210.0,45.0
67,DeepSeek-R1,2025-01-20,0.960,,,,,71.7,,,1362.0,93.1
68,o3-mini,2025-01-31,1.925,,,,,74.3,,,1306.0,95.2
69,o1-mini,2025-01-31,1.925,,,,,59.5,,,1304.0,84.3


In [21]:
# Rename all instances of 'GPT-3 175B (davinci)' to 'GPT-3'
aa_df.loc[aa_df['Model Name'] == 'GPT-3 175B (davinci)', 'Model Name'] = 'GPT-3'
aa_df


Unnamed: 0,Model Name,Release Date,USD per 1M Tokens,Tokens per Second,Prefill Latency (s),Prompt Length (tk),MMLU,GPQA Diamond,MATH-500,HumanEval,LMSys Chatbot Arena ELO,MATH 5
0,GPT-3,2021-11-20,60.000,,,,43.9,,,,,
1,GPT-3,2022-08-31,60.000,,,,43.9,,,,,
2,GPT-3,2022-09-01,20.000,,,,43.9,,,,,
3,GPT-3.5,2022-11-30,20.000,,,,64.8,,,,,
4,GPT-4,2023-03-01,37.500,23.6,0.724,1000.0,86.0,33.0,21.0,67.0,1186.0,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...
66,Mistral Small 3,2025-01-01,0.475,,,,82.0,46.0,74.0,85.0,1210.0,45.0
67,DeepSeek-R1,2025-01-20,0.960,,,,,71.7,,,1362.0,93.1
68,o3-mini,2025-01-31,1.925,,,,,74.3,,,1306.0,95.2
69,o1-mini,2025-01-31,1.925,,,,,59.5,,,1304.0,84.3


In [22]:
aa_df['Model Name and Date'] = aa_df['Model Name'] + ' (' + aa_df['Release Date'].dt.strftime('%Y-%m') + ')'
aa_df['Model Name and Date']

0                GPT-3 (2021-11)
1                GPT-3 (2022-08)
2                GPT-3 (2022-09)
3              GPT-3.5 (2022-11)
4                GPT-4 (2023-03)
                 ...            
66     Mistral Small 3 (2025-01)
67         DeepSeek-R1 (2025-01)
68             o3-mini (2025-01)
69             o1-mini (2025-01)
70    Gemini 2.0 Flash (2025-02)
Name: Model Name and Date, Length: 71, dtype: object

In [23]:
# Keep all rows from df, even if no match in aa_df
df_temp = df.copy()
df_temp['Model Name'] = df_temp['Model'].map(model_mapping)
merged_df = pd.merge(df_temp, aa_df, on='Model Name', how='left')

# Keep all rows from aa_df, even if no match in df
df_temp = df.copy()
df_temp['Model Name'] = df_temp['Model'].map(model_mapping)
merged_df = pd.merge(df_temp, aa_df, on='Model Name', how='right')

In [24]:
merged_df.dropna(subset=['Model'])

Unnamed: 0,Model,Task,epochs,total_samples,input_tokens,output_tokens,total_tokens,total_tokens_per_trial,total_tokens_per_question,output_tokens_per_question,...,Tokens per Second,Prefill Latency (s),Prompt Length (tk),MMLU,GPQA Diamond,MATH-500,HumanEval,LMSys Chatbot Arena ELO,MATH 5,Model Name and Date
4,gpt-4-0613,MATH level 5,8.0,10592.0,1982888.0,3620454.0,5603342.0,7.004178e+05,529.016427,341.810234,...,23.6,0.724,1000.0,86.0,33.0,21.0,67.0,1186.0,23.0,GPT-4 (2023-03)
5,gpt-4-0613,GPQA Diamond,16.0,3168.0,838928.0,694183.0,1533111.0,9.581944e+04,483.936553,219.123422,...,23.6,0.724,1000.0,86.0,33.0,21.0,67.0,1186.0,23.0,GPT-4 (2023-03)
13,Llama-2-70b-chat-hf,GPQA Diamond,16.0,3168.0,956816.0,1361586.0,2318402.0,1.449001e+05,731.818813,429.793561,...,,,,68.9,26.0,,,1093.0,3.0,Llama 2-70B Chat (2023-12)
14,Llama-2-70b-chat-hf,MATH level 5,8.0,10592.0,2211400.0,4559177.0,6770577.0,8.463221e+05,639.216106,430.435895,...,,,,68.9,26.0,,,1093.0,3.0,Llama 2-70B Chat (2023-12)
19,mistral-large-2402,GPQA Diamond,16.0,3168.0,928128.0,1152988.0,2081116.0,1.300698e+05,656.917929,363.948232,...,37.8,0.500,1000.0,69.0,36.0,49.0,69.0,1157.0,21.0,Mistral-Large-2024-02 (2024-02)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,o1-mini-2024-09-12_high,GPQA Diamond,8.0,1584.0,429856.0,2973549.0,3403405.0,4.254256e+05,2148.614268,1877.240530,...,,,,,59.5,,,1304.0,84.3,o1-mini (2025-01)
115,o1-mini-2024-09-12,GPQA Diamond,16.0,3168.0,859744.0,5597033.0,6456777.0,4.035486e+05,2038.124053,1766.740215,...,,,,,59.5,,,1304.0,84.3,o1-mini (2025-01)
116,o1-mini-2024-09-12,MATH level 5,8.0,10592.0,1977800.0,16812577.0,18790377.0,2.348797e+06,1774.015955,1587.290125,...,,,,,59.5,,,1304.0,84.3,o1-mini (2025-01)
117,gemini-2.0-flash-001,MATH level 5,8.0,10592.0,1883976.0,9566695.0,11450671.0,1.431334e+06,1081.067881,903.200057,...,,,,88.0,62.0,93.0,90.0,1358.0,82.0,Gemini 2.0 Flash (2025-02)


In [25]:
merged_df[merged_df['Model Name'] == 'GPT-4 Turbo']

Unnamed: 0,Model,Task,epochs,total_samples,input_tokens,output_tokens,total_tokens,total_tokens_per_trial,total_tokens_per_question,output_tokens_per_question,...,Tokens per Second,Prefill Latency (s),Prompt Length (tk),MMLU,GPQA Diamond,MATH-500,HumanEval,LMSys Chatbot Arena ELO,MATH 5,Model Name and Date
11,,,,,,,,,,,...,39.2,1.246,1000.0,87.0,50.0,74.0,92.0,1256.0,36.0,GPT-4 Turbo (2023-11)


In [26]:
# Create an interactive scatter plot using plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

# Filter out rows with missing data
plot_df = merged_df.dropna(subset=['output_tokens_per_question', 'Release Date'])
plot_df = merged_df[merged_df['Task'] == 'GPQA Diamond']

# Create figure
fig = go.Figure()

# Add scatter plot for all models
fig.add_trace(
    go.Scatter(
        x=plot_df['Release Date'],
        y=plot_df['output_tokens_per_question'],
        mode='markers',
        hoverinfo='text',
        hovertext=[
            f"Model: {row['Model Name']}<br>" +
            f"Release Date: {row['Release Date'].strftime('%Y-%m-%d')}<br>" +
            f"Output Tokens: {row['output_tokens_per_question']:.2f}"
            for _, row in plot_df.iterrows()
        ],
        # showlegend=False,
    )
)

# Add annotations for specific models
models_to_label = ['GPT-4', 'o1-preview', 'o3-mini']
model_rows = plot_df[plot_df['Model Name'].isin(models_to_label)]

# Add invisible scatter point
fig.add_trace(
    go.Scatter(
        x=model_rows['Release Date'],
        y=model_rows['output_tokens_per_question'],
        mode='text',
        text=model_rows['Model Name'],
        textposition='top center',
        showlegend=False,
    )
)

# Update layout
fig.update_layout(
    title='Model Output Length on GPQA Diamond vs. Release Date',
    xaxis_title='Release Date',
    yaxis_title='Average Output Tokens per Question',
    width=800,
    height=600,
    template='plotly_white',
    # yaxis=dict(
    #     type='log',  # Set y-axis to logarithmic scale
    # ),
)

save_plot(fig, results_dir, 'model_output_length_on_GPQA_Diamond_vs_release_date')

# Show the figure
fig.show()


In [27]:
# Compare linear vs. log-linear models for output length vs. release date
import statsmodels.api as sm
import numpy as np
import pandas as pd
from datetime import datetime
import math

# Convert release date to numeric (days since epoch)
plot_df['days_since_epoch'] = (plot_df['Release Date'] - datetime(1970, 1, 1)).dt.days

# Add constant for intercept
X = sm.add_constant(plot_df['days_since_epoch'])
y = plot_df['output_tokens_per_question']

# Fit linear model
linear_model = sm.OLS(y, X).fit()
print("Linear Model Summary:")
print(linear_model.summary())
print(f"Linear Model BIC: {linear_model.bic:.2f}")

# Fit log-linear model (log of y)
log_y = np.log10(y)
log_linear_model = sm.OLS(log_y, X).fit()
print("\nLog-Linear Model Summary:")
print(log_linear_model.summary())
print(f"Log-Linear Model BIC: {log_linear_model.bic:.2f}")

# Determine which model has lower BIC
if linear_model.bic < log_linear_model.bic:
    print("\nLinear model has lower BIC and is preferred")
    better_model = linear_model
    is_log_model = False
else:
    print("\nLog-linear model has lower BIC and is preferred")
    better_model = log_linear_model
    is_log_model = True

# Create prediction line for plotting
x_range = np.linspace(plot_df['days_since_epoch'].min(), plot_df['days_since_epoch'].max(), 100)
X_pred = sm.add_constant(x_range)

# Get predictions from the better model
if is_log_model:
    # For log-linear model, need to exponentiate predictions
    y_pred = np.exp(better_model.predict(X_pred))
    model_name = "Log-Linear Trend"
else:
    y_pred = better_model.predict(X_pred)
    model_name = "Linear Trend"

# Convert x_range back to datetime for plotting
x_dates = [datetime(1970, 1, 1) + pd.Timedelta(days=int(x)) for x in x_range]

# Add regression line to the plot
fig.add_trace(
    go.Scatter(
        x=x_dates,
        y=y_pred,
        mode='lines',
        line=dict(color='red', width=2),
        name=model_name
    )
)

# Update layout to include the regression line in the legend
fig.update_layout(
    showlegend=True,
    legend=dict(
        x=0.01,
        y=0.99,
        bgcolor='rgba(255, 255, 255, 0.8)',
        bordercolor='rgba(0, 0, 0, 0.2)',
        borderwidth=1
    ),
    yaxis=dict(
        type='log',  # Set y-axis to logarithmic scale
    ),
)

# Show the updated figure
fig.show()

# Calculate and print the average increase based on the better model
if is_log_model:
    # For log-linear model, calculate percentage increase per year
    coef = better_model.params[1]
    percent_per_day = (math.exp(coef) - 1) * 100
    percent_per_year = percent_per_day * 365.25
    print(f"Average increase in output tokens per question: {percent_per_year:.2f}% per year")
else:
    # For linear model, calculate absolute increase per year
    tokens_per_day = better_model.params[1]
    tokens_per_year = tokens_per_day * 365.25
    print(f"Average increase in output tokens per question: {tokens_per_year:.2f} tokens per year")


Linear Model Summary:
                                OLS Regression Results                                
Dep. Variable:     output_tokens_per_question   R-squared:                       0.243
Model:                                    OLS   Adj. R-squared:                  0.226
Method:                         Least Squares   F-statistic:                     13.82
Date:                        Wed, 26 Feb 2025   Prob (F-statistic):           0.000577
Time:                                09:37:20   Log-Likelihood:                -395.09
No. Observations:                          45   AIC:                             794.2
Df Residuals:                              43   BIC:                             797.8
Df Model:                                   1                                         
Covariance Type:                    nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
-----------------------



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Average increase in output tokens per question: 152.00% per year



Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

