In [1]:
import os
from pathlib import Path
os.chdir(Path.cwd().parent)

In [None]:
import pandas as pd
import numpy as np
from data_loader import scores_df
from fit import fit_statistical_model

# FrontierMath Tier 4 is missing its release date
scores_df.loc[(scores_df['benchmark'] == 'FrontierMath-Tier-4-2025-07-01-Private'), 'benchmark_release_date'] = '2025-07-11'

anchor_mode = "benchmark"
anchor_benchmark = "Winogrande"
anchor_difficulty = 0
anchor_slope = 1

df1, df_cm1, df_db1 = fit_statistical_model(
    scores_df, 
    anchor_mode=anchor_mode, 
    anchor_benchmark=anchor_benchmark,
    anchor_difficulty=anchor_difficulty,
    anchor_slope=anchor_slope
)

# Convert date strings to datetime objects
df_cm1['date_obj'] = pd.to_datetime(df_cm1['date'])

null performances after coercion: 1
after saturation filter 2237
after filter num benchmarks 1562
after merge with model versions 1548
after merge with benchmark dates 1548
after filtering on benchmark date 1397
Original number of rows: 1397
Number of rows after aggregation: 1128


  with pd.option_context('mode.use_inf_as_na', True):


`ftol` termination condition is satisfied.
Function evaluations 35, initial cost 4.1169e+01, final cost 3.7885e+00, first-order optimality 7.36e-04.


In [3]:
# Calculate ECI score scaling values
claude_fix_at = 130
gpt_5_fix_at = 150
claude_35_score_raw = df_cm1[df_cm1['model'] == 'claude-3-5-sonnet-20240620']['estimated_capability'].values[0]
gpt_5_score_raw = df_cm1[df_cm1['model'] == 'gpt-5-2025-08-07_medium']['estimated_capability'].values[0]

b = (gpt_5_fix_at - claude_fix_at) / (gpt_5_score_raw - claude_35_score_raw)
a = claude_fix_at - (b * claude_35_score_raw)

df_cm1['eci'] = a + b * df_cm1['estimated_capability']
df_db1['edi'] = a + b * df_db1['estimated_difficulty']

# gpt_3_001_score = df_cm1[df_cm1['model'] =='text-davinci-001']['eci'].values[0]
gpt_4_score = df_cm1[df_cm1['model'] == 'gpt-4-0613']['eci'].values[0]
claude_35_score = df_cm1[df_cm1['model'] == 'claude-3-5-sonnet-20240620']['eci'].values[0]
gpt_5_score = df_cm1[df_cm1['model'] == 'gpt-5-2025-08-07_medium']['eci'].values[0]
# print(f"GPT-3 (001): {gpt_3_001_score:.4}")
print(f"GPT-4: {gpt_4_score:.4}")
print(f"Claude 3.5 Sonnet (June): {claude_35_score:.4}")
print(f"GPT-5: {gpt_5_score:.4}")

GPT-4: 122.1
Claude 3.5 Sonnet (June): 130.0
GPT-5: 150.0


In [4]:
# Here we check whether our scaling is going to produce a bunch of ties, especially for the top few models.
def get_rank(
    df: pd.DataFrame,
    n: int | None = None,
    sort_col: str = "Publication date",
    val_col: str = "Training compute (FLOP)",
) -> pd.Series:
    """
    Cumulative rank of *val_col* up to each row, ordered by *sort_col*,
    robust to missing values.

    • If *val_col* is NaN for a row → rank is NaN.  
    • Rows whose *val_col* is NaN do **not** affect later ranks.  
    • Rows whose *sort_col* is NaN are treated as having unknown release time
      → their own rank is NaN and they do not affect others.  
    • If *n* is given, ranks > n are set to NaN (frontier filter).

    Returns
    -------
    pd.Series aligned with *df.index* (dtype float, so NaNs are allowed).
    """
    # Sort chronologically; keep a stable sort to preserve original order ties
    ordered = df.sort_values(
        sort_col, kind="mergesort", na_position="last"
    ).reset_index()

    vals  = ordered[val_col]
    ranks = pd.Series(np.nan, index=ordered.index, dtype=float)

    # Working array of non-NaN values we have seen so far
    seen = []

    for idx, v in enumerate(vals):
        if pd.isna(v):           # current value is NaN → leave rank as NaN
            continue
        # Count how many previous non-NaN values are strictly larger
        rank = 1 + sum(prev > v for prev in seen)
        ranks.iloc[idx] = rank
        seen.append(v)           # add current value for future rows

    if n is not None:
        ranks = ranks.where(ranks <= n)

    # Re-align to the original DataFrame’s index order
    ranks.index = ordered["index"]
    return ranks.reindex(df.index)

# Check the top 5 at each date, and count the number of time we get ties
# We'll look at ECI rounded to the nearest integer
df_cm1['eci_rounded'] = df_cm1['eci'].astype(int)
df_cm1['rank'] = get_rank(df_cm1, sort_col='date', val_col='eci')

N = 3
topn_df = df_cm1[df_cm1['rank'] <= N].copy()
print(f"Number of top-{N} models ever: {len(topn_df)}")
print(f"Number of unique ECIs among top-{N} models: {len(topn_df['eci_rounded'].unique())}")

# Create has_tie column - True if this row's eci_rounded value appears in other rows
topn_df['has_tie'] = topn_df['eci_rounded'].duplicated(keep=False)
topn_df['has_tie'].value_counts()

Number of top-3 models ever: 32
Number of unique ECIs among top-3 models: 26


has_tie
False    22
True     10
Name: count, dtype: int64

In [5]:
# Download AI models db
import os
import subprocess

data_dir = "data/ai_models"
zip_path = "data/ai_models.zip"
url = "https://epoch.ai/data/generated/ai_models.zip"

if not os.path.exists(data_dir):
    os.makedirs("data", exist_ok=True)
    print("Downloading AI models data...")
    subprocess.run(["curl", "-L", url, "-o", zip_path], check=True)
    subprocess.run(["unzip", "-q", zip_path, "-d", data_dir], check=True)
    os.remove(zip_path)
    print("Download and extraction complete.")
else:
    print(f"{data_dir} already exists; skipping download.")

data/ai_models already exists; skipping download.


In [6]:
# For plotting, we'll join some metadata
models_df = pd.read_csv('data/ai_models/all_ai_models.csv')
versions_df = pd.read_csv('data/model_versions.csv')
merged_df = df_cm1.merge(models_df, left_on='Model', right_on='Model')
merged_df = merged_df.merge(versions_df[['id', 'Display name']], left_on='model', right_on='id')

# Features
accessibility_conditions = [
    merged_df['Model accessibility'].isin([
        'API access',
        'Hosted access (no API)',
        'Unreleased'
    ]),
    merged_df['Model accessibility'].isin([
        'Open weights (unrestricted)',
        'Open weights (restricted use)',
        'Open weights (non-commercial)'
    ])
]
accessibilities = ['Closed weights', 'Open weigh†s']
merged_df['Accessibility group'] = np.select(accessibility_conditions, accessibilities, default='Other')


In [18]:
import plotly.express as px

px.scatter(
    df_cm1,
    x='date',
    y='eci',
    hover_data=['model', 'eci']
)

# px.scatter(
#     df_db1,
#     x='benchmark_release_date',
#     y='edi',
#     hover_data=['benchmark_name']
# )

In [8]:
import plotly.graph_objects as go
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_cm1['date'], 
    y=df_cm1['eci'], 
    mode='markers',
    name="Models"
    # color='has_tie',
    # hover_data=['model', 'eci'],
    # error_y='ci90_high',
    # error_y_minus='ci90_low'
))

fig.add_trace(go.Scatter(
    x=df_db1['benchmark_release_date'], 
    y=df_db1['edi'],
    mode='markers',
    name='Benchmarks'
    # error_y='se_difficulty',
    # hover_data=['benchmark_name']
))

fig.show()

In [10]:
# Export
export_df = merged_df[['model', 'Model', 'Display name', 'eci', 'date', 'Organization', 'Country (of organization)', 'Model accessibility']]
export_df = export_df.rename(columns={
    'model': 'model version'
})

export_df.to_csv('outputs/eci_scores.csv')