In [1]:
import os
from pathlib import Path
os.chdir(Path.cwd().parent)

In [14]:
import pandas as pd
import numpy as np
from data_loader_rescale import scores_df
from fit import fit_statistical_model

anchor_mode = "benchmark"

anchor_benchmark = "Winogrande"
anchor_difficulty = 1
anchor_slope = 1

anchor_model1 = 'text-davinci-001', 
anchor_model1_capability = 1, 
anchor_model2 = 'gpt-4-0613',
anchor_model2_capability = 1.2

if anchor_mode == "model":
    df1, df_cm1, df_db1 = fit_statistical_model(
        scores_df, 
        anchor_mode=anchor_mode, 
        anchor_model1=anchor_model1,
        anchor_model1_capability=anchor_model1_capability, 
        anchor_model2=anchor_model2,
        anchor_model2_capability=anchor_model2_capability
    )
else:
    df1, df_cm1, df_db1 = fit_statistical_model(
        scores_df, 
        anchor_mode=anchor_mode, 
        anchor_benchmark=anchor_benchmark,
        anchor_difficulty=anchor_difficulty,
        anchor_slope=anchor_slope
    )

# Convert date strings to datetime objects
df_cm1['date_obj'] = pd.to_datetime(df_cm1['date'])

gpt_3_001_score = df_cm1[df_cm1['model'].str.contains('text-davinci-001')]['estimated_capability'].values[0]
gpt_3_002_score = df_cm1[df_cm1['model'].str.contains('text-davinci-002')]['estimated_capability'].values[0]
gpt_4_score = df_cm1[df_cm1['model'].str.contains('gpt-4-0613')]['estimated_capability'].values[0]
gpt_5_score = df_cm1[df_cm1['model'].str.contains('gpt-5-2025-08-07_medium')]['estimated_capability'].values[0]
o3_score = df_cm1[df_cm1['model'].str.contains('o3-2025-04-16_high')]['estimated_capability'].values[0]
print(f"GPT-3 (001): {gpt_3_001_score:.3}")
print(f"GPT-3 (002): {gpt_3_002_score:.3}")
print(f"GPT-4: {gpt_4_score:.3}")
print(f"o3: {o3_score:.3}")
print(f"GPT-5: {gpt_5_score:.3}")

`ftol` termination condition is satisfied.
Function evaluations 35, initial cost 3.5665e+01, final cost 3.4294e+00, first-order optimality 7.44e-04.
GPT-3 (001): 0.999
GPT-3 (002): 1.48
GPT-4: 1.63
o3: 2.64
GPT-5: 2.86


In [20]:
print(df_cm1[['model', 'estimated_capability']].sort_values('estimated_capability', ascending=True))

print(f"Minimum capability: {df_cm1['estimated_capability'].min():.3}")
print(f"Maximum capability: {df_cm1['estimated_capability'].max():.3}\n")

print(f"GPT-4 jump: {gpt_4_score - gpt_3_001_score:.3}")
print(f"GPT-5 jump: {gpt_5_score - gpt_4_score:.3}")

                            model  estimated_capability
32                      Qwen-1_8B              0.341581
25             Baichuan-2-7B-Base              0.369050
42                        phi-1_5              0.392430
41                       gemma-2b              0.437605
20                         mpt-7b              0.501528
..                            ...                   ...
106            o3-2025-04-16_high              2.636470
0                     grok-4-0709              2.674136
122  gemini-2.5-pro-preview-06-05              2.689337
130         gpt-5-2025-08-07_high              2.817619
128       gpt-5-2025-08-07_medium              2.858115

[143 rows x 2 columns]
Minimum capability: 0.342
Maximum capability: 2.86

GPT-4 jump: 0.635
GPT-5 jump: 1.22


In [9]:
# Calculate ECI score scaling values
gpt_3_fix_at = 100
gpt_4_fix_at = 120
gpt_3_4_ratio = gpt_3_001_score / gpt_4_score

a = (gpt_4_fix_at * gpt_3_4_ratio - gpt_3_fix_at) / (gpt_3_4_ratio - 1)
b = (gpt_3_fix_at - a) / gpt_3_001_score

df_cm1['eci'] = a + b * df_cm1['estimated_capability']

In [10]:
# Here we check whether our scaling is going to produce a bunch of ties, especially for the top few models.
def get_rank(
    df: pd.DataFrame,
    n: int | None = None,
    sort_col: str = "Publication date",
    val_col: str = "Training compute (FLOP)",
) -> pd.Series:
    """
    Cumulative rank of *val_col* up to each row, ordered by *sort_col*,
    robust to missing values.

    • If *val_col* is NaN for a row → rank is NaN.  
    • Rows whose *val_col* is NaN do **not** affect later ranks.  
    • Rows whose *sort_col* is NaN are treated as having unknown release time
      → their own rank is NaN and they do not affect others.  
    • If *n* is given, ranks > n are set to NaN (frontier filter).

    Returns
    -------
    pd.Series aligned with *df.index* (dtype float, so NaNs are allowed).
    """
    # Sort chronologically; keep a stable sort to preserve original order ties
    ordered = df.sort_values(
        sort_col, kind="mergesort", na_position="last"
    ).reset_index()

    vals  = ordered[val_col]
    ranks = pd.Series(np.nan, index=ordered.index, dtype=float)

    # Working array of non-NaN values we have seen so far
    seen = []

    for idx, v in enumerate(vals):
        if pd.isna(v):           # current value is NaN → leave rank as NaN
            continue
        # Count how many previous non-NaN values are strictly larger
        rank = 1 + sum(prev > v for prev in seen)
        ranks.iloc[idx] = rank
        seen.append(v)           # add current value for future rows

    if n is not None:
        ranks = ranks.where(ranks <= n)

    # Re-align to the original DataFrame’s index order
    ranks.index = ordered["index"]
    return ranks.reindex(df.index)

# Check the top 5 at each date, and count the number of time we get ties
# We'll look at ECI rounded to the nearest integer
df_cm1['eci_rounded'] = df_cm1['eci'].astype(int)
df_cm1['rank'] = get_rank(df_cm1, sort_col='date', val_col='eci')

N = 3
topn_df = df_cm1[df_cm1['rank'] <= N].copy()
print(f"Number of top-{N} models ever: {len(topn_df)}")
print(f"Number of unique ECIs among top-{N} models: {len(topn_df['eci_rounded'].unique())}")

# Create has_tie column - True if this row's eci_rounded value appears in other rows
topn_df['has_tie'] = topn_df['eci_rounded'].duplicated(keep=False)
topn_df['has_tie'].value_counts()

Number of top-3 models ever: 33
Number of unique ECIs among top-3 models: 30


has_tie
False    28
True      5
Name: count, dtype: int64

In [11]:
# Here we check whether our scaling is going to produce a bunch of ties, especially for the top few models.
def get_rank(
    df: pd.DataFrame,
    n: int | None = None,
    sort_col: str = "Publication date",
    val_col: str = "Training compute (FLOP)",
) -> pd.Series:
    """
    Cumulative rank of *val_col* up to each row, ordered by *sort_col*,
    robust to missing values.

    • If *val_col* is NaN for a row → rank is NaN.  
    • Rows whose *val_col* is NaN do **not** affect later ranks.  
    • Rows whose *sort_col* is NaN are treated as having unknown release time
      → their own rank is NaN and they do not affect others.  
    • If *n* is given, ranks > n are set to NaN (frontier filter).

    Returns
    -------
    pd.Series aligned with *df.index* (dtype float, so NaNs are allowed).
    """
    # Sort chronologically; keep a stable sort to preserve original order ties
    ordered = df.sort_values(
        sort_col, kind="mergesort", na_position="last"
    ).reset_index()

    vals  = ordered[val_col]
    ranks = pd.Series(np.nan, index=ordered.index, dtype=float)

    # Working array of non-NaN values we have seen so far
    seen = []

    for idx, v in enumerate(vals):
        if pd.isna(v):           # current value is NaN → leave rank as NaN
            continue
        # Count how many previous non-NaN values are strictly larger
        rank = 1 + sum(prev > v for prev in seen)
        ranks.iloc[idx] = rank
        seen.append(v)           # add current value for future rows

    if n is not None:
        ranks = ranks.where(ranks <= n)

    # Re-align to the original DataFrame’s index order
    ranks.index = ordered["index"]
    return ranks.reindex(df.index)

# Check the top 5 at each date, and count the number of time we get ties
# We'll look at ECI rounded to the nearest integer
df_cm1['eci_rounded'] = df_cm1['eci'].astype(int)
df_cm1['rank'] = get_rank(df_cm1, sort_col='date', val_col='eci')

N = 3
topn_df = df_cm1[df_cm1['rank'] <= N].copy()
print(f"Number of top-{N} models ever: {len(topn_df)}")
print(f"Number of unique ECIs among top-{N} models: {len(topn_df['eci_rounded'].unique())}")

# Create has_tie column - True if this row's eci_rounded value appears in other rows
topn_df['has_tie'] = topn_df['eci_rounded'].duplicated(keep=False)
topn_df['has_tie'].value_counts()

Number of top-3 models ever: 33
Number of unique ECIs among top-3 models: 30


has_tie
False    28
True      5
Name: count, dtype: int64

In [12]:
# For plotting, we'll join some metadata
models_df = pd.read_csv('data/ai_models/all_ai_models.csv')
versions_df = pd.read_csv('data/model_versions.csv')
merged_df = df_cm1.merge(models_df, left_on='Model', right_on='Model')
merged_df = merged_df.merge(versions_df[['id', 'Display name']], left_on='model', right_on='id')

# Features
accessibility_conditions = [
    merged_df['Model accessibility'].isin([
        'API access',
        'Hosted access (no API)',
        'Unreleased'
    ]),
    merged_df['Model accessibility'].isin([
        'Open weights (unrestricted)',
        'Open weights (restricted use)',
        'Open weights (non-commercial)'
    ])
]
accessibilities = ['Closed weights', 'Open weigh†s']
merged_df['Accessibility group'] = np.select(accessibility_conditions, accessibilities, default='Other')


In [13]:
import plotly.express as px
px.scatter(
    topn_df, 
    x='date', 
    y='estimated_capability', 
    # color='has_tie',
    hover_data=['model', 'estimated_capability']
)

In [None]:
# Export
export_df = merged_df[['model', 'Model', 'Display name', 'eci', 'date', 'Organization', 'Country (of organization)', 'Model accessibility']]
export_df = export_df.rename(columns={
    'model': 'model version'
})

export_df.to_csv('outputs/eci_scores.csv')

In [None]:
from scipy.stats import gmean

def sigmoid(x):
    """
    Calculates the sigmoid of the input x.
    x can be a scalar or a NumPy array.
    """
    return 1 / (1 + np.exp(-x))

def implied_eci(ab, db, score):
    return (db + (np.log(np.clip(score, 1e-12, 1-1e-12)) - np.log1p(-np.clip(score, 1e-12, 1-1e-12))) / ab)

def implied_score(ab, cb, db):
    return sigmoid(ab * (cb - db))

palm_scores = scores_df[scores_df['model'] == 'PaLM 540B'][['benchmark', 'performance', 'model']]
gpt4_scores = scores_df[scores_df['model'] == 'gpt-4-0613'][['benchmark', 'performance', 'model']]

check_scores = pd.concat([palm_scores, gpt4_scores])

compare_df = df_db1[df_db1['benchmark_name'].isin(check_scores['benchmark'].unique())]
compare_df = compare_df.rename(columns={'benchmark_name': 'benchmark'})

# Join score data to difficulty data
compare_df = compare_df.merge(check_scores, on='benchmark')

# Join eci data to difficulty data
compare_df = compare_df.merge(df_cm1[['model', 'estimated_capability']], on='model')

# Estimate implied ECI based on individual benchmarks
compare_df['implied_eci'] = implied_eci(
    ab = compare_df['estimated_slope'],
    db = compare_df['estimated_difficulty'],
    score = compare_df['performance']
)
compare_df['implied_performance'] = implied_score(
  ab = compare_df['estimated_slope'],
  cb = compare_df['estimated_capability'],
  db = compare_df['estimated_difficulty']
)

compare_df[['model', 'implied_eci']].groupby('model').agg(geometric_mean=('implied_eci', gmean), arithmetic_mean=('implied_eci', 'mean'))

imputations = {}

for b in compare_df['benchmark'].unique():
    imputations[b] = {}
    for m in compare_df['model'].unique():
        mask = (compare_df["model"] == m) & (compare_df["benchmark"] == b)
        ab = compare_df.loc[compare_df['benchmark'] == b, 'estimated_slope'].iat[0]
        db = compare_df.loc[compare_df['benchmark'] == b, 'estimated_difficulty'].iat[0]
        if not mask.any():
            cb = compare_df.loc[compare_df['model'] == m, 'estimated_capability'].iat[0]
            imputed_score = implied_score(ab=ab, cb=cb, db=db)
            imputations[b][m] = {
              'score': f"{imputed_score:.3}*",
              'eci': f"{cb:.3}"
            }
        else:
            score = compare_df.loc[
                (compare_df['model'] == m) & (compare_df['benchmark'] == b),
                'performance'
            ].iat[0]
            imputed_eci = implied_eci(ab=ab, db=db, score=score)
            imputations[b][m] = {
              'score': f"{score:.3}",
              'eci': f"{imputed_eci:.3}*"
            }

# Flatten the nested dict into a dataframe
imputations_df = (
    pd.concat(
        {
            b: pd.DataFrame.from_dict(m_dict, orient="index")
            for b, m_dict in imputations.items()
        },
        names=["benchmark", "model"]
    )
    .reset_index()
)
imputations_df

In [None]:
import numpy as np
import pandas as pd

def pair_tables(
    df: pd.DataFrame,
    model_a: str = "GPT-4",
    model_b: str = "PaLM 540B",
    score_col: str = "performance",
    eci_col: str = "estimated_capability",
    slope_col: str = "estimated_slope",
    diff_col: str = "estimated_difficulty",
):
    """
    Build two tidy tables for a model pair:
      1) scores_tbl: compare actual score of one model vs imputed score of the other
      2) eci_tbl:    compare implied ECI (from score) of one model vs baseline ECI of the other

    Imputation rules:
      - Imputed SCORE for a missing (model, benchmark) = implied_score(ab, cb, db)
          where ab,db are benchmark params, cb is the model's baseline ECI.
      - Implied ECI for a present (model, benchmark) = implied_eci(ab, db, score).
    """
    # --- Canonical inputs ---
    models = [model_a, model_b]

    # One row per benchmark for params
    bench_params = (
        df[["benchmark", slope_col, diff_col]]
        .drop_duplicates(subset=["benchmark"])
    )

    # One ECI per model (assumed global; take first if duplicated)
    model_eci = (
        df[["model", eci_col]]
        .dropna(subset=[eci_col])
        .drop_duplicates(subset=["model"])
        .set_index("model")[eci_col]
    )

    # All (benchmark × models-of-interest) pairs
    grid = (
        df[["benchmark"]].drop_duplicates()
        .assign(_k=1)
        .merge(pd.DataFrame({"model": models, "_k": 1}), on="_k")
        .drop(columns="_k")
    )

    # Attach actual scores (take first if duplicated)
    actual_scores = (
        df[["benchmark", "model", score_col]]
        .drop_duplicates(subset=["benchmark", "model"])
    )

    out = (grid
           .merge(actual_scores, how="left", on=["benchmark", "model"])
           .merge(bench_params, how="left", on="benchmark"))

    # Attach baseline ECI per model
    out[eci_col] = out["model"].map(model_eci)

    # Compute imputed score (only when score is missing)
    out["imputed_score"] = out.apply(
        lambda r: implied_score(ab=r[slope_col], cb=r[eci_col], db=r[diff_col])
        if pd.isna(r[score_col]) and pd.notna(r[eci_col]) and pd.notna(r[slope_col]) and pd.notna(r[diff_col])
        else np.nan,
        axis=1,
    )

    # Compute implied ECI from score (only when score is present)
    out["implied_eci"] = out.apply(
        lambda r: implied_eci(ab=r[slope_col], db=r[diff_col], score=r[score_col])
        if pd.notna(r[score_col]) and pd.notna(r[slope_col]) and pd.notna(r[diff_col])
        else np.nan,
        axis=1,
    )

    # Wide for easy pair picking
    wide = out.pivot(index="benchmark", columns="model")

    # Helper selectors
    sA = wide[(score_col, model_a)]
    sB = wide[(score_col, model_b)]
    impA = wide[("imputed_score", model_a)]
    impB = wide[("imputed_score", model_b)]
    eciA = wide[(eci_col, model_a)]
    eciB = wide[(eci_col, model_b)]
    impECIA = wide[("implied_eci", model_a)]
    impECIB = wide[("implied_eci", model_b)]

    # ---- SCORES table ----
    # Cases: A has actual & B missing → compare A_actual vs B_imputed
    scores_A_vs_B = (
        pd.DataFrame({
            "benchmark": wide.index,
            f"{model_a}_actual": sA,
            f"{model_b}_imputed": impB,
        })
        .loc[sA.notna() & sB.isna()]
        .assign(direction=f"{model_a} actual vs {model_b} imputed")
    )

    # Cases: B has actual & A missing → compare B_actual vs A_imputed
    scores_B_vs_A = (
        pd.DataFrame({
            "benchmark": wide.index,
            f"{model_b}_actual": sB,
            f"{model_a}_imputed": impA,
        })
        .loc[sB.notna() & sA.isna()]
        .assign(direction=f"{model_b} actual vs {model_a} imputed")
    )

    # Combine to one tidy scores table; add delta for convenience
    scores_tbl = pd.concat([scores_A_vs_B, scores_B_vs_A], ignore_index=True)
    if f"{model_a}_actual" in scores_tbl and f"{model_b}_imputed" in scores_tbl:
        scores_tbl["delta"] = scores_tbl.get(f"{model_a}_actual") - scores_tbl.get(f"{model_b}_imputed")
    if f"{model_b}_actual" in scores_tbl and f"{model_a}_imputed" in scores_tbl:
        scores_tbl["delta"] = scores_tbl.get(f"{model_b}_actual").fillna(scores_tbl.get(f"{model_a}_actual")) - \
                              scores_tbl.get(f"{model_a}_imputed").fillna(scores_tbl.get(f"{model_b}_imputed"))

    # ---- ECI table ----
    # Cases: A has score (so we have A implied_eci) & B missing score (so use B baseline ECI)
    eci_A_vs_B = (
        pd.DataFrame({
            "benchmark": wide.index,
            f"{model_a}_eci_from_score": impECIA,
            f"{model_b}_baseline_eci": eciB,
        })
        .loc[sA.notna() & sB.isna()]
        .assign(direction=f"{model_a} implied ECI vs {model_b} baseline ECI")
    )

    # Cases: B has score & A missing score
    eci_B_vs_A = (
        pd.DataFrame({
            "benchmark": wide.index,
            f"{model_b}_eci_from_score": impECIB,
            f"{model_a}_baseline_eci": eciA,
        })
        .loc[sB.notna() & sA.isna()]
        .assign(direction=f"{model_b} implied ECI vs {model_a} baseline ECI")
    )

    eci_tbl = pd.concat([eci_A_vs_B, eci_B_vs_A], ignore_index=True)

    # Optional: standardize column order a bit
    def order_cols(df):
        first = ["benchmark", "direction"]
        rest = [c for c in df.columns if c not in first]
        return df[first + rest]

    return order_cols(scores_tbl), order_cols(eci_tbl)

scores_tbl, eci_tbl = pair_tables(compare_df, model_a="gpt-4-0613", model_b="PaLM 540B")
eci_tbl.drop(columns='direction')

In [None]:
import numpy as np
import pandas as pd

# Prepare data: decimal year from benchmark_release_date
_df = df_db1.copy()
_df['benchmark_release_date'] = pd.to_datetime(_df['benchmark_release_date'], errors='coerce')

# Compute decimal year: year + (day_of_year - 1) / days_in_year
_days_in_year = np.where(_df['benchmark_release_date'].dt.is_leap_year, 366, 365)
_df['decimal_year'] = (
    _df['benchmark_release_date'].dt.year +
    (_df['benchmark_release_date'].dt.dayofyear - 1) / _days_in_year
)

# Drop rows with missing values needed for regression
_reg = _df.dropna(subset=['decimal_year', 'estimated_slope'])

x = _reg['decimal_year'].to_numpy()
y = _reg['estimated_slope'].to_numpy()

# Simple linear regression (y = a*x + b)
coef, intercept = np.polyfit(x, y, 1)
print(f"Coefficient (estimated_slope ~ decimal_year): {coef:.6f}")
print(f"Intercept: {intercept:.6f}")

# Optional: show a quick sanity summary
print(f"N used: {len(_reg)} | date range: {pd.to_datetime(_reg['benchmark_release_date']).min().date()} → {pd.to_datetime(_reg['benchmark_release_date']).max().date()}")


In [None]:
scores_df.to_csv('outputs/input_scores.csv')
df_cm1.to_csv('outputs/model_capabilities.csv')
df_db1.to_csv('outputs/benchmark_difficulties.csv')

In [None]:
from data_loader import benchmarks, df_fiction, df_factorio
pd.concat(benchmarks, ignore_index=True)

In [None]:
scores_df[scores_df['performance'] <0 ]