In [1]:
import os
from pathlib import Path
os.chdir(Path.cwd().parent)
# print("cwd is now:", Path.cwd())

In [2]:
!pip install plotly
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime
from data_loader import scores_df
from fit import fit_statistical_model


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
null performances after coercion: 281
after saturation filter 2201
after filter num benchmarks 1401
after merge with model versions 1397
after merge with benchmark dates 1397
Original number of rows: 1397
Number of rows after aggregation: 967


In [3]:
anchor_mode = "benchmark" # "model", "benchmark"
anchor_benchmark = "Winogrande"
anchor_difficulty = 0
anchor_slope = 1
anchor_model1 = "claude-2.0"
anchor_model1_capability = 1.177630
anchor_model2 = "claude-3-opus-20240229"
anchor_model2_capability = 1.311554

df1, df_cm1, df_db1 = fit_statistical_model(
    scores_df,
    anchor_mode=anchor_mode,
    anchor_benchmark=anchor_benchmark,
    anchor_difficulty=anchor_difficulty,
    anchor_slope=anchor_slope,
    anchor_model1=anchor_model1,
    anchor_model1_capability=anchor_model1_capability,
    anchor_model2=anchor_model2,
    anchor_model2_capability=anchor_model2_capability
)

df_cm1['date_obj'] = pd.to_datetime(df_cm1['date'])

`ftol` termination condition is satisfied.
Function evaluations 25, initial cost 3.4077e+01, final cost 2.4357e+00, first-order optimality 1.37e-04.


In [5]:
# Benchmark Splitting Experiment
# Split each benchmark into two random halves and fit single unified model

from copy import deepcopy

def create_split_dataset(scores_df, random_seed=42):
    """
    Split each benchmark's model scores into two random halves and combine
    into a single expanded dataset where each benchmark becomes two benchmarks.
    
    Args:
        scores_df: DataFrame with columns ['benchmark', 'model', 'performance', ...]
        random_seed: Random seed for reproducibility
        
    Returns:
        scores_df_expanded: Single DataFrame with split benchmarks
    """
    np.random.seed(random_seed)
    
    expanded_data = []
    
    # For each benchmark, split the model scores randomly
    for benchmark in scores_df['benchmark'].unique():
        bench_data = scores_df[scores_df['benchmark'] == benchmark].copy()
        
        # Randomly shuffle the indices
        indices = np.random.permutation(len(bench_data))
        
        # Split roughly in half
        split_point = len(indices) // 2
        split1_indices = indices[:split_point]
        split2_indices = indices[split_point:]
        
        # Create split datasets
        bench_split1 = bench_data.iloc[split1_indices].copy()
        bench_split2 = bench_data.iloc[split2_indices].copy()
        
        # Rename the benchmark to indicate the split
        bench_split1['benchmark'] = f"{benchmark}_1"
        bench_split2['benchmark'] = f"{benchmark}_2"
        
        # Update benchmark_id if it exists
        if 'benchmark_id' in bench_data.columns:
            original_benchmark_id = bench_data['benchmark_id'].iloc[0]
            if len(bench_split1) > 0:
                bench_split1['benchmark_id'] = f"{original_benchmark_id}_1"
            if len(bench_split2) > 0:
                bench_split2['benchmark_id'] = f"{original_benchmark_id}_2"
        
        expanded_data.append(bench_split1)
        expanded_data.append(bench_split2)
    
    # Combine all split data into single expanded dataset
    scores_df_expanded = pd.concat(expanded_data, ignore_index=True)
    
    return scores_df_expanded

# Apply the splitting function
print("Original scores_df shape:", scores_df.shape)
print("Number of unique benchmarks:", scores_df['benchmark'].nunique())

scores_df_expanded = create_split_dataset(scores_df, random_seed=42)

print(f"\nExpanded dataset shape: {scores_df_expanded.shape}")
print(f"Number of unique benchmarks in expanded dataset: {scores_df_expanded['benchmark'].nunique()}")

# Show example of splitting for one benchmark
example_benchmark = scores_df['benchmark'].iloc[0]
original_count = len(scores_df[scores_df['benchmark'] == example_benchmark])
split1_count = len(scores_df_expanded[scores_df_expanded['benchmark'] == f"{example_benchmark}_1"])
split2_count = len(scores_df_expanded[scores_df_expanded['benchmark'] == f"{example_benchmark}_2"])

print(f"\nExample for '{example_benchmark}':")
print(f"  Original: {original_count} models")
print(f"  Split 1 ('{example_benchmark}_1'): {split1_count} models")
print(f"  Split 2 ('{example_benchmark}_2'): {split2_count} models")

# Show a few examples of the expanded benchmarks
print(f"\nFirst 10 benchmark names in expanded dataset:")
for bench in sorted(scores_df_expanded['benchmark'].unique())[:10]:
    print(f"  {bench}")


Original scores_df shape: (967, 11)
Number of unique benchmarks: 31

Expanded dataset shape: (967, 11)
Number of unique benchmarks in expanded dataset: 61

Example for 'GPQA diamond':
  Original: 77 models
  Split 1 ('GPQA diamond_1'): 38 models
  Split 2 ('GPQA diamond_2'): 39 models

First 10 benchmark names in expanded dataset:
  ANLI_1
  ANLI_2
  ARC AI2_1
  ARC AI2_2
  ARC-AGI_1
  ARC-AGI_2
  Aider polyglot_1
  Aider polyglot_2
  BBH_1
  BBH_2


In [6]:
# Fit single statistical model to the expanded dataset
print("Fitting model to expanded dataset with split benchmarks...")

# Use the same anchoring parameters as the original fit
# We'll use one of the split versions of the anchor benchmark
df1_expanded, df_cm1_expanded, df_db1_expanded = fit_statistical_model(
    scores_df_expanded,
    anchor_mode=anchor_mode,
    anchor_benchmark=f"{anchor_benchmark}_1",  # Use _1 version of anchor benchmark
    anchor_difficulty=anchor_difficulty,
    anchor_slope=anchor_slope,
    anchor_model1=anchor_model1,
    anchor_model1_capability=anchor_model1_capability,
    anchor_model2=anchor_model2,
    anchor_model2_capability=anchor_model2_capability
)

print("Model fitting completed for expanded dataset!")
print(f"Number of models fitted: {len(df_cm1_expanded)}")
print(f"Number of benchmarks fitted: {len(df_db1_expanded)}")

# Quick comparison of dataset sizes
print(f"\nDataset size comparison:")
print(f"Original dataset: {len(scores_df)} rows, {scores_df['benchmark'].nunique()} benchmarks")
print(f"Expanded dataset: {len(scores_df_expanded)} rows, {scores_df_expanded['benchmark'].nunique()} benchmarks")


Fitting model to expanded dataset with split benchmarks...
`ftol` termination condition is satisfied.
Function evaluations 42, initial cost 3.4604e+01, final cost 2.2406e+00, first-order optimality 4.91e-04.
Model fitting completed for expanded dataset!
Number of models fitted: 149
Number of benchmarks fitted: 61

Dataset size comparison:
Original dataset: 967 rows, 31 benchmarks
Expanded dataset: 967 rows, 61 benchmarks


In [7]:
# Compare the original and expanded dataset fits

# 1. Compare model capabilities
print("=== MODEL CAPABILITY COMPARISON ===")
print(f"Number of models in original fit: {len(df_cm1)}")
print(f"Number of models in expanded fit: {len(df_cm1_expanded)}")

# Merge the capability estimates
capability_comparison = df_cm1[['model', 'estimated_capability']].rename(
    columns={'estimated_capability': 'capability_original'}
)

capability_comparison = capability_comparison.merge(
    df_cm1_expanded[['model', 'estimated_capability']].rename(
        columns={'estimated_capability': 'capability_expanded'}
    ), on='model', how='outer'
)

capability_corr = capability_comparison[['capability_original', 'capability_expanded']].corr().iloc[0,1]
print(f"\nCapability correlation between original and expanded fits: {capability_corr:.4f}")

# 2. Compare benchmark difficulties for the split benchmarks
print("\n=== BENCHMARK DIFFICULTY COMPARISON ===")
print(f"Number of benchmarks in original fit: {len(df_db1)}")
print(f"Number of benchmarks in expanded fit: {len(df_db1_expanded)}")

# Extract original benchmark names and split numbers
df_db1_expanded['original_benchmark'] = df_db1_expanded['benchmark_name'].str.replace('_[12]$', '', regex=True)
df_db1_expanded['split_number'] = df_db1_expanded['benchmark_name'].str.extract('_([12])$')[0]

# Create difficulty comparison - compare the two splits of each benchmark
split_comparison = df_db1_expanded.pivot_table(
    index='original_benchmark', 
    columns='split_number', 
    values='estimated_difficulty',
    aggfunc='first'
).reset_index()

split_comparison.columns.name = None  # Remove the columns name
split_comparison = split_comparison.rename(columns={'1': 'difficulty_split1', '2': 'difficulty_split2'})

# Also merge with original difficulties
split_comparison = split_comparison.merge(
    df_db1[['benchmark_name', 'estimated_difficulty']].rename(
        columns={'benchmark_name': 'original_benchmark', 'estimated_difficulty': 'difficulty_original'}
    ), on='original_benchmark', how='left'
)

# Calculate correlations
if 'difficulty_split1' in split_comparison.columns and 'difficulty_split2' in split_comparison.columns:
    valid_splits = split_comparison.dropna(subset=['difficulty_split1', 'difficulty_split2'])
    if len(valid_splits) > 1:
        split_corr = valid_splits[['difficulty_split1', 'difficulty_split2']].corr().iloc[0,1]
        print(f"Difficulty correlation between split 1 and split 2: {split_corr:.4f}")

if 'difficulty_original' in split_comparison.columns and 'difficulty_split1' in split_comparison.columns:
    valid_orig_split1 = split_comparison.dropna(subset=['difficulty_original', 'difficulty_split1'])
    if len(valid_orig_split1) > 1:
        orig_split1_corr = valid_orig_split1[['difficulty_original', 'difficulty_split1']].corr().iloc[0,1]
        print(f"Difficulty correlation between original and split 1: {orig_split1_corr:.4f}")

if 'difficulty_original' in split_comparison.columns and 'difficulty_split2' in split_comparison.columns:
    valid_orig_split2 = split_comparison.dropna(subset=['difficulty_original', 'difficulty_split2'])
    if len(valid_orig_split2) > 1:
        orig_split2_corr = valid_orig_split2[['difficulty_original', 'difficulty_split2']].corr().iloc[0,1]
        print(f"Difficulty correlation between original and split 2: {orig_split2_corr:.4f}")

# Show the comparison dataframes
print("\n=== CAPABILITY COMPARISON TABLE (first 10 models) ===")
print(capability_comparison.head(10))

print("\n=== SPLIT BENCHMARK DIFFICULTY COMPARISON TABLE (first 10 benchmarks) ===")
print(split_comparison.head(10))


=== MODEL CAPABILITY COMPARISON ===
Number of models in original fit: 149
Number of models in expanded fit: 149

Capability correlation between original and expanded fits: 0.9984

=== BENCHMARK DIFFICULTY COMPARISON ===
Number of benchmarks in original fit: 31
Number of benchmarks in expanded fit: 61
Difficulty correlation between split 1 and split 2: 0.8470
Difficulty correlation between original and split 1: 0.9578
Difficulty correlation between original and split 2: 0.9359

=== CAPABILITY COMPARISON TABLE (first 10 models) ===
                 model  capability_original  capability_expanded
0  Baichuan-2-13B-Base             0.833178             0.769377
1   Baichuan-2-7B-Base             0.706684             0.637441
2     Cerebras-GPT-13B             0.419122             0.263450
3     Chinchilla (70B)             1.214574             1.155616
4          DeepSeek-R1             2.091282             2.253566
5     DeepSeek-R1-0528             2.181648             2.329036
6        

In [10]:
# Interactive Plotly visualizations for better exploration
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import numpy as np

# 1. Model capabilities: Original vs Expanded (Interactive)
print("Creating interactive Model Capabilities plot...")

valid_mask = ~(capability_comparison['capability_original'].isna() | capability_comparison['capability_expanded'].isna())
if valid_mask.sum() > 0:
    df_plot = capability_comparison[valid_mask].copy()
    
    # Calculate correlation
    corr = np.corrcoef(df_plot['capability_original'], df_plot['capability_expanded'])[0, 1]
    
    # Create the scatter plot
    fig1 = go.Figure()
    
    # Add scatter points
    fig1.add_trace(go.Scatter(
        x=df_plot['capability_original'],
        y=df_plot['capability_expanded'],
        mode='markers',
        marker=dict(size=8, opacity=0.7, color='blue'),
        text=df_plot['model'],  # This will show on hover
        hovertemplate='<b>%{text}</b><br>' +
                      'Original Capability: %{x:.4f}<br>' +
                      'Expanded Capability: %{y:.4f}<br>' +
                      'Difference: %{customdata:.4f}<extra></extra>',
        customdata=df_plot['capability_expanded'] - df_plot['capability_original'],
        name='Models'
    ))
    
    # Add diagonal reference line
    min_val = min(df_plot['capability_original'].min(), df_plot['capability_expanded'].min())
    max_val = max(df_plot['capability_original'].max(), df_plot['capability_expanded'].max())
    
    fig1.add_trace(go.Scatter(
        x=[min_val, max_val],
        y=[min_val, max_val],
        mode='lines',
        line=dict(dash='dash', color='red', width=2),
        name='Perfect Correlation',
        hoverinfo='skip'
    ))
    
    fig1.update_layout(
        title=f'Model Capabilities: Original vs Expanded Dataset<br><sub>Correlation: r = {corr:.4f}</sub>',
        xaxis_title='Original Model Capability',
        yaxis_title='Expanded Dataset Model Capability',
        width=700,
        height=500,
        hovermode='closest'
    )
    
    fig1.show()

# 2. Benchmark difficulties: Split 1 vs Split 2 (Interactive)
print("Creating interactive Benchmark Difficulties plot...")

valid_mask = ~(split_comparison['difficulty_split1'].isna() | split_comparison['difficulty_split2'].isna())
if valid_mask.sum() > 0:
    df_plot2 = split_comparison[valid_mask].copy()
    
    # Calculate correlation
    corr2 = np.corrcoef(df_plot2['difficulty_split1'], df_plot2['difficulty_split2'])[0, 1]
    
    # Create the scatter plot
    fig2 = go.Figure()
    
    df_plot2['diff'] = df_plot2['difficulty_split1'] - df_plot2['difficulty_split2']
    df_plot2['abs_diff'] = df_plot2['diff'].abs()

    fig2.add_trace(go.Scatter(
        x=df_plot2['difficulty_split1'],
        y=df_plot2['difficulty_split2'],
        mode='markers',
        marker=dict(size=10, opacity=0.7, color='green'),
        text=df_plot2['original_benchmark'],
        customdata=np.column_stack([df_plot2['diff'], df_plot2['abs_diff']]),
        hovertemplate='<b>%{text}</b><br>' +
                    'Split 1 Difficulty: %{x:.4f}<br>' +
                    'Split 2 Difficulty: %{y:.4f}<br>' +
                    'Difference: %{customdata[0]:.4f}<br>' +
                    'Abs Difference: %{customdata[1]:.4f}<extra></extra>',
        name='Benchmarks'
    ))
    
    # Add diagonal reference line
    min_val2 = min(df_plot2['difficulty_split1'].min(), df_plot2['difficulty_split2'].min())
    max_val2 = max(df_plot2['difficulty_split1'].max(), df_plot2['difficulty_split2'].max())
    
    fig2.add_trace(go.Scatter(
        x=[min_val2, max_val2],
        y=[min_val2, max_val2],
        mode='lines',
        line=dict(dash='dash', color='red', width=2),
        name='Perfect Correlation',
        hoverinfo='skip'
    ))
    
    fig2.update_layout(
        title=f'Benchmark Difficulties: Split 1 vs Split 2<br><sub>Correlation: r = {corr2:.4f}</sub>',
        xaxis_title='Split 1 Benchmark Difficulty',
        yaxis_title='Split 2 Benchmark Difficulty',
        width=700,
        height=500,
        hovermode='closest'
    )
    
    fig2.show()

# Additional analysis: Show difficulty differences between splits
print("\n=== BENCHMARK SPLIT ANALYSIS ===")
split_comparison['difficulty_difference'] = split_comparison['difficulty_split1'] - split_comparison['difficulty_split2']
split_comparison['abs_difficulty_difference'] = np.abs(split_comparison['difficulty_difference'])

print(f"Mean absolute difference between splits: {split_comparison['abs_difficulty_difference'].mean():.4f}")
print(f"Standard deviation of differences: {split_comparison['difficulty_difference'].std():.4f}")
print(f"Max absolute difference: {split_comparison['abs_difficulty_difference'].max():.4f}")

# Show benchmarks with largest differences
largest_diffs = split_comparison.nlargest(5, 'abs_difficulty_difference')[['original_benchmark', 'difficulty_split1', 'difficulty_split2', 'difficulty_difference']]
print(f"\nBenchmarks with largest difficulty differences between splits:")
print(largest_diffs)

# 3. Bonus: Interactive plot showing difficulty differences
print("\nCreating bonus plot: Difficulty differences by benchmark...")

if len(split_comparison) > 0:
    # Sort by absolute difference for better visualization
    df_sorted = split_comparison.sort_values('abs_difficulty_difference', ascending=True)
    
    fig3 = go.Figure()
    
    fig3.add_trace(go.Scatter(
        x=df_sorted['abs_difficulty_difference'],
        y=list(range(len(df_sorted))),
        mode='markers',
        marker=dict(
            size=8,
            color=df_sorted['abs_difficulty_difference'],
            colorscale='Viridis',
            showscale=True,
            colorbar=dict(title="Abs Difference")
        ),
        text=df_sorted['original_benchmark'],
        hovertemplate='<b>%{text}</b><br>' +
                      'Abs Difference: %{x:.4f}<br>' +
                      'Split 1: %{customdata[0]:.4f}<br>' +
                      'Split 2: %{customdata[1]:.4f}<extra></extra>',
        customdata=list(zip(df_sorted['difficulty_split1'], df_sorted['difficulty_split2'])),
        name='Benchmarks'
    ))
    
    fig3.update_layout(
        title='Benchmark Difficulty Differences Between Splits<br><sub>Sorted by Absolute Difference</sub>',
        xaxis_title='Absolute Difference in Difficulty',
        yaxis_title='Benchmark Rank (by difference)',
        width=700,
        height=600,
        hovermode='closest',
        yaxis=dict(showticklabels=False)  # Hide y-axis labels since it's just ranking
    )
    
    fig3.show()


Creating interactive Model Capabilities plot...


Creating interactive Benchmark Difficulties plot...



=== BENCHMARK SPLIT ANALYSIS ===
Mean absolute difference between splits: 0.5562
Standard deviation of differences: 0.9959
Max absolute difference: 3.4432

Benchmarks with largest difficulty differences between splits:
               original_benchmark  difficulty_split1  difficulty_split2  \
15                       GeoBench           1.464626          -1.978595   
20                        OSWorld           2.876442           5.638868   
7                           CSQA2           0.514904          -1.129768   
19                     OSUniverse           3.913141           2.420202   
11  Factorio learning environment           4.082951           2.826111   

    difficulty_difference  
15               3.443221  
20              -2.762426  
7                1.644672  
19               1.492939  
11               1.256840  

Creating bonus plot: Difficulty differences by benchmark...


In [8]:
# Grouped bar chart: Original vs Split 1 vs Split 2 difficulty per benchmark
import plotly.express as px

# Ensure columns exist
required_cols = ['original_benchmark', 'difficulty_original', 'difficulty_split1', 'difficulty_split2']
missing_cols = [c for c in required_cols if c not in split_comparison.columns]
if missing_cols:
    raise ValueError(f"Missing columns in split_comparison: {missing_cols}")

# Order benchmarks by original difficulty (NAs last)
benchmark_order = (
    split_comparison
    .sort_values('difficulty_original', na_position='last')
    ['original_benchmark']
    .tolist()
)

# Long format for plotting
long_df = split_comparison.melt(
    id_vars='original_benchmark',
    value_vars=['difficulty_original', 'difficulty_split1', 'difficulty_split2'],
    var_name='dataset', value_name='estimated_difficulty'
)

label_map = {
    'difficulty_original': 'Original',
    'difficulty_split1': 'Split 1',
    'difficulty_split2': 'Split 2',
}
long_df['dataset'] = long_df['dataset'].map(label_map)

fig = px.bar(
    long_df,
    x='original_benchmark',
    y='estimated_difficulty',
    color='dataset',
    barmode='group',
    category_orders={
        'original_benchmark': benchmark_order,
        'dataset': ['Original', 'Split 1', 'Split 2']
    },
    hover_data={'original_benchmark': True, 'estimated_difficulty': ':.4f', 'dataset': True}
)

fig.update_layout(
    title='Benchmark Difficulties: Original vs Split 1 vs Split 2',
    xaxis_title='Benchmark',
    yaxis_title='Estimated Difficulty',
    width=1200,
    height=650,
    legend_title_text='Dataset',
)
fig.update_xaxes(tickangle=-45)

fig.show()
