In [2]:
from datetime import datetime
import kaleido  # needed for saving plots
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

In [3]:
def save_plot(fig, folder, filename, extensions=['png', 'svg', 'pdf'], scale=2):
    for ext in extensions:
        fig.write_image(folder + filename + '.' + ext, scale=scale)
    fig.write_html(folder + filename + '.html')

In [4]:
url = "https://docs.google.com/spreadsheets/d/1etu9rXcME0uUA-S2ANA8bsfQbIZgNu-8NxqFGQdDIzQ/export?format=csv&gid=1305280917#gid=1305280917"

df = pd.read_csv(url)

In [5]:
df['Date'] = pd.to_datetime(df['Date'], format='mixed')
df

Unnamed: 0,System,Model size (parameters),Dataset size,Date,Open/Closed,Training compute (FLOP),Training compute notes,BBH,GPQA,MMLU,...,SEAL Math,LMSys Elo,LMSys Elo Notes,LMSys Elo 95% CI,BBH Notes,GPQA Notes,MMLU Notes,HELM MMLU Notes,Trust in benchmark results,Trust notes
0,Random chance,,,NaT,,1.000000e+20,,0.2500,0.250,0.2500,...,0.0,0.0,,,,,,,0,
1,BLOOM-176B,1.760000e+11,3.900000e+11,2022-11-09,Open,4.120000e+23,,0.4491,,0.3913,...,,,,,,,,,0,
2,BloombergGPT,5.000000e+10,7.080000e+11,2023-03-30,Closed,2.120000e+23,,0.4197,,0.3918,...,,,,,,,,,0,
3,ChatGLM-6B,6.000000e+09,,2023-03-01,Open,,,0.1873,,,...,,880.0,,,,,,,0,
4,ChatGLM2-12B-base,1.200000e+10,,2023-06-25,Open,,,0.3602,,,...,,,,,,,,,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,XVerse-65B,,,2023-11-05,Open,,,,,,...,,,,,,,,,0,
99,XVerse-7B,,,2023-09-26,Open,,,,,,...,,,,,,,,,0,
100,Yi-34B,3.400000e+10,3.000000e+12,2023-11-02,Open,6.120000e+23,,0.5430,0.370,0.7635,...,,1111.0,chat,,,"Not sure this is zero-shot CoT, OpenLLM2 Leade...",,,0,
101,Yi-6B,6.000000e+09,3.000000e+12,2023-11-02,Open,1.080000e+23,,0.4280,,0.6385,...,,,,,,,,,0,


In [6]:
# Filter out finetuned systems

finetuned_systems = [
  'Layer Normalization: Handwriting sequence generation',
 'ULM-FiT',
 'ADP-FAIRSEQ + NGRAMRES',
 'Cross-lingual alignment',
 'UnifiedQA',
 '$\\infty$-former (SM)',
 'FLAN 137B',
 'AlphaFold-Multimer',
 'Masked Autoencoders',
 'Contriever',
 'BERT-RBP',
 'Minerva',
 'BlenderBot 3',
 'PaLM-SayCan',
 'NMST+GPT-2',
 'Decaying Fast Weights Transformer (WT-103)',
 'GPT-2 + Progressive LRD',
 'U-PaLM',
 'Flan-T5 11B',
 'Flan-PaLM 540B',
 'Taiyi-Stable Diffusion',
 'OPT-IML (175B)',
 'SparseOPT-175B',
 'DiT-XL/2',
 'VideoMAE V2',
 'Segment Anything Model',
 'gLM',
 'MOSS-Moon-003',
 'WizardLM-7B',
 'InstructBLIP',
 'Guanaco-65B',
 'WizardCoder-15.5B',
 'Code Llama-34B',
 'Code Llama-7B',
 'TigerBot-70B',
 'MiniGPT4 (Vicuna finetune)',
 'LLaMA-7B (protein-oriented instructions finetuned)',
 'FinGPT-13B',
 'LLaVA 1.5',
 'CogVLM',
 'Volcano 13B',
 'SPHINX (Llama 2 13B)',
 'Orca 2-13B',
 'Llama Guard',
 'FunSearch',
 'Elyza',
 'Code Llama-70B',
 'Swallow'
]

df = df[~df['System'].isin(finetuned_systems)]
df = df[~df['System'].str.contains('Flan')]
df = df[~df['System'].str.contains('FLAN')]

# MMLU vs. GPQA

In [7]:
df['MMLU (log-ratio)'] = -np.log(1/df['MMLU'] - 1)
df['GPQA (log-ratio)'] = -np.log(1/df['GPQA'] - 1)

In [8]:
color_map = {'Open': 'blue', 'Closed': 'darkorange'}

# Create the plot
fig = px.scatter(df, x='MMLU (log-ratio)', y='GPQA (log-ratio)', color='Open/Closed',
                 title='MMLU vs. GPQA for Open and Closed Models',
                 labels={'MMLU': 'MMLU Score', 'GPQA': 'GPQA Score', 'Date': 'Date', 'System': 'Model'},
                 hover_data=['System', 'Date', 'MMLU', 'GPQA'],
                 color_discrete_map=color_map)

# x limits
# fig.update_xaxes(range=[0.6, 1])
fig.update_xaxes(range=[0.5, 2.5])

# Improve the layout
fig.update_layout(
    width=800,
    height=400,
    xaxis_title="MMLU (log-ratio)",
    yaxis_title="GPQA (log-ratio)",
    legend_title="Model accessibility",
    font=dict(size=12),
    hovermode="closest"
)

# Show the plot
fig.show()

# LMSys leaderboard

In [9]:
lmsys_leaderboard_bootstrap_elo_lu = pd.read_csv('https://docs.google.com/spreadsheets/d/12zpanuQ1Vf_ZsZ6yjIUwsN7uGPBv3ChLnEOH-g9yZDA/export?format=csv')
lmsys_leaderboard_bootstrap_elo_lu = lmsys_leaderboard_bootstrap_elo_lu.iloc[:, 1:]  # remove first column which is meaningless
lmsys_leaderboard_bootstrap_elo_lu

Unnamed: 0,gpt-4o-2024-05-13,gpt-4o-mini-2024-07-18,claude-3-5-sonnet-20240620,gemini-advanced-0514,llama-3.1-405b-instruct,gemini-1.5-pro-api-0514,gemini-1.5-pro-api-0409-preview,gpt-4-turbo-2024-04-09,gpt-4-1106-preview,claude-3-opus-20240229,...,mpt-7b-chat,chatglm2-6b,RWKV-4-Raven-14B,alpaca-13b,oasst-pythia-12b,chatglm-6b,fastchat-t5-3b,stablelm-tuned-alpha-7b,dolly-v2-12b,llama-13b
0,1286.563455,1279.608815,1274.564379,1266.819548,1263.930629,1262.800113,1256.751200,1259.245346,1251.302638,1246.760897,...,924.081608,930.543839,918.995447,901.694532,895.192400,883.743569,867.977489,837.073711,818.981457,808.397845
1,1288.428940,1287.494945,1274.828503,1268.867005,1263.724865,1263.094929,1258.320223,1258.824223,1252.959457,1250.215882,...,935.186934,920.658893,920.372902,902.614513,894.466683,882.769816,864.055190,840.495344,834.205668,808.046396
2,1287.656293,1281.667695,1275.934504,1268.043471,1261.739568,1261.318913,1258.466691,1258.400212,1254.262636,1250.576760,...,929.251400,923.474876,922.648498,902.220662,894.705996,882.850605,866.929706,847.122558,827.818429,800.490271
3,1284.869154,1279.653476,1269.833920,1270.960974,1269.859096,1260.875571,1257.971376,1257.228266,1251.218192,1249.384635,...,924.035959,922.679121,927.995020,907.170559,897.610422,886.338026,869.905058,842.907147,822.159108,802.589771
4,1285.265726,1282.100890,1268.621384,1266.144561,1262.945738,1260.335847,1257.264334,1257.701172,1248.990652,1247.745614,...,932.584454,915.986844,925.938795,907.229511,892.749966,885.482519,870.742351,844.870609,836.226871,792.792144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1285.964492,1276.413219,1272.279060,1265.682928,1259.271852,1259.780934,1257.705533,1255.287783,1253.316087,1249.462305,...,923.560224,923.189484,917.675145,905.453634,895.762404,881.942653,870.194162,848.375467,827.956177,791.777917
96,1285.769193,1280.955978,1272.596400,1265.313367,1265.510474,1259.210643,1257.968203,1255.820464,1249.073128,1246.840205,...,926.643587,917.478913,922.521962,902.473931,895.742491,876.732514,866.833976,841.230148,817.499211,799.746847
97,1283.361066,1279.543617,1269.244543,1263.704350,1262.416331,1258.528302,1255.166021,1256.156088,1249.962949,1247.240214,...,923.772273,912.598287,917.762886,891.914471,893.081979,872.800000,861.065138,833.636823,818.914647,789.664182
98,1286.871546,1280.301802,1271.749040,1270.644211,1260.150249,1261.494653,1257.594055,1259.429122,1253.718693,1249.689318,...,920.938684,935.274025,917.555680,900.601728,889.143356,884.380531,866.582044,840.679533,821.720833,802.801129


In [10]:
# Alphabetical order
lmsys_leaderboard_bootstrap_elo_lu.mean().round().sort_index()

Unnamed: 0,0
RWKV-4-Raven-14B,922.0
alpaca-13b,902.0
athene-70b-0725,1246.0
bard-jan-24-gemini-pro,1208.0
chatglm-6b,880.0
...,...
yi-large,1212.0
yi-large-preview,1240.0
zephyr-7b-alpha,1042.0
zephyr-7b-beta,1053.0


## Trust in evaluations

In [11]:
# Models whose evaluations we have a concrete reason to distrust
list(df[df['Trust in benchmark results'] < 0]['System'])

['DBRX-Instruct',
 'DeepSeek-67B',
 'Falcon 180B',
 'gpt-4-0125-preview',
 'gpt-4-0613',
 'Llama 3 8B',
 'Mistral-7B',
 'Mixtral8x22B',
 'Reka Core']

In [12]:
# Models whose evaluations we have a concrete reason to trust
list(df[df['Trust in benchmark results'] > 0]['System'])

['Claude 2',
 'Claude 3 Opus',
 'Claude 3 Sonnet',
 'Gemini 1.0 Pro',
 'Gemini 1.5 Pro (April 2024)',
 'GPT-3.5-turbo-16k',
 'GPT-4 (original)',
 'gpt-4-turbo-2024-04-09',
 'Llama 3 70B',
 'Mistral Large']

# Plot benchmark data

In [28]:
benchmarks_to_plot = ['MMLU', 'GPQA', 'LMSys Elo', 'SEAL Coding']
non_suspects_only = True
trusted_only = False
old_models_only = False
new_models_only = False
separate_new_models = True
save = False

color_map = {'Open': 'blue', 'Closed': 'darkorange'}
marker_map = {'Before': 'circle', 'After': 'diamond'}

In [29]:
df['Before November 2023'] = ['Before' if pd.to_datetime(row['Date']) < pd.to_datetime('2023-11-01') else 'After' for i, row in df.iterrows()]

## Date

In [32]:
fig = make_subplots(rows=2, cols=2, subplot_titles=benchmarks_to_plot, vertical_spacing=0.15)

# Define x limits for each subplot
x_limits = {
  'MMLU': [datetime(2020, 1, 1), datetime(2025, 1, 1)],
  'GPQA': [datetime(2023, 1, 1), datetime(2025, 1, 1)],
  'SEAL Coding': [datetime(2023, 1, 1), datetime(2025, 1, 1)],
  'SEAL Math': [datetime(2023, 1, 1), datetime(2025, 1, 1)],
  'LMSys Elo': [datetime(2023, 1, 1), datetime(2025, 1, 1)],
}

for i, bench in enumerate(benchmarks_to_plot):
  plot_df = df[~(df['System'] == 'Random chance')]
  if old_models_only:
    plot_df = plot_df[plot_df['Before November 2023'] == 'Before']
  elif new_models_only:
    plot_df = plot_df[plot_df['Before November 2023'] == 'After']
  if non_suspects_only:
    plot_df = plot_df[plot_df['Trust in benchmark results'] >= 0]
  elif trusted_only:
    plot_df = plot_df[plot_df['Trust in benchmark results'] > 0]

  for category in ['Open', 'Closed']:
    category_df = plot_df[plot_df['Open/Closed'] == category]

    if separate_new_models:
      for age in ['Before', 'After']:
        age_df = category_df[category_df['Before November 2023'] == age]
        fig.append_trace(
          go.Scatter(
            x=age_df['Date'],
            y=age_df[bench],
            mode='markers',
            marker=dict(color=color_map[category], symbol=marker_map[age]),
            text=age_df['System'],
            name=category + f' ({age})',
            legendgroup=category,
            showlegend=True if i == 0 else False
          ),
          row=i//2 + 1, col=i%2 + 1
        )
    else:
      fig.append_trace(
        go.Scatter(
          x=category_df['Date'],
          y=category_df[bench],
          mode='markers',
          marker=dict(color=color_map[category]),
          text=category_df['System'],
          name=category,
          legendgroup=category,
          showlegend=True if i == 0 else False
        ),
        row=i//2 + 1, col=i%2 + 1
      )

  # Update x and y axes for this subplot
  fig.update_xaxes(
    title_text="Model publication date" if i//2 + 1 == 2 else None,
    range=[x_limits[bench][0], x_limits[bench][1]],  # Set x limits
    row=i//2 + 1,
    col=i%2 + 1,
    dtick="M12",  # Set tick marks to every 12 months
    tickformat="%Y",  # Display only the year
    ticklabelmode="period",  # Ensure labels are centered on the year
    tickangle=0  # Make tick labels horizontal
  )

  if i%2 + 1 == 1:
    fig.update_yaxes(title_text="Score", row=i//2 + 1, col=i%2 + 1)

# Improve the layout
fig.update_layout(
  width=600,
  height=400,
  legend_title="Model accessibility",
  font=dict(size=12),
  hovermode="closest",
)

# Margins
fig.update_layout(
  margin=dict(l=0, r=0, t=20, b=0)
)

# Save the plot
if save:
  save_plot(fig, '', 'benchmark_dates')

# Show the plot
fig.show()

## Compute

In [39]:
df['Date'].astype(int)

Unnamed: 0,Date
0,-9223372036854775808
1,1667952000000000000
2,1680134400000000000
3,1677628800000000000
4,1687651200000000000
...,...
98,1699142400000000000
99,1695686400000000000
100,1698883200000000000
101,1698883200000000000


In [54]:
fig = make_subplots(rows=2, cols=2, subplot_titles=benchmarks_to_plot, vertical_spacing=0.15)

# Define x limits for each subplot
x_limits = {
  'MMLU': [1e20, 1e26],
  'GPQA': [1e23, 1e26],
  'SEAL Math': [1e23, 1e26],
  'SEAL Coding': [1e23, 1e26],
  'LMSys Elo': [1e22, 1e26],
}

for i, bench in enumerate(benchmarks_to_plot):
  plot_df = df[~(df['System'] == 'Random chance')]
  if old_models_only:
    plot_df = plot_df[plot_df['Before November 2023'] == 'Before']
  elif new_models_only:
    plot_df = plot_df[plot_df['Before November 2023'] == 'After']
  if non_suspects_only:
    plot_df = plot_df[plot_df['Trust in benchmark results'] >= 0]
  elif trusted_only:
    plot_df = plot_df[plot_df['Trust in benchmark results'] > 0]

  for category in ['Open', 'Closed']:
    category_df = plot_df[plot_df['Open/Closed'] == category]

    if separate_new_models:
      fig.append_trace(
        go.Scatter(
          x=category_df['Training compute (FLOP)'],
          y=-np.log(1 - category_df[bench]),
          mode='markers',
          marker=dict(color=category_df['Date'].astype(int),),
          text=category_df['System'],
          # name=category,
          # legendgroup=category,
          # showlegend=True if i == 0 else False
          showlegend=False,
        ),
        row=i//2 + 1, col=i%2 + 1
      )
    else:
      fig.append_trace(
        go.Scatter(
          x=category_df['Training compute (FLOP)'],
          y=category_df[bench],
          mode='markers',
          marker=dict(color=color_map[category]),
          text=category_df['System'],
          name=category,
          legendgroup=category,
          showlegend=True if i == 0 else False
        ),
        row=i//2 + 1, col=i%2 + 1
      )

    # Plot scores as horizontal lines for models that don't have compute
    # for _, row in category_df.iterrows():
    #   if pd.isna(row['Training compute (FLOP)']) and not pd.isna(row[bench]):
    #     fig.append_trace(
    #       go.Scatter(
    #         x=[0, 1e26],
    #         y=[row[bench], row[bench]],
    #         mode='lines',
    #         line=dict(color=color_map[category]),
    #         showlegend=False,
    #       ),
    #       row=i//2 + 1, col=i%2 + 1
    #     )

  # Update x and y axes for this subplot
  fig.update_xaxes(
    title_text="Training compute (FLOP)" if i//2 + 1 == 2 else None,
    type='log',
    range=[np.log10(x_limits[bench][0]), np.log10(x_limits[bench][1])],  # Set x limits
    tickmode='linear',
    dtick=2,  # This sets ticks at every two powers of 10
    row=i//2 + 1,
    col=i%2 + 1
  )

  if i%2 + 1 == 1:
    fig.update_yaxes(title_text="Score", row=i//2 + 1, col=i%2 + 1)

# Improve the layout
fig.update_layout(
  width=600,
  height=400,
  # legend_title="Model accessibility",
  font=dict(size=12),
  hovermode="closest",
)

# Margins
fig.update_layout(
  margin=dict(l=0, r=0, t=20, b=0)
)

# Save the plot
if save:
  save_plot(fig, '', 'benchmark_compute')

# Show the plot
fig.show()


invalid value encountered in log


invalid value encountered in log


invalid value encountered in log


invalid value encountered in log



# Regression

In [17]:
import numpy as np
from tqdm import tqdm
import pandas as pd
from scipy import stats, optimize, interpolate
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import scipy
from matplotlib.ticker import MultipleLocator
from matplotlib.ticker import MultipleLocator, FuncFormatter
from scipy.stats import dirichlet
from scipy.optimize import curve_fit
from collections import defaultdict
import itertools
%matplotlib inline

In [18]:
# Create a Series with the correct index (column names)
new_row = pd.Series({'Approx Compute (FLOP)': 1e20, 'MMLU': 0.25, 'BBH': 0.25, 'System': 'Random chance', 'Before November 2023': 'Before Nov 2023'},
                     index=df.columns)  # Align with DataFrame columns

df.loc[len(df)] = new_row

In [19]:
df['log_compute'] = np.log10(df['Approx Compute (FLOP)'])

KeyError: 'Approx Compute (FLOP)'

In [None]:
# Define the sigmoid function for benchmark fitting at an offset
def custom_sigmoid(x, x0, k, b):
  L = 1 - b  # Ensuring L + b <= 1
  return L / (1 + np.exp(-k * (x - x0))) + b

In [None]:
# Define a custom formatter function that formats numbers as integers
def custom_formatter(x, pos):
  return f"1e{int(x)}"

In [None]:
# df = df[df['Before November 2023'] == 'Before Nov 2023']

In [None]:
results = defaultdict(dict)

# Benchmarks to plot
"""
TODO Look for recent benchmarks that are more reliable.
It's a red flag if there's some model that's really strong in MMLU for its compute,
and then that's not the case in the more reliable benchmark.
MMLU-Pro?
Consider error bars on benchmark results - error bars for one model would be similar for others.
"""
benchmarks_to_plot = ['MMLU']#, 'BBH']#, 'GPQA', 'SEAL Math', 'SEAL Coding', 'SEAL Instruction Following']

# Number of rows and columns for the subplot grid
n_rows = 1
n_cols = 2

# Creating a figure and axis objects
fig, axs = plt.subplots(n_rows, n_cols, figsize=(15/1.75, 10/1.75))

# Flattening the axis array for easy iteration
axs = axs.flatten()

# Plotting each benchmark in its own subplot
for i, benchmark in enumerate(benchmarks_to_plot):
  for j, category in enumerate(['Open', 'Closed']):
    print(category, 'models')
    try:
      # Creating a subset with non-NaN values for the benchmark
      subset = df[df['Open/Closed'] == category]
      subset = subset[['System', 'log_compute', benchmark]]
      subset = subset.dropna()

      # Scatter plot
      axs[i].scatter(subset['log_compute'], subset[benchmark], label=f"{benchmark}: {category}")

      # Fit the custom sigmoid function to the data, if there are enough data points
      if len(subset) > 3:
        # Provide initial parameter estimates: x0 = median of x, k = 1
        initial_guesses = [subset['log_compute'].median(), 1]

        # Define bounds: x0 broadly around sensible compute OOMs, k > 0, b between 0 and 0.5
        bounds = ((15, 0.1), (32, np.inf))
        try:
          # Sigmoid with offset at the lowest performance, which we've calibrated as chance performance with a pseudo-entry.
          bench_sigmoid = lambda x, *p: custom_sigmoid(x, *p, b=df[df["System"]=="Random chance"][benchmark].values)
          popt, pcov = curve_fit(bench_sigmoid, subset['log_compute'], subset[benchmark], p0=initial_guesses, bounds=bounds, maxfev=10000)

          # Access the fitted parameters
          x0_optimal = popt[0]
          k_optimal = popt[1]

          # Calculate standard errors
          perr = np.sqrt(np.diag(pcov))
          x0_std_error = perr[0]
          k_std_error = perr[1]

          results[benchmark][category] = {'parameters': popt, 'covariance': pcov}

          # Print the parameters
          print(f"Optimal x0: {x0_optimal:.2f} ± {x0_std_error:.2f}")
          print(f"Optimal k: {k_optimal:.2f} ± {k_std_error:.2f}")

          # Generate x values for the curve
          x_values = np.linspace(20, 26 + 1, 100)
          # Calculate y values for the custom sigmoid curve
          y_values = bench_sigmoid(x_values, *popt)

          # Adding the custom sigmoid curve to the plot
          axs[i].plot(x_values, y_values, color=f'C{j}', label='')
        except RuntimeError as e:
          print(f"Could not fit a custom sigmoid for {benchmark}: {e}")

      axs[i].set_xlabel('Training compute (FLOP)')
      axs[i].set_ylabel('Score')
      axs[i].grid(True)
      axs[i].legend()
      # axs[i].set_xlim([22, 27])
    except ValueError as e:
      print(f"Could not fit a custom sigmoid for {benchmark}: {e}")

# Add Gemini to the legend
axs[i].legend(loc='lower right')

# Iterate over each axis in the axs array
for ax in axs:
  # Set the major locator for the x-axis to have a tick every 1 unit
  # ax.xaxis.set_major_locator(MultipleLocator(1))
  # Set the formatter for the x-axis
  ax.xaxis.set_major_formatter(FuncFormatter(custom_formatter))

#fig.suptitle('Estimated FLOP from benchmark scores: {:.2e}'.format(median_value), fontsize=16)

# Remove empty subplots if there are any
for j in range(i+1, len(axs)):
  fig.delaxes(axs[j])

# Adjusting layout to prevent overlap
plt.tight_layout()

plt.show()

In [None]:
custom_sigmoid(26, 24.19, 1.17, 0.25)

In [None]:
custom_sigmoid(25.1, 23.67, 1.48, 0.25)

In [None]:
benchmark = 'MMLU'

root_scalar_options = {
    'method': 'bisect',  # Use the 'bisect' method
    'bracket': [15, 32]
}

closed_params = results[benchmark]['Closed']['parameters']
open_params = results[benchmark]['Open']['parameters']
target_flop = np.log10(5e25)
target_score = custom_sigmoid(target_flop, closed_params[0], closed_params[1], 0.25)  # closed model parameters

bench_sigmoid = lambda x, *p: custom_sigmoid(x, *p, b=df[df["System"]=="Random chance"][benchmark].values)
matching_flop = scipy.optimize.root_scalar(lambda x: bench_sigmoid(x, *open_params) - target_score, **root_scalar_options).root
print(target_flop - matching_flop)