# Setup

In [1]:
import json
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

In [2]:
data_url = 'https://epochai.org/data/epochdb/all_systems.csv'
dtypes = {
    'Training compute (FLOP)': np.float64,
}
pcd_df = pd.read_csv(data_url, dtype=dtypes)
pcd_df['Decimal year'] = pd.to_datetime(pcd_df['Publication date']).dt.year + (pd.to_datetime(pcd_df['Publication date']).dt.month - 1) / 12 + (pd.to_datetime(pcd_df['Publication date']).dt.day - 1) / 365

In [3]:
pcd_df

Unnamed: 0,System,Domain,Organization,Publication date,Reference,Link,Parameters,Training dataset,Training dataset notes,Abstract,...,Training time notes,Training time (hours),Batch size,Batch size notes,Base model,Finetune compute (FLOP),Training compute upper bound,Archived links,Benchmark data,Decimal year
0,Qwen 1.5 110B,Language,Alibaba,2024-04-25,Qwen1.5-110B: The First 100B+ Model of the Qwe...,https://qwenlm.github.io/blog/qwen1.5-110b/?re...,1.100000e+11,Unspecified unreleased,We pretrained the models with a large amount o...,The Qwen1.5-110B is the largest model in the Q...,...,,,,,,,,,,2024.315753
1,phi-3-medium 14B,Language,Microsoft,2024-04-23,Phi-3 Technical Report: A Highly Capable Langu...,https://arxiv.org/abs/2404.14219,1.400000e+10,Unspecified unreleased,"we also trained phi-3-medium, a model with 14B...","We introduce phi-3-mini, a 3.8 billion paramet...",...,,,,,,,,,,2024.310274
2,Llama 3-70B,Language,Meta AI,2024-04-18,Introducing Meta Llama 3: The most capable ope...,https://ai.meta.com/blog/meta-llama-3/\n\nhttp...,7.000000e+10,,,,...,,,,,,,,,,2024.296575
3,Mixtral 8x22B,Language,Mistral AI,2024-04-17,Mixtral 8x22B,https://mistral.ai/news/mixtral-8x22b/,1.760000e+11,,,Mixtral 8x22B is our latest open model. It set...,...,,,,,,,,,,2024.293836
4,SIMA,"Games,Video",Google DeepMind,2024-04-17,Scaling Instructable Agents Across Many Simula...,https://arxiv.org/abs/2404.10179,,,,,...,,,,,,,,,,2024.293836
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1257,Sequence-based pattern recognition,Vision,Massachusetts Institute of Technology (MIT),1955-03-01,Pattern recognition and modern computers,https://dl.acm.org/doi/10.1145/1455292.1455310,,,,,...,,,,,,,,,,1955.166667
1258,Self Organizing System,Other,Massachusetts Institute of Technology (MIT),1955-03-01,Generalization of pattern recognition in a sel...,https://dl.acm.org/doi/10.1145/1455292.1455309,2.250000e+02,,,,...,,,,,,,,,,1955.166667
1259,Genetic algorithm,Other,Institute for Advanced Study,1954-07-02,Numerical testing of evolution theories,https://link.springer.com/article/10.1007/BF01...,,,,,...,,,,,,,,,,1954.502740
1260,SNARC,Robotics,Harvard University,1952-01-08,A Neural-Analogue Calculator Based upon a Prob...,https://en.wikipedia.org/wiki/Stochastic_neura...,4.000000e+01,,,,...,,,,,,,,,,1952.019178


In [4]:
pcd_df['Publication date'] = pd.to_datetime(pcd_df['Publication date'])

In [5]:
pcd_df.sort_values('Publication date', inplace=True)

In [6]:
pcd_df.dropna(subset=['Publication date', 'Notability criteria', 'Training compute (FLOP)'], inplace=True)

In [7]:
pcd_df

Unnamed: 0,System,Domain,Organization,Publication date,Reference,Link,Parameters,Training dataset,Training dataset notes,Abstract,...,Training time notes,Training time (hours),Batch size,Batch size notes,Base model,Finetune compute (FLOP),Training compute upper bound,Archived links,Benchmark data,Decimal year
1261,Theseus,Robotics,Bell Laboratories,1950-07-02,Mighty Mouse,https://www.technologyreview.com/2018/12/19/13...,4.000000e+01,,,,...,,,,,,,,,,1950.502740
1255,Perceptron Mark I,Other,"Cornell Aeronautical Laboratory,Cornell Univer...",1957-01-01,The Perceptron—a perceiving and recognizing au...,https://blogs.umass.edu/brain-wars/files/2016/...,1.000000e+03,,,,...,,,,,,,,,,1957.000000
1254,Pandemonium (morse),Language,Massachusetts Institute of Technology (MIT),1959-02-01,Pandemonium: A Paradigm for Learning,https://aitopics.org/doc/classics:504E1BAC/,,,,,...,,,,,,,,,,1959.083333
1253,Samuel Neural Checkers,Games,IBM,1959-07-01,Some studies in machine learning using the gam...,https://ieeexplore.ieee.org/abstract/document/...,1.600000e+01,,,,...,,,,,,,,,,1959.500000
1251,Perceptron (1960),Vision,Cornell Aeronautical Laboratory,1960-03-30,Perceptron Simulation Experiments,https://www.semanticscholar.org/paper/Perceptr...,1.000000e+03,,,"An experimental simulation program, which has ...",...,,,,,,,,,,1960.246119
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,FunSearch,"Language,Search",Google DeepMind,2023-12-14,Mathematical discoveries from program search w...,https://www.nature.com/articles/s41586-023-069...,1.500000e+10,,"""The experiments carried out in this paper do ...",Large language models (LLMs) have demonstrated...,...,"Appendix A.5: ""To reproduce admissible set exp...",48.0,,,PaLM 2,0.0,,,,2023.952283
22,MegaScale (Production),Language,"ByteDance,Peking University",2024-02-23,MegaScale: Scaling Large Language Model Traini...,https://arxiv.org/abs/2402.15627,5.300000e+11,,,"We present the design, implementation and engi...",...,"Speculative. Authors state ""several weeks"". Fo...",504.0,,,,,,,,2024.143607
15,Inflection-2.5,Language,Inflection AI,2024-03-07,Inflection-2.5: meet the world's best personal AI,https://inflection.ai/inflection-2-5,,,,"At Inflection, our mission is to create a pers...",...,,,,,,,,,,2024.183105
14,MM1-30B,Multimodal,Apple,2024-03-14,"MM1: Methods, Analysis & Insights from Multimo...",https://arxiv.org/abs/2403.09611,3.000000e+10,,"Text, captioned images. See Table 2","In this work, we discuss building performant M...",...,,,,,,,,,,2024.202283


In [8]:
outlier_window_size = 2  # years

In [9]:
start_large_scale_era = '2015-10-01'

# Top n all-time most compute-intensive

In [None]:
for n in range(1, 21):
    # Add a column to mark the top n models
    pcd_df[f'top_{n}_at_release'] = False
    
    for row, model in pcd_df.iterrows():
        # Filter for models released through the model's release date
        yearly_df = pcd_df[pcd_df['Decimal year'] <= model['Decimal year']]
        # get the top n models by compute
        top_n_models = yearly_df.nlargest(n, 'Training compute (FLOP)')
        # mark these models in the original dataframe
        pcd_df.loc[top_n_models.index, f'top_{n}_at_release'] = True

In [None]:
pcd_df.columns

In [None]:
np.sum(pcd_df['top_4_at_release'])

In [None]:
pcd_df_n = pcd_df[pcd_df['Decimal year'] > 2015.75].copy()

In [None]:
top_n = [np.sum(pcd_df_n[f'top_{n}_at_release']) for n in range(1, 21)]
plt.scatter(range(1, 21), top_n)
plt.xticks(ticks=range(1, 21))
plt.xlabel('n-value for top n')
plt.ylabel('Models which have ever been in the top n')
plt.show()

In [None]:
top_n_models = {}
for n in range(1, 21):
    models = pcd_df_n[pcd_df_n[f'top_{n}_at_release']]['System'].values.tolist()
    top_n_models[n] = set(models)

for n in range(20, 1, -1):
    top_n_models[n] = list(top_n_models[n].difference(top_n_models[n-1]))
top_n_models[1] = list(top_n_models[1])

In [None]:
with open('data/frontier_sysstems_by_top_n.json', 'w') as f:
    json.dump(top_n_models, f, indent=4)

# Default large scale systems

https://colab.research.google.com/drive/1PLGY5ErysqQMfy7Z08uIR2cTnnDgSaVR?usp=sharing

In [None]:
high_outliers_z_value_threshold = 0.76

In [None]:
large_scale_idx = set()

for index, row in pcd_df.iterrows():
  # Filter entries in a 2-year window around the paper
  window_size = pd.Timedelta(f'{outlier_window_size*52*7} days')
  half_window_size = window_size / 2
  mask = ( row['Publication date'] - half_window_size <= pcd_df['Publication date'] ) &\
        ( pcd_df['Publication date'] <= row['Publication date'] + half_window_size )
  window_df = pcd_df[mask].copy()

  if len(window_df) < 2: continue

  window_df['Training compute (FLOP) z scores'] = stats.zscore(np.log10(window_df['Training compute (FLOP)'].values))
  if window_df.loc[index, 'Training compute (FLOP) z scores'] > high_outliers_z_value_threshold:
    large_scale_idx.add(index)

large_scale_mask = pcd_df.index.isin(large_scale_idx) & (pcd_df['Publication date'] > start_large_scale_era)

In [None]:
large_scale_df = pcd_df[large_scale_mask]

In [None]:
large_scale_df

In [None]:
for system in large_scale_df['System'][::-1]:
  print(system)

# Percentiles (CURRENT CHOICE)

In [None]:
frontier_systems_by_percentile = {}
percentile_interval = 5
for percentile in range(95, -5, -percentile_interval):
  print(percentile)
  percentile_compute_low = np.zeros(len(pcd_df))
  percentile_compute_high = np.zeros(len(pcd_df))
  # Iterate through each row and calculate the 2-year moving average for each date
  for i, (index, row) in enumerate(pcd_df.iterrows()):
    # Define the 2-year window
    start_date = row['Publication date'] - pd.DateOffset(years=outlier_window_size/2)
    end_date = row['Publication date'] + pd.DateOffset(years=outlier_window_size/2)

    # Filter the DataFrame for this window
    window_df = pcd_df[(pcd_df['Publication date'] >= start_date) & (pcd_df['Publication date'] <= end_date)]

    percentile_compute_low[i] = np.percentile(window_df['Training compute (FLOP)'], percentile)
    percentile_compute_high[i] = np.percentile(window_df['Training compute (FLOP)'], percentile + percentile_interval)

  frontier_systems_flag = pcd_df['Training compute (FLOP)'] > np.array(percentile_compute_low)
  extra_frontier_systems_flag = pcd_df['Training compute (FLOP)'] <= np.array(percentile_compute_high)

  # raise Exception("Edit the following line if you want to consider models released after 2023-12-31.")
  extra_frontier_systems = pcd_df['System'][frontier_systems_flag & extra_frontier_systems_flag & (pcd_df['Publication date'] > pd.to_datetime('2015-09-30'))].values

  frontier_systems_by_percentile[percentile] = list(extra_frontier_systems)

In [None]:
frontier_systems_by_percentile

In [None]:
# Save to JSON
with open('data/frontier_systems_by_window_percentile.json', 'w') as f:
    json.dump(frontier_systems_by_percentile, f, indent=4)

In [None]:
total_num_systems = 0
for percentile, systems in frontier_systems_by_percentile.items():
  total_num_systems += len(systems)
  print(percentile, 'to', percentile + percentile_interval)
  print(len(systems), "systems")
  print(f'Total systems above {percentile}th percentile: {total_num_systems}')
  for system in systems[::-1]:
    print(system)
  print()

# Distance from compute record at the time

In [None]:
ooms_from_frontier = 2

In [None]:
current_max = 0
running_max = np.zeros(len(pcd_df))
for i, compute in enumerate(pcd_df['Training compute (FLOP)']):
  if compute > current_max:
    running_max[i] = compute
    current_max = compute
  else:
    running_max[i] = current_max
running_max

In [None]:
pcd_df['Frontier training compute (FLOP)'] = running_max

In [None]:
pcd_df['Frontier system'] = (pcd_df['Publication date'] > start_large_scale_era) & (np.log10(pcd_df['Frontier training compute (FLOP)']) - np.log10(pcd_df['Training compute (FLOP)']) <= ooms_from_frontier)
pcd_df[['System', 'Frontier system']]

In [None]:
frontier_df = pcd_df[pcd_df['Frontier system']]
frontier_df

In [None]:
for system in frontier_df['System'][::-1]:
  print(system)

# Constant threshold

In [None]:
compute_threshold = 1e23

In [None]:
above_threshold = pcd_df[pcd_df['Training compute (FLOP)'] > compute_threshold]

In [None]:
print(len(above_threshold), 'systems')
for system in above_threshold['System'][::-1]:
  print(system)