# Setup

In [23]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
from collections import defaultdict
from collections.abc import Callable
from dataclasses import dataclass
import json
from itertools import combinations_with_replacement
import numpy as np
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import chi2
from sklearn.model_selection import KFold
import statsmodels.api as sm
from tqdm import tqdm

from data import *
from plotting import *
from regression import *
from utils import *

# Parameters

In [25]:
# 'external': Filter to the top n models overall
# 'internal': Filter to the top n models within 'USA' and 'China' categories
# 'disabled': No filtering
frontier_selection = 'external'  # ['disabled', 'internal', 'external']
top_n = 10  # Filter to the top n models by training compute at time of release
model_selection = 'All models'  # ['All models', 'Language models', 'Google DeepMind models', 'OpenAI models', 'Meta AI models']
filter_alphago_outliers = True  # Whether to filter out AlphaGo Master and AlphaGo Zero
filter_finetuned_models = True  # Whether to filter out separate finetuned models (base + finetuned models are still included if there is no separate base model in our dataset)
include_speculative_compute = True  # Whether to include speculative compute estimates that rely on benchmark imputation and rough guesses
cutoff_date = '2010-01-01'  # When to start the regressions from
top_n_cutoff_date = '2010-01-01'  # When to split the top-n filtering into USA and China categories - set to e.g. 2010 to turn off the "kickstarting"
save = True  # Whether to save the plots

In [26]:
results_dir = 'results/compute/13Nov-OR-country/'
os.makedirs(results_dir, exist_ok=True)
os.makedirs(results_dir + 'plot_data', exist_ok=True)

In [27]:
colors = {'USA': 'blue', 'China': 'red'}


# Data preparation

In [28]:
# Load data
pcd_df = load_pcd_df()

In [29]:
pcd_df

Unnamed: 0,System,Domain,Task,Authors,Notability criteria,Notability criteria notes,Model accessibility,Link,Citations,Reference,...,Hardware TF16,Hardware FP16,Assumed precision,Assumed hardware FLOP/s,Hardware type,Compute estimate method,Training compute estimation method,Biological model safeguards,Hardware utilization (temp),BenchmarkHub-v1
0,babbage-002,Language,Language modelling,,,,,,,,...,,,FP32,,,,,,,
1,tts-1,Speech,Text-to-speech,,,,,,,,...,,,FP32,,,,,,,
2,tts-1-hd,Speech,Text-to-speech,,,,,,,,...,,,FP32,,,,,,,
3,LM-Design,Biology,Protein design,"Zaixiang Zheng, Yifan Deng, Dongyu Xue, Yi Zho...",,,,https://proceedings.mlr.press/v202/zheng23a.html,46.0,Structure-informed Language Models Are Protein...,...,,,FP32,,,,,LM-Design,,
4,Genie (bio),Biology,,,,,,https://arxiv.org/abs/2301.12485,,"Generating Novel, Designable, and Diverse Prot...",...,,,FP32,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2054,Aya Expanse 8B,Language,"Language modelling/generation,Translation",,,,Open weights (restricted use),https://cohere.com/blog/aya-expanse-connecting...,,"Cohere For AI launches Aya Expanse, a state-of...",...,,,FP32,,,,Operation counting,,,
2055,π0 (pi-zero),"Robotics,Vision",Robotic manipulation,"Kevin Black, Noah Brown, Danny Driess, Adnan E...",,,Unreleased,https://www.physicalintelligence.company/downl...,,π0: Our First Generalist Policy,...,,,FP32,,,,,,,
2056,Hunyuan-Large,Language,"Language modelling/generation,Question answeri...","Xingwu Sun, Yanfeng Chen, Yiqing Huang, Ruobin...",,,Open weights (restricted use),https://arxiv.org/abs/2411.02265,,Hunyuan-Large: An Open-Source MoE Model with 5...,...,,,FP32,,,,"Operation counting,Other",,,
2057,Qwen2.5-Coder (32B),Language,"Language modelling/generation,Code generation","Binyuan Hui, Jian Yang, Zeyu Cui, Jiaxi Yang, ...",,,Open weights (unrestricted),https://arxiv.org/abs/2409.12186,,Qwen2.5-Coder Technical Report,...,,,FP32,,,,Operation counting,,,


In [30]:
print(pcd_df.loc[pcd_df['System'] == 'Megatron-BERT']['Country (from Organization)'])
print(pcd_df.loc[pcd_df['System'] == 'Yi-34B']['Country (from Organization)'])


842    United States of America
Name: Country (from Organization), dtype: object
1664    China
Name: Country (from Organization), dtype: object


In [31]:
country_df = pcd_df.dropna(subset=['Publication date', 'Country (from Organization)'])
len(country_df)

1869

In [32]:
country_df['Country (from Organization)'].unique()


array(['United States of America',
       'United States of America,United States of America', 'Italy',
       'New Zealand',
       'United Kingdom of Great Britain and Northern Ireland',
       'Switzerland', 'Japan', 'Multinational', 'Netherlands', 'Finland',
       'Canada', 'Japan,United States of America', 'Spain',
       'Denmark,United Kingdom of Great Britain and Northern Ireland',
       'India', 'Germany', 'France',
       'United Kingdom of Great Britain and Northern Ireland,United States of America',
       'Taiwan',
       'United States of America,United States of America,United States of America',
       'United Kingdom of Great Britain and Northern Ireland,Canada',
       'United States of America,Germany', 'Korea (Republic of)',
       'United States of America,United Kingdom of Great Britain and Northern Ireland',
       'Mexico', 'Switzerland,Germany', 'France,Canada',
       'France,United States of America,France', 'Canada,Singapore',
       'Finland,Multinational

In [33]:
# How many models are both US and China?
country_df[
    country_df['Country (from Organization)'].str.contains('United States of America') & 
    country_df['Country (from Organization)'].str.contains('China')
]

Unnamed: 0,System,Domain,Task,Authors,Notability criteria,Notability criteria notes,Model accessibility,Link,Citations,Reference,...,Hardware TF16,Hardware FP16,Assumed precision,Assumed hardware FLOP/s,Hardware type,Compute estimate method,Training compute estimation method,Biological model safeguards,Hardware utilization (temp),BenchmarkHub-v1
443,SPPNet,Vision,Image classification,"Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun",Highly cited,,,https://arxiv.org/abs/1406.4729,10365.0,Spatial Pyramid Pooling in Deep Convolutional ...,...,,,FP32,4709000000000,GPU,,,,,
472,CRF-RNN,Vision,Image segmentation,"Shuai Zheng, Sadeep Jayasumana, Bernardino Rom...",Highly cited,,,https://arxiv.org/abs/1502.03240,2661.0,Conditional Random Fields as Recurrent Neural ...,...,,,FP32,,,,,,,
519,R-FCN,Vision,Object detection,"Jifeng Dai, Y. Li, Kaiming He, and Jian Sun",Highly cited,,,https://arxiv.org/abs/1605.06409,5411.0,R-fcn: Object detection via region-based fully...,...,,,FP32,,,,,,,
533,DenseNet-264,Vision,Image classification,"G Huang, Z Liu, L Van Der Maaten",Highly cited,,,https://arxiv.org/abs/1608.06993,33650.0,Densely Connected Convolutional Networks,...,,,FP32,,,,,,,
552,SPIDER2,Biology,"Protein folding prediction,Proteins","Yuedong Yang, Rhys Heffernan, Kuldip Paliwal, ...",SOTA improvement,"The method provides state-of-the-art, all-in-o...",Open weights (non-commercial),https://link.springer.com/protocol/10.1007/978...,,SPIDER2: A Package to Predict Secondary Struct...,...,,,FP32,,,,,SPIDER2,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1797,DecompDiff,Biology,Drug discovery,"Jiaqi Guan, Xiangxin Zhou, Yuwei Yang, Yu Bao,...",,,,https://arxiv.org/abs/2403.07902,41.0,DecompDiff: Diffusion Models with Decomposed P...,...,,,FP32,,,,,DecompDiff,,
1799,BitNet b1.58,Language,"Language modelling/generation,Question answering","Shuming Ma, Hongyu Wang, Lingxiao Ma, Lei Wang...",,,,https://arxiv.org/abs/2402.17764,,The Era of 1-bit LLMs: All Large Language Mode...,...,,,FP32,,,,,,,
1818,ERNIE-RNA,Biology,Protein or nucleotide language model (pLM/nLM),"Weijie Yin, Zhaoyu Zhang, Liang He, Rui Jiang,...",,,,https://www.biorxiv.org/content/10.1101/2024.0...,3.0,ERNIE-RNA: An RNA Language Model with Structur...,...,,,FP32,,,,,ERNIE-RNA,,
1989,RNAdiffusion,Biology,RNA sequence generation,"Kaixuan Huang, Yukang Yang, Kaidi Fu, Yanyi Ch...",,,,https://arxiv.org/abs/2409.09828,0.0,Latent Diffusion Models for Controllable RNA S...,...,,,FP32,,,,,RNAdiffusion,,


Check if the country is listed.

TODO: try other methods of reducing multiple countries to one country.
- Use the first country listed
- Mutually exclusive (e.g. China but NOT USA)

In [34]:
# Check if the country is listed
def assign_country(row):
    if 'United States of America' in row['Country (from Organization)'] and 'China' in row['Country (from Organization)']:
        return 'USA,China'
    if 'United States of America' in row['Country (from Organization)']:
        return 'USA'
    if 'China' in row['Country (from Organization)']:
        return 'China'
    for country in country_df['Country (from Organization)'].unique():
        if country in row['Country (from Organization)']:
            return country
    return np.nan
country_df.loc[:, 'Country'] = country_df.apply(assign_country, axis=1)

# Use the first country listed
# country_df['Country'] = country_df['Country (from Organization)'].apply(lambda x: x.split(',')[0].strip())

country_df[['System', 'Country']]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,System,Country
137,Theseus,USA
138,SNARC,USA
139,Genetic algorithm,USA
140,Sequence-based pattern recognition,USA
141,Self Organizing System,USA
...,...,...
2054,Aya Expanse 8B,Multinational
2055,π0 (pi-zero),USA
2056,Hunyuan-Large,China
2057,Qwen2.5-Coder (32B),China


In [35]:
for cat in country_df['Country'].unique():
    if pd.isna(cat):
        print(cat, len(country_df.loc[country_df['Country'].isna()]))
    else:
        print(cat, len(country_df.loc[country_df['Country'] == cat]))

USA 1075
Italy 3
New Zealand 1
United Kingdom of Great Britain and Northern Ireland 120
Switzerland 35
Japan 39
Multinational 23
Netherlands 8
Finland 3
Canada 58
Spain 2
India 3
Germany 30
France 23
Taiwan 2
Korea (Republic of) 47
Mexico 1
Poland 1
Singapore 12
China 256
USA,China 65
Hong Kong 8
Australia,Australia 1
Belgium 4
Israel 13
Australia 3
Russia,Russia 5
Russia 9
Austria,Austria,Norway 1
Lebanon 1
Lithuania,Sweden 1
Greece 1
Sweden,Sweden 2
Austria 3
Sweden 2
United Arab Emirates 5
Denmark,Denmark 1
Hungary 1
Saudi Arabia 1


In [36]:
print('USA', len(country_df[country_df['Country'].str.contains('USA')]))
print('China', len(country_df[country_df['Country'].str.contains('China')]))


USA 1140
China 321


In [37]:
df = country_df

In [38]:
def find_top_models_up_to_release(df, top_n):
    """Find the models which were in the top n by compute when they were released."""
    # This set will keep track of models that were ever in the top 10 at their release
    ever_in_top_n = set()

    # Iterate over each date in the DataFrame
    for current_date in df['date'].unique():
        # Get all entries up to the current date
        historical_data = df[df['date'] <= current_date]
        # Find top 10 models by flop count in this subset
        top_n_models = historical_data.nlargest(top_n, 'flop')['System']
        # Update the set of models that were ever in top n
        ever_in_top_n.update(top_n_models)

    # Return DataFrame filtered to only include models that were ever in the top 10
    return df[df['System'].isin(ever_in_top_n)]


def filter_top_models_within_category(df, top_n, cutoff_date, category):
    """Find the models which were in the top-n by compute when they were released,
    among models in the specified category. The top-n models in the specified category
    are seeded with the overall top-n models before the cutoff date.
    """
    # Filter top-n models within the category, but seeded with overall top-n models
    top_models_df = find_top_models_up_to_release(df, top_n)
    top_n_models_at_cutoff_date_df = top_models_df[top_models_df['date'] <= cutoff_date].nlargest(top_n, 'flop')
    category_df = df[df['category'].str.contains(category)]

    # This set will keep track of models that were ever in the top 10 at their release
    ever_in_top_n = set()

    # Iterate over each date in the DataFrame
    for current_date in category_df['date'].unique():
        # Get all entries up to the current date
        category_since_cutoff = category_df[(category_df['date'] <= current_date) & (category_df['date'] > cutoff_date)]
        historical_data = pd.concat([category_since_cutoff, top_n_models_at_cutoff_date_df])
        # Find top 10 models by flop count in this subset
        top_n_models_df = historical_data.nlargest(top_n, 'flop')
        # Update the set of models that were ever in top n
        # Filter out the models that aren't in the category
        ever_in_top_n.update(top_n_models_df[top_n_models_df['category'].str.contains(category)]['System'])

    # Return DataFrame filtered to only include models that were ever in the top 10
    new_df = df[df['System'].isin(ever_in_top_n)]
    # Assign the category to the new DataFrame (overwrites cases with both US and China)
    # E.g. if a "USA,China" model is top-10 among models affiliated with China, then it's just "China"
    new_df['category'] = category
    
    return new_df


def filter_top_models_in_both_categories(df, top_n, cutoff_date):
    # Get top models for Open and Closed categories
    top_us_models = filter_top_models_within_category(df, top_n, cutoff_date, category='USA')
    top_china_models = filter_top_models_within_category(df, top_n, cutoff_date, category='China')
    # Combine the results
    df_filtered = pd.concat([top_us_models, top_china_models])
    # Sort the combined DataFrame by date
    df_filtered = df_filtered.sort_values('date')
    return df_filtered

In [39]:
df_filtered = (df[['System', 'Training compute (FLOP)', 'Publication date', 'Organization', 'Notability criteria', 'Domain', 'Base model', 'Country']]
    .rename(columns={'Training compute (FLOP)': 'flop', 'Publication date': 'date', 'Country': 'category'})
    .assign(date=lambda x: pd.to_datetime(x['date']), log_flop=lambda x: np.log10(x['flop']))
    .sort_values('date'))

In [40]:
list(df_filtered[df_filtered['Base model'].notna()]['System'])

['BatchNorm',
 'Order embeddings with layer norm',
 'Layer Normalization: The Attentive Reader',
 'Layer Normalization: Skip Thoughts',
 'Layer Normalization: Draw',
 'Layer Normalization: Handwriting sequence generation',
 'ULM-FiT',
 'ADP-FAIRSEQ + NGRAMRES',
 'Fine-tuned-AWD-LSTM-DOC (fin)',
 'Cross-lingual alignment',
 'Theseus 6/768',
 'UnifiedQA',
 'LUKE',
 'GPT-Neo-2.7B (finetuned)',
 'GPT-Neo-2.7B (finetuned on PTB)',
 'Unicorn',
 'Multitask Unified Model (MUM)',
 '$\\infty$-former (SM)',
 'FLAN 137B',
 'AlphaFold-Multimer',
 'T0-XXL',
 'GPT-2 (AMPS)',
 'Masked Autoencoders ViT-H',
 'ViT-G/14 (LiT)',
 'Engine-XL(NE)',
 'HSO',
 'Contriever',
 'Vespa',
 'OntoProtein',
 'InstructGPT',
 'BERT-RBP',
 'Flamingo',
 'Jurassic-X',
 'DeBERTaV3large + KEAR',
 'SimCSE',
 'CogVideo',
 'Minerva (540B)',
 'Delphi',
 'Transformer-XL + RMT',
 'GPT-NeoX-Japanese',
 'BlenderBot 3',
 'PaLM-SayCan',
 'Sparrow',
 'NMST+GPT-2',
 'Decaying Fast Weights Transformer (WT-103)',
 "Instruct-GPT + Mind's Ey

In [41]:
# Add speculative compute estimates based on benchmark imputation and rough guesses
if include_speculative_compute:
    speculative_compute_estimates = {
        "Claude 3.5 Sonnet": 4.72e25,
        "Claude 3 Opus": 1.59e25,
        "Claude 3 Sonnet": 5.51e24,
        "GPT-4o": 3.98e25,
        "Gemini 1.0 Pro": 1.85e24,
        "Gemini 1.5 Pro": 1.60e25,
        "Mistral Large 2": 2.01e25,
        "GPT-4 Turbo": 2.1e25,  # rough guess matching GPT-4
        "GPT-4V": 2.1e25,  # rough guess matching GPT-4
        "Claude 2": 4.33e24,
        "Claude 2.1": 4.33e24,  # rough guess matching Claude 2
    }
    for model, compute in speculative_compute_estimates.items():
        df_filtered.loc[df_filtered["System"] == model, "flop"] = compute
        df_filtered.loc[df_filtered["System"] == model, "log_flop"] = np.log10(compute)

df_filtered.dropna(subset=['flop'], inplace=True)

# Drop Alpha Go Master / Zero
if filter_alphago_outliers:
    mask = (df_filtered["System"] == 'AlphaGo Master') | (df_filtered["System"] == 'AlphaGo Zero')
    df_filtered = df_filtered[~mask]

# Drop finetuned models
if filter_finetuned_models:
    mask = df_filtered['Base model'].isna()
    df_filtered = df_filtered[mask]

top_models_df = find_top_models_up_to_release(df_filtered, top_n)  # For reference

if frontier_selection == 'external':
    # Filter top models before other filters
    df_filtered = filter_top_models_in_both_categories(df_filtered, top_n, top_n_cutoff_date)

if model_selection == 'Language models':
    re = 'Language|Multimodal'
    mask = df_filtered['Domain'].str.contains(re, na=False)
    df_filtered = df_filtered[mask]

if frontier_selection == 'internal':
    # Filter top models after other filters
    df_filtered = filter_top_models_in_both_categories(df_filtered, top_n, top_n_cutoff_date)

# Filter for models after the cutoff date
df_filtered = df_filtered[df_filtered['date'] > cutoff_date]

print(f"{len(df_filtered)}{' top' if frontier_selection != 'disabled' else ''} {top_n} {model_selection} models found")
print(f"They span {df_filtered['date'].min().strftime('%B %Y')} to {df_filtered['date'].max().strftime('%B %Y')}")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



165 top 10 All models models found
They span June 2010 to November 2024




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [42]:
if top_n == 1:
    # Remove BIDAF outlier
    df_filtered = df_filtered[df_filtered['System'] != 'BIDAF']

In [43]:
exclude_china = []
# exclude_china = [
#     'genCNN + dyn eval',
#     'R-FCN',
#     'ResNet-200',
#     '2-layer-LSTM+Deep-Gradient-Compression',
# ]
df_filtered = df_filtered[~df_filtered['System'].isin(exclude_china)]

In [44]:
usa_df = df_filtered[df_filtered['category'] == 'USA']
china_df = df_filtered[df_filtered['category'] == 'China']
recent_top_models_df = top_models_df[top_models_df['date'] > pd.to_datetime('2010-01-01')]

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=usa_df['date'],
    y=usa_df['log_flop'],
    mode='markers',
    marker=dict(color=colors['USA'], opacity=0.5),
    text=usa_df['System'],
    hoverinfo='text',
    name=f'Top-{top_n} USA'
))

fig.add_trace(go.Scatter(
    x=china_df['date'],
    y=china_df['log_flop'],
    mode='markers',
    marker=dict(color=colors['China'], opacity=0.5),
    text=china_df['System'],
    hoverinfo='text',
    name=f'Top-{top_n} China'
))

fig.add_trace(go.Scatter(
    x=recent_top_models_df['date'],
    y=recent_top_models_df['log_flop'],
    mode='markers',
    marker=dict(color='grey', opacity=0.5),
    text=recent_top_models_df['System'],
    hoverinfo='text',
    name=f'Top-{top_n} Overall'
))

fig.update_layout(
    width=800,
    height=400,
    xaxis_title='Date',
    yaxis_title='Log FLOP',
    title=f'Top-{top_n} models',
    margin=dict(t=50, l=60, r=60, b=50),
)

save_plot(fig, results_dir, f'top_{top_n}_models_without_kickstarting')

fig.show()

In [45]:
top_models_since_cutoff = top_models_df[top_models_df['date'] >= pd.to_datetime(cutoff_date)]
top_models_set = set(top_models_since_cutoff['System'])
usa_top_models_set = set(usa_df['System'])
china_top_models_set = set(china_df['System'])

frac_usa_top_models = len(usa_top_models_set.intersection(top_models_set)) / len(top_models_set)
frac_china_top_models = len(china_top_models_set.intersection(top_models_set)) / len(top_models_set)
print(f"Fraction of top-{top_n} models that are USA: {frac_usa_top_models*100:.1f}%")
print(f"Fraction of top-{top_n} models that are China: {frac_china_top_models*100:.1f}%")


Fraction of top-10 models that are USA: 66.7%
Fraction of top-10 models that are China: 8.9%


# Regression analysis

## Model selection

In [53]:
@dataclass
class FitResult:
    p: int = None
    bic: float = None
    rss: float = None
    mse: float = None
    predict: Callable = None


@dataclass
class KinkedFitResult(FitResult):
    break_points: tuple[float] = None
    break_points_dt: float = None
    oom_year_slopes: tuple[float] = None
    intercepts: tuple[float] = None

    # Model properties for each breakpoint combination
    # (for debugging)
    bics: tuple[float] = None
    rsss: tuple[float] = None
    mses: tuple[float] = None
    break_points_list: tuple[tuple[float]] = None
    break_points_dt_list: tuple[tuple[float]] = None


def get_predictors(
    x,
    intercept_change_points,
    slope_change_points,
    pred_category=None,category=None,
    same_intercepts=None,
    same_slopes=None
):
    if pred_category == 'USA':
        is_usa = np.ones(len(x))
    elif pred_category == 'China':
        is_usa = np.zeros(len(x))
    else:
        assert category is not None
        is_usa = (category == 'USA').astype(int).values

    # Ensure the lengths match
    assert len(same_intercepts) == len(intercept_change_points), f"Length of same_intercepts ({len(same_intercepts)}) must match the number of intercept change points ({len(intercept_change_points)})"
    assert len(same_slopes) == len(slope_change_points), f"Length of same_slopes ({len(same_slopes)}) must match the number of slope change points ({len(slope_change_points)})"

    # Calculate the number of columns needed
    n_intercept_cols = sum(1 if same else 2 for same in same_intercepts)
    n_slope_cols = sum(1 if same else 2 for same in same_slopes)
    n_cols = n_intercept_cols + n_slope_cols

    predictors = np.zeros((len(x), n_cols))

    # Intercept predictors
    col_idx = 0
    for i, (intercept_point, same) in enumerate(zip(intercept_change_points, same_intercepts)):
        if same:
            predictors[:, col_idx] = (x >= intercept_point).astype(int)
            col_idx += 1
        else:
            predictors[:, col_idx] = (x >= intercept_point).astype(int) * is_usa
            predictors[:, col_idx + 1] = (x >= intercept_point).astype(int) * (1 - is_usa)
            col_idx += 2

    # Slope predictors
    for i, (break_point, same) in enumerate(zip(slope_change_points, same_slopes)):
        if same:
            predictors[:, col_idx] = np.maximum(x - break_point, 0)
            col_idx += 1
        else:
            predictors[:, col_idx] = np.maximum(x - break_point, 0) * is_usa
            predictors[:, col_idx + 1] = np.maximum(x - break_point, 0) * (1 - is_usa)
            col_idx += 2

    return predictors


def fit_n_phase_exponential(
    df,
    kink_count,
    allow_discontinuities=False,
    same_intercepts=None,
    same_slopes=None,
    min_n_segment=10
):
    # Generate monthly breakpoints between 2010 and 2024
    one_month = pd.DateOffset(months=1)
    break_point_grid = pd.date_range(start=df['date'].min() - one_month, end=df['date'].max() - 4*one_month, freq='MS')
    break_point_grid = [x.toordinal() for x in break_point_grid]

    x = pd.to_datetime(df['date']).apply(lambda date: date.toordinal()).values
    y = df['log_flop'].values

    break_points_list = []
    bics = []
    rsss = []
    mses = []
    models = []

    for break_points in combinations_with_replacement(break_point_grid, kink_count):
        intercept_change_points = (0,)
        if allow_discontinuities:
            intercept_change_points += break_points
        slope_change_points = (0,) + break_points

        # If same_intercepts or same_slopes are not provided, default to all False
        if same_intercepts is None:
            same_intercepts = [False] * len(intercept_change_points)
        if same_slopes is None:
            same_slopes = [False] * len(slope_change_points)

        predictors = get_predictors(
            x,
            intercept_change_points,
            slope_change_points,
            category=df['category'],
            same_slopes=same_slopes,
            same_intercepts=same_intercepts
        )

        # Fit the model
        model = sm.OLS(y, predictors).fit()

        # Calculate BIC manually based on log-likelihood
        n = len(x) # Number of observations
        p = len(model.params) + 2*kink_count + 1 # Number of parameters

        # Calculate log-likelihood under the assumption of normally distributed errors
        # We have to iterate over all points to get their individual log-likelihoods
        log_likelihood = 0
        rss = 0
        invalid_model = False # Discard models with segments with less than 2 points
        for i, break_point in enumerate(slope_change_points):
            left_x = break_point
            right_x = slope_change_points[i + 1] if i + 1 < len(slope_change_points) else np.inf

            segment_predictors = predictors[(left_x <= x) & (x < right_x), :]
            segment_y = y[(left_x <= x) & (x < right_x)]
            segment_n = len(segment_y)

            assert min_n_segment > 2

            if segment_n < min_n_segment:
                invalid_model = True
                break

            y_pred = model.predict(segment_predictors)

            segment_rss = np.sum((y_pred - segment_y)**2)
            assert segment_rss > 0
            segment_mse = segment_rss / segment_n

            segment_log_likelihood = -segment_n/2 * (np.log(2*np.pi) + np.log(segment_rss/segment_n) + 1)
            log_likelihood += segment_log_likelihood
            rss += segment_rss

        if invalid_model:
            continue

        # Compute BIC using the manual method based on the log-likelihood
        bic = p * np.log(n) - 2 * log_likelihood
        # bic = n*np.log(rss/n) + p*np.log(n)

        bics.append(bic)
        rsss.append(rss)
        mses.append(rss/len(df))
        models.append(model)
        break_points_list.append(break_points)

    # Prepare the result object
    best_bic = min(bics)
    best_idx = bics.index(best_bic)
    best_rss = rsss[best_idx]
    best_mse = mses[best_idx]
    best_model = models[best_idx]
    best_break_points = break_points_list[best_idx]

    p = len(best_model.params) + 2*kink_count + 1 # Number of parameters

    # Store the model parameters
    intercept_change_points = (0,)
    if allow_discontinuities:
        intercept_change_points += best_break_points
    slope_change_points = (0,) + best_break_points

    n_intercepts = sum(1 if same else 2 for same in same_intercepts)
    intercepts = best_model.params[:n_intercepts]
    oom_intercepts = np.zeros((2, len(intercept_change_points)))
    for i in range(len(intercept_change_points)):
        if same_intercepts[i]:
            oom_intercepts[0, i] = oom_intercepts[1, i] = intercepts[i]
        else:
            oom_intercepts[0, i] = intercepts[2*i - sum(same_intercepts[:i])]
            oom_intercepts[1, i] = intercepts[2*i + 1 - sum(same_intercepts[:i])]

    # Apply cumulative sum to get the actual slopes
    oom_intercepts = {'USA': np.cumsum(oom_intercepts[0]), 'China': np.cumsum(oom_intercepts[1])}

    n_slopes = len(slope_change_points)
    slopes = best_model.params[n_intercepts:]
    oom_year_slopes = np.zeros((2, n_slopes))  # 2 rows for USA and China
    for i in range(n_slopes):
        if same_slopes[i]:
            oom_year_slopes[0, i] = oom_year_slopes[1, i] = 365 * slopes[i]
        else:
            oom_year_slopes[0, i] = 365 * slopes[2*i - sum(same_slopes[:i])]
            oom_year_slopes[1, i] = 365 * slopes[2*i + 1 - sum(same_slopes[:i])]

    # Apply cumulative sum to get the actual slopes
    oom_year_slopes = {'USA': np.cumsum(oom_year_slopes[0]), 'China': np.cumsum(oom_year_slopes[1])}

    def predict(date, category):
        if not isinstance(date, pd.Series):
            date = pd.Series(date)
        x = pd.to_datetime(date).apply(lambda date: date.toordinal()).values

        predictors = get_predictors(
            x,
            intercept_change_points,
            slope_change_points,
            category=category,
            same_slopes=same_slopes,
            same_intercepts=same_intercepts
        )

        return best_model.predict(predictors)

    fit_result = KinkedFitResult(
        p=p,
        bic=best_bic,
        rss=best_rss,
        mse=best_mse,
        break_points=best_break_points,
        predict=predict,
        break_points_dt=[pd.Timestamp.fromordinal(bp) for bp in best_break_points],
        bics=bics,
        rsss=rsss,
        mses=mses,
        oom_year_slopes=oom_year_slopes,
        intercepts=oom_intercepts,
        break_points_list=break_points_list,
        break_points_dt_list=[[pd.Timestamp.fromordinal(bp) for bp in break_points] for break_points in break_points_list],
    )

    return fit_result


fit_em_all = lambda df_fit : {
    # "Simple" : fit_n_phase_exponential(df_fit, kink_count=0),
    # "Simple with same slope": fit_n_phase_exponential(df_fit, kink_count=0, same_slopes=(True,)),
    # "Simple with same slope and intercept": fit_n_phase_exponential(df_fit, kink_count=0, same_slopes=(True,), same_intercepts=(True,)),
    # "One kink" : fit_n_phase_exponential(df_fit, kink_count=1),
    "Two kinks" : fit_n_phase_exponential(df_fit, kink_count=2),
    # "Discontinuity" : fit_n_phase_exponential(df_fit, kink_count=1, allow_discontinuities=True),
    # "Same pre-break different post-break" : fit_n_phase_exponential(
    #     df_fit, kink_count=1, allow_discontinuities=True, same_intercepts=(True, False), same_slopes=(True, False)
    # ),
    # "Same pre-break different intercept post-break" : fit_n_phase_exponential(
    #     df_fit, kink_count=1, allow_discontinuities=True, same_intercepts=(True, False), same_slopes=(True, True)
    # ),
    # "Same pre-break and post-break" : fit_n_phase_exponential(
    #     df_fit, kink_count=1, allow_discontinuities=True, same_intercepts=(True, True), same_slopes=(True, True)
    # ),
}


# K-Fold Cross Validation
def perform_cross_validation(df, k=10, random_state=42):
    kf = KFold(n_splits=k, shuffle=True, random_state=random_state)
    folds_mses = defaultdict(lambda : [])
    for train_index, test_index in kf.split(df):
        train_df, test_df = df.iloc[train_index], df.iloc[test_index]

        # Fit the models on the training set
        fold_models = fit_em_all(train_df)

        # Predict on the test set
        for name,model in fold_models.items():
            try:
                predicted_log_flop = model.predict(test_df["date"], test_df["category"])
            except AttributeError:
                continue
            test_rss = np.sum((predicted_log_flop - test_df["log_flop"])**2)
            test_mse = test_rss / len(test_df)
            folds_mses[name].append(test_mse)

    # Compute mean MSE
    folds_mses = {name: np.mean(folds_mses[name]) for name in folds_mses}

    return folds_mses


def calculate_lag(df, fit_result):
    # Get the final predictions for the two categories
    final_date = df['date'].max()
    y_usa = fit_result.predict(pd.Series([final_date]), pd.Series(['USA']))[0]
    y_china = fit_result.predict(pd.Series([final_date]), pd.Series(['China']))[0]
    
    # Get the final slope for the 'China' category
    slope_usa = fit_result.oom_year_slopes['USA'][-1]
    
    # Calculate lag
    lag = (y_usa - y_china) / slope_usa
    
    return lag

In [54]:
# Best model fits
models = fit_em_all(df_filtered)

# K-fold cross validation
folds_mses = perform_cross_validation(df_filtered)

# Bootstrap
bootstrap_sample_size = 1000

rng = np.random.default_rng(43)
bootstrap_bics = defaultdict(lambda : [])
bootstrap_mses = defaultdict(lambda : [])
bootstrap_bic_score_diff = defaultdict(lambda : [])
bootstrap_slopes = defaultdict(lambda : defaultdict(lambda : []))
bootstrap_intercepts = defaultdict(lambda : defaultdict(lambda : []))
bootstrap_breaks = defaultdict(lambda : [])
bootstrap_lag_months = defaultdict(lambda : [])
for bootstrap_index in tqdm(range(bootstrap_sample_size)):
    sample = df_filtered.sample(len(df_filtered), replace=True, random_state=rng)
    sample = sample.sort_values('date')

    # Compute BICs
    boot_models = fit_em_all(sample)

    # Compute K fold validation
    boot_folds_mses = perform_cross_validation(sample)

    # Store results
    for name, model in boot_models.items():
        # It might be None if the hyperbolic fails to fit
        if model is None: continue

        bootstrap_bics[name].append(model.bic)
        bootstrap_mses[name].append(boot_folds_mses[name])
        bootstrap_bic_score_diff[name].append(model.bic - boot_models["Simple"].bic)

        if isinstance(model, KinkedFitResult):
            if (len(model.oom_year_slopes['USA']) > 0):
                bootstrap_slopes[name]['USA'].append(10**model.oom_year_slopes['USA'][-1])
            if (len(model.oom_year_slopes['China']) > 0):
                bootstrap_slopes[name]['China'].append(10**model.oom_year_slopes['China'][-1])
            if (len(model.break_points_dt) > 0):
                bootstrap_breaks[name].append(model.break_points_dt[-1])

            # Calculate the lag between predictions for the USA and China categories
            lag_months = calculate_lag(sample, model) * 12  # Convert years to months
            bootstrap_lag_months[name].append(lag_months)


ci_width = 0.90
qs = [(1 - ci_width)/2, (1 + ci_width)/2]
bootstrap_preferred_percent = {}
bootstrap_slopes_ci = defaultdict(lambda : defaultdict(lambda : []))
for name in models:
    bootstrap_preferred_percent[name] = np.mean(np.array(bootstrap_bic_score_diff[name])<0)
    bootstrap_bics[name] = np.quantile(np.array(bootstrap_bics[name]), qs)
    bootstrap_mses[name] = np.quantile(np.array(bootstrap_mses[name]), qs)
    bootstrap_bic_score_diff[name] = np.quantile(np.array(bootstrap_bic_score_diff[name]), qs)
    bootstrap_slopes_ci[name]['USA'] = np.quantile(np.array(bootstrap_slopes[name]['USA']), qs)
    bootstrap_slopes_ci[name]['China'] = np.quantile(np.array(bootstrap_slopes[name]['China']), qs)
    bootstrap_lag_months[name] = np.quantile(np.array(bootstrap_lag_months[name]), qs)
    if len(bootstrap_breaks[name]) > 0:
        bootstrap_breaks[name] = np.quantile(np.array(bootstrap_breaks[name]), qs)

# Models with lower BIC score / MSE are preferred.

results = []
for name, model in models.items():
    param_count = model.p
    log_likelihood = (np.log(len(df_filtered))*param_count - model.bic)/2

    param_count_simple = models['Simple'].p
    log_likelihood_simple = (np.log(len(df_filtered))*param_count_simple - models['Simple'].bic)/2

    c2 = chi2.sf(2*(log_likelihood - log_likelihood_simple), df=(param_count - param_count_simple))

    result = {
        "Model": name,
        "BIC" : np.round(model.bic, 2),
        "BIC 90% CI" : np.round(bootstrap_bics[name], 2),
        "BIC score diff": np.round(model.bic - models["Simple"].bic, 2),
        "BIC score diff 90% CI": np.round(bootstrap_bic_score_diff[name], 2),
        "Xi²": c2,
        "% times preferred over simple": f"{bootstrap_preferred_percent[name]:.0%}",
        "K-fold mean MSE" : np.round(folds_mses[name], 2),
        "K-fold mean MSE 90% CI" : np.round(bootstrap_mses[name], 2),
    }

    result["Recent slope for China (Nx/year)"] = np.round(10**model.oom_year_slopes['China'][-1], 2)
    result["Recent slope for China 90% CI"] = np.round(bootstrap_slopes_ci[name]['China'], 2)
    result["Recent slope for USA (Nx/year)"] = np.round(10**model.oom_year_slopes['USA'][-1], 2)
    result["Recent slope for USA 90% CI"] = np.round(bootstrap_slopes_ci[name]['USA'], 2)
    result["Lag (months)"] = np.round(calculate_lag(df_filtered, model) * 12, 2)  # Convert years to months
    result["Lag 90% CI (months)"] = np.round(bootstrap_lag_months[name], 2)
    if len(model.break_points_dt) > 0:
        result["Break point"] = model.break_points_dt[-1].strftime('%Y-%m')
        result["Break point 90% CI"] = [date.strftime('%Y-%m') for date in bootstrap_breaks[name]]
    results.append(result)

results_df = pd.DataFrame(results)

print("Results")
results_df

  0%|          | 0/1000 [00:30<?, ?it/s]


KeyError: 'Simple'

In [48]:
# Save results_df
regression_fname = f'compute_regression_analysis_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}.csv'
results_df.to_csv(os.path.join(results_dir, regression_fname), index=False)

# Save bootstrap_slopes as JSON
slopes_fname = f'bootstrap_slopes_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}.json'
with open(os.path.join(results_dir, slopes_fname), 'w') as f:
    json.dump(bootstrap_slopes, f, indent=4)

## Significant difference between regression slopes

### All data

In [49]:
df_filtered['date_float'] = datetime_to_float_year(df_filtered['date'])
usa_df = df_filtered[df_filtered['category'] == 'Open']
closed_df = df_filtered[df_filtered['category'] == 'Closed']
regression_slope_t_test(usa_df, closed_df, ['date_float'], 'log_flop', logy=False, adj_corr=True)

ValueError: zero-size array to reduction operation maximum which has no identity

### Bootstrap distributions

In [43]:
open_slopes = bootstrap_slopes['Simple']['open']
closed_slopes = bootstrap_slopes['Simple']['closed']

In [44]:
# Plot a histogram of the slopes

# Create a DataFrame for the slopes
slopes_df = pd.DataFrame({
    'slope': np.log10(open_slopes + closed_slopes),
    'category': ['Open'] * len(open_slopes) + ['Closed'] * len(closed_slopes)
})

# Plot the histogram using plotly
fig = px.histogram(slopes_df, x='slope', color='category', barmode='overlay', 
                   title='Distribution of Bootstrap Slopes', 
                   labels={'Slope': 'Slope (OOMs/year)', 'count': 'Frequency'},
                   opacity=0.5, color_discrete_map={'Open': colors['open'], 'Closed': colors['closed']})

fig.update_layout(
    width=800,
    height=600,
)

fig.show()


In [45]:
# Shapiro-Wilk test for normality
_, p_value_open = stats.shapiro(np.log10(open_slopes))
print(f"Shapiro-Wilk test p-value for Open Models: {p_value_open}")

_, p_value_closed = stats.shapiro(np.log10(closed_slopes))
print(f"Shapiro-Wilk test p-value for Closed Models: {p_value_closed}")

# Anderson-Darling test for normality
result_open = stats.anderson(np.log10(open_slopes))
print(f"Anderson-Darling test statistic for Open Models: {result_open.statistic}")

result_closed = stats.anderson(np.log10(closed_slopes))
print(f"Anderson-Darling test statistic for Closed Models: {result_closed.statistic}")

Shapiro-Wilk test p-value for Open Models: 1.6350610639784377e-26
Shapiro-Wilk test p-value for Closed Models: 6.800706046263335e-24
Anderson-Darling test statistic for Open Models: 24.891432094172615
Anderson-Darling test statistic for Closed Models: 14.877443835765916


In [46]:
# Use Mann-Whitney U test (if any test above rejects normality, p < 0.05)
statistic, p_value = stats.mannwhitneyu(np.log10(open_slopes), np.log10(closed_slopes))
print(f"Mann-Whitney U test: statistic={statistic}, p-value={p_value}")

# Use t-test otherwise
statistic, p_value = stats.ttest_ind(np.log10(open_slopes), np.log10(closed_slopes), equal_var=False)
print(f"t-test: statistic={statistic}, p-value={p_value}")

Mann-Whitney U test: statistic=574037.0, p-value=9.84316221822043e-09
t-test: statistic=7.921474002437873, p-value=4.535990084309112e-15


In [47]:
np.percentile(np.log10(closed_slopes) - np.log10(open_slopes), [2.5, 97.5])

array([-0.37281489,  0.21280456])

## Plot predictions

In [52]:
# Graph of the different model fits using plotly

model = 'kinked'  # ['simple', 'kinked']

# Parameters for the simple model
kink_count = 2
allow_discontinuities = False
same_intercepts = (False,)
same_slopes = (False,False,False)

def plot_model(df, model_type, kink_count=1, allow_discontinuities=False):
    if model_type == 'simple':
        fit_result = fit_n_phase_exponential(df, 0, same_intercepts, same_slopes)
    else:
        fit_result = fit_n_phase_exponential(df, kink_count, allow_discontinuities, same_intercepts, same_slopes)

    df_usa = df[df['category'] == 'USA']
    df_china = df[df['category'] == 'China']

    fig = go.Figure()

    # Plot the original data points
    fig.add_trace(go.Scatter(
        x=df_usa['date'], y=df_usa['log_flop'],
        mode='markers', name='USA', text=df_usa['System'],
        marker=dict(color=colors['USA'], opacity=0.3, size=10)
    ))
    fig.add_trace(go.Scatter(
        x=df_china['date'], y=df_china['log_flop'],
        mode='markers', name='China', text=df_china['System'],
        marker=dict(color=colors['China'], opacity=0.3, size=10)
    ))

    # Plot the fit lines
    date_grid = pd.date_range(start=df['date'].min(), end=df['date'].max(), freq='D')
    log_flop_usa = fit_result.predict(pd.Series(date_grid), pd.Series(['USA'] * len(date_grid)))
    log_flop_china = fit_result.predict(pd.Series(date_grid), pd.Series(['China'] * len(date_grid)))
    usa_trend_df = pd.DataFrame({
        'date': date_grid,
        'log_flop': log_flop_usa,
    })
    china_trend_df = pd.DataFrame({
        'date': date_grid,
        'log_flop': log_flop_china,
    })

    fig.add_trace(go.Scatter(
        x=date_grid, y=log_flop_usa,
        mode='lines', name='Best Fit Line (USA)',
        line=dict(color=colors['USA'])
    ))
    fig.add_trace(go.Scatter(
        x=date_grid, y=log_flop_china,
        mode='lines', name='Best Fit Line (China)',
        line=dict(color=colors['China'])
    ))

    # Add slope labels
    points = [df['date'].min()] + fit_result.break_points_dt + [df['date'].max()]
    for i in range(len(points) - 1):
        for category in ['USA', 'China']:
            mid = points[i] + (points[i+1] - points[i]) / 2
            y = fit_result.predict(pd.Series([mid]), pd.Series([category]))[0]
            fig.add_annotation(
                x=mid, y=y + 1.2 * (0.4 if category == 'China' else -1),
                text=f'{10**fit_result.oom_year_slopes[category][i]:0.1f}x/year',
                showarrow=False,
                font=dict(size=12, color=colors[category])
            )

    # Plot horizontal line segment showing the lag
    lag_months = calculate_lag(df, fit_result) * 12  # Convert years to months
    end_date = china_df['date'].max()
    start_date = end_date - pd.DateOffset(days=int(lag_months * 30.4375))  # Approximate months to days conversion
    y_value = fit_result.predict(pd.Series([end_date]), pd.Series(['China']))[0]
    fig.add_shape(
        type="line",
        x0=start_date, y0=y_value, x1=end_date, y1=y_value,
        line=dict(color="black", width=1, dash="dash")
    )
    fig.add_annotation(
        x=(start_date + (end_date - start_date) * 0.5), y=y_value + 0.15,
        text=f'Lag: {lag_months:.0f} months',
        showarrow=False,
    )

    # Annotate some key models with text
    
    # key_models = ['GPT-4'] if (exclude_big_llama or exclude_all_llamas) else ['GPT-4', 'Llama 3.1-405B']
    # for model_name in key_models:
    #     model_row = df_filtered[df_filtered['System'] == model_name]
    #     fig.add_annotation(
    #         x=model_row['date'].iloc[0], y=model_row['log_flop'].iloc[0],
    #         text=model_name,
    #         showarrow=True,
    #         font=dict(size=12, color='black'),
    #         xanchor='right', yanchor='bottom'
    #     )

    # Update layout
    title = f'Compute trends for top-{top_n} USA and China models'
    fig.update_layout(
        template='plotly_white',
        width=800,
        height=400,
        title=title,
        xaxis_title='Model publication date',
        yaxis_title='Training compute (FLOP)',
        legend_title='Affiliation',
        legend=dict(
            x=0.7,
            y=0.05
        ),
        margin=dict(l=10, r=10, t=40, b=10),
        xaxis=dict(
            tickformat='%Y',
            dtick='M12',
        ),
        yaxis=dict(
            tickmode='array',
            tickvals=list(range(int(df['log_flop'].min()), int(df['log_flop'].max())+2)),
            ticktext=[f'10<sup>{i}</sup>' for i in range(int(df['log_flop'].min()), int(df['log_flop'].max())+2)]
        )
    )

    fname = f'compute_regression_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}_{model_type}_kinks={kink_count}'
    save_plot(fig, results_dir, fname)
    df_usa[['System', 'date', 'log_flop']].to_csv(results_dir + f'plot_data/{fname}_usa.csv', index=False)
    df_china[['System', 'date', 'log_flop']].to_csv(results_dir + f'plot_data/{fname}_china.csv', index=False)
    usa_trend_df[['date', 'log_flop']].to_csv(results_dir + f'plot_data/{fname}_usa_trend.csv', index=False)
    china_trend_df[['date', 'log_flop']].to_csv(results_dir + f'plot_data/{fname}_china_trend.csv', index=False)

    fig.show()

    return fit_result

fit_result = plot_model(df_filtered, model, kink_count, allow_discontinuities)

## Model selection based on backtesting

In [49]:
len(df_filtered)

20

In [50]:
def backtest_model(train_df, test_df, same_intercepts, same_slopes):
    kink_count = 0
    allow_discontinuities = False
    fit_result = fit_n_phase_exponential(
        train_df, kink_count, allow_discontinuities, same_intercepts, same_slopes
    )
    test_df = test_df.copy()  # Create a copy to avoid SettingWithCopyWarning
    test_df['predicted_log_flop'] = fit_result.predict(test_df['date'], test_df['category'])
    return test_df


def rmse(predictions, targets):
    return np.sqrt(np.mean((np.array(predictions) - np.array(targets))**2))


def backtest_model_selection(df, error_metric=rmse):
    # Construct a list of dates to split the data into training and testing sets
    # From halfway through the data to the second last data point
    split_dates = df['date'][len(df['date'])//2:]
    same_intercepts = (False,)
    errors = {}
    for same_slopes in [(False,), (True,)]:
        print(f"Same slopes: {same_slopes}")
        targets = []
        predictions = []
        errors_this_model = []
        for split_date in split_dates:
            train_df = df.loc[df['date'] < split_date].copy()  # Create a copy to avoid SettingWithCopyWarning
            test_df = df.loc[df['date'] >= split_date].copy()  # Create a copy to avoid SettingWithCopyWarning
            backtest_result = backtest_model(train_df, test_df, same_intercepts, same_slopes)
            error = error_metric(backtest_result['predicted_log_flop'], backtest_result['log_flop'])
            print(f"{error_metric.__name__} in log-FLOP from {split_date}: {error}")
            predictions.extend(backtest_result['predicted_log_flop'])
            targets.extend(backtest_result['log_flop'])
            errors_this_model.append(error)
        errors[f"same_slopes={same_slopes}"] = errors_this_model
        mean = np.mean(errors_this_model)
        p5 = np.percentile(errors_this_model, 5)
        p95 = np.percentile(errors_this_model, 95) 
        print(f'{error_metric.__name__} in log-FLOP for same_slopes={same_slopes}: {mean} (90% CI: {p5} to {p95})')
    return split_dates, errors

In [51]:
same_slopes = (True,)
split_date = pd.Timestamp('2023-01-01')
train_df = df_filtered.loc[df_filtered['date'] < split_date]
test_df = df_filtered.loc[df_filtered['date'] >= split_date]
backtest_result = backtest_model(train_df, test_df, same_intercepts, same_slopes)
error = rmse(backtest_result['predicted_log_flop'], backtest_result['log_flop'])
closed_error = rmse(backtest_result[backtest_result['category'] == 'Closed']['predicted_log_flop'], backtest_result[backtest_result['category'] == 'Closed']['log_flop'])
open_error = rmse(backtest_result[backtest_result['category'] == 'Open']['predicted_log_flop'], backtest_result[backtest_result['category'] == 'Open']['log_flop'])
print(f"RMSE in log-FLOP from {split_date}: {error}")
print(f"RMSE in log-FLOP from {split_date} for closed models: {closed_error}")
print(f"RMSE in log-FLOP from {split_date} for open models: {open_error}")

fig = go.Figure()

# Plot raw data, separating by category
df_open = df_filtered[df_filtered['category'] == 'Open']
df_closed = df_filtered[df_filtered['category'] == 'Closed']
# Plot the original data points
fig.add_trace(go.Scatter(
    x=df_open['date'], y=10**df_open['log_flop'],
    mode='markers', name='Open models',
    marker=dict(color=colors['open'], opacity=0.3, size=10)
))
fig.add_trace(go.Scatter(
    x=df_closed['date'], y=10**df_closed['log_flop'],
    mode='markers', name='Closed models',
    marker=dict(color=colors['closed'], opacity=0.3, size=10)
))

# Plot predictions as lines for each category
test_df_open = backtest_result[backtest_result['category'] == 'Open']
test_df_closed = backtest_result[backtest_result['category'] == 'Closed']

fig.add_trace(go.Scatter(
    x=test_df_open['date'], y=10**test_df_open['predicted_log_flop'],
    mode='lines', name='Predicted open models',
    line=dict(color=colors['open'])
))
fig.add_trace(go.Scatter(
    x=test_df_closed['date'], y=10**test_df_closed['predicted_log_flop'],
    mode='lines', name='Predicted closed models',
    line=dict(color=colors['closed'])
))

# Vertical line at the split date
fig.add_shape(
    type="line",
    x0=split_date, y0=10**20, x1=split_date, y1=10**26,
    line=dict(color="black", width=1, dash="dash"),
    name='Train-test split date',
    legendgroup='Train-test split date',  # Grouping for legend
    showlegend=True  # Ensure it appears in the legend
)

fig.update_yaxes(type='log')

fig.update_layout(
    template='plotly_white',
    width=800,
    height=400,
    title=f'Example backtest of regression with equal slopes (top-{top_n} models)',
    xaxis_title='Model publication date',
    yaxis_title='Training compute (FLOP)',
    legend_title='Model Category',
)

save_plot(fig, results_dir, f'backtest_example_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}_same_slopes={same_slopes}_split={split_date}')
df_open[['System', 'date', 'log_flop']].to_csv(results_dir + f'plot_data/backtest_example_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}_same_slopes={same_slopes}_split={split_date}_open.csv', index=False)
df_closed[['System', 'date', 'log_flop']].to_csv(results_dir + f'plot_data/backtest_example_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}_same_slopes={same_slopes}_split={split_date}_closed.csv', index=False)
test_df_open[['date', 'predicted_log_flop']].to_csv(results_dir + f'plot_data/backtest_example_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}_same_slopes={same_slopes}_split={split_date}_open_predictions.csv', index=False)
test_df_closed[['date', 'predicted_log_flop']].to_csv(results_dir + f'plot_data/backtest_example_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}_same_slopes={same_slopes}_split={split_date}_closed_predictions.csv', index=False)

fig.show()

RMSE in log-FLOP from 2023-01-01 00:00:00: 0.5759765082108136
RMSE in log-FLOP from 2023-01-01 00:00:00 for closed models: 0.6876644896833987
RMSE in log-FLOP from 2023-01-01 00:00:00 for open models: 0.5335770177682432


In [52]:
split_dates,errors = backtest_model_selection(df_filtered)

Same slopes: (False,)
rmse in log-FLOP from 2022-05-02 00:00:00: 1.3636662111073252
rmse in log-FLOP from 2022-11-28 00:00:00: 0.7725445367998696
rmse in log-FLOP from 2023-02-24 00:00:00: 0.8381066906405717
rmse in log-FLOP from 2023-03-15 00:00:00: 0.8182624373848103
rmse in log-FLOP from 2023-07-18 00:00:00: 0.8207854660093162
rmse in log-FLOP from 2023-09-06 00:00:00: 0.8627907918565291
rmse in log-FLOP from 2023-12-06 00:00:00: 0.6570131279723145
rmse in log-FLOP from 2024-04-18 00:00:00: 0.7364113852588473
rmse in log-FLOP from 2024-06-14 00:00:00: 0.7312568614157056
rmse in log-FLOP from 2024-07-23 00:00:00: 0.6708808081411668
rmse in log-FLOP for same_slopes=(False,): 0.8271718316586456 (90% CI: 0.663253584048298 to 1.1382722724444665)
Same slopes: (True,)
rmse in log-FLOP from 2022-05-02 00:00:00: 0.5286828306613991
rmse in log-FLOP from 2022-11-28 00:00:00: 0.5017896740354243
rmse in log-FLOP from 2023-02-24 00:00:00: 0.5759765082108136
rmse in log-FLOP from 2023-03-15 00:00:

In [53]:
# Plot the errors
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=split_dates, y=errors['same_slopes=(True,)'],
    mode='lines', name='Equal slopes',
    # line=dict(color='blue')
))
fig.add_trace(go.Scatter(
    x=split_dates, y=errors['same_slopes=(False,)'],
    mode='lines', name='Different slopes',
    # line=dict(color='blue')
))
fig.update_layout(
    template='plotly_white',
    width=800,
    height=400,
    title=f'Test error for predicting the training compute of top-{top_n} models' + ('<br>with hypothetical Llama 4' if include_llama_4 else ''),
    xaxis_title='Train-test split date',
    yaxis_title='RMSE on the test set',
)

save_plot(fig, results_dir, f'compute_regression_backtest_error_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}')
equal_slopes_errors_df = pd.DataFrame({'split_date': split_dates, 'test_rmse': errors['same_slopes=(True,)']})
different_slopes_errors_df = pd.DataFrame({'split_date': split_dates, 'test_rmse': errors['same_slopes=(False,)']})
equal_slopes_errors_df.to_csv(results_dir + f'plot_data/compute_regression_backtest_error_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}_equal_slopes.csv', index=False)
different_slopes_errors_df.to_csv(results_dir + f'plot_data/compute_regression_backtest_error_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}_different_slopes.csv', index=False)

fig.show()


# Open and closed model compute by Organization

In [54]:
# Scatter plot of open and closed models using plotly
usa_df = df_filtered[df_filtered['category'] == 'Open']
closed_df = df_filtered[df_filtered['category'] == 'Closed']

marker_to_org = {
    'bowtie': 'Meta',
    'cross': 'Google',
    'hexagon-open': 'OpenAI',
    'star': 'Anthropic',
    'square': 'Microsoft',
    'circle': 'Other',
}
closed_added_to_legend = defaultdict(bool)
open_added_to_legend = defaultdict(bool)

org_model_counts = defaultdict(lambda: defaultdict(int))

fig = go.Figure()
for org in df_filtered['Organization'].unique():
    open_df_org = usa_df[usa_df['Organization'] == org]
    closed_df_org = closed_df[closed_df['Organization'] == org]
    if any([kw in org.lower() for kw in ['meta', 'facebook']]):
        marker = 'bowtie'
        org_model_counts['Open']['Meta'] += len(open_df_org)
        org_model_counts['Closed']['Meta'] += len(closed_df_org)
    elif any([kw in org.lower() for kw in ['google', 'deepmind']]):
        marker = 'cross'
        org_model_counts['Open']['Google/DeepMind'] += len(open_df_org)
        org_model_counts['Closed']['Google/DeepMind'] += len(closed_df_org)
    elif any([kw in org.lower() for kw in ['openai']]):
        marker = 'hexagon-open'
        org_model_counts['Open']['OpenAI'] += len(open_df_org)
        org_model_counts['Closed']['OpenAI'] += len(closed_df_org)
    elif any([kw in org.lower() for kw in ['anthropic']]):
        marker = 'star'
        org_model_counts['Open']['Anthropic'] += len(open_df_org)
        org_model_counts['Closed']['Anthropic'] += len(closed_df_org)
    elif any([kw in org.lower() for kw in ['microsoft']]):
        marker = 'square'
        org_model_counts['Open']['Microsoft'] += len(open_df_org)
        org_model_counts['Closed']['Microsoft'] += len(closed_df_org)
    else:
        marker = 'circle'
        org_model_counts['Open']['Other'] += len(open_df_org)
        org_model_counts['Closed']['Other'] += len(closed_df_org)
    fig.add_trace(go.Scatter(
        x=open_df_org['date'],
        y=open_df_org['log_flop'],
        text=open_df_org['System'],
        mode='markers',
        name=marker_to_org[marker] + ', open',
        showlegend=not open_added_to_legend[marker],
        marker=dict(
            color=colors['open'],
            opacity=0.5,
            symbol=marker
        )
    ))
    fig.add_trace(go.Scatter(
        x=closed_df_org['date'],
        y=closed_df_org['log_flop'],
        text=closed_df_org['System'],
        mode='markers',
        name=marker_to_org[marker] + ', closed',
        showlegend=not closed_added_to_legend[marker],
        marker=dict(
            color=colors['closed'],
            opacity=0.5,
            symbol=marker
        )
    ))
    if len(closed_df_org) > 0:
        closed_added_to_legend[marker] = True
    if len(open_df_org) > 0:
        open_added_to_legend[marker] = True

# Axis titles
fig.update_layout(xaxis_title='Model publication date')
fig.update_layout(yaxis_title='Training compute (FLOP)')

# Format the y-axis labels as 10^N
yvals = list(range(20, 27))
fig.update_yaxes(
    tickmode = 'array',
    tickvals = yvals,
    ticktext = [f'10<sup>{x}</sup>' for x in yvals],
    # ticks="",
    # tickfont=dict(size=20)
)

# Legend title
fig.update_layout(legend_title='Organization, access')

# Margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

# plotly-white
fig.update_layout(template='plotly_white')

# Sizing
fig.update_layout(
    width=600,
    height=400,
    title='Open and closed models by organization'
)

# Save
save_plot(fig, results_dir, f'compute_open_closed_by_org_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}')

fig.show()

In [55]:
for category in ['Open', 'Closed']:
    print(f"{category}:")
    for org in org_model_counts[category]:
        print(f"    {org}: {org_model_counts[category][org]}")

print(f"Open: Other / Total = {org_model_counts['Open']['Other'] / sum(org_model_counts['Open'].values()):.2%}")
print(f"Closed: Other / Total = {org_model_counts['Closed']['Other'] / sum(org_model_counts['Closed'].values()):.2%}")


Open:
    Google/DeepMind: 2
    OpenAI: 0
    Other: 3
    Microsoft: 0
    Meta: 5
Closed:
    Google/DeepMind: 4
    OpenAI: 4
    Other: 1
    Microsoft: 1
    Meta: 0
Open: Other / Total = 30.00%
Closed: Other / Total = 10.00%


# Llama trend and extrapolation

In [56]:
df_llamas = df_filtered[df_filtered['System'].str.contains('llama', case=False) & 
                        (df_filtered['System'].str.contains('65B') | 
                         df_filtered['System'].str.contains('70B') | 
                         df_filtered['System'].str.contains('405B'))]
                        #  df_filtered['System'].str.contains('hypothetical'))]
df_llamas

Unnamed: 0,System,flop,date,Organization,Notability criteria,Domain,Base model,category,log_flop,date_float
58,LLaMA-65B,5.5e+23,2023-02-24,Meta AI,"Historical significance,Highly cited",Language,,Open,23.740363,2023.146305
47,Llama 2-70B,8.1e+23,2023-07-18,Meta AI,"Historical significance,Significant use,Highly...",Language,,Open,23.908485,2023.546544
14,Llama 3-70B,6.3e+24,2024-04-18,Meta AI,Significant use,Language,,Open,24.799341,2024.296544
1,Llama 3.1-405B,3.8e+25,2024-07-23,Meta AI,"SOTA improvement,Training cost",Language,,Open,25.579784,2024.560234


In [57]:
llama_fit_result = fit_ols_regression(df_llamas, ['date_float'], 'log_flop')
llama_fit_result.summary()


omni_normtest is not valid with less than 8 observations; 4 samples were given.



0,1,2,3
Dep. Variable:,log_flop,R-squared:,0.928
Model:,OLS,Adj. R-squared:,0.892
Method:,Least Squares,F-statistic:,25.81
Date:,"Tue, 05 Nov 2024",Prob (F-statistic):,0.0366
Time:,09:15:01,Log-Likelihood:,0.80086
No. Observations:,4,AIC:,2.398
Df Residuals:,2,BIC:,1.171
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-2515.8801,500.005,-5.032,0.037,-4667.229,-364.531
date_float,1.2552,0.247,5.081,0.037,0.192,2.318

0,1,2,3
Omnibus:,,Durbin-Watson:,2.016
Prob(Omnibus):,,Jarque-Bera (JB):,0.612
Skew:,0.015,Prob(JB):,0.736
Kurtosis:,1.084,Cond. No.,7230000.0


In [58]:
# Create a date range for predictions
start_date = df_llamas['date'].min()
end_date = pd.Timestamp('2025-08-23')
date_range = pd.date_range(start=start_date, end=end_date, freq='ME')
date_range_float = datetime_to_float_year(date_range)

# Create a new DataFrame for predictions
pred_df = pd.DataFrame({'date': date_range, 'date_float': date_range_float})

# Get predictions using the new DataFrame
llama_log_flop = get_predictions(llama_fit_result, pred_df, ['date_float'])
pred_df['log_flop'] = llama_log_flop
llama_log_flop

array([23.59050717, 23.7054172 , 23.80658072, 23.9146175 , 24.01578102,
       24.1238178 , 24.22841794, 24.32958147, 24.43761824, 24.53878176,
       24.64681854, 24.75141869, 24.84914558, 24.96061898, 25.0617825 ,
       25.16981928, 25.2709828 , 25.37901958, 25.48361972, 25.58478325,
       25.69282002, 25.79398354, 25.90202032, 26.00662047, 26.10091073,
       26.21582076, 26.31698428, 26.42502106, 26.52618458, 26.63422136])

In [59]:
# Plot the trend of Llama models
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_llamas['date'],
    y=df_llamas['flop'],
    mode='markers',
    text=df_llamas['System'],
    name='Largest Llama models',
    marker=dict(color='blue', size=10, opacity=0.5)
))
fig.add_trace(go.Scatter(
    x=pred_df['date'],
    y=10**llama_log_flop,
    mode='lines',
    name='Llama trend (excluding Llama 4)',
    line=dict(color='blue', dash='dash')
))
fig.add_trace(go.Scatter(
    x=[pd.Timestamp('2025-07-23')],
    y=[10*10**df_llamas['log_flop'].iloc[-1]],
    mode='markers',
    name='Llama 4 (Meta\'s 10x projection)',
    showlegend=False,
    marker=dict(color='blue', size=10, opacity=0.5, symbol='circle-open')
))
# Annotate Llama 4
fig.add_annotation(
    x=pd.Timestamp('2025-07-23'),
    y=np.log10(llama_3_405_compute * 10),
    text='Llama 4<br><i>Meta\'s 10x projection</i>',
    showarrow=True,
    arrowhead=0,
    ax=0,
    ay=-40,
    font=dict(color='black', size=12)
)

# Add closed models with trendline
fit_result = fit_n_phase_exponential(df_filtered, 0, same_intercepts=(False,), same_slopes=(False,))

df_open = df_filtered[df_filtered['category'] == 'Open']
df_closed = df_filtered[df_filtered['category'] == 'Closed']

# Plot the original data points
# fig.add_trace(go.Scatter(
#     x=df_open['date'], y=df_open['log_flop'],
#     mode='markers', name='Open models',
#     marker=dict(color=colors['open'], opacity=0.3, size=10)
# ))
fig.add_trace(go.Scatter(
    x=df_closed['date'], y=df_closed['flop'],
    mode='markers', name='Largest closed models',
    marker=dict(color=colors['closed'], opacity=0.3, size=10)
))

# Plot the fit lines
date_grid = pd.date_range(start=df_filtered['date'].min(), end=pd.Timestamp('2025-08-23'), freq='ME')
log_flop_open = fit_result.predict(pd.Series(date_grid), pd.Series(['Open'] * len(date_grid)))
log_flop_closed = fit_result.predict(pd.Series(date_grid), pd.Series(['Closed'] * len(date_grid)))

# fig.add_trace(go.Scatter(
#     x=date_grid, y=log_flop_open,
#     mode='lines', name='Best Fit Line (Open)',
#     line=dict(color=colors['open'])
# ))
fig.add_trace(go.Scatter(
    x=date_grid, y=10**log_flop_closed,
    mode='lines', name='Closed trend',
    line=dict(color=colors['closed'], dash='dash')
))
fig.update_yaxes(type='log')

fig.update_layout(
    width=800,
    height=400,
    template='plotly_white',
    title='The training compute of Llama models may catch up to top closed models in 2025',
    xaxis_title='Model publication date',
    yaxis_title='Training compute (FLOP)',
)

save_plot(fig, results_dir, f'llama_compute_trend_with_hypothetical_4_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}')
df_llamas[['System', 'date', 'log_flop']].to_csv(results_dir + f'plot_data/llama_compute_trend_with_hypothetical_4_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}_llamas.csv', index=False)
pred_df[['date', 'log_flop']].to_csv(results_dir + f'plot_data/llama_compute_trend_with_hypothetical_4_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}_llamas_trend.csv', index=False)
pd.DataFrame({'date': [pd.Timestamp('2025-07-23')], 'log_flop': [1 + df_llamas['log_flop'].iloc[-1]]}).to_csv(results_dir + f'plot_data/llama_compute_trend_with_hypothetical_4_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}_llama_4_projection.csv', index=False)

fig.show()