# Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict
from collections.abc import Callable
from dataclasses import dataclass
import json
from itertools import combinations_with_replacement
import numpy as np
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import chi2
from sklearn.model_selection import KFold
import statsmodels.api as sm
from tqdm import tqdm

from data import *
from plotting import *
from regression import *
from utils import *

# Parameters

In [3]:
# 'external': Filter to the top n models overall
# 'internal': Filter to the top n models within 'Non-China' and 'China' categories
# 'disabled': No filtering
frontier_selection = 'external'  # ['disabled', 'internal', 'external']
top_n = 10  # Filter to the top n models by training compute at time of release
model_selection = 'Language models'  # ['All models', 'Language models', 'Google DeepMind models', 'OpenAI models', 'Meta AI models']
filter_alphago_outliers = True  # Whether to filter out AlphaGo Master and AlphaGo Zero
filter_finetuned_models = True  # Whether to filter out separate finetuned models (base + finetuned models are still included if there is no separate base model in our dataset)
include_speculative_compute = False  # Whether to include speculative compute estimates that rely on benchmark imputation and rough guesses
cutoff_date = '2018-01-01'  # When to start the regressions from
top_n_cutoff_date = '1950-01-01'  # When to split the top-n filtering into Non-China and China categories - set to e.g. 2010 to turn off the "kickstarting"
save = True  # Whether to save the plots

In [4]:
results_dir = 'results/compute/22Nov-nospec/'
os.makedirs(results_dir, exist_ok=True)
os.makedirs(results_dir + 'plot_data', exist_ok=True)

In [5]:
colors = {'Non-China': 'blue', 'China': 'red'}


# Data preparation

In [6]:
# Load data
pcd_df = load_pcd_df()

In [7]:
pcd_df

Unnamed: 0,System,Domain,Task,Authors,Notability criteria,Notability criteria notes,Model accessibility,Link,Citations,Reference,...,Hardware TF16,Hardware FP16,Assumed precision,Assumed hardware FLOP/s,Hardware type,Compute estimate method,Training compute estimation method,Biological model safeguards,Hardware utilization (temp),BenchmarkHub-v1
0,babbage-002,Language,Language modelling,,,,,,,,...,,,FP32,,,,,,,
1,tts-1,Speech,Text-to-speech,,,,,,,,...,,,FP32,,,,,,,
2,tts-1-hd,Speech,Text-to-speech,,,,,,,,...,,,FP32,,,,,,,
3,LM-Design,Biology,Protein design,"Zaixiang Zheng, Yifan Deng, Dongyu Xue, Yi Zho...",,,,https://proceedings.mlr.press/v202/zheng23a.html,46.0,Structure-informed Language Models Are Protein...,...,,,FP32,,,,,LM-Design,,
4,Genie (bio),Biology,,,,,,https://arxiv.org/abs/2301.12485,,"Generating Novel, Designable, and Diverse Prot...",...,,,FP32,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2047,Aya Expanse 8B,Language,"Language modelling/generation,Translation",,,,Open weights (restricted use),https://cohere.com/blog/aya-expanse-connecting...,,"Cohere For AI launches Aya Expanse, a state-of...",...,,,FP32,,,,Operation counting,,,
2048,π0 (pi-zero),"Robotics,Vision",Robotic manipulation,"Kevin Black, Noah Brown, Danny Driess, Adnan E...",,,Unreleased,https://www.physicalintelligence.company/downl...,,π0: Our First Generalist Policy,...,,,FP32,,,,,,,
2049,Hunyuan-Large,Language,"Language modelling/generation,Question answeri...","Xingwu Sun, Yanfeng Chen, Yiqing Huang, Ruobin...",,,Open weights (restricted use),https://arxiv.org/abs/2411.02265,,Hunyuan-Large: An Open-Source MoE Model with 5...,...,,,FP32,,,,"Operation counting,Other",,,
2050,Qwen2.5-Coder (32B),Language,"Language modelling/generation,Code generation","Binyuan Hui, Jian Yang, Zeyu Cui, Jiaxi Yang, ...",,,Open weights (unrestricted),https://arxiv.org/abs/2409.12186,,Qwen2.5-Coder Technical Report,...,,,FP32,,,,Operation counting,,,


In [8]:
print(pcd_df.loc[pcd_df['System'] == 'Megatron-BERT']['Country (from Organization)'])
print(pcd_df.loc[pcd_df['System'] == 'Yi-34B']['Country (from Organization)'])


835    United States of America
Name: Country (from Organization), dtype: object
1657    China
Name: Country (from Organization), dtype: object


In [9]:
country_df = pcd_df.dropna(subset=['Publication date', 'Country (from Organization)'])
len(country_df)

1869

In [10]:
country_df['Country (from Organization)'].unique()


array(['United States of America',
       'United States of America,United States of America', 'Italy',
       'New Zealand',
       'United Kingdom of Great Britain and Northern Ireland',
       'Switzerland', 'Japan', 'Multinational', 'Netherlands', 'Finland',
       'Canada', 'Japan,United States of America', 'Spain',
       'Denmark,United Kingdom of Great Britain and Northern Ireland',
       'India', 'Germany', 'France',
       'United Kingdom of Great Britain and Northern Ireland,United States of America',
       'Taiwan',
       'United States of America,United States of America,United States of America',
       'United Kingdom of Great Britain and Northern Ireland,Canada',
       'United States of America,Germany', 'Korea (Republic of)',
       'United States of America,United Kingdom of Great Britain and Northern Ireland',
       'Mexico', 'Switzerland,Germany', 'France,Canada',
       'France,United States of America,France', 'Canada,Singapore',
       'Finland,Multinational

In [11]:
country_df[country_df['Country (from Organization)'].str.contains('China')]

Unnamed: 0,System,Domain,Task,Authors,Notability criteria,Notability criteria notes,Model accessibility,Link,Citations,Reference,...,Hardware TF16,Hardware FP16,Assumed precision,Assumed hardware FLOP/s,Hardware type,Compute estimate method,Training compute estimation method,Biological model safeguards,Hardware utilization (temp),BenchmarkHub-v1
432,AdaRNN,Language,Sentiment classification,"Li Dong, Furu Wei, Chuanqi Tan, Duyu Tang, M. ...",Highly cited,,,https://www.semanticscholar.org/paper/Adaptive...,,Adaptive Recursive Neural Network for Target-d...,...,,,FP32,,,,,,,
436,SPPNet,Vision,Image classification,"Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun",Highly cited,,,https://arxiv.org/abs/1406.4729,10365.0,Spatial Pyramid Pooling in Deep Convolutional ...,...,,,FP32,4709000000000,GPU,,,,,
454,Cascaded LNet-ANet,Vision,Face detection,"Ziwei Liu, Ping Luo, Xiaogang Wang, Xiaoou Tang",Highly cited,,,https://arxiv.org/abs/1411.7766,7710.0,Deep Learning Face Attributes in the Wild,...,,,FP32,,,,,,,
465,CRF-RNN,Vision,Image segmentation,"Shuai Zheng, Sadeep Jayasumana, Bernardino Rom...",Highly cited,,,https://arxiv.org/abs/1502.03240,2661.0,Conditional Random Fields as Recurrent Neural ...,...,,,FP32,,,,,,,
470,genCNN + dyn eval,Language,Language modelling,"Mingxuan Wang, Zhengdong Lu, Hang Li, Wenbin J...",SOTA improvement,"""genCNN outperforms the state-ofthe-arts with ...",Unreleased,https://aclanthology.org/P15-1151/,33.0,genCNN: A Convolutional Architecture for Word ...,...,,,FP32,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2040,Belle-whisper-larger-v3-turbo-zh,Speech,Audio speech recognition,,,,Open weights (unrestricted),https://huggingface.co/BELLE-2/Belle-whisper-l...,,,...,,,FP32,,,,,,,
2041,Janus 1.3B,"Language,Vision,Multimodal","Language modelling/generation,Question answeri...","Chengyue Wu, Xiaokang Chen, Zhiyu Wu, Yiyang M...",,,Open weights (restricted use),https://arxiv.org/abs/2410.13848,,Janus: Decoupling Visual Encoding for Unified ...,...,311840000000000,77970000000000.0000,TF16,311840000000000,,,Hardware,,,
2042,Yi-Lightning,Language,Language modelling/generation,,,"On the blind test list LMSYS, Yi-Lightning sur...",API access,https://www.lingyiwanwu.com/en https://platfor...,,Yi-Lightning,...,989500000000000,133800000000000.0000,TF16,989500000000000,,,Hardware,,,
2049,Hunyuan-Large,Language,"Language modelling/generation,Question answeri...","Xingwu Sun, Yanfeng Chen, Yiqing Huang, Ruobin...",,,Open weights (restricted use),https://arxiv.org/abs/2411.02265,,Hunyuan-Large: An Open-Source MoE Model with 5...,...,,,FP32,,,,"Operation counting,Other",,,


In [12]:
country_df[~country_df['Country (from Organization)'].str.contains('China')]

Unnamed: 0,System,Domain,Task,Authors,Notability criteria,Notability criteria notes,Model accessibility,Link,Citations,Reference,...,Hardware TF16,Hardware FP16,Assumed precision,Assumed hardware FLOP/s,Hardware type,Compute estimate method,Training compute estimation method,Biological model safeguards,Hardware utilization (temp),BenchmarkHub-v1
130,Theseus,Robotics,Maze solving,Claude Shannon,Historical significance,,,https://www.technologyreview.com/2018/12/19/13...,0.0,Mighty Mouse,...,,,FP32,,,,,,,
131,SNARC,Robotics,Maze solving,Marvin Minsky,Historical significance,,,https://en.wikipedia.org/wiki/Stochastic_neura...,33.0,A Neural-Analogue Calculator Based upon a Prob...,...,,,FP32,,,,,,,
132,Genetic algorithm,Mathematics,Numerical simulation,NA Barricelli,Historical significance,Possibly first computer simulation of a geneti...,,https://link.springer.com/article/10.1007/BF01...,266.0,Numerical testing of evolution theories,...,,,FP32,,,,,,,
133,Sequence-based pattern recognition,Vision,Character recognition,O. G. Selfridge,Historical significance,,,https://dl.acm.org/doi/10.1145/1455292.1455310,290.0,Pattern recognition and modern computers,...,,,FP32,,,,,,,
134,Self Organizing System,Other,Pattern recognition,W. A. Clark and B. G. Farley,Historical significance,,,https://dl.acm.org/doi/10.1145/1455292.1455309,93.0,Generalization of pattern recognition in a sel...,...,,,FP32,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2045,NVLM-H 72B,"Vision,Language","Language modelling/generation,Vision-language ...","Wenliang Dai, Nayeon Lee, Boxin Wang, Zhuolin ...",,,,https://arxiv.org/abs/2409.11402,,NVLM: Open Frontier-Class Multimodal LLMs,...,989500000000000,133800000000000.0000,TF16,989500000000000,,,Operation counting,,,
2046,Aya Expanse 32B,Language,"Language modelling/generation,Translation",,,,Open weights (restricted use),https://cohere.com/blog/aya-expanse-connecting...,,"Cohere For AI launches Aya Expanse, a state-of...",...,,,FP32,,,,Operation counting,,,
2047,Aya Expanse 8B,Language,"Language modelling/generation,Translation",,,,Open weights (restricted use),https://cohere.com/blog/aya-expanse-connecting...,,"Cohere For AI launches Aya Expanse, a state-of...",...,,,FP32,,,,Operation counting,,,
2048,π0 (pi-zero),"Robotics,Vision",Robotic manipulation,"Kevin Black, Noah Brown, Danny Driess, Adnan E...",,,Unreleased,https://www.physicalintelligence.company/downl...,,π0: Our First Generalist Policy,...,,,FP32,,,,,,,


Check if the country is listed.

TODO: try other methods of reducing multiple countries to one country.
- Use the first country listed
- Mutually exclusive (e.g. China but NOT Non-China)

In [13]:
# Check if the country is listed
def assign_country(row):
    if 'China' in row['Country (from Organization)']:
        return 'China'
    else:
        return 'Non-China'
country_df.loc[:, 'Country'] = country_df.apply(assign_country, axis=1)

# Use the first country listed
# country_df['Country'] = country_df['Country (from Organization)'].apply(lambda x: x.split(',')[0].strip())

country_df[['System', 'Country']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  country_df.loc[:, 'Country'] = country_df.apply(assign_country, axis=1)


Unnamed: 0,System,Country
130,Theseus,Non-China
131,SNARC,Non-China
132,Genetic algorithm,Non-China
133,Sequence-based pattern recognition,Non-China
134,Self Organizing System,Non-China
...,...,...
2047,Aya Expanse 8B,Non-China
2048,π0 (pi-zero),Non-China
2049,Hunyuan-Large,China
2050,Qwen2.5-Coder (32B),China


In [14]:
for cat in country_df['Country'].unique():
    if pd.isna(cat):
        print(cat, len(country_df.loc[country_df['Country'].isna()]))
    else:
        print(cat, len(country_df.loc[country_df['Country'] == cat]))

Non-China 1534
China 335


In [15]:
df = country_df

In [16]:
def find_top_models_up_to_release(df, top_n):
    """Find the models which were in the top n by compute when they were released."""
    # This set will keep track of models that were ever in the top 10 at their release
    ever_in_top_n = set()

    # Iterate over each date in the DataFrame
    for current_date in df['date'].unique():
        # Get all entries up to the current date
        historical_data = df[df['date'] <= current_date]
        # Find top 10 models by flop count in this subset
        top_n_models = historical_data.nlargest(top_n, 'flop')['System']
        # Update the set of models that were ever in top n
        ever_in_top_n.update(top_n_models)

    # Return DataFrame filtered to only include models that were ever in the top 10
    return df[df['System'].isin(ever_in_top_n)]


def filter_top_models_within_category(df, top_n, cutoff_date, category):
    """Find the models which were in the top-n by compute when they were released,
    among models in the specified category. The top-n models in the specified category
    are seeded with the overall top-n models before the cutoff date.
    """
    # Filter top-n models within the category, but seeded with overall top-n models
    top_models_df = find_top_models_up_to_release(df, top_n)
    top_n_models_at_cutoff_date_df = top_models_df[top_models_df['date'] <= cutoff_date].nlargest(top_n, 'flop')
    category_df = df[df['category'] == category]

    # This set will keep track of models that were ever in the top 10 at their release
    ever_in_top_n = set()

    # Iterate over each date in the DataFrame
    for current_date in category_df['date'].unique():
        # Get all entries up to the current date
        category_since_cutoff = category_df[(category_df['date'] <= current_date) & (category_df['date'] > cutoff_date)]
        historical_data = pd.concat([category_since_cutoff, top_n_models_at_cutoff_date_df])
        # Find top 10 models by flop count in this subset
        top_n_models_df = historical_data.nlargest(top_n, 'flop')
        # Update the set of models that were ever in top n
        # Filter out the models that aren't in the category
        ever_in_top_n.update(top_n_models_df[top_n_models_df['category'].str.contains(category)]['System'])

    # Return DataFrame filtered to only include models that were ever in the top 10
    new_df = df[df['System'].isin(ever_in_top_n)]
    # Assign the category to the new DataFrame (overwrites cases with both US and China)
    # E.g. if a "USA,China" model is top-10 among models affiliated with China, then it's just "China"
    new_df['category'] = category
    
    return new_df


def filter_top_models_in_both_categories(df, top_n, cutoff_date):
    # Get top models for Open and Closed categories
    top_us_models = filter_top_models_within_category(df, top_n, cutoff_date, category='Non-China')
    top_china_models = filter_top_models_within_category(df, top_n, cutoff_date, category='China')
    # Combine the results
    df_filtered = pd.concat([top_us_models, top_china_models])
    # Sort the combined DataFrame by date
    df_filtered = df_filtered.sort_values('date')
    return df_filtered

In [17]:
df_filtered = (df[['System', 'Training compute (FLOP)', 'Publication date', 'Organization', 'Notability criteria', 'Domain', 'Base model', 'Country']]
    .rename(columns={'Training compute (FLOP)': 'flop', 'Publication date': 'date', 'Country': 'category'})
    .assign(date=lambda x: pd.to_datetime(x['date']), log_flop=lambda x: np.log10(x['flop']))
    .sort_values('date'))

In [18]:
list(df_filtered[df_filtered['Base model'].notna()]['System'])

['BatchNorm',
 'Order embeddings with layer norm',
 'Layer Normalization: The Attentive Reader',
 'Layer Normalization: Skip Thoughts',
 'Layer Normalization: Draw',
 'Layer Normalization: Handwriting sequence generation',
 'ULM-FiT',
 'ADP-FAIRSEQ + NGRAMRES',
 'Fine-tuned-AWD-LSTM-DOC (fin)',
 'Cross-lingual alignment',
 'Theseus 6/768',
 'UnifiedQA',
 'LUKE',
 'GPT-Neo-2.7B (finetuned)',
 'GPT-Neo-2.7B (finetuned on PTB)',
 'Unicorn',
 'Multitask Unified Model (MUM)',
 '$\\infty$-former (SM)',
 'FLAN 137B',
 'AlphaFold-Multimer',
 'T0-XXL',
 'GPT-2 (AMPS)',
 'Masked Autoencoders ViT-H',
 'ViT-G/14 (LiT)',
 'Engine-XL(NE)',
 'HSO',
 'Contriever',
 'Vespa',
 'OntoProtein',
 'InstructGPT',
 'BERT-RBP',
 'Flamingo',
 'Jurassic-X',
 'DeBERTaV3large + KEAR',
 'SimCSE',
 'CogVideo',
 'Minerva (540B)',
 'Delphi',
 'Transformer-XL + RMT',
 'GPT-NeoX-Japanese',
 'BlenderBot 3',
 'PaLM-SayCan',
 'Sparrow',
 'NMST+GPT-2',
 'Decaying Fast Weights Transformer (WT-103)',
 "Instruct-GPT + Mind's Ey

In [19]:
# Add speculative compute estimates based on benchmark imputation and rough guesses
if include_speculative_compute:
    speculative_compute_estimates = {
        "Claude 3.5 Sonnet": 4.72e25,
        "Claude 3 Opus": 1.59e25,
        "Claude 3 Sonnet": 5.51e24,
        "GPT-4o": 3.98e25,
        "Gemini 1.0 Pro": 1.85e24,
        "Gemini 1.5 Pro": 1.60e25,
        "Mistral Large 2": 2.01e25,
        "GPT-4 Turbo": 2.1e25,  # rough guess matching GPT-4
        "GPT-4V": 2.1e25,  # rough guess matching GPT-4
        "Claude 2": 4.33e24,
        "Claude 2.1": 4.33e24,  # rough guess matching Claude 2
    }
    for model, compute in speculative_compute_estimates.items():
        df_filtered.loc[df_filtered["System"] == model, "flop"] = compute
        df_filtered.loc[df_filtered["System"] == model, "log_flop"] = np.log10(compute)

df_filtered.dropna(subset=['flop'], inplace=True)

# Drop Alpha Go Master / Zero
if filter_alphago_outliers:
    mask = (df_filtered["System"] == 'AlphaGo Master') | (df_filtered["System"] == 'AlphaGo Zero')
    df_filtered = df_filtered[~mask]

# Drop finetuned models
if filter_finetuned_models:
    mask = df_filtered['Base model'].isna()
    df_filtered = df_filtered[mask]

top_models_df = find_top_models_up_to_release(df_filtered, top_n)  # For reference

if frontier_selection == 'external':
    # Filter top models before other filters
    df_filtered = filter_top_models_in_both_categories(df_filtered, top_n, top_n_cutoff_date)

if model_selection == 'Language models':
    re = 'Language|Multimodal'
    mask = df_filtered['Domain'].str.contains(re, na=False)
    df_filtered = df_filtered[mask]

if frontier_selection == 'internal':
    # Filter top models after other filters
    df_filtered = filter_top_models_in_both_categories(df_filtered, top_n, top_n_cutoff_date)

# Filter for models after the cutoff date
df_filtered = df_filtered[df_filtered['date'] > cutoff_date]

print(f"{len(df_filtered)}{' top' if frontier_selection != 'disabled' else ''} {top_n} {model_selection} models found")
print(f"They span {df_filtered['date'].min().strftime('%B %Y')} to {df_filtered['date'].max().strftime('%B %Y')}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['category'] = category


109 top 10 Language models models found
They span August 2018 to November 2024


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['category'] = category


In [20]:
if top_n == 1:
    # Remove BIDAF outlier
    df_filtered = df_filtered[df_filtered['System'] != 'BIDAF']

In [21]:
exclude_china = []
# exclude_china = [
#     'genCNN + dyn eval',
#     'R-FCN',
#     'ResNet-200',
#     '2-layer-LSTM+Deep-Gradient-Compression',
# ]
df_filtered = df_filtered[~df_filtered['System'].isin(exclude_china)]

In [22]:
usa_df = df_filtered[df_filtered['category'] == 'Non-China']
china_df = df_filtered[df_filtered['category'] == 'China']
recent_top_models_df = top_models_df[top_models_df['date'] > pd.to_datetime('2010-01-01')]

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=usa_df['date'],
    y=usa_df['log_flop'],
    mode='markers',
    marker=dict(color=colors['Non-China'], opacity=0.5),
    text=usa_df['System'],
    hoverinfo='text',
    name=f'Top-{top_n} Non-China'
))

fig.add_trace(go.Scatter(
    x=china_df['date'],
    y=china_df['log_flop'],
    mode='markers',
    marker=dict(color=colors['China'], opacity=0.5),
    text=china_df['System'],
    hoverinfo='text',
    name=f'Top-{top_n} China'
))

fig.add_trace(go.Scatter(
    x=recent_top_models_df['date'],
    y=recent_top_models_df['log_flop'],
    mode='markers',
    marker=dict(color='grey', opacity=0.5),
    text=recent_top_models_df['System'],
    hoverinfo='text',
    name=f'Top-{top_n} Overall'
))

fig.update_layout(
    width=800,
    height=400,
    xaxis_title='Date',
    yaxis_title='Log FLOP',
    title=f'Top-{top_n} models',
    margin=dict(t=50, l=60, r=60, b=50),
)

save_plot(fig, results_dir, f'top_{top_n}_models_without_kickstarting')

fig.show()

In [23]:
top_models_since_cutoff = top_models_df[top_models_df['date'] >= pd.to_datetime(cutoff_date)]
top_models_set = set(top_models_since_cutoff['System'])
usa_top_models_set = set(usa_df['System'])
china_top_models_set = set(china_df['System'])

frac_usa_top_models = len(usa_top_models_set.intersection(top_models_set)) / len(top_models_set)
frac_china_top_models = len(china_top_models_set.intersection(top_models_set)) / len(top_models_set)
print(f"Fraction of overall top-{top_n} models that are Non-China: {frac_usa_top_models*100:.1f}%")
print(f"Fraction of overall top-{top_n} models that are China: {frac_china_top_models*100:.1f}%")


Fraction of overall top-10 models that are Non-China: 68.0%
Fraction of overall top-10 models that are China: 10.7%


# Regression analysis

In [24]:
dep_var = 'log_flop'

In [25]:
#@markdown Analysis of best fit to the data

@dataclass
class FitResult:
    df: pd.DataFrame
    p: int = None
    bic: float = None
    rss: float = None
    mse: float = None
    predict: Callable = None

@dataclass
class HyperbolicFitResult(FitResult):
    params: tuple[float] = None

@dataclass
class KinkedFitResult(FitResult):
    break_points: tuple[float] = None
    break_points_dt: float = None
    oom_year_slopes: tuple[float] = None

    # Model properties for each breakpoint combination
    # (for debugging)
    bics: tuple[float] = None
    rsss: tuple[float] = None
    mses: tuple[float] = None
    break_points_list: tuple[tuple[float]] = None
    break_points_dt_list: tuple[tuple[float]] = None

def fit_hyperbolic(df):
    def hyperbolic_model(t, A, B, k):
        return A / (1 + B * np.exp(-k * t))

    # Prepare data for curve fitting
    timestamp = pd.to_datetime(df['date']).apply(lambda date: date.toordinal()).values

    # Initial guess for the parameters
    # initial_guess = [0, 0, 0]
    initial_guess = [1.72373207e-02, -9.45447534e-01, -7.50101861e-08]  # Updated initial guess

    # Fit the model to the data
    try:
      params, covariance = curve_fit(hyperbolic_model, timestamp, df[dep_var], p0=initial_guess, maxfev=100000, ftol=1e-10)
    except RuntimeError as e:
      print("FATAL ERROR WHEN FITTING HYPERBOLIC")
      return None

    # Extracting parameters
    A, B, k = params

    # Compute predictions to calculate residuals
    predicted_log_y = hyperbolic_model(timestamp, *params)

    # Compute the Residual Sum of Squares (RSS)
    rss = np.sum((df[dep_var] - predicted_log_y) ** 2)

    # Number of observations (n)
    n = len(df[dep_var])

    # Number of parameters (p)
    p = len(params) + 1

    # Calculate log-likelihood under the assumption of normally distributed errors
    # log_likelihood = -0.5 * rss
    log_likelihood = -0.5 * n * (np.log(2 * np.pi * rss/n) + 1)

    # Compute bic_hyperbolic using the provided formula
    bic = p * np.log(n) - 2 * log_likelihood

    # Compute MSE
    mse = rss / n

    fit_result = HyperbolicFitResult(
        df=df,
        p=p,
        bic=bic,
        rss=rss,
        mse=mse,
        params=params,
        predict=lambda date: hyperbolic_model(date.apply(lambda d: d.toordinal()), *params)
    )

    return fit_result

def fit_n_phase_exponential(df, kink_count=0, allow_discontinuities=False, min_n_segment=10):
    # Generate monthly breakpoints between 2010 and 2024
    one_month = pd.DateOffset(months=1)
    break_point_grid = pd.date_range(start=df['date'].min() - one_month, end=df['date'].max() - 4*one_month, freq='MS')
    break_point_grid = [x.toordinal() for x in break_point_grid]

    x = pd.to_datetime(df['date']).apply(lambda date: date.toordinal()).values
    y = df[dep_var].values

    break_points_list = []
    bics = []
    rsss = []
    mses = []
    models = []

    for break_points in combinations_with_replacement(break_point_grid, kink_count):
        # Model predictors

        intercept_change_points = (0,)
        if allow_discontinuities:
            intercept_change_points += break_points
        slope_change_points = (0,) + break_points

        predictors = np.zeros((len(x), len(intercept_change_points) + len(slope_change_points)))

        for i, intercept_point in enumerate(intercept_change_points):
            predictors[:, i] = (x >= intercept_point).astype(int)

        for i, break_point in enumerate(slope_change_points):
            predictors[:, len(intercept_change_points) + i] = np.maximum(x - break_point, 0)

        # Fit the model
        model = sm.OLS(y, predictors).fit()

        # Calculate BIC manually based on log-likelihood
        n = len(x) # Number of observations
        p = len(model.params) + 2*kink_count + 1 # Number of parameters

        # Calculate log-likelihood under the assumption of normally distributed errors
        # We have to iterate over all points to get their individual log-likelihoods
        log_likelihood = 0
        rss = 0
        invalid_model = False # Discard models with segments with less than 2 points
        for i, break_point in enumerate(slope_change_points):
            left_x = break_point
            right_x = slope_change_points[i + 1] if i + 1 < len(slope_change_points) else np.inf

            segment_predictors = predictors[(left_x <= x) & (x < right_x), :]
            segment_y = y[(left_x <= x) & (x < right_x)]
            segment_n = len(segment_y)

            assert min_n_segment > 2

            if segment_n < min_n_segment:
                invalid_model = True
                break

            y_pred = model.predict(segment_predictors)

            segment_rss = np.sum((y_pred - segment_y)**2)
            if segment_rss == 0:
                print(f"segment_rss={segment_rss}")
                print(f"y_pred={y_pred}")
                print(f"segment_y={segment_y}")
                invalid_model = True
                break
            segment_mse = segment_rss / segment_n

            segment_log_likelihood = -segment_n/2 * (np.log(2*np.pi) + np.log(segment_rss/segment_n) + 1)
            log_likelihood += segment_log_likelihood
            rss += segment_rss

        if invalid_model:
            continue

        # Compute BIC using the manual method based on the log-likelihood
        bic = p * np.log(n) - 2 * log_likelihood
        # bic = n*np.log(rss/n) + p*np.log(n)

        bics.append(bic)
        rsss.append(rss)
        mses.append(rss/len(df))
        models.append(model)
        break_points_list.append(break_points)

    # Prepare the result object
    best_bic = min(bics)
    best_idx = bics.index(best_bic)
    best_rss = rsss[best_idx]
    best_mse = mses[best_idx]
    best_model = models[best_idx]
    best_break_points = break_points_list[best_idx]

    p = len(best_model.params) + 2*kink_count + 1 # Number of parameters

    intercept_change_points = (0,)
    if allow_discontinuities:
        intercept_change_points += best_break_points
    slope_change_points = (0,) + best_break_points

    intercepts = best_model.params[:len(intercept_change_points)]
    oom_year_slopes = 365 * np.cumsum(best_model.params[len(intercepts):])

    def predict(date):
        if not isinstance(date, pd.Series):
            date = pd.Series(date)
        x = pd.to_datetime(date).apply(lambda date: date.toordinal()).values

        predictors = np.zeros((len(x), len(intercept_change_points) + len(slope_change_points)))

        for i, intercept_point in enumerate(intercept_change_points):
            predictors[:, i] = (x >= intercept_point).astype(int)

        for i, break_point in enumerate(slope_change_points):
            predictors[:, len(intercept_change_points) + i] = np.maximum(x - break_point, 0)

        return best_model.predict(predictors)

    fit_result = KinkedFitResult(
        df=df,
        p=p,
        bic=best_bic,
        rss=best_rss,
        mse=best_mse,
        break_points=best_break_points,
        predict=predict,
        break_points_dt=[pd.Timestamp.fromordinal(bp) for bp in best_break_points],
        bics=bics,
        rsss=rsss,
        mses=mses,
        oom_year_slopes=oom_year_slopes,
        break_points_list=break_points_list,
        break_points_dt_list=[[pd.Timestamp.fromordinal(bp) for bp in break_points] for break_points in break_points_list],
    )

    return fit_result

def calculate_lag(df, fit_results, date=None):
    if date is None:
        date = df['date'].max()

    # Get the predictions for the two categories
    y_usa = fit_results['Non-China'].predict(pd.Series([date]))[0]
    y_china = fit_results['China'].predict(pd.Series([date]))[0]
    
    # Get the final slope for the 'China' category
    slope_usa = fit_results['Non-China'].oom_year_slopes[-1]
    
    # Calculate lag
    lag = (y_usa - y_china) / slope_usa
    
    return lag


## Plot predictions

In [26]:
# Graph of the different model fits using plotly

# Regression parameters for each category
params = {
    'Non-China': {
        'kink_count': 0,
        'allow_discontinuities': False,
    },
    'China': {
        'kink_count': 1,
        'allow_discontinuities': False,
    }
}

def plot_model(df, params):
    fig = go.Figure()

    # Plot the original data points
    df_usa = df[df['category'] == 'Non-China']
    df_china = df[df['category'] == 'China']

    fig.add_trace(go.Scatter(
        x=df_usa['date'], y=df_usa['log_flop'],
        mode='markers', name='Not developed in China', text=df_usa['System'],
        marker=dict(color=colors['Non-China'], opacity=0.3, size=10)
    ))
    fig.add_trace(go.Scatter(
        x=df_china['date'], y=df_china['log_flop'],
        mode='markers', name='Developed in China', text=df_china['System'],
        marker=dict(color=colors['China'], opacity=0.3, size=10)
    ))

    fit_results = {}
    for category, params in params.items():
        fit_result = fit_n_phase_exponential(df[df['category'] == category], **params)
        fit_results[category] = fit_result

    # Plot the fit lines
    usa_date_grid = pd.date_range(start=usa_df['date'].min(), end=usa_df['date'].max(), freq='D')
    china_date_grid = pd.date_range(start=china_df['date'].min(), end=china_df['date'].max(), freq='D')
    log_flop_usa = fit_results['Non-China'].predict(pd.Series(usa_date_grid))
    log_flop_china = fit_results['China'].predict(pd.Series(china_date_grid))
    usa_trend_df = pd.DataFrame({
        'date': usa_date_grid,
        'log_flop': log_flop_usa,
    })
    china_trend_df = pd.DataFrame({
        'date': china_date_grid,
        'log_flop': log_flop_china,
    })

    fig.add_trace(go.Scatter(
        x=usa_date_grid, y=log_flop_usa,
        mode='lines', name='Best fit line (Non-China)',
        line=dict(color=colors['Non-China'], dash='dash')
    ))
    fig.add_trace(go.Scatter(
        x=china_date_grid, y=log_flop_china,
        mode='lines', name='Best fit line (China)',
        line=dict(color=colors['China'], dash='dash')
    ))

    # Add slope labels
    for category in ['Non-China', 'China']:
        category_df = df[df['category'] == category]
        points = [category_df['date'].min()] + fit_results[category].break_points_dt + [category_df['date'].max()]
        # print(fit_results[category].oom_year_slopes)
        for i in range(len(points) - 2, len(points) - 1):
            mid = points[i] + (points[i+1] - points[i]) / 2
            y = fit_results[category].predict(pd.Series([mid]))[0]
            fig.add_annotation(
                x=mid, y=y + 1.2 * (1.5 if category == 'Non-China' else -1.5),
                text=f'{10**fit_results[category].oom_year_slopes[i]:0.1f}x/year',
                showarrow=False,
                font=dict(size=12, color=colors[category])
            )

    # Plot horizontal line segment showing the lag
    # lag_months = calculate_lag(df, fit_results) * 12  # Convert years to months
    # end_date = china_df['date'].max()
    # start_date = end_date - pd.DateOffset(days=int(lag_months * 30.4375))  # Approximate months to days conversion
    # y_value = fit_results['China'].predict(pd.Series([end_date]))[0]
    # fig.add_annotation(
    #     x=end_date, y=y_value,
    #     ax=start_date, ay=y_value,
    #     xref='x', yref='y',
    #     axref='x', ayref='y',
    #     showarrow=True,
    #     arrowhead=2,
    #     arrowsize=1,
    #     arrowwidth=2,
    #     arrowcolor='black',
    # )
    # fig.add_annotation(
    #     x=start_date, y=y_value,
    #     ax=end_date, ay=y_value,
    #     xref='x', yref='y',
    #     axref='x', ayref='y',
    #     showarrow=True,
    #     arrowhead=2,
    #     arrowsize=1,
    #     arrowwidth=2,
    #     arrowcolor='black',
    # )
    # fig.add_annotation(
    #     # x=(start_date + (end_date - start_date) * 0.5), 
    #     x=pd.to_datetime('2022-09-01'),
    #     y=y_value+0.65,
    #     text=f'Lag: {lag_months:.0f} months',
    #     showarrow=False,
    # )

    # Annotate some key models with text
    
    # key_models = ['GPT-4'] if (exclude_big_llama or exclude_all_llamas) else ['GPT-4', 'Llama 3.1-405B']
    # for model_name in key_models:
    #     model_row = df_filtered[df_filtered['System'] == model_name]
    #     fig.add_annotation(
    #         x=model_row['date'].iloc[0], y=model_row['log_flop'].iloc[0],
    #         text=model_name,
    #         showarrow=True,
    #         font=dict(size=12, color='black'),
    #         xanchor='right', yanchor='bottom'
    #     )

    # Update layout
    title = f'Compute trends for top-{top_n} language models inside and outside China'
    fig.update_layout(
        template='plotly_white',
        width=800,
        height=400,
        title=title,
        xaxis_title='Model publication date',
        yaxis_title='Training compute (FLOP)',
        legend_title='',
        # legend=dict(
        #     x=0.75,
        #     y=0.05
        # ),
        margin=dict(l=10, r=10, t=40, b=10),
        # xaxis=dict(
        #     tickformat='%Y',
        #     dtick='M12',
        # ),
        yaxis=dict(
            tickmode='array',
            tickvals=list(range(int(df['log_flop'].min()), int(df['log_flop'].max())+2, 2)),
            ticktext=[f'10<sup>{i}</sup>' for i in range(int(df['log_flop'].min()), int(df['log_flop'].max())+2, 2)]
        )
    )

    if save:
        fname = f'compute_regression_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}'
        save_plot(fig, results_dir, fname)
        df_usa[['System', 'date', 'log_flop']].to_csv(results_dir + f'plot_data/{fname}_usa.csv', index=False)
        df_china[['System', 'date', 'log_flop']].to_csv(results_dir + f'plot_data/{fname}_china.csv', index=False)
        usa_trend_df[['date', 'log_flop']].to_csv(results_dir + f'plot_data/{fname}_usa_trend.csv', index=False)
        china_trend_df[['date', 'log_flop']].to_csv(results_dir + f'plot_data/{fname}_china_trend.csv', index=False)

    for category in ['Non-China', 'China']:
        print(category)
        fit_result = fit_results[category]
        simple_fit = fit_n_phase_exponential(df[df['category'] == category], 0)
        bayes_factor = np.exp(-0.5 * (fit_result.bic - simple_fit.bic))
        unadjusted_bayes_factor = np.exp(-0.5 * (fit_result.bic - (simple_fit.bic + 2*np.log(len(df[df['category'] == category])))))

        print(f"BIC score: {fit_result.bic}")
        bic_score_difference = fit_result.bic - simple_fit.bic
        if bic_score_difference > 0:
            print(f"The simple exponential is preferred over this fit by a BIC score difference of {fit_result.bic - simple_fit.bic}")
        if bic_score_difference < 0:
            print(f"This fit is preferred over a simple exponential by a BIC score difference of {-(fit_result.bic - simple_fit.bic)}")

    fig.show()

    return fit_result

fit_result = plot_model(df_filtered, params)

# To test a bootstrap sample:
# sample = df_filtered.sample(len(df_filtered), replace=True, random_state=DEFAULT_RNG)
# sample = sample.sort_values('date')
# fit_result = plot_model(sample, params)

Non-China
BIC score: 66.50322739417308
China
BIC score: 133.1695745302169
This fit is preferred over a simple exponential by a BIC score difference of 46.49529102111882


In [27]:
# Regression parameters for each category
params = {
    'Non-China': {
        'kink_count': 0,
        'allow_discontinuities': False,
    },
    'China': {
        'kink_count': 1,
        'allow_discontinuities': False,
    }
}

def calculate_all_lags(df, params):
    fit_results = {}
    for category, params in params.items():
        fit_result = fit_n_phase_exponential(df[df['category'] == category], **params)
        fit_results[category] = fit_result

    date_range = pd.date_range(start='2021-01-01', end='2025-01-01', freq='MS')
    for date in date_range:
        lag_months = calculate_lag(df, fit_results, date) * 12
        print(f"Lag at {date}: {lag_months:.0f} months")

calculate_all_lags(df_filtered, params)

Lag at 2021-01-01 00:00:00: 39 months
Lag at 2021-02-01 00:00:00: 35 months
Lag at 2021-03-01 00:00:00: 32 months
Lag at 2021-04-01 00:00:00: 28 months
Lag at 2021-05-01 00:00:00: 25 months
Lag at 2021-06-01 00:00:00: 21 months
Lag at 2021-07-01 00:00:00: 17 months
Lag at 2021-08-01 00:00:00: 14 months
Lag at 2021-09-01 00:00:00: 10 months
Lag at 2021-10-01 00:00:00: 7 months
Lag at 2021-11-01 00:00:00: 7 months
Lag at 2021-12-01 00:00:00: 7 months
Lag at 2022-01-01 00:00:00: 8 months
Lag at 2022-02-01 00:00:00: 8 months
Lag at 2022-03-01 00:00:00: 9 months
Lag at 2022-04-01 00:00:00: 9 months
Lag at 2022-05-01 00:00:00: 9 months
Lag at 2022-06-01 00:00:00: 10 months
Lag at 2022-07-01 00:00:00: 10 months
Lag at 2022-08-01 00:00:00: 11 months
Lag at 2022-09-01 00:00:00: 11 months
Lag at 2022-10-01 00:00:00: 12 months
Lag at 2022-11-01 00:00:00: 12 months
Lag at 2022-12-01 00:00:00: 12 months
Lag at 2023-01-01 00:00:00: 13 months
Lag at 2023-02-01 00:00:00: 13 months
Lag at 2023-03-01 00

## Model selection

In [28]:
fit_em_all = lambda df_fit : {
    "Simple" : fit_n_phase_exponential(df_fit, kink_count=0),
    "One kink" : fit_n_phase_exponential(df_fit, kink_count=1),
    "Discontinuity" : fit_n_phase_exponential(df_fit, kink_count=1, allow_discontinuities=True),
    # "Hyperbolic": fit_hyperbolic(df_fit),
}

# Best model fits
category = 'Non-China'
print(f"Fitting {category} models")
models = fit_em_all(df_filtered[df_filtered['category'] == category])

Fitting Non-China models


In [29]:
# K-Fold Cross Validation
def perform_cross_validation(df, k=10, random_state=42):
    kf = KFold(n_splits=k, shuffle=True, random_state=random_state)
    folds_mses = defaultdict(lambda : [])
    for train_index, test_index in kf.split(df):
        train_df, test_df = df.iloc[train_index], df.iloc[test_index]

        # Fit the models on the training set
        fold_models = fit_em_all(train_df)

        # Predict on the test set
        for name,model in fold_models.items():
            try:
                predicted_log_y = model.predict(test_df["date"])
            except AttributeError:
                continue
            test_rss = np.sum((predicted_log_y - test_df[dep_var])**2)
            test_mse = test_rss / len(test_df)
            folds_mses[name].append(test_mse)

    # Compute mean MSE
    folds_mses = {name: np.mean(folds_mses[name]) for name in folds_mses}

    return folds_mses

folds_mses = perform_cross_validation(df_filtered[df_filtered['category'] == category])

In [30]:
# Bootstrap
bootstrap_sample_size = 1000
bootstrap_bics = defaultdict(lambda : [])
bootstrap_mses = defaultdict(lambda : [])
bootstrap_bic_score_diff = defaultdict(lambda : [])
bootstrap_slopes = defaultdict(lambda : [])
bootstrap_breaks = defaultdict(lambda : [])
for bootstrap_index in tqdm(range(bootstrap_sample_size)):
    sample = df_filtered.sample(len(df_filtered), replace=True, random_state=DEFAULT_RNG)
    sample = sample[sample['category'] == category]
    sample = sample.sort_values('date')

    # Compute BICs
    boot_models = fit_em_all(sample)

    # Compute K fold validation
    boot_folds_mses = perform_cross_validation(sample)

    # Store results
    for name, model in boot_models.items():
        # It might be None if the hyperbolic fails to fit
        if model is None: continue

        bootstrap_bics[name].append(model.bic)
        bootstrap_mses[name].append(boot_folds_mses[name])
        bootstrap_bic_score_diff[name].append(model.bic - boot_models["Simple"].bic)

        if isinstance(model, KinkedFitResult):
            if (len(model.oom_year_slopes) > 0): bootstrap_slopes[name].append(10**model.oom_year_slopes[-1])
            if (len(model.break_points_dt) > 0): bootstrap_breaks[name].append(model.break_points_dt[-1])

100%|██████████| 1000/1000 [02:18<00:00,  7.24it/s]


In [31]:
ci_width = 0.90
qs = [(1 - ci_width)/2, (1 + ci_width)/2]
bootstrap_preferred_percent = {}
for name in models:
    bootstrap_preferred_percent[name] = np.mean(np.array(bootstrap_bic_score_diff[name])<0)
    bootstrap_bics[name] = np.quantile(np.array(bootstrap_bics[name]), qs)
    bootstrap_mses[name] = np.quantile(np.array(bootstrap_mses[name]), qs)
    bootstrap_bic_score_diff[name] = np.quantile(np.array(bootstrap_bic_score_diff[name]), qs)
    try:
        bootstrap_slopes[name] = np.quantile(np.array(bootstrap_slopes[name]), qs)
        bootstrap_breaks[name] = np.quantile(np.array(bootstrap_breaks[name]), qs)
    except IndexError:
        pass
#@markdown Models with lower BIC score / MSE are preferred.

results = []
for name, model in models.items():
    param_count = model.p
    log_likelihood = (np.log(len(df_filtered))*param_count - model.bic)/2

    param_count_simple = models['Simple'].p
    log_likelihood_simple = (np.log(len(df_filtered))*param_count_simple - models['Simple'].bic)/2

    c2 = chi2.sf(2*(log_likelihood - log_likelihood_simple), df=(param_count - param_count_simple))

    result = {
        "Model": name,
        "BIC" : np.round(model.bic, 2),
        "BIC 90% CI" : np.round(bootstrap_bics[name], 2),
        #"Parameter count": param_count,
        #"Log likelihood": np.round((np.log(len(df_filtered))*param_count - model.bic)/2),
        # "MSE" : model.mse,
        "BIC score diff": np.round(model.bic - models["Simple"].bic, 2),
        "BIC score diff 90% CI": np.round(bootstrap_bic_score_diff[name], 2),
        "Xi²": c2,
        "% times preferred over simple": f"{bootstrap_preferred_percent[name]:.0%}",
        # "bayes factor over simple" : np.exp(-0.5 * (model.bic - models["simple"].bic)),
        "K-fold mean MSE" : np.round(folds_mses[name], 2),
        "K-fold mean MSE 90% CI" : np.round(bootstrap_mses[name], 2),
    }

    try:
        result["Recent slope (Nx/year)"] = np.round(10**model.oom_year_slopes[-1], 2)
        result["Recent slope 90% CI"] = np.round(bootstrap_slopes[name], 2)
        result["Break point"] = model.break_points_dt[-1].strftime('%Y-%m')
        result["Break point 90% CI"] = [date.strftime('%Y-%m') for date in bootstrap_breaks[name]]
    except (AttributeError, IndexError):
        pass
    results.append(result)

results_df = pd.DataFrame(results)

# bayes_factor = np.exp(-0.5 * (kinked_fit.bic - simple_fit.bic))

print("Results")
results_df

Results


Unnamed: 0,Model,BIC,BIC 90% CI,BIC score diff,BIC score diff 90% CI,Xi²,% times preferred over simple,K-fold mean MSE,K-fold mean MSE 90% CI,Recent slope (Nx/year),Recent slope 90% CI,Break point,Break point 90% CI
0,Simple,66.5,"[42.32, 82.59]",0.0,"[0.0, 0.0]",,0%,0.17,"[0.12, 0.23]",4.41,"[3.88, 4.98]",,
1,One kink,67.16,"[30.76, 80.91]",0.66,"[-32.0, 8.45]",0.003814,59%,0.16,"[0.11, 0.23]",3.77,"[3.04, 10.16]",2020-02,"[2019-10, 2024-01]"
2,Discontinuity,61.13,"[22.46, 77.6]",-5.37,"[-39.16, 6.95]",7.5e-05,74%,0.17,"[0.1, 0.23]",4.12,"[1.0, 15.13]",2020-02,"[2019-11, 2023-12]"


In [32]:
# Save results_df
regression_fname = f'compute_regression_analysis_{category}_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}.csv'
results_df.to_csv(os.path.join(results_dir, regression_fname), index=False)