# Setup

In [22]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
from collections import defaultdict
from collections.abc import Callable
from dataclasses import dataclass
import json
from itertools import combinations_with_replacement
import numpy as np
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import chi2
from sklearn.model_selection import KFold
import statsmodels.api as sm
from tqdm import tqdm

from data import *
from plotting import *
from regression import *
from utils import *

# Parameters

In [24]:
# Llama ablation flags
exclude_big_llama = False  # Whether to exclude Llama 3.1-405B
exclude_all_llamas = False  # Whether to exclude all Llamas
include_llama_4 = False  # Whether to include hypothetical Llama 4
# 'external': Filter to the top n models overall
# 'internal': Filter to the top n models within 'Open' and 'Closed' categories
# 'disabled': No filtering
frontier_selection = 'external'  # ['disabled', 'internal', 'external']
top_n = 1  # Filter to the top n models by training compute at time of release
model_selection = 'All models'  # ['All models', 'Language models', 'Google DeepMind models', 'OpenAI models', 'Meta AI models']
filter_alphago_outliers = True  # Whether to filter out AlphaGo Master and AlphaGo Zero
filter_finetuned_models = True  # Whether to filter out separate finetuned models (base + finetuned models are still included if there is no separate base model in our dataset)
include_speculative_compute = True  # Whether to include speculative compute estimates that rely on benchmark imputation and rough guesses
cutoff_date = '2018-01-01'  # When to start the regressions from
top_n_cutoff_date = '2018-01-01'  # When to split the top-n filtering into open and closed categories - set to e.g. 2010 to turn off the "kickstarting"

In [25]:
results_dir = 'results/compute/14Oct/'
os.makedirs(results_dir, exist_ok=True)

In [26]:
def save_plot(fig, folder, filename, extensions=['png', 'svg', 'pdf'], scale=2):
    prefix = ''
    if exclude_all_llamas:
        prefix = 'all_llamas_excluded_'
    elif exclude_big_llama:
        prefix = 'big_llama_excluded_'
        
    for ext in extensions:
        fig.write_image(folder + prefix + filename + '.' + ext, scale=scale)
    fig.write_html(folder + prefix + filename + '.html')

In [27]:
colors = {'open': '#1f77b4', 'closed': '#ff7f0e'}

# Data preparation

In [28]:
# Load data
pcd_df = load_pcd_df()

In [29]:
pcd_df

Unnamed: 0,System,Domain,Task,Authors,Notability criteria,Notability criteria notes,Model accessibility,Link,Citations,Reference,...,Hardware TF32,Hardware count,Hardware TF16,Hardware FP16,Assumed precision,Assumed hardware FLOP/s,TEMP,Hardware type,Compute estimate method,Training compute estimation method
0,babbage-002,Language,Language modelling,,,,,,,,...,,0,,,FP32,,0,,,
1,tts-1,Speech,Text to Speech,,,,,,,,...,,0,,,FP32,,0,,,
2,tts-1-hd,Speech,Text to Speech,,,,,,,,...,,0,,,FP32,,0,,,
3,Suno Music Generation,Audio,Audio generation,,,,Hosted access (no API),https://suno.com/about,,,...,,0,,,FP32,,0,,,
4,CoPRA,,,,,,,,,,...,,0,,,FP32,,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1744,Genesis,Biology,Protein design,"Zander Harteveld, Alexandra Van Hall-Beauvais,...",,,,https://www.cell.com/cell-systems/fulltext/S24...,,Exploring “dark-matter” protein folds using de...,...,,0,,,FP32,,1,,,
1745,SO3LR,Biology,"Protein folding prediction,Molecular simulation","Adil Kabylda, J. Thorben Frank, Sergio Suarez ...",,,,https://chemrxiv.org/engage/chemrxiv/article-d...,,Molecular Simulations with a Pretrained Neural...,...,,0,,,FP32,,1,,,
1746,ProCALM,Biology,"Proteins,Protein generation","Jason Yang, Aadyot Bhatnagar, Jeffrey A. Ruffo...",,,,https://www.arxiv.org/abs/2410.03634,,Conditional Enzyme Generation Using Protein La...,...,,0,,,FP32,,1,,,
1747,SCUBA-D,Biology,Protein design,"Yufeng Liu, Sheng Wang, Jixin Dong, Linghui Ch...",,,,https://www.nature.com/articles/s41592-024-024...,,De novo protein design with a denoising diffus...,...,,0,,,FP32,,1,,,


In [30]:
pcd_df.loc[pcd_df['System'] == 'Megatron-BERT']['Model accessibility']

698    Unreleased
Name: Model accessibility, dtype: object

In [31]:
access_df = pcd_df.dropna(subset=['Publication date'])
len(access_df)

1741

In [32]:
# Ablate Llamas
llama_3_405_row = access_df[access_df['System'] == 'Llama 3.1-405B']
llama_3_405_compute = llama_3_405_row['Training compute (FLOP)'].iloc[0]
if include_llama_4:
    # Append hypothetical Llama 4 to DataFrame
    access_df = pd.concat([access_df, pd.DataFrame({
        'System': ['Llama 4 (hypothetical)'],
        'Training compute (FLOP)': [llama_3_405_compute * 10],
        'Publication date': [pd.Timestamp('2025-07-23')],
        'Model accessibility': ['Open access (restricted use)'],
        'Organization': ['Meta AI'],
        'Domain': 'Language',
    })])
if exclude_big_llama:
    access_df = access_df[~((access_df['System'].str.contains('Llama', case=False)) & (access_df['System'].str.contains('405')))]
if exclude_all_llamas:
    access_df = access_df[~access_df['System'].str.contains('Llama', case=False)]
    
len(access_df)

1741

In [33]:
access_df['Model accessibility'].unique()

array([nan, 'Unreleased', 'Open access (unrestricted)',
       'Hosted access (no API)', 'Open access (non-commercial)',
       'API access', 'Open access (restricted use)'], dtype=object)

In [34]:
for cat in access_df['Model accessibility'].unique():
    if pd.isna(cat):
        print(cat, len(access_df.loc[access_df['Model accessibility'].isna()]))
    else:
        print(cat, len(access_df.loc[access_df['Model accessibility'] == cat]))

nan 654
Unreleased 487
Open access (unrestricted) 333
Hosted access (no API) 28
Open access (non-commercial) 95
API access 66
Open access (restricted use) 78


In [35]:
open_access_categories = ['Open access (unrestricted)', 'Open access (restricted use)', 'Open access (non-commercial)']
closed_access_categories = ['API access', 'Hosted access (no API)', 'Unreleased']

In [36]:
def get_access_label(access_category):
    if pd.isna(access_category):
        return 'Unknown'
    elif access_category in open_access_categories:
        return 'Open'
    elif access_category in closed_access_categories:
        return 'Closed'
    else:
        return 'Unknown'

# Add column with binary access label
access_df.loc[:, 'Model open/closed'] = access_df['Model accessibility'].apply(
    lambda x: get_access_label(x)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  access_df.loc[:, 'Model open/closed'] = access_df['Model accessibility'].apply(


In [37]:
access_df

Unnamed: 0,System,Domain,Task,Authors,Notability criteria,Notability criteria notes,Model accessibility,Link,Citations,Reference,...,Hardware count,Hardware TF16,Hardware FP16,Assumed precision,Assumed hardware FLOP/s,TEMP,Hardware type,Compute estimate method,Training compute estimation method,Model open/closed
8,Theseus,Robotics,Maze solving,Claude Shannon,Historical significance,,,https://www.technologyreview.com/2018/12/19/13...,0.0,Mighty Mouse,...,0,,,FP32,,0,,,,Unknown
9,SNARC,Robotics,Maze solving,Marvin Minsky,Historical significance,,,https://en.wikipedia.org/wiki/Stochastic_neura...,33.0,A Neural-Analogue Calculator Based upon a Prob...,...,0,,,FP32,,0,,,,Unknown
10,Genetic algorithm,Mathematics,Numerical simulation,NA Barricelli,Historical significance,Possibly first computer simulation of a geneti...,,https://link.springer.com/article/10.1007/BF01...,266.0,Numerical testing of evolution theories,...,0,,,FP32,,0,,,,Unknown
11,Sequence-based pattern recognition,Vision,Character recognition,O. G. Selfridge,Historical significance,,,https://dl.acm.org/doi/10.1145/1455292.1455310,290.0,Pattern recognition and modern computers,...,0,,,FP32,,0,,,,Unknown
12,Self Organizing System,Other,Pattern recognition,W. A. Clark and B. G. Farley,Historical significance,,,https://dl.acm.org/doi/10.1145/1455292.1455309,93.0,Generalization of pattern recognition in a sel...,...,0,,,FP32,,0,,,,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1744,Genesis,Biology,Protein design,"Zander Harteveld, Alexandra Van Hall-Beauvais,...",,,,https://www.cell.com/cell-systems/fulltext/S24...,,Exploring “dark-matter” protein folds using de...,...,0,,,FP32,,1,,,,Unknown
1745,SO3LR,Biology,"Protein folding prediction,Molecular simulation","Adil Kabylda, J. Thorben Frank, Sergio Suarez ...",,,,https://chemrxiv.org/engage/chemrxiv/article-d...,,Molecular Simulations with a Pretrained Neural...,...,0,,,FP32,,1,,,,Unknown
1746,ProCALM,Biology,"Proteins,Protein generation","Jason Yang, Aadyot Bhatnagar, Jeffrey A. Ruffo...",,,,https://www.arxiv.org/abs/2410.03634,,Conditional Enzyme Generation Using Protein La...,...,0,,,FP32,,1,,,,Unknown
1747,SCUBA-D,Biology,Protein design,"Yufeng Liu, Sheng Wang, Jixin Dong, Linghui Ch...",,,,https://www.nature.com/articles/s41592-024-024...,,De novo protein design with a denoising diffus...,...,0,,,FP32,,1,,,,Unknown


In [38]:
print('Closed', len(access_df[access_df['Model open/closed'] == 'Closed']))
print('Open', len(access_df[access_df['Model open/closed'] == 'Open']))
print('Unknown', len(access_df[access_df['Model open/closed'] == 'Unknown']))

Closed 581
Open 506
Unknown 654


In [39]:
df = access_df

In [40]:
def find_top_models_up_to_release(df, top_n):
    """Find the models which were in the top n by compute when they were released."""
    # This set will keep track of models that were ever in the top 10 at their release
    ever_in_top_n = set()

    # Iterate over each date in the DataFrame
    for current_date in df['date'].unique():
        # Get all entries up to the current date
        historical_data = df[df['date'] <= current_date]
        # Find top 10 models by flop count in this subset
        top_n_models = historical_data.nlargest(top_n, 'flop')['System']
        # Update the set of models that were ever in top n
        ever_in_top_n.update(top_n_models)

    # Return DataFrame filtered to only include models that were ever in the top 10
    return df[df['System'].isin(ever_in_top_n)]


def filter_top_models_within_category(df, top_n, cutoff_date, category):
    """Find the models which were in the top-n by compute when they were released,
    among models in the specified category. The top-n models in the specified category
    are seeded with the overall top-n models before the cutoff date.
    """
    # Filter top-n models within the category, but seeded with overall top-n models
    top_models_df = find_top_models_up_to_release(df, top_n)
    top_n_models_at_cutoff_date_df = top_models_df[top_models_df['date'] <= cutoff_date].nlargest(top_n, 'flop')
    category_df = df[df['category'] == category]

    # This set will keep track of models that were ever in the top 10 at their release
    ever_in_top_n = set()

    # Iterate over each date in the DataFrame
    for current_date in category_df['date'].unique():
        # Get all entries up to the current date
        category_since_cutoff = category_df[(category_df['date'] <= current_date) & (category_df['date'] > cutoff_date)]
        historical_data = pd.concat([category_since_cutoff, top_n_models_at_cutoff_date_df])
        # Find top 10 models by flop count in this subset
        top_n_models_df = historical_data.nlargest(top_n, 'flop')
        # Update the set of models that were ever in top n
        # Filter out the models that aren't in the category
        ever_in_top_n.update(top_n_models_df[top_n_models_df['category'] == category]['System'])

    # Return DataFrame filtered to only include models that were ever in the top 10
    return df[df['System'].isin(ever_in_top_n)]


def filter_top_models_in_both_categories(df, top_n, cutoff_date):
    # Get top models for Open and Closed categories
    top_open_models = filter_top_models_within_category(df, top_n, cutoff_date, category='Open')
    top_closed_models = filter_top_models_within_category(df, top_n, cutoff_date, category='Closed')
    # Combine the results
    df_filtered = pd.concat([top_open_models, top_closed_models])
    # Sort the combined DataFrame by date
    df_filtered = df_filtered.sort_values('date')
    return df_filtered

In [41]:
df_filtered = (df[['System', 'Training compute (FLOP)', 'Publication date', 'Organization', 'Notability criteria', 'Domain', 'Base model', 'Model open/closed']]
    .rename(columns={'Training compute (FLOP)': 'flop', 'Publication date': 'date', 'Model open/closed': 'category'})
    .assign(date=lambda x: pd.to_datetime(x['date']), log_flop=lambda x: np.log10(x['flop']))
    .sort_values('date'))
list(df_filtered[df_filtered['Base model'].notna()]['System'])

['BatchNorm',
 'SSD',
 'Layer Normalization: Handwriting sequence generation',
 'Layer Normalization: Draw',
 'Layer Normalization: The Attentive Reader',
 'Layer Normalization: Skip Thoughts',
 'Order embeddings with layer norm',
 'ULM-FiT',
 'ADP-FAIRSEQ + NGRAMRES',
 'Fine-tuned-AWD-LSTM-DOC (fin)',
 'Cross-lingual alignment',
 'Theseus 6/768',
 'UnifiedQA',
 'GPT-Neo-2.7B (finetuned)',
 'GPT-Neo-2.7B (finetuned on PTB)',
 'Unicorn',
 'Multitask Unified Model (MUM)',
 '$\\infty$-former (SM)',
 'FLAN 137B',
 'AlphaFold-Multimer',
 'T0-XXL',
 'GPT-2 (AMPS)',
 'Masked Autoencoders ViT-H',
 'ViT-G/14 (LiT)',
 'Engine-XL(NE)',
 'HSO',
 'Contriever',
 'Vespa',
 'OntoProtein',
 'InstructGPT',
 'BERT-RBP',
 'Flamingo',
 'Jurassic-X',
 'SimCSE',
 'CogVideo',
 'Minerva (540B)',
 'Delphi',
 'Transformer-XL + RMT',
 'GPT-NeoX-Japanese',
 'BlenderBot 3',
 'PaLM-SayCan',
 'Sparrow',
 'NMST+GPT-2',
 'Decaying Fast Weights Transformer (WT-103)',
 "Instruct-GPT + Mind's Eye",
 'GPT-2 + Progressive L

In [42]:
df_filtered = (df[['System', 'Training compute (FLOP)', 'Publication date', 'Organization', 'Notability criteria', 'Domain', 'Base model', 'Model open/closed']]
    .rename(columns={'Training compute (FLOP)': 'flop', 'Publication date': 'date', 'Model open/closed': 'category'})
    .assign(date=lambda x: pd.to_datetime(x['date']), log_flop=lambda x: np.log10(x['flop']))
    .sort_values('date'))

# Add speculative compute estimates based on benchmark imputation and rough guesses
if include_speculative_compute:
    speculative_compute_estimates = {
        "Claude 3.5 Sonnet": 4.72e25,
        "Claude 3 Opus": 1.59e25,
        "Claude 3 Sonnet": 5.51e24,
        "GPT-4o": 3.98e25,
        "Gemini 1.0 Pro": 1.85e24,
        "Gemini 1.5 Pro": 1.60e25,
        "Mistral Large 2": 2.01e25,
        "GPT-4 Turbo": 2.1e25,  # rough guess matching GPT-4
        "GPT-4V": 2.1e25,  # rough guess matching GPT-4
        "Claude 2": 4.33e24,
        "Claude 2.1": 4.33e24,  # rough guess matching Claude 2
    }
    for model, compute in speculative_compute_estimates.items():
        df_filtered.loc[df_filtered["System"] == model, "flop"] = compute
        df_filtered.loc[df_filtered["System"] == model, "log_flop"] = np.log10(compute)

df_filtered.dropna(subset=['flop'], inplace=True)

# Drop Alpha Go Master / Zero
if filter_alphago_outliers:
    mask = (df_filtered["System"] == 'AlphaGo Master') | (df_filtered["System"] == 'AlphaGo Zero')
    df_filtered = df_filtered[~mask]

# Drop finetuned models
if filter_finetuned_models:
    mask = df_filtered['Base model'].isna()
    df_filtered = df_filtered[mask]

top_models_df = find_top_models_up_to_release(df_filtered, top_n)  # For reference

if frontier_selection == 'external':
    # Filter top models before other filters
    df_filtered = filter_top_models_in_both_categories(df_filtered, top_n, top_n_cutoff_date)

if model_selection == 'Language models':
    re = 'Language|Multimodal'
    mask = df_filtered['Domain'].str.contains(re, na=False)
    df_filtered = df_filtered[mask]

if frontier_selection == 'internal':
    # Filter top models after other filters
    df_filtered = filter_top_models_in_both_categories(df_filtered, top_n, top_n_cutoff_date)

# Filter for models after the cutoff date
df_filtered = df_filtered[df_filtered['date'] > cutoff_date]

print(f"{len(df_filtered)}{' top' if frontier_selection != 'disabled' else ''} {model_selection} models found")
print(f"They span {df_filtered['date'].min().strftime('%B %Y')} to {df_filtered['date'].max().strftime('%B %Y')}")

20 top All models models found
They span October 2019 to July 2024


In [43]:
if top_n == 1:
    # Remove BIDAF outlier
    df_filtered = df_filtered[df_filtered['System'] != 'BIDAF']

In [44]:
open_df = df_filtered[df_filtered['category'] == 'Open']
closed_df = df_filtered[df_filtered['category'] == 'Closed']
recent_top_models_df = top_models_df[top_models_df['date'] > pd.to_datetime('2010-01-01')]

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=open_df['date'],
    y=open_df['log_flop'],
    mode='markers',
    marker=dict(color=colors['open'], opacity=0.5),
    text=open_df['System'],
    hoverinfo='text',
    name=f'Top-{top_n} Open'
))

fig.add_trace(go.Scatter(
    x=closed_df['date'],
    y=closed_df['log_flop'],
    mode='markers',
    marker=dict(color=colors['closed'], opacity=0.5),
    text=closed_df['System'],
    hoverinfo='text',
    name=f'Top-{top_n} Closed'
))

fig.add_trace(go.Scatter(
    x=recent_top_models_df['date'],
    y=recent_top_models_df['log_flop'],
    mode='markers',
    marker=dict(color='grey', opacity=0.5),
    text=recent_top_models_df['System'],
    hoverinfo='text',
    name=f'Top-{top_n} Overall'
))

fig.update_layout(
    width=800,
    height=400,
    xaxis_title='Date',
    yaxis_title='Log FLOP',
    title=f'Top-{top_n} models with kickstarting',
    margin=dict(t=50, l=60, r=60, b=50),
)

# save_plot(fig, results_dir, f'top_{top_n}_models_without_kickstarting')

fig.show()

In [45]:
top_models_since_cutoff = top_models_df[top_models_df['date'] >= pd.to_datetime(cutoff_date)]
top_models_set = set(top_models_since_cutoff['System'])
open_top_models_set = set(open_df['System'])
closed_top_models_set = set(closed_df['System'])

frac_open_top_models = len(open_top_models_set.intersection(top_models_set)) / len(top_models_set)
frac_closed_top_models = len(closed_top_models_set.intersection(top_models_set)) / len(top_models_set)
print(f"Fraction of top-{top_n} models that are open: {frac_open_top_models*100:.1f}%")
print(f"Fraction of top-{top_n} models that are closed: {frac_closed_top_models*100:.1f}%")

Fraction of top-1 models that are open: 0.0%
Fraction of top-1 models that are closed: 100.0%


# Frontier lag analysis

In [46]:
# closed_models_highlight = ['AlphaStar', 'Meena', 'GPT-3 175B (davinci)', 'Megatron-Turing NLG 530B', 'PaLM (540B)', 'GPT-4', 'Gemini 1.0 Ultra']
closed_text_idxs = [idx for idx in closed_df.index if closed_df.loc[idx, 'System'] not in ['OpenAI Five', 'Meena', 'GPT-3.5 (text-davinci-003)', 'Megatron-Turing NLG 530B', 'Jurassic-1-Jumbo']]
open_text_idxs = [idx for idx in open_df.index if open_df.loc[idx, 'System'] not in ['mT5-XXL', 'Switch', 'LLaMA-65B', 'Llama 2-70B', 'Nemotron-4 340B', 'Llama 3-70B']]

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=closed_df['date'],
        y=closed_df['flop'],
        mode='lines+markers',
        line=dict(color=colors['closed'], shape='hv'),
        text=closed_df['System'],
        name='Closed-weight frontier',
        showlegend=True
    ),
)
fig.add_trace(
    go.Scatter(
        x=closed_df.loc[closed_text_idxs, 'date'],
        y=closed_df.loc[closed_text_idxs, 'flop'],
        mode='text',
        text=closed_df.loc[closed_text_idxs, 'System'],
        textposition='top left',
        name='Closed',
        showlegend=False
    ),
)
fig.add_trace(
    go.Scatter(
        x=open_df['date'],
        y=open_df['flop'],
        mode='lines+markers',
        line=dict(color=colors['open'], shape='hv'),
        text=open_df['System'],
        name='Open-weight frontier',
        showlegend=True
    ),
)
fig.add_trace(
    go.Scatter(
        x=open_df.loc[open_text_idxs, 'date'],
        y=open_df.loc[open_text_idxs, 'flop'],
        mode='text',
        text=open_df.loc[open_text_idxs, 'System'],
        textposition='top left',
        name='Open',
        showlegend=False
    ),
)
fig.update_xaxes(range=[pd.Timestamp('2019-01-01'), pd.Timestamp('2025-01-01')])
fig.update_yaxes(type='log', range=[22.5, 26], dtick=1)
fig.update_layout(
    width=800,
    height=500,
    title='Models at the frontier of training compute',
    xaxis_title='Date',
    yaxis_title='Training compute (FLOP)',
    # margin=dict(t=50, l=60, r=60, b=100),
    template='plotly_white',
    legend=dict(
        orientation='v',
        yanchor='bottom',
        y=0.05,
        xanchor='right',
        x=1.0,
        bordercolor="rgb(230, 230, 230)",
        borderwidth=1
    ),
)

save_plot(fig, results_dir, f'frontiers_annotated')

fig.show()


In [47]:
if top_n == 1:
    # Measure the lag in compute between top-1 models in open and closed categories
    lags = []
    already_matched = set()
    for i, closed_row in closed_df.iterrows():
        for j, open_row in open_df.iterrows():
            if open_row['log_flop'] >= closed_row['log_flop'] and open_row['date'] not in already_matched:
                lag_months = (open_row['date'] - closed_row['date']).days/365*12
                print(f"{open_row['System']} exceeded {closed_row['System']} after {lag_months:.1f} months")
                # already_matched.add(open_row['date'])
                lags.append(lag_months)
                break
    lags = np.array(lags)
    print(f"Top-1 models: {lags.mean():.1f} months, [{np.percentile(lags, 5):.1f}, {np.percentile(lags, 95):.1f}]")

mT5-XXL exceeded AlphaStar after 11.7 months
mT5-XXL exceeded OpenAI Five after 10.3 months
OPT-175B exceeded Meena after 27.1 months
OPT-175B exceeded GPT-3 175B (davinci) after 23.1 months
OPT-175B exceeded Jurassic-1-Jumbo after 8.7 months
Falcon-180B exceeded Megatron-Turing NLG 530B after 22.8 months
Falcon-180B exceeded PaLM (540B) after 17.1 months
Falcon-180B exceeded GPT-3.5 (text-davinci-003) after 9.3 months
Llama 3.1-405B exceeded GPT-4 after 16.3 months
Top-1 models: 16.3 months, [8.9, 25.5]


In [48]:
def create_staircase_array(df, step_size, min_log_flop, max_log_flop):
    # Sort the dataframe by log_flop
    df = df.sort_values('log_flop')
    
    # Create a date range with the specified step size
    compute_range = np.arange(min_log_flop, max_log_flop, step_size)
    
    # Initialize the staircase array
    staircase = np.zeros(len(compute_range), dtype=pd.Timestamp)
    
    # Fill the staircase array
    for i, compute in enumerate(compute_range):
        # Find the first date that's greater than the current value
        mask = df['log_flop'] >= compute
        if mask.any():
            staircase[i] = df.loc[mask, 'date'].iloc[0]
        else:
            staircase[i] = np.nan  # or some other placeholder value
    
    return compute_range, staircase

# Usage example
step_size = 0.001  # log10-FLOP
compute_min = np.maximum(closed_df['log_flop'].min(), open_df['log_flop'].min())
compute_max = np.minimum(closed_df['log_flop'].max(), open_df['log_flop'].max())
compute_staircase_closed, date_staircase_closed = create_staircase_array(closed_df, step_size, compute_min, compute_max)
compute_staircase_open, date_staircase_open = create_staircase_array(open_df, step_size, compute_min, compute_max)

In [49]:
# Now you can use these truncated arrays to create your plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=compute_staircase_closed, y=date_staircase_closed, mode='lines', name='Closed', line=dict(color=colors['closed'])))
fig.add_trace(go.Scatter(x=compute_staircase_open, y=date_staircase_open, mode='lines', name='Open', line=dict(color=colors['open'])))

# Update layout as before
fig.update_layout(
    width=450,
    height=750,
    title='Frontiers',
    xaxis_title='Log FLOP',
    yaxis_title='Timestamp',
    margin=dict(t=50, l=60, r=60, b=50),
    template='plotly_white'
)

fig.show()

In [50]:
date_differences = date_staircase_open - date_staircase_closed

In [51]:
date_differences

array([Timedelta('266 days 00:00:00'), Timedelta('349 days 00:00:00'),
       Timedelta('743 days 00:00:00'), ...,
       Timedelta('230 days 00:00:00'), Timedelta('230 days 00:00:00'),
       Timedelta('230 days 00:00:00')], dtype=object)

In [52]:
# Convert date_differences from Timedelta to months
date_differences = np.array([td.days for td in date_differences]) / 365 * 12

In [53]:
compute_staircase_closed

array([22.91381385, 22.91481385, 22.91581385, ..., 25.57681385,
       25.57781385, 25.57881385])

In [54]:
# Plot date differences
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=compute_staircase_closed,
    y=date_differences,
    mode='lines',
    line=dict(shape='hv'),
    name='Date differences'
))

fig.update_xaxes(range=[compute_min-0.5, compute_max+0.5])
# x axis step size
fig.update_xaxes(dtick=1)
fig.update_yaxes(range=[0, 36])

fig.update_layout(
    width=800,
    height=400,
    title='Lag of Open model compute frontier',
    xaxis_title='Log FLOP',
    yaxis_title='Lag (months)',
    margin=dict(t=50, l=60, r=60, b=50),
    template='plotly_white'
)

In [55]:
date_differences.mean()

16.30871142443145

In [56]:
np.trapz(date_differences, compute_staircase_closed) / (compute_max - compute_min)

16.30583817650746

In [57]:
unique_lags = []
unique_lag_compute = []
previous_lag = None
for i, lag in enumerate(date_differences):
    if previous_lag is None or lag != previous_lag:
        unique_lags.append(lag)
        unique_lag_compute.append(compute_staircase_closed[i])
        previous_lag = lag
len(unique_lags)

17

In [58]:
unique_lags, unique_lag_compute

([8.745205479452055,
  11.473972602739725,
  24.42739726027397,
  27.12328767123288,
  23.145205479452056,
  8.67945205479452,
  6.673972602739727,
  16.471232876712328,
  21.205479452054796,
  22.84931506849315,
  17.0958904109589,
  9.271232876712329,
  5.7534246575342465,
  13.150684931506849,
  15.024657534246575,
  16.306849315068494,
  7.561643835616438],
 [22.913813852383715,
  22.914813852383716,
  22.915813852383717,
  22.969813852383783,
  23.04981385238388,
  23.49781385238443,
  23.568813852384515,
  23.633813852384595,
  23.740813852384726,
  23.90881385238493,
  24.068813852385126,
  24.402813852385535,
  24.411813852385546,
  24.575813852385746,
  24.79981385238602,
  25.255813852386577,
  25.32281385238666])

In [59]:
import numpy as np
import scipy.stats as stats
from sklearn.linear_model import LinearRegression

# Assuming you have arrays log_flop and lag
log_flop = np.array(unique_lag_compute)
lag = np.array(unique_lags)

# 1. Linear Regression
X = log_flop.reshape(-1, 1)
y = lag
reg = LinearRegression().fit(X, y)
slope = reg.coef_[0]
p_value = stats.t.sf(abs(slope / np.sqrt(np.sum((y - reg.predict(X))**2) / (len(X)-2) / np.sum((X-X.mean())**2))), len(X)-2)*2

print(f"Linear Regression: Slope = {slope:.4f}, p-value = {p_value:.4f}")

# 2. Kendall's Tau
tau, p_value = stats.kendalltau(log_flop, lag)
print(f"Kendall's Tau: Tau = {tau:.4f}, p-value = {p_value:.4f}")

# 3. Spearman's Rank Correlation
rho, p_value = stats.spearmanr(log_flop, lag)
print(f"Spearman's Rank Correlation: Rho = {rho:.4f}, p-value = {p_value:.4f}")

Linear Regression: Slope = -2.9573, p-value = 0.1674
Kendall's Tau: Tau = -0.2059, p-value = 0.2706
Spearman's Rank Correlation: Rho = -0.2672, p-value = 0.2999


# Regression analysis

## Model selection

In [60]:
@dataclass
class FitResult:
    p: int = None
    bic: float = None
    rss: float = None
    mse: float = None
    predict: Callable = None


@dataclass
class KinkedFitResult(FitResult):
    break_points: tuple[float] = None
    break_points_dt: float = None
    oom_year_slopes: tuple[float] = None
    intercepts: tuple[float] = None

    # Model properties for each breakpoint combination
    # (for debugging)
    bics: tuple[float] = None
    rsss: tuple[float] = None
    mses: tuple[float] = None
    break_points_list: tuple[tuple[float]] = None
    break_points_dt_list: tuple[tuple[float]] = None


def get_predictors(
    x,
    intercept_change_points,
    slope_change_points,
    pred_category=None,category=None,
    same_intercepts=None,
    same_slopes=None
):
    if pred_category == 'Open':
        is_open = np.ones(len(x))
    elif pred_category == 'Closed':
        is_open = np.zeros(len(x))
    else:
        assert category is not None
        is_open = (category == 'Open').astype(int).values

    # Ensure the lengths match
    assert len(same_intercepts) == len(intercept_change_points), f"Length of same_intercepts ({len(same_intercepts)}) must match the number of intercept change points ({len(intercept_change_points)})"
    assert len(same_slopes) == len(slope_change_points), f"Length of same_slopes ({len(same_slopes)}) must match the number of slope change points ({len(slope_change_points)})"

    # Calculate the number of columns needed
    n_intercept_cols = sum(1 if same else 2 for same in same_intercepts)
    n_slope_cols = sum(1 if same else 2 for same in same_slopes)
    n_cols = n_intercept_cols + n_slope_cols

    predictors = np.zeros((len(x), n_cols))

    # Intercept predictors
    col_idx = 0
    for i, (intercept_point, same) in enumerate(zip(intercept_change_points, same_intercepts)):
        if same:
            predictors[:, col_idx] = (x >= intercept_point).astype(int)
            col_idx += 1
        else:
            predictors[:, col_idx] = (x >= intercept_point).astype(int) * is_open
            predictors[:, col_idx + 1] = (x >= intercept_point).astype(int) * (1 - is_open)
            col_idx += 2

    # Slope predictors
    for i, (break_point, same) in enumerate(zip(slope_change_points, same_slopes)):
        if same:
            predictors[:, col_idx] = np.maximum(x - break_point, 0)
            col_idx += 1
        else:
            predictors[:, col_idx] = np.maximum(x - break_point, 0) * is_open
            predictors[:, col_idx + 1] = np.maximum(x - break_point, 0) * (1 - is_open)
            col_idx += 2

    return predictors


def fit_n_phase_exponential(
    df,
    kink_count,
    allow_discontinuities=False,
    same_intercepts=None,
    same_slopes=None,
    min_n_segment=10
):
    # Generate monthly breakpoints between 2010 and 2024
    one_month = pd.DateOffset(months=1)
    break_point_grid = pd.date_range(start=df['date'].min() - one_month, end=df['date'].max() - 4*one_month, freq='MS')
    break_point_grid = [x.toordinal() for x in break_point_grid]

    x = pd.to_datetime(df['date']).apply(lambda date: date.toordinal()).values
    y = df['log_flop'].values

    break_points_list = []
    bics = []
    rsss = []
    mses = []
    models = []

    for break_points in combinations_with_replacement(break_point_grid, kink_count):
        intercept_change_points = (0,)
        if allow_discontinuities:
            intercept_change_points += break_points
        slope_change_points = (0,) + break_points

        # If same_intercepts or same_slopes are not provided, default to all False
        if same_intercepts is None:
            same_intercepts = [False] * len(intercept_change_points)
        if same_slopes is None:
            same_slopes = [False] * len(slope_change_points)

        predictors = get_predictors(
            x,
            intercept_change_points,
            slope_change_points,
            category=df['category'],
            same_slopes=same_slopes,
            same_intercepts=same_intercepts
        )

        # Fit the model
        model = sm.OLS(y, predictors).fit()

        # Calculate BIC manually based on log-likelihood
        n = len(x) # Number of observations
        p = len(model.params) + 2*kink_count + 1 # Number of parameters

        # Calculate log-likelihood under the assumption of normally distributed errors
        # We have to iterate over all points to get their individual log-likelihoods
        log_likelihood = 0
        rss = 0
        invalid_model = False # Discard models with segments with less than 2 points
        for i, break_point in enumerate(slope_change_points):
            left_x = break_point
            right_x = slope_change_points[i + 1] if i + 1 < len(slope_change_points) else np.inf

            segment_predictors = predictors[(left_x <= x) & (x < right_x), :]
            segment_y = y[(left_x <= x) & (x < right_x)]
            segment_n = len(segment_y)

            assert min_n_segment > 2

            if segment_n < min_n_segment:
                invalid_model = True
                break

            y_pred = model.predict(segment_predictors)

            segment_rss = np.sum((y_pred - segment_y)**2)
            assert segment_rss > 0
            segment_mse = segment_rss / segment_n

            segment_log_likelihood = -segment_n/2 * (np.log(2*np.pi) + np.log(segment_rss/segment_n) + 1)
            log_likelihood += segment_log_likelihood
            rss += segment_rss

        if invalid_model:
            continue

        # Compute BIC using the manual method based on the log-likelihood
        bic = p * np.log(n) - 2 * log_likelihood
        # bic = n*np.log(rss/n) + p*np.log(n)

        bics.append(bic)
        rsss.append(rss)
        mses.append(rss/len(df))
        models.append(model)
        break_points_list.append(break_points)

    # Prepare the result object
    best_bic = min(bics)
    best_idx = bics.index(best_bic)
    best_rss = rsss[best_idx]
    best_mse = mses[best_idx]
    best_model = models[best_idx]
    best_break_points = break_points_list[best_idx]

    p = len(best_model.params) + 2*kink_count + 1 # Number of parameters

    # Store the model parameters
    intercept_change_points = (0,)
    if allow_discontinuities:
        intercept_change_points += best_break_points
    slope_change_points = (0,) + best_break_points

    n_intercepts = sum(1 if same else 2 for same in same_intercepts)
    intercepts = best_model.params[:n_intercepts]
    oom_intercepts = np.zeros((2, len(intercept_change_points)))
    for i in range(len(intercept_change_points)):
        if same_intercepts[i]:
            oom_intercepts[0, i] = oom_intercepts[1, i] = intercepts[i]
        else:
            oom_intercepts[0, i] = intercepts[2*i - sum(same_intercepts[:i])]
            oom_intercepts[1, i] = intercepts[2*i + 1 - sum(same_intercepts[:i])]

    # Apply cumulative sum to get the actual slopes
    oom_intercepts = {'open': np.cumsum(oom_intercepts[0]), 'closed': np.cumsum(oom_intercepts[1])}

    n_slopes = len(slope_change_points)
    slopes = best_model.params[n_intercepts:]
    oom_year_slopes = np.zeros((2, n_slopes))  # 2 rows for Open and Closed
    for i in range(n_slopes):
        if same_slopes[i]:
            oom_year_slopes[0, i] = oom_year_slopes[1, i] = 365 * slopes[i]
        else:
            oom_year_slopes[0, i] = 365 * slopes[2*i - sum(same_slopes[:i])]
            oom_year_slopes[1, i] = 365 * slopes[2*i + 1 - sum(same_slopes[:i])]

    # Apply cumulative sum to get the actual slopes
    oom_year_slopes = {'open': np.cumsum(oom_year_slopes[0]), 'closed': np.cumsum(oom_year_slopes[1])}

    def predict(date, category):
        if not isinstance(date, pd.Series):
            date = pd.Series(date)
        x = pd.to_datetime(date).apply(lambda date: date.toordinal()).values

        predictors = get_predictors(
            x,
            intercept_change_points,
            slope_change_points,
            category=category,
            same_slopes=same_slopes,
            same_intercepts=same_intercepts
        )

        return best_model.predict(predictors)

    fit_result = KinkedFitResult(
        p=p,
        bic=best_bic,
        rss=best_rss,
        mse=best_mse,
        break_points=best_break_points,
        predict=predict,
        break_points_dt=[pd.Timestamp.fromordinal(bp) for bp in best_break_points],
        bics=bics,
        rsss=rsss,
        mses=mses,
        oom_year_slopes=oom_year_slopes,
        intercepts=oom_intercepts,
        break_points_list=break_points_list,
        break_points_dt_list=[[pd.Timestamp.fromordinal(bp) for bp in break_points] for break_points in break_points_list],
    )

    return fit_result


fit_em_all = lambda df_fit : {
    "Simple" : fit_n_phase_exponential(df_fit, kink_count=0),
    "Simple with same slope": fit_n_phase_exponential(df_fit, kink_count=0, same_slopes=(True,)),
    "Simple with same slope and intercept": fit_n_phase_exponential(df_fit, kink_count=0, same_slopes=(True,), same_intercepts=(True,)),
    # "Discrete acceleration" : fit_n_phase_exponential(df_fit, kink_count=1),
    # "Discontinuity" : fit_n_phase_exponential(df_fit, kink_count=1, allow_discontinuities=True),
    # "Same pre-break different post-break" : fit_n_phase_exponential(
    #     df_fit, kink_count=1, allow_discontinuities=True, same_intercepts=(True, False), same_slopes=(True, False)
    # ),
    # "Same pre-break different intercept post-break" : fit_n_phase_exponential(
    #     df_fit, kink_count=1, allow_discontinuities=True, same_intercepts=(True, False), same_slopes=(True, True)
    # ),
    # "Same pre-break and post-break" : fit_n_phase_exponential(
    #     df_fit, kink_count=1, allow_discontinuities=True, same_intercepts=(True, True), same_slopes=(True, True)
    # ),
}


# K-Fold Cross Validation
def perform_cross_validation(df, k=10, random_state=42):
    kf = KFold(n_splits=k, shuffle=True, random_state=random_state)
    folds_mses = defaultdict(lambda : [])
    for train_index, test_index in kf.split(df):
        train_df, test_df = df.iloc[train_index], df.iloc[test_index]

        # Fit the models on the training set
        fold_models = fit_em_all(train_df)

        # Predict on the test set
        for name,model in fold_models.items():
            try:
                predicted_log_flop = model.predict(test_df["date"], test_df["category"])
            except AttributeError:
                continue
            test_rss = np.sum((predicted_log_flop - test_df["log_flop"])**2)
            test_mse = test_rss / len(test_df)
            folds_mses[name].append(test_mse)

    # Compute mean MSE
    folds_mses = {name: np.mean(folds_mses[name]) for name in folds_mses}

    return folds_mses


def calculate_lag(df, fit_result):
    # Get the final predictions for 'open' and 'closed' categories
    final_date = df['date'].max()
    y_open = fit_result.predict(pd.Series([final_date]), pd.Series(['Open']))[0]
    y_closed = fit_result.predict(pd.Series([final_date]), pd.Series(['Closed']))[0]
    
    # Get the final slope for the 'closed' category
    slope_closed = fit_result.oom_year_slopes['closed'][-1]
    
    # Calculate lag
    lag = (y_closed - y_open) / slope_closed
    
    return lag

In [61]:
# Best model fits
models = fit_em_all(df_filtered)

# K-fold cross validation
folds_mses = perform_cross_validation(df_filtered)

# Bootstrap
bootstrap_sample_size = 1000

rng = np.random.default_rng(43)
bootstrap_bics = defaultdict(lambda : [])
bootstrap_mses = defaultdict(lambda : [])
bootstrap_bic_score_diff = defaultdict(lambda : [])
bootstrap_slopes = defaultdict(lambda : defaultdict(lambda : []))
bootstrap_intercepts = defaultdict(lambda : defaultdict(lambda : []))
bootstrap_breaks = defaultdict(lambda : [])
bootstrap_lag_months = defaultdict(lambda : [])
for bootstrap_index in tqdm(range(bootstrap_sample_size)):
    sample = df_filtered.sample(len(df_filtered), replace=True, random_state=rng)
    sample = sample.sort_values('date')

    # Compute BICs
    boot_models = fit_em_all(sample)

    # Compute K fold validation
    boot_folds_mses = perform_cross_validation(sample)

    # Store results
    for name, model in boot_models.items():
        # It might be None if the hyperbolic fails to fit
        if model is None: continue

        bootstrap_bics[name].append(model.bic)
        bootstrap_mses[name].append(boot_folds_mses[name])
        bootstrap_bic_score_diff[name].append(model.bic - boot_models["Simple"].bic)

        if isinstance(model, KinkedFitResult):
            if (len(model.oom_year_slopes['open']) > 0):
                bootstrap_slopes[name]['open'].append(10**model.oom_year_slopes['open'][-1])
            if (len(model.oom_year_slopes['closed']) > 0):
                bootstrap_slopes[name]['closed'].append(10**model.oom_year_slopes['closed'][-1])
            if (len(model.break_points_dt) > 0):
                bootstrap_breaks[name].append(model.break_points_dt[-1])

            # Calculate the lag between predictions for the open and closed categories
            lag_months = calculate_lag(sample, model) * 12  # Convert years to months
            bootstrap_lag_months[name].append(lag_months)


ci_width = 0.90
qs = [(1 - ci_width)/2, (1 + ci_width)/2]
bootstrap_preferred_percent = {}
bootstrap_slopes_ci = defaultdict(lambda : defaultdict(lambda : []))
for name in models:
    bootstrap_preferred_percent[name] = np.mean(np.array(bootstrap_bic_score_diff[name])<0)
    bootstrap_bics[name] = np.quantile(np.array(bootstrap_bics[name]), qs)
    bootstrap_mses[name] = np.quantile(np.array(bootstrap_mses[name]), qs)
    bootstrap_bic_score_diff[name] = np.quantile(np.array(bootstrap_bic_score_diff[name]), qs)
    bootstrap_slopes_ci[name]['open'] = np.quantile(np.array(bootstrap_slopes[name]['open']), qs)
    bootstrap_slopes_ci[name]['closed'] = np.quantile(np.array(bootstrap_slopes[name]['closed']), qs)
    bootstrap_lag_months[name] = np.quantile(np.array(bootstrap_lag_months[name]), qs)
    if len(bootstrap_breaks[name]) > 0:
        bootstrap_breaks[name] = np.quantile(np.array(bootstrap_breaks[name]), qs)

# Models with lower BIC score / MSE are preferred.

results = []
for name, model in models.items():
    param_count = model.p
    log_likelihood = (np.log(len(df_filtered))*param_count - model.bic)/2

    param_count_simple = models['Simple'].p
    log_likelihood_simple = (np.log(len(df_filtered))*param_count_simple - models['Simple'].bic)/2

    c2 = chi2.sf(2*(log_likelihood - log_likelihood_simple), df=(param_count - param_count_simple))

    result = {
        "Model": name,
        "BIC" : np.round(model.bic, 2),
        "BIC 90% CI" : np.round(bootstrap_bics[name], 2),
        "BIC score diff": np.round(model.bic - models["Simple"].bic, 2),
        "BIC score diff 90% CI": np.round(bootstrap_bic_score_diff[name], 2),
        "Xi²": c2,
        "% times preferred over simple": f"{bootstrap_preferred_percent[name]:.0%}",
        "K-fold mean MSE" : np.round(folds_mses[name], 2),
        "K-fold mean MSE 90% CI" : np.round(bootstrap_mses[name], 2),
    }

    result["Recent slope for closed models (Nx/year)"] = np.round(10**model.oom_year_slopes['closed'][-1], 2)
    result["Recent slope for closed models 90% CI"] = np.round(bootstrap_slopes_ci[name]['closed'], 2)
    result["Recent slope for open models (Nx/year)"] = np.round(10**model.oom_year_slopes['open'][-1], 2)
    result["Recent slope for open models 90% CI"] = np.round(bootstrap_slopes_ci[name]['open'], 2)
    result["Lag (months)"] = np.round(calculate_lag(df_filtered, model) * 12, 2)  # Convert years to months
    result["Lag 90% CI (months)"] = np.round(bootstrap_lag_months[name], 2)
    if len(model.break_points_dt) > 0:
        result["Break point"] = model.break_points_dt[-1].strftime('%Y-%m')
        result["Break point 90% CI"] = [date.strftime('%Y-%m') for date in bootstrap_breaks[name]]
    results.append(result)

results_df = pd.DataFrame(results)

print("Results")
results_df

100%|██████████| 1000/1000 [00:40<00:00, 24.44it/s]

Results





Unnamed: 0,Model,BIC,BIC 90% CI,BIC score diff,BIC score diff 90% CI,Xi²,% times preferred over simple,K-fold mean MSE,K-fold mean MSE 90% CI,Recent slope for closed models (Nx/year),Recent slope for closed models 90% CI,Recent slope for open models (Nx/year),Recent slope for open models 90% CI,Lag (months),Lag 90% CI (months)
0,Simple,22.31,"[3.77, 25.64]",0.0,"[0.0, 0.0]",,0%,0.12,"[0.05, 0.22]",4.56,"[3.34, 5.44]",4.65,"[3.28, 8.79]",15.37,"[6.45, 21.9]"
1,Simple with same slope,19.33,"[4.64, 23.41]",-2.99,"[-2.99, 4.94]",,82%,0.11,"[0.06, 0.14]",4.6,"[3.78, 5.61]",4.6,"[3.78, 5.61]",15.61,"[11.55, 19.82]"
2,Simple with same slope and intercept,36.45,"[23.34, 42.68]",14.14,"[7.76, 30.45]",,0%,0.29,"[0.15, 0.38]",3.41,"[2.7, 4.32]",3.41,"[2.7, 4.32]",0.0,"[0.0, 0.0]"


In [62]:
# Save results_df
regression_fname = f'compute_regression_analysis_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}.csv'
results_df.to_csv(os.path.join(results_dir, regression_fname), index=False)

# Save bootstrap_slopes as JSON
slopes_fname = f'bootstrap_slopes_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}.json'
with open(os.path.join(results_dir, slopes_fname), 'w') as f:
    json.dump(bootstrap_slopes, f, indent=4)

## Significant difference between regression slopes

### All data

In [63]:
df_filtered['date_float'] = datetime_to_float_year(df_filtered['date'])
open_df = df_filtered[df_filtered['category'] == 'Open']
closed_df = df_filtered[df_filtered['category'] == 'Closed']
regression_slope_t_test(open_df, closed_df, ['date_float'], 'log_flop', logy=False, adj_corr=True)

Slope 1: 0.67 (SE: 0.09)
Slope 2: 0.66 (SE: 0.06)
Correlation of residuals: 0.00
Test statistic: 0.07
p-value: 0.94


### Bootstrap distributions

In [64]:
open_slopes = bootstrap_slopes['Simple']['open']
closed_slopes = bootstrap_slopes['Simple']['closed']

In [65]:
# Plot a histogram of the slopes

# Create a DataFrame for the slopes
slopes_df = pd.DataFrame({
    'slope': np.log10(open_slopes + closed_slopes),
    'category': ['Open'] * len(open_slopes) + ['Closed'] * len(closed_slopes)
})

# Plot the histogram using plotly
fig = px.histogram(slopes_df, x='slope', color='category', barmode='overlay', 
                   title='Distribution of Bootstrap Slopes', 
                   labels={'Slope': 'Slope (OOMs/year)', 'count': 'Frequency'},
                   opacity=0.5, color_discrete_map={'Open': colors['open'], 'Closed': colors['closed']})

fig.update_layout(
    width=800,
    height=600,
)

fig.show()


In [66]:
# Shapiro-Wilk test for normality
_, p_value_open = stats.shapiro(np.log10(open_slopes))
print(f"Shapiro-Wilk test p-value for Open Models: {p_value_open}")

_, p_value_closed = stats.shapiro(np.log10(closed_slopes))
print(f"Shapiro-Wilk test p-value for Closed Models: {p_value_closed}")

# Anderson-Darling test for normality
result_open = stats.anderson(np.log10(open_slopes))
print(f"Anderson-Darling test statistic for Open Models: {result_open.statistic}")

result_closed = stats.anderson(np.log10(closed_slopes))
print(f"Anderson-Darling test statistic for Closed Models: {result_closed.statistic}")

Shapiro-Wilk test p-value for Open Models: 1.6350610639784377e-26
Shapiro-Wilk test p-value for Closed Models: 6.800706046263335e-24
Anderson-Darling test statistic for Open Models: 24.891432094172615
Anderson-Darling test statistic for Closed Models: 14.877443835765916


In [67]:
# Use Mann-Whitney U test (if any test above rejects normality, p < 0.05)
statistic, p_value = stats.mannwhitneyu(np.log10(open_slopes), np.log10(closed_slopes))
print(f"Mann-Whitney U test: statistic={statistic}, p-value={p_value}")

# Use t-test otherwise
statistic, p_value = stats.ttest_ind(np.log10(open_slopes), np.log10(closed_slopes), equal_var=False)
print(f"t-test: statistic={statistic}, p-value={p_value}")

Mann-Whitney U test: statistic=574037.0, p-value=9.84316221822043e-09
t-test: statistic=7.921474002437873, p-value=4.535990084309112e-15


In [68]:
np.percentile(np.log10(closed_slopes) - np.log10(open_slopes), [2.5, 97.5])

array([-0.37281489,  0.21280456])

## Plot predictions

In [69]:
# Graph of the different model fits using plotly

model = 'simple'  # ['simple', 'kinked']
colors = {'open': '#1f77b4', 'closed': '#ff7f0e'}  # Using default plotly colors

# Parameters for the simple model
kink_count = 0
allow_discontinuities = False
same_intercepts = (False,)
same_slopes = (False,)

def plot_model(df, model_type, kink_count=1, allow_discontinuities=False):
    if model_type == 'simple':
        fit_result = fit_n_phase_exponential(df, 0, same_intercepts, same_slopes)
    else:
        fit_result = fit_n_phase_exponential(df, kink_count, allow_discontinuities, same_intercepts, same_slopes)

    df_open = df[df['category'] == 'Open']
    df_closed = df[df['category'] == 'Closed']

    fig = go.Figure()

    # Plot the original data points
    fig.add_trace(go.Scatter(
        x=df_open['date'], y=df_open['log_flop'],
        mode='markers', name='Open models', text=df_open['System'],
        marker=dict(color=colors['open'], opacity=0.3, size=10)
    ))
    fig.add_trace(go.Scatter(
        x=df_closed['date'], y=df_closed['log_flop'],
        mode='markers', name='Closed models', text=df_closed['System'],
        marker=dict(color=colors['closed'], opacity=0.3, size=10)
    ))

    # Plot the fit lines
    date_grid = pd.date_range(start=df['date'].min(), end=df['date'].max(), freq='D')
    log_flop_open = fit_result.predict(pd.Series(date_grid), pd.Series(['Open'] * len(date_grid)))
    log_flop_closed = fit_result.predict(pd.Series(date_grid), pd.Series(['Closed'] * len(date_grid)))
    
    fig.add_trace(go.Scatter(
        x=date_grid, y=log_flop_open,
        mode='lines', name='Best Fit Line (Open)',
        line=dict(color=colors['open'])
    ))
    fig.add_trace(go.Scatter(
        x=date_grid, y=log_flop_closed,
        mode='lines', name='Best Fit Line (Closed)',
        line=dict(color=colors['closed'])
    ))

    # Add slope labels
    points = [df['date'].min()] + fit_result.break_points_dt + [df['date'].max()]
    for i in range(len(points) - 1):
        for category in ['open', 'closed']:
            mid = points[i] + (points[i+1] - points[i]) / 2
            y = fit_result.predict(pd.Series([mid]), pd.Series([category]))[0]
            fig.add_annotation(
                x=mid, y=y + 1.2 * (0.4 if category == 'closed' else -1),
                text=f'{10**fit_result.oom_year_slopes[category][i]:0.1f}x/year',
                showarrow=False,
                font=dict(size=12, color=colors[category])
            )

    # Plot horizontal line segment showing the lead time
    lag_months = calculate_lag(df, fit_result) * 12  # Convert years to months
    end_date = df['date'].max()
    start_date = end_date - pd.DateOffset(days=int(lag_months * 30.4375))  # Approximate months to days conversion
    y_value = fit_result.predict(pd.Series([end_date]), pd.Series(['Open']))[0]
    fig.add_shape(
        type="line",
        x0=start_date, y0=y_value, x1=end_date, y1=y_value,
        line=dict(color="black", width=1, dash="dash")
    )
    fig.add_annotation(
        x=(start_date + (end_date - start_date) * 0.5), y=y_value + 0.15,
        text=f'Lag: {lag_months:.0f} months',
        showarrow=False,
    )

    # Annotate some key models with text
    
    key_models = ['GPT-4'] if (exclude_big_llama or exclude_all_llamas) else ['GPT-4', 'Llama 3.1-405B']
    for model_name in key_models:
        model_row = df_filtered[df_filtered['System'] == model_name]
        fig.add_annotation(
            x=model_row['date'].iloc[0], y=model_row['log_flop'].iloc[0],
            text=model_name,
            showarrow=True,
            font=dict(size=12, color='black'),
            xanchor='right', yanchor='bottom'
        )

    # Update layout
    fig.update_layout(
        template='plotly_white',
        width=800,
        height=400,
        title=f'Compute trends for top-{top_n} open and closed models',
        xaxis_title='Model publication date',
        yaxis_title='Training compute (FLOP, log-scale)',
        legend_title='Model Category',
        legend=dict(
            x=0.7,
            y=0.05
        ),
        margin=dict(l=10, r=10, t=40, b=10),
        xaxis=dict(
            tickformat='%Y',
            dtick='M12',
        ),
        yaxis=dict(
            tickmode='array',
            tickvals=list(range(int(df['log_flop'].min()), int(df['log_flop'].max())+2)),
            ticktext=[f'10<sup>{i}</sup>' for i in range(int(df['log_flop'].min()), int(df['log_flop'].max())+2)]
        )
    )
    footnote = None
    if exclude_all_llamas:
        footnote = '*All Llama models excluded'
    elif exclude_big_llama:
        footnote = '*Llama 3.1 405B excluded'

    if footnote:
        fig.add_annotation(
            showarrow=False,
            text=footnote,
            font=dict(size=10), 
            xref="paper",
            x=0,
            yref="paper",
            y=-0.1,
            xanchor="left",
            yanchor="top",
        )

    fname = f'compute_regression_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}_{model_type}_kinks={kink_count}'
    save_plot(fig, results_dir, fname)

    fig.show()

plot_model(df_filtered, model, kink_count)

## Model selection based on backtesting

In [70]:
len(df_filtered)

20

In [71]:
def backtest_model(train_df, test_df, same_intercepts, same_slopes):
    kink_count = 0
    allow_discontinuities = False
    fit_result = fit_n_phase_exponential(
        train_df, kink_count, allow_discontinuities, same_intercepts, same_slopes
    )
    test_df.loc[:, 'predicted_log_flop'] = fit_result.predict(test_df['date'], test_df['category'])
    return test_df


def rmse(predictions, targets):
    return np.sqrt(np.mean((np.array(predictions) - np.array(targets))**2))


def backtest_model_selection(df, error_metric=rmse):
    # Construct a list of dates to split the data into training and testing sets
    # From halfway through the data to the second last data point
    split_dates = df['date'][len(df['date'])//2:]
    same_intercepts = (False,)
    errors = {}
    for same_slopes in [(False,), (True,)]:
        print(f"Same slopes: {same_slopes}")
        targets = []
        predictions = []
        errors_this_model = []
        for split_date in split_dates:
            train_df = df.loc[df['date'] < split_date]
            test_df = df.loc[df['date'] >= split_date]
            backtest_result = backtest_model(train_df, test_df, same_intercepts, same_slopes)
            error = error_metric(backtest_result['predicted_log_flop'], backtest_result['log_flop'])
            print(f"{error_metric.__name__} in log-FLOP from {split_date}: {error}")
            predictions.extend(backtest_result['predicted_log_flop'])
            targets.extend(backtest_result['log_flop'])
            errors_this_model.append(error)
        errors[f"same_slopes={same_slopes}"] = errors_this_model
        mean = np.mean(errors_this_model)
        p5 = np.percentile(errors_this_model, 5)
        p95 = np.percentile(errors_this_model, 95) 
        print(f'{error_metric.__name__} in log-FLOP for same_slopes={same_slopes}: {mean} (90% CI: {p5} to {p95})')
    return split_dates,errors

In [72]:
same_slopes = (False,)
split_date = pd.Timestamp('2024-07-01')
train_df = df_filtered.loc[df_filtered['date'] < split_date]
test_df = df_filtered.loc[df_filtered['date'] >= split_date]
backtest_result = backtest_model(train_df, test_df, same_intercepts, same_slopes)
error = rmse(backtest_result['predicted_log_flop'], backtest_result['log_flop'])
closed_error = rmse(backtest_result[backtest_result['category'] == 'Closed']['predicted_log_flop'], backtest_result[backtest_result['category'] == 'Closed']['log_flop'])
open_error = rmse(backtest_result[backtest_result['category'] == 'Open']['predicted_log_flop'], backtest_result[backtest_result['category'] == 'Open']['log_flop'])
print(f"RMSE in log-FLOP from {split_date}: {error}")
print(f"RMSE in log-FLOP from {split_date} for closed models: {closed_error}")
print(f"RMSE in log-FLOP from {split_date} for open models: {open_error}")

fig = go.Figure()

# Plot raw data, separating by category
df_open = df_filtered[df_filtered['category'] == 'Open']
df_closed = df_filtered[df_filtered['category'] == 'Closed']
# Plot the original data points
fig.add_trace(go.Scatter(
    x=df_open['date'], y=df_open['log_flop'],
    mode='markers', name='Open models',
    marker=dict(color=colors['open'], opacity=0.3, size=10)
))
fig.add_trace(go.Scatter(
    x=df_closed['date'], y=df_closed['log_flop'],
    mode='markers', name='Closed models',
    marker=dict(color=colors['closed'], opacity=0.3, size=10)
))

# Plot predictions as lines for each category
test_df_open = backtest_result[backtest_result['category'] == 'Open']
test_df_closed = backtest_result[backtest_result['category'] == 'Closed']

fig.add_trace(go.Scatter(
    x=test_df_open['date'], y=test_df_open['predicted_log_flop'],
    mode='lines', name='Predicted open models',
    line=dict(color=colors['open'])
))
fig.add_trace(go.Scatter(
    x=test_df_closed['date'], y=test_df_closed['predicted_log_flop'],
    mode='lines', name='Predicted closed models',
    line=dict(color=colors['closed'])
))

# Vertical line at the split date
fig.add_shape(
    type="line",
    x0=split_date, y0=22, x1=split_date, y1=26,
    line=dict(color="black", width=1, dash="dash"),
    name='Training cutoff',
    legendgroup='Training Cutoff',  # Grouping for legend
    showlegend=True  # Ensure it appears in the legend
)

fig.update_layout(
    template='plotly_white',
    width=800,
    height=400,
    title=f'Backtest of regression with same_slopes={same_slopes}: RMSE={error:.2f}',
    xaxis_title='Model publication date',
    yaxis_title='Training compute (FLOP, log-scale)',
    legend_title='Model Category',
)

save_plot(fig, results_dir, f'backtest_example_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}_same_slopes={same_slopes}_split={split_date}')

fig.show()

RMSE in log-FLOP from 2024-07-01 00:00:00: 0.6708808081411668
RMSE in log-FLOP from 2024-07-01 00:00:00 for closed models: nan
RMSE in log-FLOP from 2024-07-01 00:00:00 for open models: 0.6708808081411668




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Mean of empty slice.


invalid value encountered in scalar divide



In [73]:
split_dates,errors = backtest_model_selection(df_filtered)

Same slopes: (False,)
rmse in log-FLOP from 2022-05-02 00:00:00: 1.3636662111073252
rmse in log-FLOP from 2022-11-28 00:00:00: 0.7725445367998696
rmse in log-FLOP from 2023-02-24 00:00:00: 0.8381066906405717
rmse in log-FLOP from 2023-03-15 00:00:00: 0.8182624373848103
rmse in log-FLOP from 2023-07-18 00:00:00: 0.8207854660093162
rmse in log-FLOP from 2023-09-06 00:00:00: 0.8627907918565291
rmse in log-FLOP from 2023-12-06 00:00:00: 0.6570131279723145
rmse in log-FLOP from 2024-04-18 00:00:00: 0.7364113852588473
rmse in log-FLOP from 2024-06-14 00:00:00: 0.7312568614157056
rmse in log-FLOP from 2024-07-23 00:00:00: 0.6708808081411668
rmse in log-FLOP for same_slopes=(False,): 0.8271718316586456 (90% CI: 0.663253584048298 to 1.1382722724444665)
Same slopes: (True,)
rmse in log-FLOP from 2022-05-02 00:00:00: 0.5286828306613991
rmse in log-FLOP from 2022-11-28 00:00:00: 0.5017896740354243
rmse in log-FLOP from 2023-02-24 00:00:00: 0.5759765082108136
rmse in log-FLOP from 2023-03-15 00:00:



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [74]:
errors_diff = np.array(errors['same_slopes=(True,)']) - np.array(errors['same_slopes=(False,)'])
np.percentile(errors_diff, [5, 95])

# Plot the errors diff
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=split_dates, y=errors_diff,
    mode='lines', name='Error difference',
    line=dict(color='blue')
))
fig.update_layout(
    template='plotly_white',
    width=800,
    height=400,
    title='Test error of the same slope model minus the different slope model' + ('<br>with hypothetical Llama 4' if include_llama_4 else ''),
    xaxis_title='Train-test split date',
    yaxis_title='Difference in RMSE',
)

save_plot(fig, results_dir, f'compute_regression_backtest_error_diff_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}')

fig.show()


# Open and closed model compute by Organization

In [75]:
# Scatter plot of open and closed models using plotly
open_df = df_filtered[df_filtered['category'] == 'Open']
closed_df = df_filtered[df_filtered['category'] == 'Closed']

marker_to_org = {
    'bowtie': 'Meta',
    'cross': 'Google',
    'hexagon-open': 'OpenAI',
    'star': 'Anthropic',
    'square': 'Microsoft',
    'circle': 'Other',
}
closed_added_to_legend = defaultdict(bool)
open_added_to_legend = defaultdict(bool)

org_model_counts = defaultdict(lambda: defaultdict(int))

fig = go.Figure()
for org in df_filtered['Organization'].unique():
    open_df_org = open_df[open_df['Organization'] == org]
    closed_df_org = closed_df[closed_df['Organization'] == org]
    if any([kw in org.lower() for kw in ['meta', 'facebook']]):
        marker = 'bowtie'
        org_model_counts['Open']['Meta'] += len(open_df_org)
        org_model_counts['Closed']['Meta'] += len(closed_df_org)
    elif any([kw in org.lower() for kw in ['google', 'deepmind']]):
        marker = 'cross'
        org_model_counts['Open']['Google/DeepMind'] += len(open_df_org)
        org_model_counts['Closed']['Google/DeepMind'] += len(closed_df_org)
    elif any([kw in org.lower() for kw in ['openai']]):
        marker = 'hexagon-open'
        org_model_counts['Open']['OpenAI'] += len(open_df_org)
        org_model_counts['Closed']['OpenAI'] += len(closed_df_org)
    elif any([kw in org.lower() for kw in ['anthropic']]):
        marker = 'star'
        org_model_counts['Open']['Anthropic'] += len(open_df_org)
        org_model_counts['Closed']['Anthropic'] += len(closed_df_org)
    elif any([kw in org.lower() for kw in ['microsoft']]):
        marker = 'square'
        org_model_counts['Open']['Microsoft'] += len(open_df_org)
        org_model_counts['Closed']['Microsoft'] += len(closed_df_org)
    else:
        marker = 'circle'
        org_model_counts['Open']['Other'] += len(open_df_org)
        org_model_counts['Closed']['Other'] += len(closed_df_org)
    fig.add_trace(go.Scatter(
        x=open_df_org['date'],
        y=open_df_org['log_flop'],
        text=open_df_org['System'],
        mode='markers',
        name=marker_to_org[marker] + ', open',
        showlegend=not open_added_to_legend[marker],
        marker=dict(
            color=colors['open'],
            opacity=0.5,
            symbol=marker
        )
    ))
    fig.add_trace(go.Scatter(
        x=closed_df_org['date'],
        y=closed_df_org['log_flop'],
        text=closed_df_org['System'],
        mode='markers',
        name=marker_to_org[marker] + ', closed',
        showlegend=not closed_added_to_legend[marker],
        marker=dict(
            color=colors['closed'],
            opacity=0.5,
            symbol=marker
        )
    ))
    if len(closed_df_org) > 0:
        closed_added_to_legend[marker] = True
    if len(open_df_org) > 0:
        open_added_to_legend[marker] = True

# Axis titles
fig.update_layout(xaxis_title='Model publication date')
fig.update_layout(yaxis_title='Training compute (FLOP, log-scale)')

# Format the y-axis labels as 10^N
yvals = list(range(20, 27))
fig.update_yaxes(
    tickmode = 'array',
    tickvals = yvals,
    ticktext = [f'10<sup>{x}</sup>' for x in yvals],
    # ticks="",
    # tickfont=dict(size=20)
)

# Legend title
fig.update_layout(legend_title='Organization, access')

# Margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

# plotly-white
fig.update_layout(template='plotly_white')

# Sizing
fig.update_layout(
    width=600,
    height=400,
    title='Open and closed models by organization'
)

# Save
save_plot(fig, results_dir, f'compute_open_closed_by_org_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}')

fig.show()

In [76]:
for category in ['Open', 'Closed']:
    print(f"{category}:")
    for org in org_model_counts[category]:
        print(f"    {org}: {org_model_counts[category][org]}")

print(f"Open: Other / Total = {org_model_counts['Open']['Other'] / sum(org_model_counts['Open'].values()):.2%}")
print(f"Closed: Other / Total = {org_model_counts['Closed']['Other'] / sum(org_model_counts['Closed'].values()):.2%}")


Open:
    Google/DeepMind: 2
    OpenAI: 0
    Other: 3
    Microsoft: 0
    Meta: 5
Closed:
    Google/DeepMind: 4
    OpenAI: 4
    Other: 1
    Microsoft: 1
    Meta: 0
Open: Other / Total = 30.00%
Closed: Other / Total = 10.00%


# Llama trend and extrapolation

In [77]:
df_llamas = df_filtered[df_filtered['System'].str.contains('llama', case=False) & 
                        (df_filtered['System'].str.contains('65B') | 
                         df_filtered['System'].str.contains('70B') | 
                         df_filtered['System'].str.contains('405B') |
                         df_filtered['System'].str.contains('hypothetical'))]
df_llamas

Unnamed: 0,System,flop,date,Organization,Notability criteria,Domain,Base model,category,log_flop,date_float
1261,LLaMA-65B,5.5e+23,2023-02-24,Meta AI,"Historical significance,Highly cited",Language,,Open,23.740363,2023.146305
1380,Llama 2-70B,8.1e+23,2023-07-18,Meta AI,"Historical significance,Significant use,Highly...",Language,,Open,23.908485,2023.546544
1639,Llama 3-70B,6.3e+24,2024-04-18,Meta AI,Significant use,Language,,Open,24.799341,2024.296544
1690,Llama 3.1-405B,3.8e+25,2024-07-23,Meta AI,"SOTA improvement,Training cost",Language,,Open,25.579784,2024.560234


In [78]:
llama_fit_result = fit_ols_regression(df_llamas, ['date_float'], 'log_flop')
llama_fit_result.summary()


omni_normtest is not valid with less than 8 observations; 4 samples were given.



0,1,2,3
Dep. Variable:,y,R-squared:,0.928
Model:,OLS,Adj. R-squared:,0.892
Method:,Least Squares,F-statistic:,25.81
Date:,"Mon, 14 Oct 2024",Prob (F-statistic):,0.0366
Time:,11:49:15,Log-Likelihood:,0.80086
No. Observations:,4,AIC:,2.398
Df Residuals:,2,BIC:,1.171
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2515.8801,500.005,-5.032,0.037,-4667.229,-364.531
x1,1.2552,0.247,5.081,0.037,0.192,2.318

0,1,2,3
Omnibus:,,Durbin-Watson:,2.016
Prob(Omnibus):,,Jarque-Bera (JB):,0.612
Skew:,0.015,Prob(JB):,0.736
Kurtosis:,1.084,Cond. No.,7230000.0


In [79]:
llama_log_flop = get_predictions(llama_fit_result, df_llamas, ['date_float'])

In [80]:
# Plot the trend of Llama models
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_llamas['date'],
    y=df_llamas['flop'],
    mode='markers',
    text=df_llamas['System'],
    name='Largest Llama models',
    marker=dict(color='blue', size=10, opacity=0.5)
))
fig.add_trace(go.Scatter(
    x=df_llamas['date'],
    y=10**llama_log_flop,
    mode='lines',
    name='Llama trend (including #4)',
    line=dict(color='blue')
))
# Annotate Llama 4
fig.add_annotation(
    x=pd.Timestamp('2025-07-23'),
    y=np.log10(llama_3_405_compute * 10),
    text='Llama 4 (hypothetical)',
    showarrow=True,
    arrowhead=2,
    ax=0,
    ay=-40,
    font=dict(color='black', size=12)
)

# Add closed models with trendline
fit_result = fit_n_phase_exponential(df_filtered, 0, same_intercepts=(False,), same_slopes=(False,))

df_open = df_filtered[df_filtered['category'] == 'Open']
df_closed = df_filtered[df_filtered['category'] == 'Closed']

# Plot the original data points
# fig.add_trace(go.Scatter(
#     x=df_open['date'], y=df_open['log_flop'],
#     mode='markers', name='Open models',
#     marker=dict(color=colors['open'], opacity=0.3, size=10)
# ))
fig.add_trace(go.Scatter(
    x=df_closed['date'], y=df_closed['flop'],
    mode='markers', name='Largest closed models',
    marker=dict(color=colors['closed'], opacity=0.3, size=10)
))

# Plot the fit lines
date_grid = pd.date_range(start=df_filtered['date'].min(), end=df_llamas['date'].max(), freq='D')
log_flop_open = fit_result.predict(pd.Series(date_grid), pd.Series(['Open'] * len(date_grid)))
log_flop_closed = fit_result.predict(pd.Series(date_grid), pd.Series(['Closed'] * len(date_grid)))

# fig.add_trace(go.Scatter(
#     x=date_grid, y=log_flop_open,
#     mode='lines', name='Best Fit Line (Open)',
#     line=dict(color=colors['open'])
# ))
fig.add_trace(go.Scatter(
    x=date_grid, y=10**log_flop_closed,
    mode='lines', name='Closed trend',
    line=dict(color=colors['closed'])
))
fig.update_yaxes(type='log')

fig.update_layout(
    width=800,
    height=400,
    template='plotly_white',
    title='The training compute of Llama models may catch up to top closed models in 2025',
    xaxis_title='Model publication date',
    yaxis_title='Training compute (FLOP)',
)

save_plot(fig, results_dir, f'llama_compute_trend_with_hypothetical_4_{model_selection}_frontier={frontier_selection}_top{top_n}_cutoff={cutoff_date}')

fig.show()