In [305]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [306]:
import numpy as np
import os
import pandas as pd
import plotly.express as px

from data import *
from plotting import *
from regression import *
from utils import *

In [307]:
results_dir = 'results/updated-schema/'
os.makedirs(results_dir, exist_ok=True)

# Data

In [308]:
# Load data
pcd_df = load_pcd_df()

In [309]:
pcd_df

Unnamed: 0,System,Domain,Task,Authors,Notability criteria,Notability criteria notes,Model accessibility,Link,Citations,Reference,...,Organization categorization,Foundation model,Training compute lower bound,Training compute upper bound,Training chip-hours,Code accessibility,Dataset accessibility,Accessibility notes,Field 57,Organization categorization (from Organization)
0,Llama 3,,,,,,,https://engineering.fb.com/2024/03/12/data-cen...,,,...,,,,,,,,,,
1,DBRX,Language,"Chat,Code generation",Mosaic Research Team,,,Open access (restricted use),https://www.databricks.com/blog/introducing-db...,,Introducing DBRX: A New State-of-the-Art Open LLM,...,Industry,,,,,Unreleased,Unreleased,license: https://www.databricks.com/legal/open...,,Industry
2,Claude 3 Haiku,"Multimodal,Language,Vision",Chat,,,,API access,https://www-cdn.anthropic.com/de8ba9b01c9ab7cb...,,"The Claude 3 Model Family: Opus, Sonnet, Haiku",...,Industry,,,,,,,,,Industry
3,Claude 3 Sonnet,"Multimodal,Language,Vision",Chat,,,,API access,https://www-cdn.anthropic.com/de8ba9b01c9ab7cb...,,"The Claude 3 Model Family: Opus, Sonnet, Haiku",...,Industry,,,,,,,,,Industry
4,Claude 3 Opus,"Multimodal,Language,Vision",Chat,,SOTA improvement,,API access,https://www-cdn.anthropic.com/de8ba9b01c9ab7cb...,,"The Claude 3 Model Family: Opus, Sonnet, Haiku",...,Industry,,,,,,,,,Industry
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1408,Self Organizing System,Other,Pattern recognition,W. A. Clark and B. G. Farley,Historical significance,,,https://dl.acm.org/doi/10.1145/1455292.1455309,93.0,Generalization of pattern recognition in a sel...,...,Academia,,,,,,,,,Academia
1409,Genetic algorithm,Other,,NA Barricelli,Historical significance,Possibly first computer simulation of a geneti...,,https://link.springer.com/article/10.1007/BF01...,266.0,Numerical testing of evolution theories,...,Academia,,,,,,,,,Academia
1410,SNARC,Robotics,Maze solving,Marvin Minsky,Historical significance,,,https://en.wikipedia.org/wiki/Stochastic_neura...,33.0,A Neural-Analogue Calculator Based upon a Prob...,...,Academia,,,,,,,,,Academia
1411,Theseus,Robotics,Maze solving,Claude Shannon,Historical significance,,,https://www.technologyreview.com/2018/12/19/13...,0.0,Mighty Mouse,...,Industry,,,,,,,,,Industry


In [310]:
pcd_df.loc[pcd_df['System'] == 'Megatron-BERT']['Model accessibility']

809    Unreleased
Name: Model accessibility, dtype: object

In [311]:
access_df_with_deprecated = pcd_df.dropna(subset=['Publication date', 'Training compute (FLOP)', 'Model accessibility'])
len(access_df_with_deprecated)

239

In [312]:
access_df_with_deprecated['Model accessibility'].unique()

array(['Open access (restricted use)', 'API access', 'Open source',
       'Open access (non-commercial)', 'Hosted access (no API)',
       'Permissive license (depr.)', 'Fully open-source (depr.)',
       'Unreleased', 'Weights available (depr.)'], dtype=object)

In [313]:
for cat in access_df_with_deprecated['Model accessibility'].unique():
    print(cat, len(access_df_with_deprecated.loc[access_df_with_deprecated['Model accessibility'] == cat]))

Open access (restricted use) 17
API access 15
Open source 55
Open access (non-commercial) 13
Hosted access (no API) 4
Permissive license (depr.) 39
Fully open-source (depr.) 52
Unreleased 40
Weights available (depr.) 4


In [314]:
open_access_categories = ['Open source', 'Open access (restricted use)', 'Open access (non-commercial)']
closed_access_categories = ['API access', 'Hosted access (no API)', 'Unreleased']

In [315]:
access_df = access_df_with_deprecated.loc[
    access_df_with_deprecated['Model accessibility'].isin(open_access_categories + closed_access_categories)
]

In [316]:
open_access_df = access_df.loc[access_df['Model accessibility'].isin(open_access_categories)]
len(open_access_df)

85

In [317]:
closed_access_df = access_df.loc[access_df['Model accessibility'].isin(closed_access_categories)]
len(closed_access_df)

59

In [318]:
# Add column with binary access label
access_df['Model open/closed'] = access_df['Model accessibility'].apply(
    lambda x: 'Open' if x in open_access_categories else 'Closed'
)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Compute percentile filtering

In [319]:
outlier_window_size = 2  # years
start_large_scale_era = '2015-09-01'

In [320]:
pcd_df['Publication date'] = pd.to_datetime(pcd_df['Publication date'])
pcd_df.sort_values('Publication date', inplace=True)
pcd_df.dropna(subset=['Publication date', 'Notability criteria', 'Training compute (FLOP)'], inplace=True)

In [321]:
systems_by_percentile = {}
percentile_interval = 5
for percentile in range(95, -5, -percentile_interval):
  print(percentile)
  percentile_compute_low = np.zeros(len(pcd_df))
  percentile_compute_high = np.zeros(len(pcd_df))
  # Iterate through each row and calculate the 2-year moving average for each date
  for i, (index, row) in enumerate(pcd_df.iterrows()):
    # Define the 2-year window
    start_date = row['Publication date'] - pd.DateOffset(years=outlier_window_size/2)
    end_date = row['Publication date'] + pd.DateOffset(years=outlier_window_size/2)

    # Filter the DataFrame for this window
    window_df = pcd_df[(pcd_df['Publication date'] >= start_date) & (pcd_df['Publication date'] <= end_date)]

    percentile_compute_low[i] = np.percentile(window_df['Training compute (FLOP)'], percentile)
    percentile_compute_high[i] = np.percentile(window_df['Training compute (FLOP)'], percentile + percentile_interval)

  systems_flag = pcd_df['Training compute (FLOP)'] > np.array(percentile_compute_low)
  extra_systems_flag = pcd_df['Training compute (FLOP)'] <= np.array(percentile_compute_high)

  # raise Exception("Edit the following line if you want to consider models released after 2023-12-31.")
  extra_systems = pcd_df['System'][systems_flag & extra_systems_flag & (pcd_df['Publication date'] > pd.to_datetime('2015-09-30'))].values

  systems_by_percentile[percentile] = list(extra_systems)

95
90
85
80
75
70
65
60
55
50
45
40
35
30
25
20
15
10
5
0


In [322]:
systems_by_percentile

{95: ['GNMT',
  'AlphaGo Master',
  'AlphaGo Zero',
  'AlphaZero',
  'ResNeXt-101 32x48d',
  'Megatron-BERT',
  'OpenAI Five',
  'Meena',
  'GPT-3 175B (davinci)',
  'Megatron-Turing NLG 530B',
  'ERNIE 3.0 Titan',
  'PaLM (540B)',
  'Minerva (540B)',
  'GPT-4',
  'PaLM 2',
  'Inflection-2',
  'Gemini Ultra'],
 90: ['NASv3 (CIFAR-10)',
  'FTW',
  'T5-11B',
  'AlphaStar',
  'mT5-XXL',
  'Switch',
  'Gopher (280B)',
  'Chinchilla',
  'U-PaLM (540B)',
  'GPT-3.5 (text-davinci-003)',
  'Claude 2',
  'Falcon 180B'],
 85: ['AlphaGo Fan',
  'AlphaGo Lee',
  'BigGAN-deep 512x512',
  'Megatron-LM (8.3B)',
  'OpenAI Five Rerun',
  'Turing-NLG',
  'Yuan 1.0',
  'GLaM',
  'LaMDA',
  'OPT-175B',
  'BLOOM-176B',
  'Llama 2-70B',
  'ChatGLM3',
  'Qwen-72B'],
 80: ['JFT',
  'OpenAI TI7 DOTA 1v1',
  'AmoebaNet-A (F=448)',
  'GPT-2 (1.5B)',
  'iGPT-XL',
  'DALL-E',
  'Meta Pseudo Labels',
  'ProtT5-XXL',
  'ByT5-XXL',
  'GOAT',
  'HyperCLOVA',
  'AlphaCode',
  'Flamingo',
  'Parti',
  'BlenderBot 3',
  

In [323]:
compute_percentile_threshold = 50
selected_systems = []
for percentile in range(compute_percentile_threshold, 100, 5):
    selected_systems.extend(systems_by_percentile[percentile])
selected_systems

['BIDAF',
 'Transformer',
 'GPT',
 'GBERT-Large',
 'wave2vec 2.0 LARGE',
 'AlphaFold 2',
 'DeBERTa',
 'HuBERT',
 'XGLM',
 'Imagen',
 'NLLB',
 'ESM2-3B',
 'LLaMA-7B',
 'LLaMA-13B',
 'WizardLM-7B',
 'Pangu-Weather',
 'LSTM (Hebbian, Cache, MbPA)',
 'SciBERT',
 'DD-PPO',
 'ViT-Huge/14',
 'MSA Transformer',
 'M6-T',
 'T0-XXL',
 'Whisper',
 'XGen-7B',
 'DeepStack',
 'PNASNet-5',
 'Population-based DRL',
 'Mesh-TensorFlow Transformer 2.9B (translation)',
 'Megatron-LM (355M)',
 'T5-3B',
 'CamemBERT',
 'Noisy Student (L2)',
 'Once for All',
 'CLIP (ViT L/14@336px)',
 'ERNIE 3.0',
 'BASIC-L',
 'XGLM-7.5B',
 'ESM2-15B',
 'PaLI',
 'Taiyi-Stable Diffusion',
 'Llama 2-7B',
 'Nemotron-3-8B',
 'ResNet-152 (ImageNet)',
 'PolyNet',
 'MoE',
 'YOLOv3',
 'Transformer (Adaptive Input Embeddings)',
 'Mesh-TensorFlow Transformer 4.9B (language modelling)',
 'BERT-Large-CAS (PTB+WT2+WT103)',
 'Conformer + Wav2vec 2.0 + Noisy Student',
 'ProtT5-XXL-BFD',
 'CogView',
 'ALIGN',
 'Florence',
 'Stable Diffusion (

# Regression

In [324]:
access_df['Publication date (float)'] = datetime_to_float_year(
    pd.to_datetime(access_df['Publication date'])
)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [325]:
filtered_access_df = access_df.loc[access_df['System'].isin(selected_systems)]

In [326]:
reg_results = fit_ols_regression(
    filtered_access_df,
    ['Publication date (float)'],
    'Training compute (FLOP)',
    logy=True
)
reg_results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.576
Model:,OLS,Adj. R-squared:,0.569
Method:,Least Squares,F-statistic:,90.94
Date:,"Fri, 05 Apr 2024",Prob (F-statistic):,4.23e-14
Time:,15:38:01,Log-Likelihood:,-81.836
No. Observations:,69,AIC:,167.7
Df Residuals:,67,BIC:,172.1
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1104.7897,118.274,-9.341,0.000,-1340.866,-868.713
x1,0.5578,0.058,9.536,0.000,0.441,0.675

0,1,2,3
Omnibus:,0.956,Durbin-Watson:,1.673
Prob(Omnibus):,0.62,Jarque-Bera (JB):,0.889
Skew:,-0.001,Prob(JB):,0.641
Kurtosis:,2.444,Cond. No.,2470000.0


In [327]:
open_reg_results = fit_ols_regression(
    filtered_access_df[filtered_access_df['Model open/closed'] == 'Open'],
    ['Publication date (float)'],
    'Training compute (FLOP)',
    logy=True
)
open_reg_results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.696
Model:,OLS,Adj. R-squared:,0.686
Method:,Least Squares,F-statistic:,68.7
Date:,"Fri, 05 Apr 2024",Prob (F-statistic):,2.99e-09
Time:,15:38:01,Log-Likelihood:,-26.927
No. Observations:,32,AIC:,57.85
Df Residuals:,30,BIC:,60.79
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1175.2664,144.549,-8.131,0.000,-1470.476,-880.057
x1,0.5924,0.071,8.288,0.000,0.446,0.738

0,1,2,3
Omnibus:,0.248,Durbin-Watson:,1.572
Prob(Omnibus):,0.883,Jarque-Bera (JB):,0.443
Skew:,-0.013,Prob(JB):,0.801
Kurtosis:,2.424,Cond. No.,2850000.0


In [328]:
closed_reg_results = fit_ols_regression(
    filtered_access_df[filtered_access_df['Model open/closed'] == 'Closed'],
    ['Publication date (float)'],
    'Training compute (FLOP)',
    logy=True
)
closed_reg_results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.676
Model:,OLS,Adj. R-squared:,0.667
Method:,Least Squares,F-statistic:,73.08
Date:,"Fri, 05 Apr 2024",Prob (F-statistic):,4.33e-10
Time:,15:38:01,Log-Likelihood:,-41.676
No. Observations:,37,AIC:,87.35
Df Residuals:,35,BIC:,90.57
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1195.4736,142.574,-8.385,0.000,-1484.915,-906.032
x1,0.6029,0.071,8.549,0.000,0.460,0.746

0,1,2,3
Omnibus:,1.533,Durbin-Watson:,1.71
Prob(Omnibus):,0.465,Jarque-Bera (JB):,1.155
Skew:,-0.43,Prob(JB):,0.561
Kurtosis:,2.907,Cond. No.,2280000.0


In [329]:
print('All:')
print_growth_rates(reg_results)
print('Open:')
print_growth_rates(open_reg_results)
print('Closed:')
print_growth_rates(closed_reg_results)

All:
Adj. R^2=0.57
0.56 OOMs/year (95% CI: 0.44, 0.67)
3.6x/year (95% CI: 2.8x, 4.7x)
doubling time of 6 months (95% CI: 5, 8)
Open:
Adj. R^2=0.69
0.59 OOMs/year (95% CI: 0.45, 0.74)
3.9x/year (95% CI: 2.8x, 5.5x)
doubling time of 6 months (95% CI: 5, 8)
Closed:
Adj. R^2=0.67
0.60 OOMs/year (95% CI: 0.46, 0.75)
4.0x/year (95% CI: 2.9x, 5.6x)
doubling time of 6 months (95% CI: 5, 8)


## Predictions

In [330]:
pred_start_year = 2015
pred_end_year = 2025
pred_start_date = f'{pred_start_year}-01-01'
pred_end_date = f'{pred_end_year}-01-01'
num_preds = (pred_end_year - pred_start_year) * 12 + 1

pred_years = pd.DataFrame({'Publication date (float)': np.linspace(pred_start_year, pred_end_year, num_preds)})
pred_years

Unnamed: 0,Publication date (float)
0,2015.000000
1,2015.083333
2,2015.166667
3,2015.250000
4,2015.333333
...,...
116,2024.666667
117,2024.750000
118,2024.833333
119,2024.916667


In [331]:
predicted_open_df = get_predictions(open_reg_results, pred_years, ['Publication date (float)'])
predicted_open_df['Publication date'] = predicted_open_df['Publication date (float)'].apply(float_year_to_datetime)
predicted_open_df

Unnamed: 0,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper,Publication date (float),Publication date
0,18.490950,0.530567,17.407388,19.574513,16.886007,20.095893,2015.000000,2015-01-01
1,18.540320,0.524724,17.468691,19.611949,16.943409,20.137231,2015.083333,2015-01-31
2,18.589689,0.518883,17.529988,19.649391,17.000758,20.178621,2015.166667,2015-03-02
3,18.639059,0.513046,17.591280,19.686838,17.058054,20.220064,2015.250000,2015-04-02
4,18.688429,0.507210,17.652567,19.724291,17.115296,20.261561,2015.333333,2015-05-02
...,...,...,...,...,...,...,...,...
116,24.217826,0.198827,23.811767,24.623885,22.966179,25.469472,2024.666667,2024-09-01
117,24.267195,0.203954,23.850665,24.683726,23.012112,25.522278,2024.750000,2024-10-01
118,24.316565,0.209126,23.889473,24.743657,23.057938,25.575192,2024.833333,2024-10-31
119,24.365935,0.214338,23.928198,24.803671,23.103655,25.628214,2024.916667,2024-12-01


In [332]:
predicted_open_df.set_index('Publication date', inplace=True)

In [333]:
predicted_closed_df = get_predictions(closed_reg_results, pred_years, ['Publication date (float)'])
predicted_closed_df['Publication date'] = predicted_closed_df['Publication date (float)'].apply(float_year_to_datetime)
predicted_closed_df

Unnamed: 0,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper,Publication date (float),Publication date
0,19.285054,0.493049,18.284111,20.285997,17.433332,21.136776,2015.000000,2015-01-01
1,19.335292,0.487370,18.345878,20.324707,17.489776,21.180808,2015.083333,2015-01-31
2,19.385530,0.481696,18.407635,20.363426,17.546164,21.224896,2015.166667,2015-03-02
3,19.435768,0.476027,18.469381,20.402155,17.602495,21.269042,2015.250000,2015-04-02
4,19.486007,0.470363,18.531118,20.440895,17.658768,21.313245,2015.333333,2015-05-02
...,...,...,...,...,...,...,...,...
116,25.112680,0.240757,24.623917,25.601443,23.479928,26.745432,2024.666667,2024-09-01
117,25.162918,0.245782,24.663954,25.661882,23.527084,26.798752,2024.750000,2024-10-01
118,25.213156,0.250843,24.703917,25.722396,23.574159,26.852154,2024.833333,2024-10-31
119,25.263395,0.255940,24.743809,25.782980,23.621153,26.905636,2024.916667,2024-12-01


In [334]:
predicted_closed_df.set_index('Publication date', inplace=True)

## Differences between trends

In [335]:
NUM_SAMPLES = 1000

In [336]:
open_reg_results.conf_int()

array([[-1.47047555e+03, -8.80057212e+02],
       [ 4.46457267e-01,  7.38413534e-01]])

In [337]:
median_date = datetime_to_float_year(pd.to_datetime(filtered_access_df['Publication date'])).median()
median_date

2022.4522594700363

In [338]:
compute_refs = {}
for key, value in {'open': open_reg_results, 'closed': closed_reg_results}.items():
    X = np.array([median_date, 1])  # dummy 1 value makes this work
    X = sm.add_constant(X)
    ref_preds = value.get_prediction(X).summary_frame()
    compute_ref_low = ref_preds['mean_ci_lower'][0]
    compute_ref_high = ref_preds['mean_ci_upper'][0]
    compute_refs[key] = compute_ref_low, compute_ref_high
compute_refs

{'open': (22.695184337746053, 23.116680787246505),
 'closed': (23.503029133104835, 24.052385347745233)}

In [339]:
# Open
open_ci = open_reg_results.conf_int()
open_slope = lognorm_from_90_ci(
    10**open_ci[1][0],
    10**open_ci[1][1],
    NUM_SAMPLES,
)
# Closed
closed_ci = closed_reg_results.conf_int()
closed_slope = lognorm_from_90_ci(
    10**closed_ci[1][0],
    10**closed_ci[1][1],
    NUM_SAMPLES,
)

In [340]:
print_median_and_ci(open_slope)
print_median_and_ci(closed_slope)

Median: 4 [90% CI: 2.8, 5.4]
Median: 4 [90% CI: 3, 5.5]


In [341]:
# Calculate when the closed compute trend was equal to the open compute trend today
current_date = '2024-04-01'
current_open_compute_df = predicted_open_df.loc[current_date]
current_open_compute = lognorm_from_90_ci(
    10**current_open_compute_df['mean_ci_lower'],
    10**current_open_compute_df['mean_ci_upper'],
    NUM_SAMPLES,
)
print_median_and_ci(current_open_compute)

Median: 9.3e+23 [90% CI: 4.1e+23, 2.1e+24]


In [342]:
# Same for closed
current_closed_compute_df = predicted_closed_df.loc[current_date]
current_closed_compute = lognorm_from_90_ci(
    10**current_closed_compute_df['mean_ci_lower'],
    10**current_closed_compute_df['mean_ci_upper'],
    NUM_SAMPLES,
)
print_median_and_ci(current_closed_compute)

Median: 7.7e+24 [90% CI: 3.1e+24, 2.1e+25]


In [343]:
compute_gap = np.log10(current_closed_compute) - np.log10(current_open_compute)
print(
    f'Open access models are {np.median(compute_gap):.1f} OOMs ' + 
    f'[{np.percentile(compute_gap, 2.5):.1f}; {np.percentile(compute_gap, 97.5):.1f}] ' + 
    'behind closed access models'
)

Open access models are 0.9 OOMs [0.3; 1.6] behind closed access models


In [344]:
# Calculate when the closed compute trend was equal to the open compute trend today
# TODO: do this with confidence interval



In [345]:
current_date = 2024.25
current_open_compute = open_reg_results.params[1] * current_date + open_reg_results.params[0]
current_open_compute

23.97097769370089

In [346]:
closed_compute_match_date = (current_open_compute - closed_reg_results.params[0]) / closed_reg_results.params[1]
closed_compute_match_date

2022.772849881161

In [347]:
open_time_lag = current_date - closed_compute_match_date
print(f'Open access models are {open_time_lag:.1f} years behind closed access models')

Open access models are 1.5 years behind closed access models


# Plots

In [348]:
fig = px.scatter(
    access_df_with_deprecated,
    x='Publication date',
    y='Training compute (FLOP)',
    color='Model accessibility',
    hover_data=['System'],
    log_y=True,
)

save_plot(fig, results_dir, 'training_compute_by_model_accessibility_depr')

fig.show()

In [349]:
fig = px.scatter(
    access_df,
    x='Publication date',
    y='Training compute (FLOP)',
    color='Model accessibility',
    hover_data=['System'],
    log_y=True,
)

save_plot(fig, results_dir, 'training_compute_by_model_accessibility')

fig.show()

In [350]:
fig = px.scatter(
    # access_df,
    access_df.loc[access_df['System'].isin(selected_systems)],
    x='Publication date',
    y='Training compute (FLOP)',
    color='Model open/closed',
    hover_data=['System'],
    log_y=True,
)

save_plot(fig, results_dir, 'training_compute_by_open_closed')

fig.show()

In [351]:
fig = px.scatter(
    # access_df,
    access_df.loc[access_df['System'].isin(selected_systems)],
    x='Publication date',
    y='Training compute (FLOP)',
    color='Model open/closed',
    hover_data=['System'],
    log_y=True,
)

# Marker color
# fig.update_traces(
#     marker=dict(
#         color='rgb(0,100,200)',
#     ),
#     selector=dict(mode='markers'),
# )

# Shade in CI
fig.add_scatter(
    x=predicted_open_df['Publication date'],
    y=10**predicted_open_df['mean_ci_lower'],
    mode='lines',
    line=dict(width=0),
    showlegend=False,
)
fig.add_scatter(
    x=predicted_open_df['Publication date'],
    y=10**predicted_open_df['mean_ci_upper'],
    mode='lines',
    fill='tonexty',
    fillcolor='rgba(230,100,0,0.2)',
    line=dict(width=0),
    name='95% CI of mean',
)
fig.add_scatter(
    x=predicted_open_df['Publication date'],
    y=10**predicted_open_df['mean'],
    mode='lines',
    line=dict(color='rgb(230,100,0)'),
    name=f'Regression mean (growth rate: {10**reg_results.params[1]:.1f}x per year)',
)

fig.add_scatter(
    x=predicted_closed_df['Publication date'],
    y=10**predicted_closed_df['mean_ci_lower'],
    mode='lines',
    line=dict(width=0),
    showlegend=False,
)
fig.add_scatter(
    x=predicted_closed_df['Publication date'],
    y=10**predicted_closed_df['mean_ci_upper'],
    mode='lines',
    fill='tonexty',
    fillcolor='rgba(0,100,200,0.2)',
    line=dict(width=0),
    name='95% CI of mean',
)
fig.add_scatter(
    x=predicted_closed_df['Publication date'],
    y=10**predicted_closed_df['mean'],
    mode='lines',
    line=dict(color='rgb(0,100,200)'),
    name=f'Regression mean (growth rate: {10**reg_results.params[1]:.1f}x per year)',
)

fig.update_traces(textposition='top center')

# legend on top
fig.update_layout(legend=dict(
    orientation='h',
    yanchor='top',
    y=-0.15,
    xanchor='center',
    x=0.5,
))

# axis labels
# fig.update_xaxes(title_text='Publication date')
# fig.update_yaxes(title_text='Cost (2023 USD)')

# title


# update size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_x=0.5,
)

# font size
fig.update_layout(
    font=dict(
        size=14,
    )
)

# axis limits
fig.update_xaxes(range=[pred_start_date, pred_end_date])

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

save_plot(fig, results_dir, 'open_closed_regression')

fig.show()

KeyError: 'Publication date'

In [None]:
access_df.columns

Index(['System', 'Domain', 'Task', 'Authors', 'Notability criteria',
       'Notability criteria notes', 'Model accessibility', 'Link', 'Citations',
       'Reference', 'Publication date', 'Organization', 'Parameters',
       'Parameters notes', 'Training compute (FLOP)', 'Training compute notes',
       'Training dataset', 'Training dataset notes',
       'Training dataset size (datapoints)', 'Dataset size notes', 'Epochs',
       'Inference compute (FLOP)', 'Inference compute notes',
       'Training time (hours)', 'Training time notes', 'Training hardware',
       'Approach', 'Training compute cost (2020 USD)', 'Compute cost notes',
       'Compute sponsor categorization', 'Confidence', 'Abstract',
       'Last modified', 'Created By', 'Benchmark data', 'Exclude',
       'Country (from Organization)', 'Base model', 'Finetune compute (FLOP)',
       'Finetune compute notes', 'Hardware quantity', 'Hardware utilization',
       'Training cost trends', 'Training cloud compute vendor',
 