# Setup

In [44]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [45]:
from contextlib import redirect_stdout
import numpy as np
import os
import pandas as pd
import plotly.express as px

from cost import *
from plotting import *
from prices import *
from imputation import *
from inflation import *
from regression import *
from utils import *

In [46]:
estimation_method = 'hardware-capex-energy'
compute_threshold_method = 'top_n'  # top_n, window_percentile
compute_threshold = 10  # e.g. 10 to select top 10; 75 to select top 25%
variant = 'original'
exclude_models_containing = []  # e.g. ['AlphaGo Master', 'AlphaGo Zero']

estimation_method_lookup = {
    'hardware-capex-energy': estimate_hardware_capex_energy,
    'hardware-acquisition': estimate_hardware_acquisition_cost,
    'cloud': estimate_cloud_costs,
}
cost_estimation_function = estimation_method_lookup[estimation_method]

results_dir = f'results/{estimation_method}-{compute_threshold_method}={compute_threshold}-{variant}/'
os.makedirs(results_dir, exist_ok=True)

# Load data

In [47]:
frontier_pcd_df, hardware_df, price_df = load_data_for_cost_estimation(
    compute_threshold_method=compute_threshold_method, compute_threshold=compute_threshold,
)

In [48]:
len(frontier_pcd_df), len(hardware_df), len(price_df)

(64, 5510, 379)

# Cost estimation

In [49]:
with open(f'{results_dir}/cost_estimation.out', 'w') as f:
    with redirect_stdout(f):
        cost_df = cost_estimation_function(frontier_pcd_df, hardware_df, price_df)

In [50]:
if estimation_method == 'hardware-capex-energy':
    frontier_pcd_df_copy = frontier_pcd_df.copy()
    with open(f'{results_dir}/component_cost_estimation.out', 'w') as f:
        with redirect_stdout(f):
            component_cost_df = cost_estimation_function(frontier_pcd_df_copy, hardware_df, price_df, separate_components=True)

In [51]:
cost_df

Unnamed: 0,System,Domain,Task,Authors,Notability criteria,Notability criteria notes,Model accessibility,Link,Citations,Reference,...,Training compute upper bound,Training chip-hours,Code accessibility,Dataset accessibility,Accessibility notes,Organization categorization (from Organization),Possibly over 1e23 FLOP,Training cost trends 2,Training cost trends 3,Cost
88,Gemini 1.0 Ultra,"Multimodal,Language,Vision","Language modelling,Visual question answering,C...",Gemini Team,SOTA improvement,""" Evaluation on a broad range of benchmarks sh...",Hosted access (no API),https://storage.googleapis.com/deepmind-media/...,633.0,Gemini: A Family of Highly Capable Multimodal ...,...,,132000000.0,,,,Industry,,,,2.982734e+07
104,Inflection-2,Language,Language modelling,,Significant use,Inflection-2 either already powers Pi or soon ...,Hosted access (no API),https://inflection.ai/inflection-2,,Inflection-2: The Next Step Up,...,,,,,"via Pi, no API",Industry,,,,1.293179e+07
130,Grok-1,Language,"Language modelling,Chat",,SOTA improvement,"""On these benchmarks, Grok-1 displayed strong ...",Open source,"https://x.ai/model-card/, https://x.ai/blog/gr...",,Announcing Grok,...,7.0,,Unreleased,Unreleased,apache 2.0,Industry,checked,,,
193,Falcon-180B,Language,Language modelling,"Ebtesam Almazrouei, Hamza Alobeidli, Abdulaziz...",SOTA improvement,"""It's currently at the top of the Hugging Face...",Open access (restricted use),https://falconllm.tii.ae/falcon-180b.html; htt...,86.0,The Falcon Series of Open Language Models,...,,17694720.0,,,"""Falcon 180b can be commercially used but unde...",Government,checked,,,1.027732e+07
243,Claude 2,Language,"Language modelling,Chat",,Historical significance,,API access,"https://www.anthropic.com/index/claude-2, http...",0.0,,...,,,,,,Industry,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1179,Xception,Vision,Image classification,François Chollet,Highly cited,,,https://arxiv.org/abs/1610.02357,11578.0,Xception: Deep Learning with Depthwise Separab...,...,,43200.0,,,,Industry,,,,1.155451e+04
1180,GNMT,Language,Translation,"Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc ...",Highly cited,,Hosted access (no API),https://arxiv.org/abs/1609.08144,6196.0,Google's Neural Machine Translation System: Br...,...,,655730.0,,,presumably deployed via Google translate,Industry,,,,1.774592e+05
1222,AlphaGo Lee,Games,Go,"David Silver, Aja Huang, Chris J. Maddison, Ar...",Highly cited,,,https://www.nature.com/articles/nature16961,14887.0,Mastering the game of Go with deep neural netw...,...,,,,,,Industry,,,,
1226,ResNet-152 (ImageNet),Vision,Image classification,"Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun",Highly cited,,,https://arxiv.org/abs/1512.03385,156882.0,Deep Residual Learning for Image Recognition,...,,,,,,Industry,,,,


In [52]:
cost_df['Cost'].notna().sum()

45

In [53]:
cost_df.dropna(subset=['Cost'])['Training time (hours)'].notna().sum()

30

In [54]:
cost_df.dropna(subset=['Cost'])['Hardware utilization'].notna().sum()

18

Exclusion

In [55]:
cost_df[['System', 'Publication date']].tail(15)

Unnamed: 0,System,Publication date
1045,ResNeXt-101 32x48d,2018-05-02
1076,AlphaZero,2017-12-05
1092,AlphaGo Zero,2017-10-18
1114,OpenAI TI7 DOTA 1v1,2017-08-11
1126,JFT,2017-07-10
1148,MoE,2017-01-23
1151,Libratus,2017-01-01
1152,AlphaGo Master,2017-01-01
1168,PolyNet,2016-11-17
1171,NASv3 (CIFAR-10),2016-11-05


In [56]:
for kw in exclude_models_containing:
    cost_df = cost_df[cost_df['System'].str.contains(kw) == False]
cost_df[['System', 'Publication date']].tail(15)

Unnamed: 0,System,Publication date
1045,ResNeXt-101 32x48d,2018-05-02
1076,AlphaZero,2017-12-05
1092,AlphaGo Zero,2017-10-18
1114,OpenAI TI7 DOTA 1v1,2017-08-11
1126,JFT,2017-07-10
1148,MoE,2017-01-23
1151,Libratus,2017-01-01
1152,AlphaGo Master,2017-01-01
1168,PolyNet,2016-11-17
1171,NASv3 (CIFAR-10),2016-11-05


Use the below to check data availability for specific systems

In [57]:
# system = 'WizardLM-7B'
# row = cost_df.loc[cost_df['System'] == system]
# print('Cost:', row['Cost'].values[0])
# print('Training hardware:', row['Training hardware'].values[0])
# print('Training time (hours):', row['Training time (hours)'].values[0])
# print('Hardware quantity:', row['Hardware quantity'].values[0])
# print('Hardware utilization:', row['Hardware utilization'].values[0])

# Apply inflation adjustment

In [58]:
cost_df['Cost'].dropna()

88      2.982734e+07
104     1.293179e+07
193     1.027732e+07
294     4.816068e+06
336     4.015593e+07
396     4.534422e+06
408     8.833166e+05
423     2.915239e+06
425     2.903292e+06
448     1.436066e+06
449     8.042599e+05
471     3.121761e+06
477     3.384675e+05
514     7.202773e+05
524     2.897985e+06
546     2.224292e+05
576     5.246842e+05
580     5.975319e+05
622     3.590762e+06
629     5.674315e+05
650     8.219443e+04
672     8.940160e+04
680     8.287237e+04
718     5.211218e+04
734     1.345841e+05
737     1.224800e+05
799     1.010278e+05
806     2.107048e+06
841     5.298256e+04
846     1.968088e+05
873     1.180148e+05
877     7.087411e+04
891     6.216056e+05
892     1.070241e+05
917     8.276500e+04
1003    4.799969e+03
1076    2.126712e+05
1092    5.674602e+05
1126    1.596084e+04
1148    3.538189e+03
1152    4.317229e+05
1168    5.636840e+02
1179    1.155451e+04
1180    1.774592e+05
1228    1.854566e+02
Name: Cost, dtype: float64

In [59]:
cost_df = adjust_column_for_inflation(cost_df, 'Cost', 'data/PCU518210518210.csv', '2023-12-01')

In [60]:
cost_df['Cost (inflation-adjusted)'].dropna()

88      2.982734e+07
104     1.296196e+07
193     1.034091e+07
294     4.865570e+06
336     4.058659e+07
396     4.625551e+06
408     9.010687e+05
423     2.973979e+06
425     2.961791e+06
448     1.464701e+06
449     8.202966e+05
471     3.180656e+06
477     3.448529e+05
514     7.316676e+05
524     2.945950e+06
546     2.299500e+05
576     5.414374e+05
580     6.166111e+05
622     3.704291e+06
629     5.864088e+05
650     8.479979e+04
672     9.245338e+04
680     8.570126e+04
718     5.384428e+04
734     1.396636e+05
737     1.271026e+05
799     1.053921e+05
806     2.211639e+06
841     5.556354e+04
846     2.067604e+05
873     1.257581e+05
877     7.552439e+04
891     6.629848e+05
892     1.141485e+05
917     8.907265e+04
1003    5.170457e+03
1076    2.299186e+05
1092    6.134806e+05
1126    1.723959e+04
1148    3.863735e+03
1152    4.714453e+05
1168    6.155480e+02
1179    1.261763e+04
1180    1.937871e+05
1228    2.063058e+02
Name: Cost (inflation-adjusted), dtype: float64

In [61]:
assert cost_df['Cost (inflation-adjusted)'].notna().sum() == cost_df['Cost'].notna().sum()

# Regression

In [62]:
cost_df['Publication date (float)'] = datetime_to_float_year(pd.to_datetime(cost_df['Publication date']))

In [63]:
reg_results = fit_ols_regression(cost_df, ['Publication date (float)'], 'Cost (inflation-adjusted)', logy=True)
reg_results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.607
Model:,OLS,Adj. R-squared:,0.598
Method:,Least Squares,F-statistic:,66.34
Date:,"Mon, 27 May 2024",Prob (F-statistic):,2.96e-10
Time:,00:34:56,Log-Likelihood:,-47.264
No. Observations:,45,AIC:,98.53
Df Residuals:,43,BIC:,102.1
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-778.9320,96.309,-8.088,0.000,-973.157,-584.707
x1,0.3882,0.048,8.145,0.000,0.292,0.484

0,1,2,3
Omnibus:,1.469,Durbin-Watson:,2.05
Prob(Omnibus):,0.48,Jarque-Bera (JB):,1.361
Skew:,0.403,Prob(JB):,0.506
Kurtosis:,2.723,Cond. No.,1850000.0


In [64]:
with open(f'{results_dir}/regression_results.out', 'w') as f:
    with redirect_stdout(f):
        print_growth_rates(reg_results)
print_growth_rates(reg_results)

N=45.0
R^2=0.61
0.39 OOMs/year (95% CI: 0.29, 0.48)
2.4x/year (95% CI: 2.0x, 3.1x)
doubling time of 9 months (95% CI: 7, 12)


In [65]:
pred_start_year = 2015
pred_end_year = 2025
pred_start_date = f'{pred_start_year}-01-01'
pred_end_date = f'{pred_end_year}-01-01'

pred_years = pd.DataFrame({'Publication date (float)': np.linspace(pred_start_year, pred_end_year, 100)})
pred_years

Unnamed: 0,Publication date (float)
0,2015.00000
1,2015.10101
2,2015.20202
3,2015.30303
4,2015.40404
...,...
95,2024.59596
96,2024.69697
97,2024.79798
98,2024.89899


In [66]:

predicted_cost_df = get_predictions(reg_results, pred_years, ['Publication date (float)'])
predicted_cost_df['Publication date'] = predicted_cost_df['Publication date (float)'].apply(float_year_to_datetime)
predicted_cost_df

Unnamed: 0,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper,Publication date (float),Publication date
0,3.262606,0.293062,2.671590,3.853621,1.718075,4.807136,2015.00000,2015-01-01
1,3.301816,0.288575,2.719849,3.883784,1.760725,4.842908,2015.10101,2015-02-06
2,3.341027,0.284100,2.768085,3.913969,1.803321,4.878733,2015.20202,2015-03-15
3,3.380238,0.279635,2.816299,3.944176,1.845864,4.914612,2015.30303,2015-04-21
4,3.419448,0.275183,2.864489,3.974408,1.888352,4.950545,2015.40404,2015-05-28
...,...,...,...,...,...,...,...,...
95,6.987622,0.212023,6.560036,7.415208,5.497955,8.477288,2024.59596,2024-08-06
96,7.026833,0.216213,6.590798,7.462867,5.534719,8.518946,2024.69697,2024-09-12
97,7.066043,0.220428,6.621508,7.510578,5.571424,8.560663,2024.79798,2024-10-19
98,7.105254,0.224667,6.652170,7.558338,5.608070,8.602438,2024.89899,2024-11-25


# Export data

In [67]:
keep_cols = [
    'System',
    'Domain',
    'Task',
    'Model accessibility',
    'Reference',
    'Publication date',
    'Organization',
    'Parameters',
    'Training compute (FLOP)',
    'Training dataset size (datapoints)',
    'Epochs',
    'Training time (hours)',
    'Training hardware',
    'Country (from Organization)',
    'Base model',
    'Finetune compute (FLOP)',
    'Hardware quantity',
    'Hardware utilization',
    'Training cloud compute vendor',
    'Training data center',
    # 'Training time (chip hours)',
    'Cost',
    'Cost (inflation-adjusted)',
]
cost_df[keep_cols]

Unnamed: 0,System,Domain,Task,Model accessibility,Reference,Publication date,Organization,Parameters,Training compute (FLOP),Training dataset size (datapoints),...,Training hardware,Country (from Organization),Base model,Finetune compute (FLOP),Hardware quantity,Hardware utilization,Training cloud compute vendor,Training data center,Cost,Cost (inflation-adjusted)
88,Gemini 1.0 Ultra,"Multimodal,Language,Vision","Language modelling,Visual question answering,C...",Hosted access (no API),Gemini: A Family of Highly Capable Multimodal ...,2023-12-06,Google DeepMind,,5.000000e+25,,...,Google TPU v4,Multinational,,,55000.0,,,,2.982734e+07,2.982734e+07
104,Inflection-2,Language,Language modelling,Hosted access (no API),Inflection-2: The Next Step Up,2023-11-22,Inflection AI,,1.001000e+25,,...,NVIDIA H100 SXM5,United States of America,,,5000.0,,,,1.293179e+07,1.296196e+07
130,Grok-1,Language,"Language modelling,Chat",Open source,Announcing Grok,2023-11-04,xAI,3.140000e+11,2.900000e+24,,...,,United States of America,,,,,,,,
193,Falcon-180B,Language,Language modelling,Open access (restricted use),The Falcon Series of Open Language Models,2023-09-06,Technology Innovation Institute,1.800000e+11,3.760000e+24,3.500000e+12,...,NVIDIA A100 SXM4 40 GB,United Arab Emirates,,,4096.0,0.1876,Amazon Web Services,,1.027732e+07,1.034091e+07
243,Claude 2,Language,"Language modelling,Chat",API access,,2023-07-11,Anthropic,,3.866000e+24,,...,,United States of America,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1179,Xception,Vision,Image classification,,Xception: Deep Learning with Depthwise Separab...,2016-10-07,Google,2.285595e+07,4.360000e+20,3.500000e+08,...,NVIDIA Tesla K80,United States of America,,,60.0,,,,1.155451e+04,1.261763e+04
1180,GNMT,Language,Translation,Hosted access (no API),Google's Neural Machine Translation System: Br...,2016-09-26,Google,2.780000e+08,6.620000e+21,1.230000e+09,...,NVIDIA Tesla K80,United States of America,,,,,,,1.774592e+05,1.937871e+05
1222,AlphaGo Lee,Games,Go,,Mastering the game of Go with deep neural netw...,2016-01-27,DeepMind,,1.900000e+21,2.940000e+07,...,,United Kingdom of Great Britain and Northern I...,,,,,,,,
1226,ResNet-152 (ImageNet),Vision,Image classification,,Deep Residual Learning for Image Recognition,2015-12-10,Microsoft,6.000000e+07,1.210000e+19,1.280000e+06,...,,United States of America,,,,,,,,


In [68]:
cost_df[keep_cols].to_csv(results_dir + 'cost_dataset.csv', index=False)

# Plots

In [69]:
fig = px.scatter(
    cost_df,
    x='Publication date',
    y='Cost (inflation-adjusted)',
    text='System',
    log_y=True,
)
fig.update_traces(textposition='top center')

# no legend
fig.update_layout(showlegend=False)

# axis labels
fig.update_xaxes(title_text='Publication date')
fig.update_yaxes(title_text='Cost (2023 USD, log scale)')

# title
fig.update_layout(title_text=get_cost_plot_title(estimation_method, compute_threshold_method, compute_threshold))

# update size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_font=dict(
        size=16,
    )
)

# font size
fig.update_layout(
    font=dict(
        size=14,
    )
)

# axis limits
fig.update_xaxes(range=['2015-01-01', '2025-01-01'])

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

save_plot(fig, results_dir, 'cost_scatter')

fig.show()

In [70]:
label_systems = ['GNMT', 'AlphaGo Master', 'AlphaGo Zero', 'AlphaZero', 'DALL-E', 'GPT-3 175B (davinci)', 'PaLM (540B)', 'Llama 2-70B', 'Falcon 180B', 'GPT-4', 'Gemini 1.0 Ultra', 'Inflection-2']

tpu_mask = cost_df['Training hardware'].str.contains('TPU', na=False)
tpu_cost_df = cost_df.loc[tpu_mask]
gpu_cost_df = cost_df.loc[~tpu_mask]

fig = px.scatter(
    gpu_cost_df,
    x='Publication date',
    y='Cost (inflation-adjusted)',
    log_y=True,
)
fig.add_scatter(
    x=tpu_cost_df['Publication date'],
    y=tpu_cost_df['Cost (inflation-adjusted)'],
    mode='markers',
    marker_symbol='circle-open' if estimation_method != 'cloud' else 'circle',
    name='Using estimated cost of TPU' if estimation_method != 'cloud' else '',
    showlegend=estimation_method != 'cloud',
)
fig.add_scatter(
    x=gpu_cost_df.loc[gpu_cost_df['System'].isin(label_systems)]['Publication date'],
    y=gpu_cost_df.loc[gpu_cost_df['System'].isin(label_systems)]['Cost (inflation-adjusted)'],
    text=gpu_cost_df.loc[gpu_cost_df['System'].isin(label_systems)]['System'],
    mode='text',
    showlegend=False,
)
fig.add_scatter(
    x=tpu_cost_df.loc[tpu_cost_df['System'].isin(label_systems)]['Publication date'],
    y=tpu_cost_df.loc[tpu_cost_df['System'].isin(label_systems)]['Cost (inflation-adjusted)'],
    text=tpu_cost_df.loc[tpu_cost_df['System'].isin(label_systems)]['System'],
    mode='text',
    showlegend=False,
)

# Marker color
fig.update_traces(
    marker=dict(
        color='rgb(0,100,200)',
    ),
    selector=dict(mode='markers'),
)

# Shade in CI
fig.add_scatter(
    x=predicted_cost_df['Publication date'],
    y=10**predicted_cost_df['mean_ci_lower'],
    mode='lines',
    line=dict(width=0),
    showlegend=False,
)
fig.add_scatter(
    x=predicted_cost_df['Publication date'],
    y=10**predicted_cost_df['mean_ci_upper'],
    mode='lines',
    fill='tonexty',
    fillcolor='rgba(0,100,200,0.2)',
    line=dict(width=0),
    name='95% CI of mean',
)
fig.add_scatter(
    x=predicted_cost_df['Publication date'],
    y=10**predicted_cost_df['mean'],
    mode='lines',
    line=dict(color='rgb(0,100,200)'),
    name=f'Regression mean (growth rate: {10**reg_results.params[1]:.1f}x per year)',
)

fig.update_traces(textposition='top center')

# axis limits
fig.update_xaxes(range=['2015-01-01', '2025-01-01'])
if estimation_method == 'hardware-acquisition':
    fig.update_yaxes(range=[4, 10])
else:
    fig.update_yaxes(range=[1, 9])

# legend on bottom-right of the axes
fig.update_layout(
    legend=dict(
        x=0.45,
        y=0.05,
    )
)

# axis labels
fig.update_xaxes(title_text='Publication date')
fig.update_yaxes(title_text='Cost (2023 USD, log scale)')

# title
fig.update_layout(title_text=get_cost_plot_title(estimation_method, compute_threshold_method, compute_threshold))

# update size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_font=dict(
        size=16,
    ),
    title_x=0.5,
)

# font size
fig.update_layout(
    font=dict(
        size=14,
    )
)

# axis limits
fig.update_xaxes(range=[pred_start_date, pred_end_date])

# margins
fig.update_layout(margin=dict(l=10, r=10, t=60, b=10))

save_plot(fig, results_dir, 'cost_regression')

fig.show()

# Cost components

In [71]:
cost_component_names = [
    'AI accelerator chip cost',
    'Other server components cost',
    'Cluster-level interconnect cost',
    'Energy cost',
]

In [72]:
for key in cost_component_names:
    component_cost_df[f"{key} (%)"] = component_cost_df[key] / component_cost_df['Cost'] * 100
component_cost_df['AI accelerator chip cost (%)']

88      44.541490
104     46.631554
130           NaN
193     43.637309
243           NaN
          ...    
1179    43.829051
1180    43.894050
1222          NaN
1226          NaN
1228    34.484219
Name: AI accelerator chip cost (%), Length: 64, dtype: float64

In [73]:
cost_component_pc_names = [name + ' (%)' for name in cost_component_names]
filtered_component_cost_df = component_cost_df.dropna(subset=cost_component_pc_names).sort_values(by='Publication date')

In [74]:
# Stacked bar chart of cost components, using component_cost_df
fig = px.bar(
    filtered_component_cost_df,
    x='System',
    y=cost_component_pc_names,
    barmode='stack',
)

# axis labels
fig.update_xaxes(title_text='ML model')
fig.update_yaxes(title_text='% of amortized hardware CapEx + energy')
fig.update_layout(
    legend=dict(
        title_text='Cost component',
        x=0.60,
        y=0.05,
    )
)
# limits 0 to 100
fig.update_yaxes(range=[0, 100])

fig.update_yaxes(tickvals=list(range(0, 101, 10)))

# size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
)

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

save_plot(fig, results_dir, 'cost_component_percentage')

fig.show()

In [75]:
# Average percentage for each component
filtered_component_cost_df[cost_component_pc_names].mean()

AI accelerator chip cost (%)           44.535050
Other server components cost (%)       29.036318
Cluster-level interconnect cost (%)    17.257481
Energy cost (%)                         9.171151
dtype: float64

In [76]:
fig = px.bar(
    filtered_component_cost_df,
    x='System',
    y='Energy cost (%)',
    barmode='stack',
    # labels='Cost %',
    # text='Energy cost %',
)
# axis labels
fig.update_xaxes(title_text='System')
fig.update_yaxes(title_text='Energy cost (% of amortized hardware CapEx + energy)')
# fig.update_layout(
#     legend=dict(
#         title_text='Cost component',
#         x=0.75,
#         y=0.05,
#     )
# )
# limits 0 to 100
fig.update_yaxes(range=[0, 30])
# size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
)

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

save_plot(fig, results_dir, 'energy_percentage')

fig.show()

In [77]:
fig = px.scatter(
    filtered_component_cost_df,
    x='Publication date',
    y='Energy cost',
    text='System',
)
# axis labels
fig.update_xaxes(title_text='System')
fig.update_yaxes(title_text='Energy cost')
# log y
fig.update_yaxes(type='log')
# size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
)

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

save_plot(fig, results_dir, 'energy_cost')

fig.show()

In [78]:
from energy import energy_price

# Stacked bar chart of cost components, using component_cost_df
filtered_component_cost_df.loc[:, 'Energy (kWh)'] = [
    row['Energy cost'] / energy_price(row['Publication date'].year) 
    for _, row in filtered_component_cost_df.iterrows()
]
fig = px.scatter(
    filtered_component_cost_df,
    x='Publication date',
    y='Energy (kWh)',
    text='System',
)
# log y
fig.update_yaxes(type='log')
# size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
)

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

save_plot(fig, results_dir, 'energy_kwh')

fig.show()

In [79]:
filtered_component_cost_df.columns

Index(['System', 'Domain', 'Task', 'Authors', 'Notability criteria',
       'Notability criteria notes', 'Model accessibility', 'Link', 'Citations',
       'Reference', 'Publication date', 'Organization', 'Parameters',
       'Parameters notes', 'Training compute (FLOP)', 'Training compute notes',
       'Training dataset', 'Training dataset notes',
       'Training dataset size (datapoints)', 'Dataset size notes', 'Epochs',
       'Inference compute (FLOP)', 'Inference compute notes',
       'Training time (hours)', 'Training time notes', 'Training hardware',
       'Approach', 'Training compute cost (2020 USD)', 'Compute cost notes',
       'Compute sponsor categorization', 'Confidence', 'Abstract',
       'Last modified', 'Created By', 'Benchmark data', 'Exclude',
       'Country (from Organization)', 'Base model', 'Finetune compute (FLOP)',
       'Finetune compute notes', 'Hardware quantity', 'Hardware utilization',
       'Training cost trends', 'Training cloud compute vendor',
 

In [80]:
filtered_component_cost_df = filtered_component_cost_df.dropna(subset=['Training hardware'])
power_col = 'Power capacity for final training run (kW)'
filtered_component_cost_df.loc[:, power_col] = [
    cluster_power_capacity(row['Training hardware'], row['Hardware quantity'], hardware_df, row['Organization'])
    for _, row in filtered_component_cost_df.iterrows()
]

fig = px.scatter(
    filtered_component_cost_df,
    x='Publication date',
    y=power_col,
    text='System',
)
# log y
fig.update_yaxes(type='log')
# size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
)

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

save_plot(fig, results_dir, 'power_capacity_kw')

fig.show()

In [81]:
filtered_component_cost_df['Publication date (float)'] = datetime_to_float_year(
    pd.to_datetime(filtered_component_cost_df['Publication date'])
)

In [82]:
power_reg_results = fit_ols_regression(
    filtered_component_cost_df,
    ['Publication date (float)'],
    power_col,
    logy=True
)
power_reg_results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.603
Model:,OLS,Adj. R-squared:,0.592
Method:,Least Squares,F-statistic:,53.22
Date:,"Mon, 27 May 2024",Prob (F-statistic):,1.59e-08
Time:,00:34:57,Log-Likelihood:,-28.478
No. Observations:,37,AIC:,60.96
Df Residuals:,35,BIC:,64.18
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-611.8235,84.231,-7.264,0.000,-782.822,-440.825
x1,0.3041,0.042,7.295,0.000,0.219,0.389

0,1,2,3
Omnibus:,0.922,Durbin-Watson:,1.45
Prob(Omnibus):,0.631,Jarque-Bera (JB):,0.215
Skew:,-0.022,Prob(JB):,0.898
Kurtosis:,3.371,Cond. No.,1930000.0


In [83]:
with open(f'{results_dir}/power_regression_results.out', 'w') as f:
    with redirect_stdout(f):
        print_growth_rates(power_reg_results)
print_growth_rates(power_reg_results)

N=37.0
R^2=0.60
0.30 OOMs/year (95% CI: 0.22, 0.39)
2.0x/year (95% CI: 1.7x, 2.4x)
doubling time of 12 months (95% CI: 9, 16)


In [84]:
pred_start_year = 2015
pred_end_year = 2025
pred_start_date = f'{pred_start_year}-01-01'
pred_end_date = f'{pred_end_year}-01-01'

pred_years = pd.DataFrame({'Publication date (float)': np.linspace(pred_start_year, pred_end_year, 100)})
pred_years

Unnamed: 0,Publication date (float)
0,2015.00000
1,2015.10101
2,2015.20202
3,2015.30303
4,2015.40404
...,...
95,2024.59596
96,2024.69697
97,2024.79798
98,2024.89899


In [85]:
predicted_power_df = get_predictions(power_reg_results, pred_years, ['Publication date (float)'])
predicted_power_df['Publication date'] = predicted_power_df['Publication date (float)'].apply(float_year_to_datetime)
predicted_power_df

Unnamed: 0,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper,Publication date (float),Publication date
0,0.901975,0.258706,0.376774,1.427175,-0.308376,2.112325,2015.00000,2015-01-01
1,0.932690,0.254752,0.415515,1.449865,-0.274200,2.139580,2015.10101,2015-02-06
2,0.963406,0.250807,0.454239,1.472572,-0.240074,2.166885,2015.20202,2015-03-15
3,0.994121,0.246871,0.492946,1.495296,-0.206000,2.194242,2015.30303,2015-04-21
4,1.024836,0.242944,0.531633,1.518039,-0.171977,2.221650,2015.40404,2015-05-28
...,...,...,...,...,...,...,...,...
95,3.819934,0.179960,3.454596,4.185272,2.669897,4.969971,2024.59596,2024-08-06
96,3.850650,0.183640,3.477841,4.223459,2.698218,5.003082,2024.69697,2024-09-12
97,3.881365,0.187342,3.501040,4.261690,2.726480,5.036251,2024.79798,2024-10-19
98,3.912080,0.191066,3.524196,4.299965,2.754684,5.069477,2024.89899,2024-11-25


## Power plot

In [86]:
label_systems = ['GNMT', 'AlphaGo Master', 'AlphaGo Zero', 'AlphaZero', 'DALL-E', 'GPT-3 175B (davinci)', 'PaLM (540B)', 'Llama 2-70B', 'Falcon 180B', 'GPT-4', 'Gemini 1.0 Ultra', 'Inflection-2']

fig = px.scatter(
    filtered_component_cost_df,
    x='Publication date',
    y=power_col,
    log_y=True,
)

# Marker color
fig.update_traces(
    marker=dict(
        color='rgb(0,100,200)',
    ),
    selector=dict(mode='markers'),
)

fig.add_scatter(
    x=filtered_component_cost_df.loc[filtered_component_cost_df['System'].isin(label_systems)]['Publication date'],
    y=filtered_component_cost_df.loc[filtered_component_cost_df['System'].isin(label_systems)][power_col],
    text=filtered_component_cost_df.loc[filtered_component_cost_df['System'].isin(label_systems)]['System'],
    mode='text',
    showlegend=False,
)

# Shade in CI
fig.add_scatter(
    x=predicted_power_df['Publication date'],
    y=10**predicted_power_df['mean_ci_lower'],
    mode='lines',
    line=dict(width=0),
    showlegend=False,
)
fig.add_scatter(
    x=predicted_power_df['Publication date'],
    y=10**predicted_power_df['mean_ci_upper'],
    mode='lines',
    fill='tonexty',
    fillcolor='rgba(0,100,200,0.2)',
    line=dict(width=0),
    name='95% CI of mean',
)
fig.add_scatter(
    x=predicted_power_df['Publication date'],
    y=10**predicted_power_df['mean'],
    mode='lines',
    line=dict(color='rgb(0,100,200)'),
    name=f'Regression mean (growth rate: {10**power_reg_results.params[1]:.1f}x per year)',
)

fig.update_traces(textposition='top center')

# axis limits
fig.update_xaxes(range=['2015-01-01', '2025-01-01'])
# fig.update_yaxes(range=[1, 6])

# legend on bottom-right of the axes
fig.update_layout(
    legend=dict(
        x=0.45,
        y=0.05,
    )
)

# axis labels
fig.update_xaxes(title_text='Publication date')
fig.update_yaxes(title_text='Power (kW, log scale)')

# title
fig.update_layout(title_text='Cluster power required for final training run')

# update size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_font=dict(
        size=16,
    ),
    title_x=0.5,
)

# font size
fig.update_layout(
    font=dict(
        size=14,
    )
)

# axis limits
fig.update_xaxes(range=[pred_start_date, pred_end_date])

# margins
fig.update_layout(margin=dict(l=10, r=10, t=60, b=10))

save_plot(fig, results_dir, 'power_regression')

fig.show()