# Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from contextlib import redirect_stdout
import numpy as np
import os
import pandas as pd
import plotly.express as px

from cost import *
from plotting import *
from prices import *
from inflation import *
from regression import *
from utils import *

In [3]:
estimation_method = 'cloud'  # hardware-capex-energy, hardware-acquisition, cloud
compute_threshold_method = 'top_n'  # top_n, window_percentile
compute_threshold = 10  # e.g. 10 to select top 10; 75 to select top 25%
variant = '2025-03-14_all_vendors'  # whatever else distinguishes this run, e.g. 'excluding-AlphaGo'
exclude_models_containing = []  # ['GNMT', 'AlphaZero', 'AlphaGo Master', 'AlphaGo Zero']

estimation_method_lookup = {
    'hardware-capex-energy': estimate_hardware_capex_energy,
    'hardware-acquisition': estimate_hardware_acquisition_cost,
    'cloud': estimate_cloud_costs,
}
cost_estimation_function = estimation_method_lookup[estimation_method]

results_dir = f'results/{estimation_method}-{compute_threshold_method}={compute_threshold}-{variant}/'
os.makedirs(results_dir, exist_ok=True)

# Load data

In [4]:
frontier_pcd_df, hardware_df, price_df = load_data_for_cost_estimation(
    compute_threshold_method=compute_threshold_method, compute_threshold=compute_threshold,
)

In [5]:
len(frontier_pcd_df), len(hardware_df), len(price_df)

(89, 5703, 576)

# Cost estimation

In [6]:
with open(f'{results_dir}/cost_estimation.out', 'w') as f:
    with redirect_stdout(f):
        cost_df = cost_estimation_function(frontier_pcd_df, hardware_df, price_df)

In [7]:
if estimation_method == 'hardware-capex-energy':
    frontier_pcd_df_copy = frontier_pcd_df.copy()
    with open(f'{results_dir}/component_cost_estimation.out', 'w') as f:
        with redirect_stdout(f):
            component_cost_df = cost_estimation_function(frontier_pcd_df_copy, hardware_df, price_df, separate_components=True)

In [8]:
cost_df

Unnamed: 0,Model,Domain,Task,Authors,Notability criteria,Notability criteria notes,Model accessibility,Link,Citations,Reference,...,Post-training compute (FLOP),Post-training compute notes,Hardware maker,benchmarks/models,Maybe over 1e25 FLOP,Updated dataset size,WT103 ppl,WT2 ppl,PTB ppl,Cost
109,Doubao-pro,Language,"Language modeling/generation,Question answerin...",,Training cost,,API access,https://www.volcengine.com/docs/6360/1264663,,Doubao General Model Pro (Doubao-pro),...,,,,doubao-pro-32k,,,,,,
312,GLM-4-Plus,Language,Language modeling,Zhipu AI,Training cost,,API access,https://bigmodel.cn/dev/howuse/glm-4,,GLM-4-Plus,...,,,,,,,,,,
335,Grok-2,"Language,Vision,Multimodal","Chat,Language modeling/generation,Question ans...",,Training cost,,Hosted access (no API),https://x.ai/blog/grok-2,,Grok-2 Beta Release,...,,,NVIDIA,"grok-2-1212,grok-2-vision-1212,grok-2-0813",,,,,,5.739075e+07
367,Mistral Large 2,Language,"Language modeling/generation,Translation,Code ...","Albert Jiang, Alexandre Sablayrolles, Alexis T...",Training cost,likely high training cost since previous Mistr...,Open weights (non-commercial),https://mistral.ai/news/mistral-large-2407/,,"Top-tier reasoning for high-complexity tasks, ...",...,,,,"Mistral-Large-Instruct-2407,Mistral-Large-Inst...",,,,,,
370,Llama 3.1-405B,Language,Language modeling/generation,"Abhimanyu Dubey, Abhinav Jauhri, Abhinav Pande...","SOTA improvement,Training cost","High training compute, exceeds 4o and Claude 3...",Open weights (restricted use),https://ai.meta.com/research/publications/the-...,,The Llama 3 Herd of Models,...,9.400000e+22,Section 4 gives detail about the post-training...,NVIDIA,Llama-3.1-405B-Instruct,,,,,,9.089483e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2054,AlphaGo Lee,Games,Go,"David Silver, Aja Huang, Chris J. Maddison, Ar...",Highly cited,,Unreleased,https://www.nature.com/articles/nature16961,16057.0,Mastering the game of Go with deep neural netw...,...,,,,,,0 checked out of 1,,,,
2058,ResNet-152 (ImageNet),Vision,Image classification,"Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun",Highly cited,,,https://arxiv.org/abs/1512.03385,175697.0,Deep Residual Learning for Image Recognition,...,,,,,,0 checked out of 1,,,,
2060,ResNet-101 (ImageNet),Vision,Image classification,"Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun",Highly cited,,Open weights (unrestricted),https://arxiv.org/abs/1512.03385,175697.0,Deep Residual Learning for Image Recognition,...,,,,,,,,,,
2061,DeepSpeech2 (English),Speech,Speech recognition,"Dario Amodei, Rishita Anubhai, Eric Battenberg...",Highly cited,,,https://arxiv.org/abs/1512.02595,2853.0,Deep Speech 2: End-to-End Speech Recognition i...,...,,,NVIDIA,,,0 checked out of 1,,,,


In [9]:
cost_df['Cost'].notna().sum()

50

In [10]:
cost_df.dropna(subset=['Cost'])['Training time (hours)'].notna().sum()

32

In [11]:
cost_df.dropna(subset=['Cost'])['Hardware utilization'].notna().sum()

19

Exclusion

In [12]:
cost_df[['Model', 'Publication date']].tail(15)

Unnamed: 0,Model,Publication date
1920,Libratus,2017-08-19
1929,OpenAI TI7 DOTA 1v1,2017-08-11
1935,ConvS2S (ensemble of 8 models),2017-07-25
1941,JFT,2017-07-10
1965,MoE-Multi,2017-01-23
1985,PolyNet,2016-11-17
1991,NASv3 (CIFAR-10),2016-11-05
2000,Xception,2016-10-07
2001,GNMT,2016-09-26
2013,ResNet-200,2016-09-17


In [13]:
for kw in exclude_models_containing:
    cost_df = cost_df[cost_df['Model'].str.contains(kw) == False]
cost_df[['Model', 'Publication date']].tail(15)

Unnamed: 0,Model,Publication date
1920,Libratus,2017-08-19
1929,OpenAI TI7 DOTA 1v1,2017-08-11
1935,ConvS2S (ensemble of 8 models),2017-07-25
1941,JFT,2017-07-10
1965,MoE-Multi,2017-01-23
1985,PolyNet,2016-11-17
1991,NASv3 (CIFAR-10),2016-11-05
2000,Xception,2016-10-07
2001,GNMT,2016-09-26
2013,ResNet-200,2016-09-17


Use the below to check data availability for specific systems

In [14]:
# system = 'WizardLM-7B'
# row = cost_df.loc[cost_df['Model'] == system]
# print('Cost:', row['Cost'].values[0])
# print('Training hardware:', row['Training hardware'].values[0])
# print('Training time (hours):', row['Training time (hours)'].values[0])
# print('Hardware quantity:', row['Hardware quantity'].values[0])
# print('Hardware utilization:', row['Hardware utilization'].values[0])

# Apply inflation adjustment

In [15]:
cost_df['Cost'].dropna()

335     5.739075e+07
370     9.089483e+07
417     3.500851e+07
592     1.939071e+07
618     2.171542e+07
624     8.980070e+06
633     6.171048e+07
732     1.914000e+08
757     1.940816e+07
843     2.298470e+07
865     2.565734e+07
1010    2.866801e+07
1075    7.752000e+07
1152    8.324027e+06
1249    1.547798e+06
1288    1.462579e+06
1299    1.218724e+07
1322    1.276416e+06
1355    2.028237e+06
1359    3.391488e+06
1403    6.209280e+06
1411    9.550237e+05
1427    1.273742e+06
1456    4.390244e+05
1465    3.994580e+05
1505    2.433024e+05
1522    5.971968e+05
1525    3.425778e+05
1590    9.289728e+05
1599    2.268444e+05
1606    4.120320e+06
1647    6.635520e+05
1654    1.003880e+07
1655    9.526085e+05
1669    1.327104e+05
1673    1.488569e+05
1675    3.568435e+05
1678    4.769648e+04
1679    2.171250e+05
1693    2.355302e+05
1694    2.025830e+05
1720    1.486848e+05
1741    3.280458e+04
1746    2.433024e+04
1780    1.017525e+04
1812    1.081344e+04
1854    2.049869e+05
1941    3.096

In [16]:
cost_df = adjust_column_for_inflation(cost_df, 'Cost', 'data/PCU518210518210.csv', '2024-12-01')

In [17]:
cost_df['Cost (inflation-adjusted)'].dropna()

335     5.719284e+07
370     9.050505e+07
417     3.492219e+07
592     1.940713e+07
618     2.168453e+07
624     8.967297e+06
633     6.162270e+07
732     1.919146e+08
757     1.950573e+07
843     2.318910e+07
865     2.588551e+07
1010    2.904054e+07
1075    7.856202e+07
1152    8.514143e+06
1249    1.581238e+06
1288    1.489702e+06
1299    1.242226e+07
1322    1.323122e+06
1355    2.098625e+06
1359    3.509187e+06
1403    6.422820e+06
1411    9.896170e+05
1427    1.318117e+06
1456    4.552313e+05
1465    4.142043e+05
1505    2.520651e+05
1522    6.214023e+05
1525    3.564631e+05
1590    9.717088e+05
1599    2.372801e+05
1606    4.336473e+06
1647    6.989785e+05
1654    1.069751e+07
1655    1.015115e+06
1669    1.417982e+05
1673    1.590503e+05
1675    3.812795e+05
1678    5.096264e+04
1679    2.319933e+05
1693    2.518844e+05
1694    2.166495e+05
1720    1.604465e+05
1741    3.546371e+04
1746    2.635018e+04
1780    1.101002e+04
1812    1.167940e+04
1854    2.216031e+05
1941    3.353

In [18]:
# Equal number of non-null values
assert cost_df['Cost (inflation-adjusted)'].notna().sum() == cost_df['Cost'].notna().sum()

# Regression

In [19]:
cost_df['Publication date (float)'] = datetime_to_float_year(pd.to_datetime(cost_df['Publication date']))

In [20]:
reg_results = fit_ols_regression(cost_df, ['Publication date (float)'], 'Cost (inflation-adjusted)', logy=True)
reg_results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.756
Model:,OLS,Adj. R-squared:,0.751
Method:,Least Squares,F-statistic:,149.0
Date:,"Fri, 14 Mar 2025",Prob (F-statistic):,2.52e-16
Time:,14:23:12,Log-Likelihood:,-40.799
No. Observations:,50,AIC:,85.6
Df Residuals:,48,BIC:,89.42
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-934.5754,77.074,-12.126,0.000,-1089.544,-779.607
x1,0.4654,0.038,12.205,0.000,0.389,0.542

0,1,2,3
Omnibus:,5.689,Durbin-Watson:,1.23
Prob(Omnibus):,0.058,Jarque-Bera (JB):,4.556
Skew:,0.67,Prob(JB):,0.102
Kurtosis:,3.626,Cond. No.,1970000.0


In [21]:
with open(f'{results_dir}/regression_results.out', 'w') as f:
    with redirect_stdout(f):
        print_growth_rates(reg_results, round_digits=None)
print_growth_rates(reg_results, ci=90, round_digits=5)

N=50.0
R^2=0.76
0.4654 OOMs/year (90% CI: 0.40144, 0.52935)
2.92009x/year (90% CI: 2.52024x, 3.38338x)
doubling time of 7.76191 months (90% CI: 6.82413, 8.99848)


In [22]:
pred_start_year = 2015
pred_end_year = 2025
pred_start_date = f'{pred_start_year}-01-01'
pred_end_date = f'{pred_end_year}-01-01'

pred_years = pd.DataFrame({'Publication date (float)': np.linspace(pred_start_year, pred_end_year, 100)})
pred_years

Unnamed: 0,Publication date (float)
0,2015.00000
1,2015.10101
2,2015.20202
3,2015.30303
4,2015.40404
...,...
95,2024.59596
96,2024.69697
97,2024.79798
98,2024.89899


In [23]:

predicted_cost_df = get_predictions(reg_results, pred_years, ['Publication date (float)'])
predicted_cost_df['Publication date'] = predicted_cost_df['Publication date (float)'].apply(float_year_to_datetime)
predicted_cost_df

Unnamed: 0,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper,Publication date (float),Publication date
0,3.197524,0.252514,2.774001,3.621047,2.169538,4.225511,2015.00000,2015-01-01
1,3.244534,0.248859,2.827142,3.661926,2.219059,4.270010,2015.10101,2015-02-06
2,3.291544,0.245209,2.880273,3.702815,2.268544,4.314543,2015.20202,2015-03-15
3,3.338553,0.241566,2.933393,3.743714,2.317995,4.359112,2015.30303,2015-04-21
4,3.385563,0.237929,2.986502,3.784624,2.367411,4.403715,2015.40404,2015-05-28
...,...,...,...,...,...,...,...,...
95,7.663445,0.148760,7.413942,7.912949,6.694097,8.632794,2024.59596,2024-08-06
96,7.710455,0.152037,7.455454,7.965456,6.739677,8.681233,2024.69697,2024-09-12
97,7.757465,0.155342,7.496922,8.018007,6.785217,8.729713,2024.79798,2024-10-19
98,7.804475,0.158670,7.538349,8.070600,6.830715,8.778234,2024.89899,2024-11-25


In [24]:
predicted_cost_df.to_csv(results_dir + 'predicted_cost_dataset.csv', index=False)

# Export data

In [25]:
keep_cols = [
    'Model',
    'Domain',
    'Task',
    'Model accessibility',
    'Reference',
    'Publication date',
    'Organization',
    'Parameters',
    'Training compute (FLOP)',
    'Training dataset size (datapoints)',
    'Epochs',
    'Training time (hours)',
    'Training hardware',
    'Country (from Organization)',
    'Base model',
    'Finetune compute (FLOP)',
    'Hardware quantity',
    'Hardware utilization',
    'Training cloud compute vendor',
    'Training data center',
    # 'Training time (chip hours)',
    'Cost',
    'Cost (inflation-adjusted)',
]
cost_df[keep_cols]

Unnamed: 0,Model,Domain,Task,Model accessibility,Reference,Publication date,Organization,Parameters,Training compute (FLOP),Training dataset size (datapoints),...,Training hardware,Country (from Organization),Base model,Finetune compute (FLOP),Hardware quantity,Hardware utilization,Training cloud compute vendor,Training data center,Cost,Cost (inflation-adjusted)
109,Doubao-pro,Language,"Language modeling/generation,Question answerin...",API access,Doubao General Model Pro (Doubao-pro),2024-10-28,ByteDance,5.000000e+11,2.505000e+25,8.350000e+12,...,,China,,,,,,"There is no paper to reference, also no inform...",,
312,GLM-4-Plus,Language,Language modeling,API access,GLM-4-Plus,2024-08-29,Zhipu AI,,3.600000e+25,,...,,China,,,,,,Check references for hardware details.,,
335,Grok-2,"Language,Vision,Multimodal","Chat,Language modeling/generation,Question ans...",Hosted access (no API),Grok-2 Beta Release,2024-08-13,xAI,,2.960000e+25,,...,NVIDIA H100 SXM5 80GB,United States of America,,,,,,,5.739075e+07,5.719284e+07
367,Mistral Large 2,Language,"Language modeling/generation,Translation,Code ...",Open weights (non-commercial),"Top-tier reasoning for high-complexity tasks, ...",2024-07-24,Mistral AI,1.230000e+11,2.130000e+25,,...,,France,,,,,,,,
370,Llama 3.1-405B,Language,Language modeling/generation,Open weights (restricted use),The Llama 3 Herd of Models,2024-07-23,Meta AI,4.050000e+11,3.800000e+25,1.560000e+13,...,NVIDIA H100 SXM5 80GB,United States of America,,,16384.0,0.4042,,,9.089483e+07,9.050505e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2054,AlphaGo Lee,Games,Go,Unreleased,Mastering the game of Go with deep neural netw...,2016-01-27,DeepMind,,1.900000e+21,2.940000e+07,...,,United Kingdom of Great Britain and Northern I...,,,,,,,,
2058,ResNet-152 (ImageNet),Vision,Image classification,,Deep Residual Learning for Image Recognition,2015-12-10,Microsoft,6.020000e+07,1.041408e+19,1.280000e+06,...,,United States of America,,,,,,,,
2060,ResNet-101 (ImageNet),Vision,Image classification,Open weights (unrestricted),Deep Residual Learning for Image Recognition,2015-12-10,Microsoft,4.450000e+07,7.004000e+18,1.280000e+06,...,,United States of America,,,,,,,,
2061,DeepSpeech2 (English),Speech,Speech recognition,,Deep Speech 2: End-to-End Speech Recognition i...,2015-12-08,Baidu Research - Silicon Valley AI Lab,3.800000e+07,2.600000e+19,1.633392e+08,...,NVIDIA GeForce GTX TITAN X,United States of America,,,16.0,0.4484,,,,


In [26]:
cost_df[keep_cols].to_csv(results_dir + 'cost_dataset.csv', index=False)

# Plots

In [27]:
fig = px.scatter(
    cost_df,
    x='Publication date',
    y='Cost (inflation-adjusted)',
    text='Model',
    log_y=True,
)
fig.update_traces(textposition='top center')

# no legend
fig.update_layout(showlegend=False)

# axis labels
fig.update_xaxes(title_text='Publication date')
fig.update_yaxes(title_text='Cost (2024 USD, log scale)')

# title
fig.update_layout(title_text=get_cost_plot_title(estimation_method, compute_threshold_method, compute_threshold))

# update size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_font=dict(
        size=16,
    )
)

# font size
fig.update_layout(
    font=dict(
        size=14,
    )
)

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

save_plot(fig, results_dir, 'cost_scatter')

fig.show()

In [28]:
label_systems = ['GNMT', 'AlphaGo Zero', 'DALL-E', 'GPT-3 175B (davinci)', 'GPT-4', 'Llama 3.1-405B', 'Grok-2']

tpu_mask = cost_df['Training hardware'].str.contains('TPU', na=False)
tpu_cost_df = cost_df.loc[tpu_mask]
gpu_cost_df = cost_df.loc[~tpu_mask]

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=gpu_cost_df['Publication date'],
    y=gpu_cost_df['Cost (inflation-adjusted)'],
    text=gpu_cost_df['Model'],
    mode='markers',
    showlegend=False,
))
fig.update_yaxes(type='log')
fig.add_trace(go.Scatter(
    x=tpu_cost_df['Publication date'],
    y=tpu_cost_df['Cost (inflation-adjusted)'],
    text=tpu_cost_df['Model'],
    mode='markers',
    marker_symbol='circle-open' if estimation_method != 'cloud' else 'circle',
    name='Using estimated cost of TPU' if estimation_method != 'cloud' else '',
    showlegend=estimation_method != 'cloud',
))
fig.add_trace(go.Scatter(
    x=gpu_cost_df.loc[gpu_cost_df['Model'].isin(label_systems)]['Publication date'],
    y=gpu_cost_df.loc[gpu_cost_df['Model'].isin(label_systems)]['Cost (inflation-adjusted)'],
    text=gpu_cost_df.loc[gpu_cost_df['Model'].isin(label_systems)]['Model'],
    mode='text',
    showlegend=False,
))
fig.add_trace(go.Scatter(
    x=tpu_cost_df.loc[tpu_cost_df['Model'].isin(label_systems)]['Publication date'],
    y=tpu_cost_df.loc[tpu_cost_df['Model'].isin(label_systems)]['Cost (inflation-adjusted)'],
    text=tpu_cost_df.loc[tpu_cost_df['Model'].isin(label_systems)]['Model'],
    mode='text',
    showlegend=False,
))

# Marker color
fig.update_traces(
    marker=dict(
        color='rgb(0,100,200)',
    ),
    selector=dict(mode='markers'),
)

# Shade in CI
fig.add_scatter(
    x=predicted_cost_df['Publication date'],
    y=10**predicted_cost_df['mean_ci_lower'],
    mode='lines',
    line=dict(width=0),
    showlegend=False,
)
fig.add_scatter(
    x=predicted_cost_df['Publication date'],
    y=10**predicted_cost_df['mean_ci_upper'],
    mode='lines',
    fill='tonexty',
    fillcolor='rgba(0,100,200,0.2)',
    line=dict(width=0),
    name='90% CI of mean',
)
fig.add_scatter(
    x=predicted_cost_df['Publication date'],
    y=10**predicted_cost_df['mean'],
    mode='lines',
    line=dict(color='rgb(0,100,200)'),
    name=f'Regression mean (growth rate: {10**reg_results.params[1]:.1f}x per year)',
)

fig.update_traces(textposition='top center')

# axis limits
# fig.update_xaxes(range=[pred_start_date, pred_end_date])
fig.update_xaxes(range=['2015-01-01', '2025-06-01'])  # manual
if estimation_method == 'hardware-acquisition':
    fig.update_yaxes(range=[4, 10])
else:
    fig.update_yaxes(range=[1, 9])

# legend on bottom-right of the axes
fig.update_layout(
    legend=dict(
        x=0.45,
        y=0.05,
    )
)

# axis labels
fig.update_xaxes(title_text='Publication date')
fig.update_yaxes(title_text='Cost (2024 USD, log scale)')

# title
fig.update_layout(title_text=get_cost_plot_title(estimation_method, compute_threshold_method, compute_threshold))

# update size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_font=dict(
        size=16,
    ),
    title_x=0.5,
)

# font size
fig.update_layout(
    font=dict(
        size=14,
    )
)

# margins
fig.update_layout(margin=dict(l=10, r=10, t=60, b=10))

save_plot(fig, results_dir, 'cost_regression')

fig.show()

# Cost components

In [29]:
cost_component_names = [
    'AI accelerator chip cost',
    'Other server components cost',
    'Cluster-level interconnect cost',
    'Energy cost',
]

In [30]:
for key in cost_component_names:
    component_cost_df[f"{key} (%)"] = component_cost_df[key] / component_cost_df['Cost'] * 100
component_cost_df['AI accelerator chip cost (%)']

NameError: name 'component_cost_df' is not defined

In [31]:
cost_component_pc_names = [name + ' (%)' for name in cost_component_names]
filtered_component_cost_df = component_cost_df.dropna(subset=cost_component_pc_names).sort_values(by='Publication date')

In [32]:
# Stacked bar chart of cost components, using component_cost_df
fig = px.bar(
    filtered_component_cost_df,
    x='Model',
    y=cost_component_pc_names,
    barmode='stack',
)

# axis labels
fig.update_xaxes(title_text='ML model')
fig.update_yaxes(title_text='% of amortized hardware CapEx + energy')
fig.update_layout(
    legend=dict(
        title_text='Cost component',
        x=0.60,
        y=0.05,
    )
)
# limits 0 to 100
fig.update_yaxes(range=[0, 100])

fig.update_yaxes(tickvals=list(range(0, 101, 10)))

# size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
)

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

save_plot(fig, results_dir, 'cost_component_percentage')

fig.show()

In [33]:
filtered_component_cost_df.head()

Unnamed: 0,Model,Domain,Task,Authors,Notability criteria,Notability criteria notes,Model accessibility,Link,Citations,Reference,...,PTB ppl,Cost,AI accelerator chip cost,Other server components cost,Cluster-level interconnect cost,Energy cost,AI accelerator chip cost (%),Other server components cost (%),Cluster-level interconnect cost (%),Energy cost (%)
2061,DeepSpeech2 (English),Speech,Speech recognition,"Dario Amodei, Rishita Anubhai, Eric Battenberg...",Highly cited,,,https://arxiv.org/abs/1512.02595,2853.0,Deep Speech 2: End-to-End Speech Recognition i...,...,,185.456639,63.953273,40.930095,24.602271,55.971,34.484219,22.0699,13.265781,30.180101
2001,GNMT,Language,Translation,"Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc ...",Highly cited,,Hosted access (no API),https://arxiv.org/abs/1609.08144,6483.0,Google's Neural Machine Translation System: Br...,...,,177459.232941,77894.044829,49852.188691,29965.165887,19747.833534,43.89405,28.092192,16.885662,11.128096
2000,Xception,Vision,Image classification,François Chollet,Highly cited,,,https://arxiv.org/abs/1610.02357,13038.0,Xception: Deep Learning with Depthwise Separab...,...,,11554.506253,5064.230483,3241.107509,1948.165702,1301.00256,43.829051,28.050593,16.860657,11.259698
1985,PolyNet,Vision,Image classification,"X Zhang, Z Li, C Change Loy",SOTA improvement,"""The Very Deep PolyNet, designed following thi...",,https://arxiv.org/abs/1611.05725,282.0,PolyNet: A Pursuit of Structural Diversity in ...,...,,563.599706,178.564122,114.281038,68.692074,202.062472,31.682792,20.276987,12.188096,35.852125
1965,MoE-Multi,Language,"Language modeling,Translation","N Shazeer, A Mirhoseini, K Maziarz, A Davis","Highly cited,SOTA improvement","""On large language modeling and machine transl...",Unreleased,https://arxiv.org/abs/1701.06538,2037.0,Outrageously Large Neural Networks: The Sparse...,...,,3538.189418,1519.646471,972.573741,584.594865,461.374341,42.949834,27.487894,16.52243,13.039843


In [34]:
filtered_component_cost_df.to_csv(results_dir + 'cost_components.csv', index=False)

In [35]:
# Average percentage for each component
filtered_component_cost_df[cost_component_pc_names].mean()

AI accelerator chip cost (%)           45.038389
Other server components cost (%)       29.130089
Cluster-level interconnect cost (%)    17.397544
Energy cost (%)                         8.433978
dtype: float64

In [36]:
fig = px.bar(
    filtered_component_cost_df,
    x='Model',
    y='Energy cost (%)',
    barmode='stack',
    # labels='Cost %',
    # text='Energy cost %',
)
# axis labels
fig.update_xaxes(title_text='Model')
fig.update_yaxes(title_text='Energy cost (% of amortized hardware CapEx + energy)')
# fig.update_layout(
#     legend=dict(
#         title_text='Cost component',
#         x=0.75,
#         y=0.05,
#     )
# )
# limits 0 to 100
fig.update_yaxes(range=[0, 30])
# size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
)

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

save_plot(fig, results_dir, 'energy_percentage')

fig.show()

In [37]:
fig = px.scatter(
    filtered_component_cost_df,
    x='Publication date',
    y='Energy cost',
    text='Model',
)
# axis labels
fig.update_xaxes(title_text='Model')
fig.update_yaxes(title_text='Energy cost')
# log y
fig.update_yaxes(type='log')
# size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
)

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

save_plot(fig, results_dir, 'energy_cost')

fig.show()

In [38]:
from energy import energy_price

# Stacked bar chart of cost components, using component_cost_df
filtered_component_cost_df.loc[:, 'Energy (kWh)'] = [
    row['Energy cost'] / energy_price(row['Publication date'].year) 
    for _, row in filtered_component_cost_df.iterrows()
]
fig = px.scatter(
    filtered_component_cost_df,
    x='Publication date',
    y='Energy (kWh)',
    text='Model',
)
# log y
fig.update_yaxes(type='log')
# size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
)

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

save_plot(fig, results_dir, 'energy_kwh')

fig.show()

In [39]:
filtered_component_cost_df.columns

Index(['Model', 'Domain', 'Task', 'Authors', 'Notability criteria',
       'Notability criteria notes', 'Model accessibility', 'Link', 'Citations',
       'Reference',
       ...
       'Cost', 'AI accelerator chip cost', 'Other server components cost',
       'Cluster-level interconnect cost', 'Energy cost',
       'AI accelerator chip cost (%)', 'Other server components cost (%)',
       'Cluster-level interconnect cost (%)', 'Energy cost (%)',
       'Energy (kWh)'],
      dtype='object', length=108)

In [40]:
filtered_component_cost_df = filtered_component_cost_df.dropna(subset=['Training hardware'])
power_col = 'Power capacity for final training run (kW)'
filtered_component_cost_df.loc[:, power_col] = [
    cluster_power_capacity(row['Training hardware'], row['Hardware quantity'], hardware_df, row['Organization'])
    for _, row in filtered_component_cost_df.iterrows()
]

fig = px.scatter(
    filtered_component_cost_df,
    x='Publication date',
    y=power_col,
    text='Model',
)
# log y
fig.update_yaxes(type='log')
# size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
)

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

save_plot(fig, results_dir, 'power_capacity_kw')

fig.show()

In [41]:
filtered_component_cost_df['Publication date (float)'] = datetime_to_float_year(
    pd.to_datetime(filtered_component_cost_df['Publication date'])
)

In [42]:
power_reg_results = fit_ols_regression(
    filtered_component_cost_df,
    ['Publication date (float)'],
    power_col,
    logy=True
)
power_reg_results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.863
Model:,OLS,Adj. R-squared:,0.859
Method:,Least Squares,F-statistic:,233.0
Date:,"Thu, 13 Mar 2025",Prob (F-statistic):,1.51e-17
Time:,17:02:39,Log-Likelihood:,-10.518
No. Observations:,39,AIC:,25.04
Df Residuals:,37,BIC:,28.36
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-717.4255,47.192,-15.202,0.000,-813.045,-621.806
x1,0.3564,0.023,15.263,0.000,0.309,0.404

0,1,2,3
Omnibus:,5.566,Durbin-Watson:,1.953
Prob(Omnibus):,0.062,Jarque-Bera (JB):,4.658
Skew:,0.514,Prob(JB):,0.0974
Kurtosis:,4.345,Cond. No.,1830000.0


In [43]:
with open(f'{results_dir}/power_regression_results.out', 'w') as f:
    with redirect_stdout(f):
        print_growth_rates(power_reg_results)
print_growth_rates(power_reg_results)

N=39.0
R^2=0.86
0.3564433491873348 OOMs/year (90% CI: 0.3170440986156087, 0.39584259975906083)
2.2721832248227765x/year (90% CI: 2.075124216561464x, 2.4879554515155515x)
doubling time of 10.13445742837872 months (90% CI: 9.12574834079638, 11.393872220745795)


In [44]:
pred_start_year = 2015
pred_end_year = 2025
pred_start_date = f'{pred_start_year}-01-01'
pred_end_date = f'{pred_end_year}-01-01'

pred_years = pd.DataFrame({'Publication date (float)': np.linspace(pred_start_year, pred_end_year, 100)})
pred_years

Unnamed: 0,Publication date (float)
0,2015.00000
1,2015.10101
2,2015.20202
3,2015.30303
4,2015.40404
...,...
95,2024.59596
96,2024.69697
97,2024.79798
98,2024.89899


In [45]:
predicted_power_df = get_predictions(power_reg_results, pred_years, ['Publication date (float)'])
predicted_power_df['Publication date'] = predicted_power_df['Publication date (float)'].apply(float_year_to_datetime)
predicted_power_df

Unnamed: 0,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper,Publication date (float),Publication date
0,0.807834,0.144451,0.564132,1.051537,0.207307,1.408361,2015.00000,2015-01-01
1,0.843838,0.142253,0.603844,1.083833,0.244807,1.442870,2015.10101,2015-02-06
2,0.879843,0.140061,0.643547,1.116139,0.282284,1.477402,2015.20202,2015-03-15
3,0.915847,0.137874,0.683241,1.148454,0.319737,1.511957,2015.30303,2015-04-21
4,0.951852,0.135693,0.722925,1.180778,0.357168,1.546535,2015.40404,2015-05-28
...,...,...,...,...,...,...,...,...
95,4.228250,0.103442,4.053734,4.402766,3.652318,4.804182,2024.59596,2024-08-06
96,4.264254,0.105487,4.086289,4.442220,3.687268,4.841241,2024.69697,2024-09-12
97,4.300259,0.107544,4.118822,4.481696,3.722193,4.878325,2024.79798,2024-10-19
98,4.336263,0.109614,4.151335,4.521192,3.757091,4.915435,2024.89899,2024-11-25


## Power plot

In [46]:
label_systems = ['GNMT', 'AlphaGo Master', 'AlphaGo Zero', 'AlphaZero', 'DALL-E', 'GPT-3 175B (davinci)', 'PaLM (540B)', 'Llama 2-70B', 'Falcon 180B', 'GPT-4', 'Gemini 1.0 Ultra', 'Inflection-2']

fig = px.scatter(
    filtered_component_cost_df,
    x='Publication date',
    y=power_col,
    log_y=True,
)

# Marker color
fig.update_traces(
    marker=dict(
        color='rgb(0,100,200)',
    ),
    selector=dict(mode='markers'),
)

fig.add_scatter(
    x=filtered_component_cost_df.loc[filtered_component_cost_df['Model'].isin(label_systems)]['Publication date'],
    y=filtered_component_cost_df.loc[filtered_component_cost_df['Model'].isin(label_systems)][power_col],
    text=filtered_component_cost_df.loc[filtered_component_cost_df['Model'].isin(label_systems)]['Model'],
    mode='text',
    showlegend=False,
)

# Shade in CI
fig.add_scatter(
    x=predicted_power_df['Publication date'],
    y=10**predicted_power_df['mean_ci_lower'],
    mode='lines',
    line=dict(width=0),
    showlegend=False,
)
fig.add_scatter(
    x=predicted_power_df['Publication date'],
    y=10**predicted_power_df['mean_ci_upper'],
    mode='lines',
    fill='tonexty',
    fillcolor='rgba(0,100,200,0.2)',
    line=dict(width=0),
    name='90% CI of mean',
)
fig.add_scatter(
    x=predicted_power_df['Publication date'],
    y=10**predicted_power_df['mean'],
    mode='lines',
    line=dict(color='rgb(0,100,200)'),
    name=f'Regression mean (growth rate: {10**power_reg_results.params[1]:.1f}x per year)',
)

fig.update_traces(textposition='top center')

# axis limits
fig.update_xaxes(range=[pred_start_date, pred_end_date])
# fig.update_xaxes(range=['2015-01-01', '2025-01-01'])  # manual
# fig.update_yaxes(range=[1, 6])

# legend on bottom-right of the axes
fig.update_layout(
    legend=dict(
        x=0.45,
        y=0.05,
    )
)

# axis labels
fig.update_xaxes(title_text='Publication date')
fig.update_yaxes(title_text='Power (kW, log scale)')

# title
fig.update_layout(title_text='Cluster power required for final training run')

# update size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_font=dict(
        size=16,
    ),
    title_x=0.5,
)

# font size
fig.update_layout(
    font=dict(
        size=14,
    )
)

# margins
fig.update_layout(margin=dict(l=10, r=10, t=60, b=10))

save_plot(fig, results_dir, 'power_regression')

fig.show()