# Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from contextlib import redirect_stdout
import numpy as np
import os
import pandas as pd
import plotly.express as px

from cost import *
from plotting import *
from prices import *
from inflation import *
from regression import *
from utils import *

In [3]:
estimation_method = 'hardware-capex-energy'  # hardware-capex-energy, hardware-acquisition, cloud
compute_threshold_method = 'top_n'  # top_n, window_percentile
compute_threshold = 10  # e.g. 10 to select top 10; 75 to select top 25%
variant = '2025-03-17_exclude_finetunes_at_threshold_stage'  # whatever else distinguishes this run, e.g. 'excluding-AlphaGo'
exclude_models_containing = []  # ['GNMT', 'AlphaZero', 'AlphaGo Master', 'AlphaGo Zero']

estimation_method_lookup = {
    'hardware-capex-energy': estimate_hardware_capex_energy,
    'hardware-acquisition': estimate_hardware_acquisition_cost,
    'cloud': estimate_cloud_costs,
}
cost_estimation_function = estimation_method_lookup[estimation_method]

results_dir = f'results/{estimation_method}-{compute_threshold_method}={compute_threshold}-{variant}/'
os.makedirs(results_dir, exist_ok=True)

# Load data

In [4]:
frontier_pcd_df, hardware_df, price_df = load_data_for_cost_estimation(
    compute_threshold_method=compute_threshold_method, compute_threshold=compute_threshold,
)

In [5]:
len(frontier_pcd_df), len(hardware_df), len(price_df)

(89, 5775, 590)

# Cost estimation

In [6]:
with open(f'{results_dir}/cost_estimation.out', 'w') as f:
    with redirect_stdout(f):
        cost_df = cost_estimation_function(frontier_pcd_df, hardware_df, price_df)

In [7]:
if estimation_method == 'hardware-capex-energy':
    frontier_pcd_df_copy = frontier_pcd_df.copy()
    with open(f'{results_dir}/component_cost_estimation.out', 'w') as f:
        with redirect_stdout(f):
            component_cost_df = cost_estimation_function(frontier_pcd_df_copy, hardware_df, price_df, separate_components=True)

In [8]:
cost_df

Unnamed: 0,Model,Domain,Task,Organization,Authors,Publication date,Reference,Link,Citations,Notability criteria,...,Organization categorization (from Organization),Training compute cost (2023 USD),Utilization notes,Numerical format,Training power draw (W),Training compute estimation method,Hugging Face developer id,Post-training compute (FLOP),Post-training compute notes,Cost
52,Llama 4 Behemoth (preview),"Multimodal,Language,Vision","Chat,Code generation,Visual question answering...",Meta AI,,2025-04-05,The Llama 4 herd: The beginning of a new era o...,https://ai.meta.com/blog/llama-4-multimodal-in...,,Training cost,...,Industry,,,,,Operation counting,,,,
87,GPT-4.5,"Language,Vision,Multimodal","Language modeling/generation,Question answerin...",OpenAI,"Foundational contributors\r\nAlex Paino, Ali K...",2025-02-27,Introducing GPT-4.5,https://openai.com/index/introducing-gpt-4-5/,,Training cost,...,Industry,,,,,Benchmarks,,,,
94,Claude 3.7 Sonnet,"Language,Vision,Multimodal","Language modeling/generation,Question answerin...",Anthropic,,2025-02-24,Claude 3.7 Sonnet,https://www.anthropic.com/news/claude-3-7-sonnet,,Training cost,...,Industry,,,,,,,,,
101,Grok-3,"Language,Vision,Multimodal","Chat,Language modeling/generation,Question ans...",xAI,,2025-02-17,Grok 3 Beta — The Age of Reasoning Agents,https://x.ai/blog/grok-3,,Training cost,...,Industry,,,,1.374358e+08,"Hardware,Comparison with other models",,,,3.014548e+08
206,Doubao-pro,Language,"Language modeling/generation,Question answerin...",ByteDance,,2024-10-28,Doubao General Model Pro (Doubao-pro),https://www.volcengine.com/docs/6360/1264663,,Training cost,...,Industry,,,,,Operation counting,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1632,ResNet-200,Vision,Image classification,Microsoft Research Asia,"Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun",2016-09-17,Identity Mappings in Deep Residual Networks,https://link.springer.com/chapter/10.1007/978-...,9621.0,Highly cited,...,Industry,,,,,Hardware,,,,
1660,AlphaGo Lee,Games,Go,DeepMind,"David Silver, Aja Huang, Chris J. Maddison, Ar...",2016-01-27,Mastering the game of Go with deep neural netw...,https://www.nature.com/articles/nature16961,16057.0,Highly cited,...,Industry,,,,,Comparison with other models,,,,
1664,ResNet-152 (ImageNet),Vision,Image classification,Microsoft,"Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun",2015-12-10,Deep Residual Learning for Image Recognition,https://arxiv.org/abs/1512.03385,175697.0,Highly cited,...,Industry,,,FP32,,"Operation counting,Third-party estimation",,,,
1665,DeepSpeech2 (English),Speech,Speech recognition,Baidu Research - Silicon Valley AI Lab,"Dario Amodei, Rishita Anubhai, Eric Battenberg...",2015-12-08,Deep Speech 2: End-to-End Speech Recognition i...,https://arxiv.org/abs/1512.02595,2853.0,Highly cited,...,Industry,$206.31,"""Overall the system sustains approximately 50 ...",FP32,8.463468e+03,"Operation counting,Third-party estimation",,,,1.854566e+02


In [9]:
cost_df['Cost'].notna().sum()

61

In [10]:
cost_df.dropna(subset=['Cost'])['Training time (hours)'].notna().sum()

40

In [11]:
cost_df.dropna(subset=['Cost'])['Hardware utilization'].notna().sum()

22

Exclusion

In [12]:
cost_df[['Model', 'Publication date']].tail(15)

Unnamed: 0,Model,Publication date
1566,AlphaGo Master,2017-10-19
1567,AlphaGo Zero,2017-10-18
1572,Libratus,2017-08-19
1577,OpenAI TI7 DOTA 1v1,2017-08-11
1584,JFT,2017-07-10
1604,MoE-Multi,2017-01-23
1615,PolyNet,2016-11-17
1617,NASv3 (CIFAR-10),2016-11-05
1623,Xception,2016-10-07
1624,GNMT,2016-09-26


In [13]:
for kw in exclude_models_containing:
    cost_df = cost_df[cost_df['Model'].str.contains(kw) == False]
cost_df[['Model', 'Publication date']].tail(15)

Unnamed: 0,Model,Publication date
1566,AlphaGo Master,2017-10-19
1567,AlphaGo Zero,2017-10-18
1572,Libratus,2017-08-19
1577,OpenAI TI7 DOTA 1v1,2017-08-11
1584,JFT,2017-07-10
1604,MoE-Multi,2017-01-23
1615,PolyNet,2016-11-17
1617,NASv3 (CIFAR-10),2016-11-05
1623,Xception,2016-10-07
1624,GNMT,2016-09-26


Use the below to check data availability for specific systems

In [14]:
# system = 'WizardLM-7B'
# row = cost_df.loc[cost_df['Model'] == system]
# print('Cost:', row['Cost'].values[0])
# print('Training hardware:', row['Training hardware'].values[0])
# print('Training time (hours):', row['Training time (hours)'].values[0])
# print('Hardware quantity:', row['Hardware quantity'].values[0])
# print('Hardware utilization:', row['Hardware utilization'].values[0])

# Apply inflation adjustment

In [15]:
cost_df['Cost'].dropna()

101     3.014548e+08
366     3.060541e+07
403     5.126034e+07
448     2.057972e+07
612     1.179460e+07
            ...     
1604    3.538189e+03
1615    5.635997e+02
1623    1.155451e+04
1624    1.774592e+05
1665    1.854566e+02
Name: Cost, Length: 61, dtype: float64

In [16]:
cost_df = adjust_column_for_inflation(cost_df, 'Cost', 'data/PCU518210518210.csv', '2024-12-01')

In [17]:
cost_df['Cost (inflation-adjusted)'].dropna()

101     3.008724e+08
366     3.049986e+07
403     5.104052e+07
448     2.052898e+07
612     1.180459e+07
            ...     
1604    3.874123e+03
1615    6.171107e+02
1623    1.265155e+04
1624    1.943081e+05
1665    2.068604e+02
Name: Cost (inflation-adjusted), Length: 61, dtype: float64

In [18]:
# Equal number of non-null values
assert cost_df['Cost (inflation-adjusted)'].notna().sum() == cost_df['Cost'].notna().sum()

# Regression

In [19]:
cost_df['Publication date (float)'] = datetime_to_float_year(pd.to_datetime(cost_df['Publication date']))

In [20]:
reg_results = fit_ols_regression(cost_df, ['Publication date (float)'], 'Cost (inflation-adjusted)', logy=True)
reg_results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.707
Model:,OLS,Adj. R-squared:,0.702
Method:,Least Squares,F-statistic:,142.1
Date:,"Wed, 30 Jul 2025",Prob (F-statistic):,2.36e-17
Time:,15:32:48,Log-Likelihood:,-62.231
No. Observations:,61,AIC:,128.5
Df Residuals:,59,BIC:,132.7
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-887.9037,74.944,-11.848,0.000,-1037.867,-737.940
x1,0.4421,0.037,11.923,0.000,0.368,0.516

0,1,2,3
Omnibus:,4.397,Durbin-Watson:,1.406
Prob(Omnibus):,0.111,Jarque-Bera (JB):,4.037
Skew:,0.63,Prob(JB):,0.133
Kurtosis:,2.962,Cond. No.,1730000.0


In [21]:
with open(f'{results_dir}/regression_results.out', 'w') as f:
    with redirect_stdout(f):
        print_growth_rates(reg_results, round_digits=None)
print_growth_rates(reg_results, ci=90, round_digits=5)

N=61.0
R^2=0.71
0.44209 OOMs/year (90% CI: 0.38012, 0.50405)
2.76748x/year (90% CI: 2.3995x, 3.1919x)
doubling time of 8.17119 months (90% CI: 7.16669, 9.50317)


In [22]:
pred_start_year = 2015
pred_end_year = 2025
pred_start_date = f'{pred_start_year}-01-01'
pred_end_date = f'{pred_end_year}-01-01'

pred_years = pd.DataFrame({'Publication date (float)': np.linspace(pred_start_year, pred_end_year, 100)})
pred_years

Unnamed: 0,Publication date (float)
0,2015.00000
1,2015.10101
2,2015.20202
3,2015.30303
4,2015.40404
...,...
95,2024.59596
96,2024.69697
97,2024.79798
98,2024.89899


In [23]:

predicted_cost_df = get_predictions(reg_results, pred_years, ['Publication date (float)'])
predicted_cost_df['Publication date'] = predicted_cost_df['Publication date (float)'].apply(float_year_to_datetime)
predicted_cost_df

Unnamed: 0,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper,Publication date (float),Publication date
0,2.897729,0.244884,2.488505,3.306952,1.686120,4.109338,2015.00000,2015-01-01
1,2.942384,0.241388,2.539001,3.345767,1.732736,4.152032,2015.10101,2015-02-06
2,2.987039,0.237901,2.589484,3.384594,1.779322,4.194756,2015.20202,2015-03-15
3,3.031694,0.234421,2.639954,3.423434,1.825878,4.237510,2015.30303,2015-04-21
4,3.076349,0.230950,2.690410,3.462288,1.872405,4.280293,2015.40404,2015-05-28
...,...,...,...,...,...,...,...,...
95,7.139960,0.154196,6.882284,7.397636,5.970802,8.309117,2024.59596,2024-08-06
96,7.184615,0.157296,6.921758,7.447472,6.014305,8.354925,2024.69697,2024-09-12
97,7.229270,0.160424,6.961186,7.497354,6.057775,8.400765,2024.79798,2024-10-19
98,7.273925,0.163578,7.000571,7.547279,6.101212,8.446637,2024.89899,2024-11-25


In [24]:
predicted_cost_df.to_csv(results_dir + 'predicted_cost_dataset.csv', index=False)

# Export data

In [25]:
keep_cols = [
    'Model',
    'Domain',
    'Task',
    'Model accessibility',
    'Reference',
    'Publication date',
    'Organization',
    'Parameters',
    'Training compute (FLOP)',
    'Training dataset size (datapoints)',
    'Epochs',
    'Training time (hours)',
    'Training hardware',
    'Country (of organization)',
    'Base model',
    'Finetune compute (FLOP)',
    'Hardware quantity',
    'Hardware utilization',
    'Training cloud compute vendor',
    'Training data center',
    # 'Training time (chip hours)',
    'Cost',
    'Cost (inflation-adjusted)',
]
cost_df[keep_cols]

Unnamed: 0,Model,Domain,Task,Model accessibility,Reference,Publication date,Organization,Parameters,Training compute (FLOP),Training dataset size (datapoints),...,Training hardware,Country (of organization),Base model,Finetune compute (FLOP),Hardware quantity,Hardware utilization,Training cloud compute vendor,Training data center,Cost,Cost (inflation-adjusted)
52,Llama 4 Behemoth (preview),"Multimodal,Language,Vision","Chat,Code generation,Visual question answering...",Unreleased,The Llama 4 herd: The beginning of a new era o...,2025-04-05,Meta AI,2.000000e+12,5.184000e+25,3.000000e+13,...,,United States of America,,,32000.0,,,,,
87,GPT-4.5,"Language,Vision,Multimodal","Language modeling/generation,Question answerin...",API access,Introducing GPT-4.5,2025-02-27,OpenAI,,2.100000e+26,,...,,United States of America,,,,,Azure AI,,,
94,Claude 3.7 Sonnet,"Language,Vision,Multimodal","Language modeling/generation,Question answerin...",API access,Claude 3.7 Sonnet,2025-02-24,Anthropic,,3.350000e+25,,...,,United States of America,,,,,,,,
101,Grok-3,"Language,Vision,Multimodal","Chat,Language modeling/generation,Question ans...",Hosted access (no API),Grok 3 Beta — The Age of Reasoning Agents,2025-02-17,xAI,,4.640000e+26,,...,NVIDIA H100 SXM5 80GB,United States of America,,,100000.0,,,xAI Memphis Colossus,3.014548e+08,3.008724e+08
206,Doubao-pro,Language,"Language modeling/generation,Question answerin...",API access,Doubao General Model Pro (Doubao-pro),2024-10-28,ByteDance,5.000000e+11,2.505000e+25,8.350000e+12,...,,China,,,,,,"There is no paper to reference, also no inform...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1632,ResNet-200,Vision,Image classification,Unreleased,Identity Mappings in Deep Residual Networks,2016-09-17,Microsoft Research Asia,,2.974164e+19,1.281167e+06,...,,China,,,,,,,,
1660,AlphaGo Lee,Games,Go,Unreleased,Mastering the game of Go with deep neural netw...,2016-01-27,DeepMind,,1.900000e+21,2.940000e+07,...,,United Kingdom of Great Britain and Northern I...,,,,,,,,
1664,ResNet-152 (ImageNet),Vision,Image classification,,Deep Residual Learning for Image Recognition,2015-12-10,Microsoft,6.020000e+07,1.041408e+19,1.280000e+06,...,,"United States of America,Multinational,India,B...",,,,,,,,
1665,DeepSpeech2 (English),Speech,Speech recognition,,Deep Speech 2: End-to-End Speech Recognition i...,2015-12-08,Baidu Research - Silicon Valley AI Lab,3.800000e+07,2.600000e+19,1.633392e+08,...,NVIDIA GeForce GTX TITAN X,United States of America,,,16.0,0.4484,,,1.854566e+02,2.068604e+02


In [26]:
cost_df[keep_cols].to_csv(results_dir + 'cost_dataset.csv', index=False)

# Plots

In [27]:
# fig = px.scatter(
#     cost_df,
#     x='Publication date',
#     y='Cost (inflation-adjusted)',
#     text='Model',
#     log_y=True,
# )
# fig.update_traces(textposition='top center')

# # no legend
# fig.update_layout(showlegend=False)

# # axis labels
# fig.update_xaxes(title_text='Publication date')
# fig.update_yaxes(title_text='Cost (2024 USD, log scale)')

# # title
# fig.update_layout(title_text=get_cost_plot_title(estimation_method, compute_threshold_method, compute_threshold))

# # update size
# fig.update_layout(
#     autosize=False,
#     width=800,
#     height=600,
#     title_font=dict(
#         size=16,
#     )
# )

# # font size
# fig.update_layout(
#     font=dict(
#         size=14,
#     )
# )

# # margins
# fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

# save_plot(fig, results_dir, 'cost_scatter')

# fig.show()

In [28]:
label_systems = ['GNMT', 'AlphaGo Zero', 'DALL-E', 'GPT-3 175B (davinci)', 'GPT-4', 'Llama 3.1-405B', 'Grok-2']

tpu_mask = cost_df['Training hardware'].str.contains('TPU', na=False)
tpu_cost_df = cost_df.loc[tpu_mask]
gpu_cost_df = cost_df.loc[~tpu_mask]

# fig = go.Figure()

# fig.add_trace(go.Scatter(
#     x=gpu_cost_df['Publication date'],
#     y=gpu_cost_df['Cost (inflation-adjusted)'],
#     text=gpu_cost_df['Model'],
#     mode='markers',
#     showlegend=False,
# ))
# fig.update_yaxes(type='log')
# fig.add_trace(go.Scatter(
#     x=tpu_cost_df['Publication date'],
#     y=tpu_cost_df['Cost (inflation-adjusted)'],
#     text=tpu_cost_df['Model'],
#     mode='markers',
#     marker_symbol='circle-open' if estimation_method != 'cloud' else 'circle',
#     name='Using estimated cost of TPU' if estimation_method != 'cloud' else '',
#     showlegend=estimation_method != 'cloud',
# ))
# fig.add_trace(go.Scatter(
#     x=gpu_cost_df.loc[gpu_cost_df['Model'].isin(label_systems)]['Publication date'],
#     y=gpu_cost_df.loc[gpu_cost_df['Model'].isin(label_systems)]['Cost (inflation-adjusted)'],
#     text=gpu_cost_df.loc[gpu_cost_df['Model'].isin(label_systems)]['Model'],
#     mode='text',
#     showlegend=False,
# ))
# fig.add_trace(go.Scatter(
#     x=tpu_cost_df.loc[tpu_cost_df['Model'].isin(label_systems)]['Publication date'],
#     y=tpu_cost_df.loc[tpu_cost_df['Model'].isin(label_systems)]['Cost (inflation-adjusted)'],
#     text=tpu_cost_df.loc[tpu_cost_df['Model'].isin(label_systems)]['Model'],
#     mode='text',
#     showlegend=False,
# ))

# # Marker color
# fig.update_traces(
#     marker=dict(
#         color='rgb(0,100,200)',
#     ),
#     selector=dict(mode='markers'),
# )

# # Shade in CI
# fig.add_scatter(
#     x=predicted_cost_df['Publication date'],
#     y=10**predicted_cost_df['mean_ci_lower'],
#     mode='lines',
#     line=dict(width=0),
#     showlegend=False,
# )
# fig.add_scatter(
#     x=predicted_cost_df['Publication date'],
#     y=10**predicted_cost_df['mean_ci_upper'],
#     mode='lines',
#     fill='tonexty',
#     fillcolor='rgba(0,100,200,0.2)',
#     line=dict(width=0),
#     name='90% CI of mean',
# )
# fig.add_scatter(
#     x=predicted_cost_df['Publication date'],
#     y=10**predicted_cost_df['mean'],
#     mode='lines',
#     line=dict(color='rgb(0,100,200)'),
#     name=f'Regression mean (growth rate: {10**reg_results.params[1]:.1f}x per year)',
# )

# fig.update_traces(textposition='top center')

# # axis limits
# # fig.update_xaxes(range=[pred_start_date, pred_end_date])
# fig.update_xaxes(range=['2015-01-01', '2025-06-01'])  # manual
# if estimation_method == 'hardware-acquisition':
#     fig.update_yaxes(range=[4, 10])
# else:
#     fig.update_yaxes(range=[1, 9])

# # legend on bottom-right of the axes
# fig.update_layout(
#     legend=dict(
#         x=0.45,
#         y=0.05,
#     )
# )

# # axis labels
# fig.update_xaxes(title_text='Publication date')
# fig.update_yaxes(title_text='Cost (2024 USD, log scale)')

# # title
# fig.update_layout(title_text=get_cost_plot_title(estimation_method, compute_threshold_method, compute_threshold))

# # update size
# fig.update_layout(
#     autosize=False,
#     width=800,
#     height=600,
#     title_font=dict(
#         size=16,
#     ),
#     title_x=0.5,
# )

# # font size
# fig.update_layout(
#     font=dict(
#         size=14,
#     )
# )

# # margins
# fig.update_layout(margin=dict(l=10, r=10, t=60, b=10))

# save_plot(fig, results_dir, 'cost_regression')

# fig.show()

# Cost components

In [29]:
cost_component_names = [
    'AI accelerator chip cost',
    'Other server components cost',
    'Cluster-level interconnect cost',
    'Energy cost',
]

In [30]:
for key in cost_component_names:
    component_cost_df[f"{key} (%)"] = component_cost_df[key] / component_cost_df['Cost'] * 100
component_cost_df['AI accelerator chip cost (%)']

52            NaN
87            NaN
94            NaN
101     45.507920
206           NaN
          ...    
1632          NaN
1660          NaN
1664          NaN
1665    34.484219
1667          NaN
Name: AI accelerator chip cost (%), Length: 89, dtype: float64

In [31]:
cost_component_pc_names = [name + ' (%)' for name in cost_component_names]
filtered_component_cost_df = component_cost_df.dropna(subset=cost_component_pc_names).sort_values(by='Publication date')

In [32]:
# Stacked bar chart of cost components, using component_cost_df
# fig = px.bar(
#     filtered_component_cost_df,
#     x='Model',
#     y=cost_component_pc_names,
#     barmode='stack',
# )

# # axis labels
# fig.update_xaxes(title_text='ML model')
# fig.update_yaxes(title_text='% of amortized hardware CapEx + energy')
# fig.update_layout(
#     legend=dict(
#         title_text='Cost component',
#         x=0.60,
#         y=0.05,
#     )
# )
# # limits 0 to 100
# fig.update_yaxes(range=[0, 100])

# fig.update_yaxes(tickvals=list(range(0, 101, 10)))

# # size
# fig.update_layout(
#     autosize=False,
#     width=800,
#     height=600,
# )

# # margins
# fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

# save_plot(fig, results_dir, 'cost_component_percentage')

# fig.show()

In [33]:
filtered_component_cost_df.head()

Unnamed: 0,Model,Domain,Task,Organization,Authors,Publication date,Reference,Link,Citations,Notability criteria,...,Post-training compute notes,Cost,AI accelerator chip cost,Other server components cost,Cluster-level interconnect cost,Energy cost,AI accelerator chip cost (%),Other server components cost (%),Cluster-level interconnect cost (%),Energy cost (%)
1665,DeepSpeech2 (English),Speech,Speech recognition,Baidu Research - Silicon Valley AI Lab,"Dario Amodei, Rishita Anubhai, Eric Battenberg...",2015-12-08,Deep Speech 2: End-to-End Speech Recognition i...,https://arxiv.org/abs/1512.02595,2853.0,Highly cited,...,,185.456639,63.953273,40.930095,24.602271,55.971,34.484219,22.0699,13.265781,30.180101
1624,GNMT,Language,Translation,Google,"Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc ...",2016-09-26,Google's Neural Machine Translation System: Br...,https://arxiv.org/abs/1609.08144,6483.0,Highly cited,...,,177459.232941,77894.044829,49852.188691,29965.165887,19747.833534,43.89405,28.092192,16.885662,11.128096
1623,Xception,Vision,Image classification,Google,François Chollet,2016-10-07,Xception: Deep Learning with Depthwise Separab...,https://arxiv.org/abs/1610.02357,13038.0,Highly cited,...,,11554.506253,5064.230483,3241.107509,1948.165702,1301.00256,43.829051,28.050593,16.860657,11.259698
1615,PolyNet,Vision,Image classification,Chinese University of Hong Kong (CUHK),"X Zhang, Z Li, C Change Loy",2016-11-17,PolyNet: A Pursuit of Structural Diversity in ...,https://arxiv.org/abs/1611.05725,282.0,SOTA improvement,...,,563.599706,178.564122,114.281038,68.692074,202.062472,31.682792,20.276987,12.188096,35.852125
1604,MoE-Multi,Language,"Language modeling,Translation","Jagiellonian University,Google Brain","N Shazeer, A Mirhoseini, K Maziarz, A Davis",2017-01-23,Outrageously Large Neural Networks: The Sparse...,https://arxiv.org/abs/1701.06538,2037.0,"Highly cited,SOTA improvement",...,,3538.189418,1519.646471,972.573741,584.594865,461.374341,42.949834,27.487894,16.52243,13.039843


In [34]:
filtered_component_cost_df.to_csv(results_dir + 'cost_components.csv', index=False)

In [35]:
# Average percentage for each component
filtered_component_cost_df[cost_component_pc_names].mean()

AI accelerator chip cost (%)           45.618313
Other server components cost (%)       29.544045
Cluster-level interconnect cost (%)    17.630677
Energy cost (%)                         7.206965
dtype: float64

In [36]:
# fig = px.bar(
#     filtered_component_cost_df,
#     x='Model',
#     y='Energy cost (%)',
#     barmode='stack',
#     # labels='Cost %',
#     # text='Energy cost %',
# )
# # axis labels
# fig.update_xaxes(title_text='Model')
# fig.update_yaxes(title_text='Energy cost (% of amortized hardware CapEx + energy)')
# # fig.update_layout(
# #     legend=dict(
# #         title_text='Cost component',
# #         x=0.75,
# #         y=0.05,
# #     )
# # )
# # limits 0 to 100
# fig.update_yaxes(range=[0, 30])
# # size
# fig.update_layout(
#     autosize=False,
#     width=800,
#     height=600,
# )

# # margins
# fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

# save_plot(fig, results_dir, 'energy_percentage')

# fig.show()

In [37]:
# fig = px.scatter(
#     filtered_component_cost_df,
#     x='Publication date',
#     y='Energy cost',
#     text='Model',
# )
# # axis labels
# fig.update_xaxes(title_text='Model')
# fig.update_yaxes(title_text='Energy cost')
# # log y
# fig.update_yaxes(type='log')
# # size
# fig.update_layout(
#     autosize=False,
#     width=800,
#     height=600,
# )

# # margins
# fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

# save_plot(fig, results_dir, 'energy_cost')

# fig.show()

In [38]:
from energy import energy_price

# Stacked bar chart of cost components, using component_cost_df
filtered_component_cost_df.loc[:, 'Energy (kWh)'] = [
    row['Energy cost'] / energy_price(row['Publication date'].year) 
    for _, row in filtered_component_cost_df.iterrows()
]
# fig = px.scatter(
#     filtered_component_cost_df,
#     x='Publication date',
#     y='Energy (kWh)',
#     text='Model',
# )
# # log y
# fig.update_yaxes(type='log')
# # size
# fig.update_layout(
#     autosize=False,
#     width=800,
#     height=600,
# )

# # margins
# fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

# save_plot(fig, results_dir, 'energy_kwh')

# fig.show()

In [39]:
filtered_component_cost_df.columns

Index(['Model', 'Domain', 'Task', 'Organization', 'Authors',
       'Publication date', 'Reference', 'Link', 'Citations',
       'Notability criteria', 'Notability criteria notes', 'Parameters',
       'Parameters notes', 'Training compute (FLOP)', 'Training compute notes',
       'Training dataset', 'Training dataset notes',
       'Training dataset size (datapoints)', 'Dataset size notes',
       'Training time (hours)', 'Training time notes', 'Training hardware',
       'Approach', 'Confidence', 'Abstract', 'Epochs', 'Benchmark data',
       'Model accessibility', 'Country (of organization)', 'Base model',
       'Finetune compute (FLOP)', 'Finetune compute notes',
       'Hardware quantity', 'Hardware utilization', 'Last modified',
       'Training cloud compute vendor', 'Training data center',
       'Archived links', 'Batch size', 'Batch size notes',
       'Organization categorization', 'Foundation model',
       'Training compute lower bound', 'Training compute upper bound',
  

In [40]:
filtered_component_cost_df = filtered_component_cost_df.dropna(subset=['Training hardware'])
power_col = 'Power capacity for final training run (kW)'
filtered_component_cost_df.loc[:, power_col] = [
    cluster_power_capacity(row['Training hardware'], row['Hardware quantity'], hardware_df, row['Organization'])
    for _, row in filtered_component_cost_df.iterrows()
]

# fig = px.scatter(
#     filtered_component_cost_df,
#     x='Publication date',
#     y=power_col,
#     text='Model',
# )
# # log y
# fig.update_yaxes(type='log')
# # size
# fig.update_layout(
#     autosize=False,
#     width=800,
#     height=600,
# )

# # margins
# fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

# save_plot(fig, results_dir, 'power_capacity_kw')

# fig.show()

In [41]:
filtered_component_cost_df['Publication date (float)'] = datetime_to_float_year(
    pd.to_datetime(filtered_component_cost_df['Publication date'])
)

In [42]:
power_reg_results = fit_ols_regression(
    filtered_component_cost_df,
    ['Publication date (float)'],
    power_col,
    logy=True
)
power_reg_results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.785
Model:,OLS,Adj. R-squared:,0.78
Method:,Least Squares,F-statistic:,149.9
Date:,"Wed, 30 Jul 2025",Prob (F-statistic):,2.8e-15
Time:,15:32:50,Log-Likelihood:,-24.222
No. Observations:,43,AIC:,52.44
Df Residuals:,41,BIC:,55.97
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-703.9984,57.729,-12.195,0.000,-820.584,-587.412
x1,0.3497,0.029,12.244,0.000,0.292,0.407

0,1,2,3
Omnibus:,1.823,Durbin-Watson:,1.654
Prob(Omnibus):,0.402,Jarque-Bera (JB):,0.989
Skew:,0.327,Prob(JB):,0.61
Kurtosis:,3.353,Cond. No.,1760000.0


In [43]:
with open(f'{results_dir}/power_regression_results.out', 'w') as f:
    with redirect_stdout(f):
        print_growth_rates(power_reg_results)
print_growth_rates(power_reg_results)

N=43.0
R^2=0.79
0.34972925639193325 OOMs/year (90% CI: 0.301660382440203, 0.3977981303436635)
2.23732593192286x/year (90% CI: 2.002905146315243x, 2.499183416081074x)
doubling time of 10.32901846770145 months (90% CI: 9.08088719483675, 11.974923318556222)


In [44]:
pred_start_year = 2015
pred_end_year = 2025
pred_start_date = f'{pred_start_year}-01-01'
pred_end_date = f'{pred_end_year}-01-01'

pred_years = pd.DataFrame({'Publication date (float)': np.linspace(pred_start_year, pred_end_year, 100)})
pred_years

Unnamed: 0,Publication date (float)
0,2015.00000
1,2015.10101
2,2015.20202
3,2015.30303
4,2015.40404
...,...
95,2024.59596
96,2024.69697
97,2024.79798
98,2024.89899


In [45]:
predicted_power_df = get_predictions(power_reg_results, pred_years, ['Publication date (float)'])
predicted_power_df['Publication date'] = predicted_power_df['Publication date (float)'].apply(float_year_to_datetime)
predicted_power_df

Unnamed: 0,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper,Publication date (float),Publication date
0,0.706065,0.185751,0.393470,1.018661,-0.090333,1.502463,2015.00000,2015-01-01
1,0.741392,0.183059,0.433326,1.049457,-0.053239,1.536022,2015.10101,2015-02-06
2,0.776718,0.180373,0.473172,1.080264,-0.016172,1.569607,2015.20202,2015-03-15
3,0.812044,0.177694,0.513007,1.111081,0.020870,1.603218,2015.30303,2015-04-21
4,0.847370,0.175020,0.552832,1.141908,0.057885,1.636855,2015.40404,2015-05-28
...,...,...,...,...,...,...,...,...
95,4.062053,0.120531,3.859215,4.264892,3.302002,4.822104,2024.59596,2024-08-06
96,4.097379,0.122949,3.890471,4.304288,3.336232,4.858527,2024.69697,2024-09-12
97,4.132706,0.125388,3.921693,4.343718,3.370433,4.894978,2024.79798,2024-10-19
98,4.168032,0.127845,3.952885,4.383179,3.404604,4.931459,2024.89899,2024-11-25


## Power plot

In [46]:
label_systems = ['GNMT', 'AlphaGo Master', 'AlphaGo Zero', 'AlphaZero', 'DALL-E', 'GPT-3 175B (davinci)', 'PaLM (540B)', 'Llama 2-70B', 'Falcon 180B', 'GPT-4', 'Gemini 1.0 Ultra', 'Inflection-2']

# fig = px.scatter(
#     filtered_component_cost_df,
#     x='Publication date',
#     y=power_col,
#     log_y=True,
# )

# # Marker color
# fig.update_traces(
#     marker=dict(
#         color='rgb(0,100,200)',
#     ),
#     selector=dict(mode='markers'),
# )

# fig.add_scatter(
#     x=filtered_component_cost_df.loc[filtered_component_cost_df['Model'].isin(label_systems)]['Publication date'],
#     y=filtered_component_cost_df.loc[filtered_component_cost_df['Model'].isin(label_systems)][power_col],
#     text=filtered_component_cost_df.loc[filtered_component_cost_df['Model'].isin(label_systems)]['Model'],
#     mode='text',
#     showlegend=False,
# )

# # Shade in CI
# fig.add_scatter(
#     x=predicted_power_df['Publication date'],
#     y=10**predicted_power_df['mean_ci_lower'],
#     mode='lines',
#     line=dict(width=0),
#     showlegend=False,
# )
# fig.add_scatter(
#     x=predicted_power_df['Publication date'],
#     y=10**predicted_power_df['mean_ci_upper'],
#     mode='lines',
#     fill='tonexty',
#     fillcolor='rgba(0,100,200,0.2)',
#     line=dict(width=0),
#     name='90% CI of mean',
# )
# fig.add_scatter(
#     x=predicted_power_df['Publication date'],
#     y=10**predicted_power_df['mean'],
#     mode='lines',
#     line=dict(color='rgb(0,100,200)'),
#     name=f'Regression mean (growth rate: {10**power_reg_results.params[1]:.1f}x per year)',
# )

# fig.update_traces(textposition='top center')

# # axis limits
# fig.update_xaxes(range=[pred_start_date, pred_end_date])
# # fig.update_xaxes(range=['2015-01-01', '2025-01-01'])  # manual
# # fig.update_yaxes(range=[1, 6])

# # legend on bottom-right of the axes
# fig.update_layout(
#     legend=dict(
#         x=0.45,
#         y=0.05,
#     )
# )

# # axis labels
# fig.update_xaxes(title_text='Publication date')
# fig.update_yaxes(title_text='Power (kW, log scale)')

# # title
# fig.update_layout(title_text='Cluster power required for final training run')

# # update size
# fig.update_layout(
#     autosize=False,
#     width=800,
#     height=600,
#     title_font=dict(
#         size=16,
#     ),
#     title_x=0.5,
# )

# # font size
# fig.update_layout(
#     font=dict(
#         size=14,
#     )
# )

# # margins
# fig.update_layout(margin=dict(l=10, r=10, t=60, b=10))

# save_plot(fig, results_dir, 'power_regression')

# fig.show()