# Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import os
import pandas as pd
import plotly.express as px

from cost import *
from plotting import *
from prices import *
from imputation import *
from inflation import *
from regression import *
from utils import *

In [3]:
estimation_method = 'hardware-capex-opex'
compute_percentile_threshold = 50  # select systems >= this percentile of training compute
variant = 'interconnect-constant-overhead-15pc'

estimation_method_lookup = {
    'cloud': estimate_costs,
    'amortized': estimate_amortized_hardware_costs,
    'up-front-server-capex': estimate_upfront_server_capex,
    'hardware-capex-opex': estimate_hardware_capex_opex,
}
cost_estimation_function = estimation_method_lookup[estimation_method]

results_dir = f'results/{estimation_method}-{compute_percentile_threshold}th-{variant}/'
os.makedirs(results_dir, exist_ok=True)

# Load data

In [4]:
frontier_pcd_df, hardware_df, price_df = load_data_for_cost_estimation(
    compute_percentile_threshold=compute_percentile_threshold
)

In [5]:
len(frontier_pcd_df), len(hardware_df), len(price_df)

(148, 5510, 142)

# Cost estimation

In [6]:
cost_df = cost_estimation_function(frontier_pcd_df, hardware_df, price_df)

==== System: Gemini Ultra ====
Trying Google TPU v4
Could not find hardware model after soft matching: Google TPU v4


==== System: Qwen-72B ====
Could not find hardware model for Qwen-72B


==== System: Inflection-2 ====
Trying NVIDIA H100 SXM5
Found price: 39546.976

==== System: Nemotron-3-8B ====
Trying NVIDIA A100
Soft matching NVIDIA A100 to NVIDIA A100
Soft matching NVIDIA A100 to NVIDIA A100 PCIe
Found price: 22000.0

==== System: Yi-34B ====
Could not find hardware model for Yi-34B


==== System: Skywork-13B ====
Trying NVIDIA A800
Could not find hardware model after soft matching: NVIDIA A800


==== System: ChatGLM3 ====
Could not find hardware model for ChatGLM3


==== System: XGen-7B ====
Trying Google TPU v4
Could not find hardware model after soft matching: Google TPU v4


==== System: Falcon 180B ====
Trying NVIDIA A100 SXM4 40 GB
Found price: 22958.333333333332

==== System: Llama 2-70B ====
Trying NVIDIA A100 SXM4 80 GB
Found price: 25641.25

==== System: Llama 2-34B =

In [7]:
cost_df

Unnamed: 0,System,Domain,Task,Authors,Notability criteria,Notability criteria notes,Model accessibility,Link,Citations,Reference,...,Batch size notes,Organization categorization,Foundation model,Training compute lower bound,Training compute upper bound,Training chip-hours,Code accessibility,Dataset accessibility,Accessibility notes,Cost
39,Gemini Ultra,Multimodal,"Language modelling,Visual question answering,C...",Gemini Team,SOTA improvement,""" Evaluation on a broad range of benchmarks sh...",Hosted access (no API),https://storage.googleapis.com/deepmind-media/...,252.0,Gemini: A Family of Highly Capable Multimodal ...,...,,Industry,,,,132000000.0,,,,
46,Qwen-72B,Language,"Chat,Code generation","Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Ka...",SOTA improvement,"SOTA on several Chinese benchmarks, with highe...",Permissive license (depr.),https://huggingface.co/Qwen/Qwen-72B,,,...,Table 1 https://arxiv.org/abs/2309.16609\n(thi...,Industry,,,,,,,,
54,Inflection-2,Language,Language modelling,,Significant use,Inflection-2 either already powers Pi or soon ...,API access,https://inflection.ai/inflection-2,,Inflection-2: The Next Step Up,...,,Industry,checked,,,,,,,1.441020e+07
59,Nemotron-3-8B,Language,"Chat,Language generation",,SOTA improvement,"""The Nemotron-3-8B-QA model offers state-of-th...",Permissive license (depr.),https://developer.nvidia.com/blog/nvidia-ai-fo...,,NVIDIA AI Foundation Models: Build Custom Ente...,...,,Industry,,,,,,,,5.065593e+05
83,Yi-34B,Language,Chat,,Significant use,2nd most popular model on HuggingFace: https:/...,Permissive license (depr.),https://arxiv.org/abs/2403.04652,,,...,,Industry,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,GNMT,Language,Translation,"Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc ...",Highly cited,,,https://arxiv.org/abs/1609.08144,6105.0,Google's Neural Machine Translation System: Br...,...,,Industry,,,,414720.0,,,,9.795660e+04
1131,AlphaGo Lee,Games,Go,"David Silver, Aja Huang, Chris J. Maddison, Ar...",Highly cited,,,https://www.nature.com/articles/nature16961,14733.0,Mastering the game of Go with deep neural netw...,...,,Industry,,,,,,,,
1135,ResNet-152 (ImageNet),Vision,Image classification,"Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun",Highly cited,,,https://arxiv.org/abs/1512.03385,154061.0,Deep Residual Learning for Image Recognition,...,,Industry,,,,,,,,
1137,DeepSpeech2 (English),Speech,Speech recognition,"Dario Amodei, Rishita Anubhai, Eric Battenberg...",Highly cited,,,https://arxiv.org/abs/1512.02595,2741.0,Deep Speech 2: End-to-End Speech Recognition i...,...,,Industry,,,,301.0,,,,3.075543e+02


In [8]:
cost_df['Cost'].notna().sum()

50

Use the below to check data availability for specific systems

In [9]:
# system = 'WizardLM-7B'
# row = cost_df.loc[cost_df['System'] == system]
# print('Cost:', row['Cost'].values[0])
# print('Training hardware:', row['Training hardware'].values[0])
# print('Training time (hours):', row['Training time (hours)'].values[0])
# print('Hardware quantity:', row['Hardware quantity'].values[0])
# print('Hardware utilization:', row['Hardware utilization'].values[0])

# Apply inflation adjustment

In [10]:
# TODO move to function

In [11]:
cost_df['Publication date']

39     2023-12-06
46     2023-11-30
54     2023-11-22
59     2023-11-15
83     2023-11-02
          ...    
1095   2016-09-26
1131   2016-01-27
1135   2015-12-10
1137   2015-12-08
1141   2015-10-01
Name: Publication date, Length: 148, dtype: datetime64[ns]

In [12]:
from_year_month = cost_df['Publication date'].apply(str)
cost_df['Publication date'] = from_year_month

In [13]:
cost_df['Publication date']

39      2023-12-06 00:00:00
46      2023-11-30 00:00:00
54      2023-11-22 00:00:00
59      2023-11-15 00:00:00
83      2023-11-02 00:00:00
               ...         
1095    2016-09-26 00:00:00
1131    2016-01-27 00:00:00
1135    2015-12-10 00:00:00
1137    2015-12-08 00:00:00
1141    2015-10-01 00:00:00
Name: Publication date, Length: 148, dtype: object

In [14]:
cost_df = adjust_column_for_inflation(cost_df, 'Cost', 'data/PCU518210518210.csv', '2023-12-01')

In [15]:
cost_df['Cost (inflation-adjusted)'].dropna()

54      1.444394e+07
59      5.077451e+05
135     2.006101e+07
174     2.670508e+06
175     1.197600e+06
176     2.465648e+05
177     4.696471e+05
186     3.404523e+06
225     4.025194e+05
256     7.129772e+05
263     6.491391e+07
264     6.063113e+05
277     1.170741e+05
279     1.109782e+06
280     9.020657e+04
323     7.059762e+06
329     9.673901e+05
334     1.372773e+06
340     1.701560e+05
374     2.183224e+06
375     1.275417e+06
376     4.036632e+05
389     5.224218e+04
439     1.017571e+06
443     1.660140e+05
472     2.423933e+05
495     1.409082e+05
514     1.394772e+05
542     4.332846e+06
549     7.292925e+05
593     1.398829e+05
608     1.191339e+05
630     3.436159e+04
646     3.414615e+04
655     2.461725e+05
682     1.178885e+04
717     2.071817e+05
718     5.593907e+04
724     3.786403e+06
759     9.835721e+04
809     6.367064e+05
810     1.815856e+05
835     1.334657e+05
982     2.313175e+04
1041    1.845217e+04
1083    9.822699e+02
1088    6.653528e+01
1094    1.114

In [16]:
cost_df['Cost (inflation-adjusted)'].notna().sum()

50

# Regression

In [17]:
cost_df['Publication date (float)'] = datetime_to_float_year(pd.to_datetime(cost_df['Publication date']))

In [18]:
reg_results = fit_ols_regression(cost_df, ['Publication date (float)'], 'Cost (inflation-adjusted)', logy=True)
reg_results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.548
Model:,OLS,Adj. R-squared:,0.539
Method:,Least Squares,F-statistic:,58.2
Date:,"Tue, 30 Apr 2024",Prob (F-statistic):,8.04e-10
Time:,18:25:00,Log-Likelihood:,-55.897
No. Observations:,50,AIC:,115.8
Df Residuals:,48,BIC:,119.6
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-754.8357,99.647,-7.575,0.000,-955.190,-554.481
x1,0.3761,0.049,7.629,0.000,0.277,0.475

0,1,2,3
Omnibus:,0.696,Durbin-Watson:,2.089
Prob(Omnibus):,0.706,Jarque-Bera (JB):,0.491
Skew:,0.241,Prob(JB):,0.782
Kurtosis:,2.942,Cond. No.,1890000.0


In [19]:
print_growth_rates(reg_results)

0.38 OOMs/year (95% CI: 0.28, 0.48)
2.4x/year (95% CI: 1.9x, 3.0x)
doubling time of 10 months (95% CI: 8, 13)


In [20]:
pred_start_year = 2015
pred_end_year = 2025
pred_start_date = f'{pred_start_year}-01-01'
pred_end_date = f'{pred_end_year}-01-01'

pred_years = pd.DataFrame({'Publication date (float)': np.linspace(pred_start_year, pred_end_year, 100)})
pred_years

Unnamed: 0,Publication date (float)
0,2015.00000
1,2015.10101
2,2015.20202
3,2015.30303
4,2015.40404
...,...
95,2024.59596
96,2024.69697
97,2024.79798
98,2024.89899


In [21]:

predicted_cost_df = get_predictions(reg_results, pred_years, ['Publication date (float)'])
predicted_cost_df['Publication date'] = predicted_cost_df['Publication date (float)'].apply(float_year_to_datetime)
predicted_cost_df

Unnamed: 0,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper,Publication date (float),Publication date
0,2.955732,0.336233,2.279690,3.631775,1.293355,4.618109,2015.00000,2015-01-01
1,2.993720,0.331516,2.327162,3.660277,1.335178,4.652262,2015.10101,2015-02-06
2,3.031707,0.326806,2.374619,3.688795,1.376948,4.686466,2015.20202,2015-03-15
3,3.069695,0.322105,2.422060,3.717329,1.418666,4.720723,2015.30303,2015-04-21
4,3.107682,0.317411,2.469483,3.745880,1.460332,4.755032,2015.40404,2015-05-28
...,...,...,...,...,...,...,...,...
95,6.564534,0.187597,6.187345,6.941723,4.999691,8.129378,2024.59596,2024-08-06
96,6.602522,0.191711,6.217060,6.987983,5.035664,8.169379,2024.69697,2024-09-12
97,6.640509,0.195866,6.246695,7.034323,5.071575,8.209443,2024.79798,2024-10-19
98,6.678496,0.200058,6.276254,7.080739,5.107426,8.249567,2024.89899,2024-11-25


# Plots

In [22]:
plot_title_lookup = {
    'cloud': 'Cloud compute cost to train large ML systems',
    'amortized': 'Amortized cost of hardware to train large ML systems',
    'up-front-server-capex': 'Acquisition cost of hardware to train large ML systems',
    'hardware-capex-opex': 'Hardware CapEx + OpEx to train large ML systems',
}

In [23]:
fig = px.scatter(
    cost_df,
    x='Publication date',
    y='Cost (inflation-adjusted)',
    text='System',
    log_y=True,
)
fig.update_traces(textposition='top center')

# no legend
fig.update_layout(showlegend=False)

# axis labels
fig.update_xaxes(title_text='Publication date')
fig.update_yaxes(title_text='Cost (2023 USD)')

# title
fig.update_layout(title_text=plot_title_lookup[estimation_method])

# update size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
)

# font size
fig.update_layout(
    font=dict(
        size=14,
    )
)

# axis limits
fig.update_xaxes(range=['2015-01-01', '2025-01-01'])

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

save_plot(fig, results_dir, 'cost_scatter')

fig.show()

In [24]:
label_systems = ['GNMT', 'Megatron-BERT', 'GPT-3 175B (davinci)', 'PaLM (540B)', 'Llama 2-70B', 'Falcon 180B', 'GPT-4', 'Gemini Ultra']

fig = px.scatter(
    cost_df.loc[cost_df['System'].isin(label_systems)],
    x='Publication date',
    y='Cost (inflation-adjusted)',
    text='System',
    log_y=True,
)

fig.add_scatter(
    x=cost_df['Publication date'],
    y=cost_df['Cost (inflation-adjusted)'],
    mode='markers',
    showlegend=False,
)

# Marker color
fig.update_traces(
    marker=dict(
        color='rgb(0,100,200)',
    ),
    selector=dict(mode='markers'),
)

# Shade in CI
fig.add_scatter(
    x=predicted_cost_df['Publication date'],
    y=10**predicted_cost_df['mean_ci_lower'],
    mode='lines',
    line=dict(width=0),
    showlegend=False,
)
fig.add_scatter(
    x=predicted_cost_df['Publication date'],
    y=10**predicted_cost_df['mean_ci_upper'],
    mode='lines',
    fill='tonexty',
    fillcolor='rgba(0,100,200,0.2)',
    line=dict(width=0),
    name='95% CI of mean',
)
fig.add_scatter(
    x=predicted_cost_df['Publication date'],
    y=10**predicted_cost_df['mean'],
    mode='lines',
    line=dict(color='rgb(0,100,200)'),
    name=f'Regression mean (growth rate: {10**reg_results.params[1]:.1f}x per year)',
)

fig.update_traces(textposition='top center')

# legend on top
fig.update_layout(legend=dict(
    orientation='h',
    yanchor='top',
    y=-0.15,
    xanchor='center',
    x=0.5,
))

# axis labels
fig.update_xaxes(title_text='Publication date')
fig.update_yaxes(title_text='Cost (2023 USD)')

# title
fig.update_layout(title_text=plot_title_lookup[estimation_method])

# update size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_x=0.5,
)

# font size
fig.update_layout(
    font=dict(
        size=14,
    )
)

# axis limits
fig.update_xaxes(range=[pred_start_date, pred_end_date])

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

save_plot(fig, results_dir, 'cost_regression')

fig.show()

# Export data

In [25]:
keep_cols = [
    'System',
    'Domain',
    'Task',
    'Model accessibility',
    'Reference',
    'Publication date',
    'Organization',
    'Parameters',
    'Training compute (FLOP)',
    'Training dataset size (datapoints)',
    'Epochs',
    'Training time (hours)',
    'Training hardware',
    'Country (from Organization)',
    'Base model',
    'Finetune compute (FLOP)',
    'Hardware quantity',
    'Hardware utilization',
    'Training cloud compute vendor',
    'Training data center',
    # 'Training time (chip hours)',
    'Cost',
    'Cost (inflation-adjusted)',
]
cost_df[keep_cols]

Unnamed: 0,System,Domain,Task,Model accessibility,Reference,Publication date,Organization,Parameters,Training compute (FLOP),Training dataset size (datapoints),...,Training hardware,Country (from Organization),Base model,Finetune compute (FLOP),Hardware quantity,Hardware utilization,Training cloud compute vendor,Training data center,Cost,Cost (inflation-adjusted)
39,Gemini Ultra,Multimodal,"Language modelling,Visual question answering,C...",Hosted access (no API),Gemini: A Family of Highly Capable Multimodal ...,2023-12-06 00:00:00,Google DeepMind,,5.000000e+25,,...,Google TPU v4,Multinational,,,55000.0,,,,,
46,Qwen-72B,Language,"Chat,Code generation",Permissive license (depr.),,2023-11-30 00:00:00,Alibaba,7.200000e+10,1.300000e+24,,...,,China,,,,,,,,
54,Inflection-2,Language,Language modelling,API access,Inflection-2: The Next Step Up,2023-11-22 00:00:00,Inflection AI,,1.001000e+25,,...,NVIDIA H100 SXM5,United States of America,,,5000.0,,,,1.441020e+07,1.444394e+07
59,Nemotron-3-8B,Language,"Chat,Language generation",Permissive license (depr.),NVIDIA AI Foundation Models: Build Custom Ente...,2023-11-15 00:00:00,NVIDIA,8.000000e+09,1.800000e+23,,...,NVIDIA A100,United States of America,,,1024.0,0.34,,,5.065593e+05,5.077451e+05
83,Yi-34B,Language,Chat,Permissive license (depr.),,2023-11-02 00:00:00,01.AI,3.400000e+10,6.100000e+23,,...,,China,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,GNMT,Language,Translation,,Google's Neural Machine Translation System: Br...,2016-09-26 00:00:00,Google,2.780000e+08,6.900000e+21,360000000.0,...,NVIDIA Tesla K80,United States of America,,,96.0,,,,9.795660e+04,1.069704e+05
1131,AlphaGo Lee,Games,Go,,Mastering the game of Go with deep neural netw...,2016-01-27 00:00:00,DeepMind,,1.900000e+21,29400000.0,...,,United Kingdom of Great Britain and Northern I...,,,,,,,,
1135,ResNet-152 (ImageNet),Vision,Image classification,,Deep Residual Learning for Image Recognition,2015-12-10 00:00:00,Microsoft,6.000000e+07,1.210000e+19,1280000.0,...,,United States of America,,,,,,,,
1137,DeepSpeech2 (English),Speech,Speech recognition,,Deep Speech 2: End-to-End Speech Recognition i...,2015-12-08 00:00:00,Baidu Research - Silicon Valley AI Lab,3.800000e+07,2.600000e+19,163339200.0,...,NVIDIA GTX Titan X,United States of America,,,,0.45,,,3.075543e+02,3.421326e+02


In [26]:
cost_df[keep_cols].to_csv(results_dir + 'price dataset.csv', index=False)