# Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import os
import pandas as pd
import plotly.express as px

from cost import *
from plotting import *
from prices import *
from imputation import *
from inflation import *
from regression import *
from utils import *

In [3]:
results_dir = 'results/ai-index-final/'
os.makedirs(results_dir, exist_ok=True)

# Load data

In [4]:
frontier_pcd_df, hardware_df, price_df = load_data_for_cost_estimation()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frontier_pcd_df.loc[:, 'Training compute (FLOP)'] = pd.to_numeric(


In [5]:
len(frontier_pcd_df), len(hardware_df), len(price_df)

(138, 5509, 73)

# Cost estimation

In [6]:
cost_df = estimate_costs(frontier_pcd_df, hardware_df, price_df)

==== System: Gemini Ultra ====
Trying Google TPU v4 at 2023-06-29 00:00:00
Trying Google Cloud, Price per chip-hour (3-year CUD)
Found price: 1.45 at 2023-06-01 00:00:00
Difference between purchase time and price date: 28 days 00:00:00 

==== System: Qwen-72B ====
Training time is required but no value found

==== System: Inflection-2 ====
Training time is required but no value found

==== System: Nemotron-3-8B ====
Trying NVIDIA A100 at 2023-08-28 00:00:00
Trying Amazon Web Services, Price per chip-hour (3-year CUD)
Found price: 1.45 at 2022-12-09 00:00:00
Difference between purchase time and price date: 262 days 00:00:00 

==== System: CogVLM ====
Training time is required but no value found

==== System: Yi-34B ====
Training time is required but no value found

==== System: Skywork-13B ====
Trying NVIDIA A800 at 2023-07-22 20:00:00
Trying Amazon Web Services, Price per chip-hour (3-year CUD)
Could not find price

Trying Microsoft Azure, Price per chip-hour (3-year CUD)
Could not fin

In [7]:
cost_df

Unnamed: 0,System,Domain,Task,Authors,Notability criteria,Notability criteria notes,Open-source,Link,Citations,Reference,...,Archived links,Batch size,Batch size notes,Organization categorization,Foundation model,Training compute lower bound,Training compute upper bound,Training chip-hours,Training time (chip hours),Cost
38,Gemini Ultra,Multimodal,"Language modelling,Visual question answering,C...",Gemini Team,SOTA improvement,""" Evaluation on a broad range of benchmarks sh...",,https://storage.googleapis.com/deepmind-media/...,252.0,Gemini: A Family of Highly Capable Multimodal ...,...,,,,Industry,,,,132000000,132000000.0,191400000.0
45,Qwen-72B,Language,"Chat,Code generation","Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Ka...",SOTA improvement,"SOTA on several Chinese benchmarks, with highe...",Permissive license,https://huggingface.co/Qwen/Qwen-72B,,,...,,4000000.0,Table 1 https://arxiv.org/abs/2309.16609\n(thi...,Industry,,,,0,,
53,Inflection-2,Language,Language modelling,,Significant use,Inflection-2 either already powers Pi or soon ...,API accessible,https://inflection.ai/inflection-2,,Inflection-2: The Next Step Up,...,,,,Industry,checked,,,0,,
58,Nemotron-3-8B,Language,"Chat,Language generation",,SOTA improvement,"""The Nemotron-3-8B-QA model offers state-of-th...",Permissive license,https://developer.nvidia.com/blog/nvidia-ai-fo...,,NVIDIA AI Foundation Models: Build Custom Ente...,...,,,,Industry,,,,0,,
75,CogVLM,"Multimodal,Vision,Language","Image captioning,Visual question answering,Chat","Weihan Wang, Qingsong Lv, Wenmeng Yu, Wenyi Ho...",SOTA improvement,"""CogVLM-17B\nachieves state-of-the-art perform...",Permissive license,https://arxiv.org/abs/2311.03079\nhttps://hugg...,43.0,CogVLM: Visual Expert for Pretrained Language ...,...,,,,"Academia,Industry,Academia",checked,,,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1094,GNMT,Language,Translation,"Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc ...",Highly cited,,,https://arxiv.org/abs/1609.08144,6105.0,Google's Neural Machine Translation System: Br...,...,,,,Industry,,,,414720,414720.0,178329.6
1130,AlphaGo Lee,Games,Go,"David Silver, Aja Huang, Chris J. Maddison, Ar...",Highly cited,,,https://www.nature.com/articles/nature16961,14733.0,Mastering the game of Go with deep neural netw...,...,,,,Industry,,,,0,,
1134,ResNet-152 (ImageNet),Vision,Image classification,"Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun",Highly cited,,,https://arxiv.org/abs/1512.03385,154061.0,Deep Residual Learning for Image Recognition,...,,,,Industry,,,,0,,
1136,DeepSpeech2 (English),Speech,Speech recognition,"Dario Amodei, Rishita Anubhai, Eric Battenberg...",Highly cited,,,https://arxiv.org/abs/1512.02595,2741.0,Deep Speech 2: End-to-End Speech Recognition i...,...,,,,Industry,,,,0,,


In [8]:
cost_df['Cost'].notna().sum()

48

Use the below to check data availability for specific systems

In [9]:
# system = 'AlphaGo Fan'
# print('Cost:', cost_df.loc[system, 'Cost'])
# print('Training hardware:', cost_df.loc[system, 'Training hardware'])
# print('Training time (hours):', cost_df.loc[system, 'Training time (hours)'])
# print('Hardware quantity:', cost_df.loc[system, 'Hardware quantity'])
# print('Hardware utilization:', cost_df.loc[system, 'Hardware utilization'])

# Apply inflation adjustment

In [10]:
# TODO move to function

In [11]:
cost_df['Publication date']

38     2023-12-06
45     2023-11-30
53     2023-11-22
58     2023-11-15
75     2023-11-06
          ...    
1094   2016-09-26
1130   2016-01-27
1134   2015-12-10
1136   2015-12-08
1140   2015-10-01
Name: Publication date, Length: 138, dtype: datetime64[ns]

In [12]:
from_year_month = cost_df['Publication date'].apply(str)
cost_df['Publication date'] = from_year_month

In [13]:
cost_df['Publication date']

38      2023-12-06 00:00:00
45      2023-11-30 00:00:00
53      2023-11-22 00:00:00
58      2023-11-15 00:00:00
75      2023-11-06 00:00:00
               ...         
1094    2016-09-26 00:00:00
1130    2016-01-27 00:00:00
1134    2015-12-10 00:00:00
1136    2015-12-08 00:00:00
1140    2015-10-01 00:00:00
Name: Publication date, Length: 138, dtype: object

In [14]:
cost_df = adjust_column_for_inflation(cost_df, 'Cost', 'data/PCU518210518210.csv', '2023-12-01')

In [15]:
cost_df['Cost (inflation-adjusted)'].dropna()

38      1.914000e+08
134     2.581632e+07
173     3.931897e+06
224     5.856264e+05
255     9.529677e+05
262     7.835203e+07
263     8.103978e+05
278     1.503763e+06
333     1.990912e+06
348     1.252379e+07
369     2.547193e+05
374     1.635573e+06
395     1.347019e+07
431     3.619903e+04
435     5.610850e+05
438     1.493975e+06
439     8.150695e+05
447     1.238906e+07
469     1.319586e+06
471     3.108410e+05
494     1.930600e+05
498     2.093016e+06
502     3.499808e+06
513     1.724109e+05
537     6.417537e+04
541     6.405653e+06
548     9.869716e+05
578     1.655001e+05
636     2.513914e+05
651     6.197414e+05
714     9.691116e+05
723     4.324883e+06
763     6.971103e+05
790     3.889027e+05
794     2.366316e+05
808     8.817783e+05
809     2.160704e+05
834     1.600176e+05
884     6.387439e+02
917     3.287635e+03
920     1.164818e+04
921     5.358052e+03
931     4.434402e+03
983     1.381309e+02
1040    3.344073e+04
1045    9.299259e+02
1093    2.028533e+04
1094    1.947

In [16]:
cost_df['Cost (inflation-adjusted)'].notna().sum()

48

# Regression

In [17]:
cost_df['Publication date (float)'] = datetime_to_float_year(pd.to_datetime(cost_df['Publication date']))

In [18]:
reg_results = fit_ols_regression(cost_df, ['Publication date (float)'], 'Cost (inflation-adjusted)', logy=True)
reg_results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.533
Model:,OLS,Adj. R-squared:,0.523
Method:,Least Squares,F-statistic:,52.48
Date:,"Thu, 07 Mar 2024",Prob (F-statistic):,3.93e-09
Time:,16:07:05,Log-Likelihood:,-60.947
No. Observations:,48,AIC:,125.9
Df Residuals:,46,BIC:,129.6
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-953.2855,132.365,-7.202,0.000,-1219.723,-686.848
x1,0.4744,0.065,7.244,0.000,0.343,0.606

0,1,2,3
Omnibus:,0.557,Durbin-Watson:,1.592
Prob(Omnibus):,0.757,Jarque-Bera (JB):,0.693
Skew:,-0.187,Prob(JB):,0.707
Kurtosis:,2.546,Cond. No.,2110000.0


In [19]:
print_growth_rates(reg_results)

0.47 OOMs/year (95% CI: 0.34, 0.61)
3.0x/year (95% CI: 2.2x, 4.0x)
doubling time of 8 months (95% CI: 6, 11)


In [20]:
pred_start_year = 2015
pred_end_year = 2025
pred_start_date = f'{pred_start_year}-01-01'
pred_end_date = f'{pred_end_year}-01-01'

pred_years = pd.DataFrame({'Publication date (float)': np.linspace(pred_start_year, pred_end_year, 100)})
pred_years

Unnamed: 0,Publication date (float)
0,2015.00000
1,2015.10101
2,2015.20202
3,2015.30303
4,2015.40404
...,...
95,2024.59596
96,2024.69697
97,2024.79798
98,2024.89899


In [21]:

predicted_cost_df = get_predictions(reg_results, pred_years, ['Publication date (float)'])
predicted_cost_df['Publication date'] = predicted_cost_df['Publication date (float)'].apply(float_year_to_datetime)
predicted_cost_df

Unnamed: 0,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper,Publication date (float),Publication date
0,2.673192,0.420160,1.827453,3.518931,0.710444,4.635939,2015.00000,2015-01-01
1,2.721113,0.413859,1.888057,3.554169,0.763798,4.678429,2015.10101,2015-02-06
2,2.769034,0.407568,1.948642,3.589427,0.817075,4.720994,2015.20202,2015-03-15
3,2.816956,0.401288,2.009205,3.624706,0.870276,4.763636,2015.30303,2015-04-21
4,2.864877,0.395018,2.069747,3.660007,0.923400,4.806354,2015.40404,2015-05-28
...,...,...,...,...,...,...,...,...
95,7.225718,0.260940,6.700474,7.750963,5.378292,9.073144,2024.59596,2024-08-06
96,7.273639,0.266738,6.736724,7.810555,5.422861,9.124418,2024.69697,2024-09-12
97,7.321561,0.272573,6.772899,7.870223,5.467341,9.175781,2024.79798,2024-10-19
98,7.369482,0.278444,6.809004,7.929960,5.511731,9.227233,2024.89899,2024-11-25


# Plots

In [22]:
fig = px.scatter(
    cost_df,
    x='Publication date',
    y='Cost (inflation-adjusted)',
    text='System',
    log_y=True,
)
fig.update_traces(textposition='top center')

# no legend
fig.update_layout(showlegend=False)

# axis labels
fig.update_xaxes(title_text='Publication date')
fig.update_yaxes(title_text='Cost (2023 USD)')

# title
fig.update_layout(title_text='Cost of cloud compute to train large ML systems')

# update size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
)

# font size
fig.update_layout(
    font=dict(
        size=14,
    )
)

# axis limits
fig.update_xaxes(range=['2015-01-01', '2025-01-01'])

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

save_plot(fig, results_dir, 'cost_scatter')

fig.show()

In [23]:
label_systems = ['GNMT', 'Megatron-BERT', 'GPT-3 175B (davinci)', 'PaLM (540B)', 'Llama 2-70B', 'Falcon 180B', 'GPT-4', 'Gemini Ultra']

fig = px.scatter(
    cost_df.loc[cost_df['System'].isin(label_systems)],
    x='Publication date',
    y='Cost (inflation-adjusted)',
    text='System',
    log_y=True,
)

fig.add_scatter(
    x=cost_df['Publication date'],
    y=cost_df['Cost (inflation-adjusted)'],
    mode='markers',
    showlegend=False,
)

# Marker color
fig.update_traces(
    marker=dict(
        color='rgb(0,100,200)',
    ),
    selector=dict(mode='markers'),
)

# Shade in CI
fig.add_scatter(
    x=predicted_cost_df['Publication date'],
    y=10**predicted_cost_df['mean_ci_lower'],
    mode='lines',
    line=dict(width=0),
    showlegend=False,
)
fig.add_scatter(
    x=predicted_cost_df['Publication date'],
    y=10**predicted_cost_df['mean_ci_upper'],
    mode='lines',
    fill='tonexty',
    fillcolor='rgba(0,100,200,0.2)',
    line=dict(width=0),
    name='95% CI of mean',
)
fig.add_scatter(
    x=predicted_cost_df['Publication date'],
    y=10**predicted_cost_df['mean'],
    mode='lines',
    line=dict(color='rgb(0,100,200)'),
    name=f'Regression mean (growth rate: {10**reg_results.params[1]:.1f}x per year)',
)

fig.update_traces(textposition='top center')

# legend on top
fig.update_layout(legend=dict(
    orientation='h',
    yanchor='top',
    y=-0.15,
    xanchor='center',
    x=0.5,
))

# axis labels
fig.update_xaxes(title_text='Publication date')
fig.update_yaxes(title_text='Cost (2023 USD)')

# title
fig.update_layout(title_text='Cost of cloud compute to train large ML systems')

# update size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_x=0.5,
)

# font size
fig.update_layout(
    font=dict(
        size=14,
    )
)

# axis limits
fig.update_xaxes(range=[pred_start_date, pred_end_date])

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

save_plot(fig, results_dir, 'cost_scatter')

fig.show()

# Export data

In [24]:
keep_cols = [
    'System',
    'Domain',
    'Task',
    'Open-source',
    'Reference',
    'Publication date',
    'Organization',
    'Parameters',
    'Training compute (FLOP)',
    'Training dataset size (datapoints)',
    'Epochs',
    'Training time (hours)',
    'Training hardware',
    'Country (from Organization)',
    'Base model',
    'Finetune compute (FLOP)',
    'Hardware quantity',
    'Hardware utilization',
    'Training cloud compute vendor',
    'Training data center',
    'Training time (chip hours)',
    'Cost',
    'Cost (inflation-adjusted)',
]
cost_df[keep_cols]

Unnamed: 0,System,Domain,Task,Open-source,Reference,Publication date,Organization,Parameters,Training compute (FLOP),Training dataset size (datapoints),...,Country (from Organization),Base model,Finetune compute (FLOP),Hardware quantity,Hardware utilization,Training cloud compute vendor,Training data center,Training time (chip hours),Cost,Cost (inflation-adjusted)
38,Gemini Ultra,Multimodal,"Language modelling,Visual question answering,C...",,Gemini: A Family of Highly Capable Multimodal ...,2023-12-06 00:00:00,Google DeepMind,,5.000000e+25,,...,Multinational,,,55000.0,,,,132000000.0,191400000.0,1.914000e+08
45,Qwen-72B,Language,"Chat,Code generation",Permissive license,,2023-11-30 00:00:00,Alibaba,7.200000e+10,1.300000e+24,,...,China,,,,,,,,,
53,Inflection-2,Language,Language modelling,API accessible,Inflection-2: The Next Step Up,2023-11-22 00:00:00,Inflection AI,,1.001000e+25,,...,United States of America,,,5000.0,,,,,,
58,Nemotron-3-8B,Language,"Chat,Language generation",Permissive license,NVIDIA AI Foundation Models: Build Custom Ente...,2023-11-15 00:00:00,NVIDIA,8.000000e+09,1.800000e+23,,...,United States of America,,,,,,,,,
75,CogVLM,"Multimodal,Vision,Language","Image captioning,Visual question answering,Chat",Permissive license,CogVLM: Visual Expert for Pretrained Language ...,2023-11-06 00:00:00,"Tsinghua University,Zhipu AI,Beihang University",1.700000e+10,1.988064e+22,,...,"China,China,China",,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1094,GNMT,Language,Translation,,Google's Neural Machine Translation System: Br...,2016-09-26 00:00:00,Google,2.780000e+08,6.900000e+21,360000000.0,...,Multinational,,,96.0,,,,414720.0,178329.6,1.947392e+05
1130,AlphaGo Lee,Games,Go,,Mastering the game of Go with deep neural netw...,2016-01-27 00:00:00,DeepMind,,1.900000e+21,29400000.0,...,United Kingdom of Great Britain and Northern I...,,,,,,,,,
1134,ResNet-152 (ImageNet),Vision,Image classification,,Deep Residual Learning for Image Recognition,2015-12-10 00:00:00,Microsoft,6.000000e+07,1.210000e+19,1280000.0,...,Multinational,,,,,,,,,
1136,DeepSpeech2 (English),Speech,Speech recognition,,Deep Speech 2: End-to-End Speech Recognition i...,2015-12-08 00:00:00,Baidu Research - Silicon Valley AI Lab,3.800000e+07,2.600000e+19,163339200.0,...,United States of America,,,,,,,,,


In [25]:
cost_df[keep_cols].to_csv(results_dir + 'price dataset.csv', index=False)