# Miscellaneous data exploration

In [1]:
from collections import defaultdict
import numpy as np
import pandas as pd
import plotly.graph_objects as go

from data import *
from hardware import *
from parameters import *
from regression import *

In [2]:
frontier_pcd_df, hardware_df, price_df = load_data_for_cost_estimation(compute_threshold_method="window_percentile", compute_threshold=0)
frontier_pcd_df

Unnamed: 0,System,Domain,Task,Authors,Notability criteria,Notability criteria notes,Model accessibility,Link,Citations,Reference,...,Training compute lower bound,Training compute upper bound,Training chip-hours,Code accessibility,Dataset accessibility,Accessibility notes,Organization categorization (from Organization),Possibly over 1e23 FLOP,Training cost trends 2,Training cost trends 3
77,CogAgent,"Vision,Language","Instruction interpretation,Visual question ans...","Wenyi Hong, Weihan Wang, Qingsong Lv, Jiazheng...",SOTA improvement,See Table 1,Open access (restricted use),https://arxiv.org/abs/2312.08914,50.0,CogAgent: A Visual Language Model for GUI Agents,...,,,,Open source,,Code is Apache License 2.0; model is under a m...,"Academia,Industry",,,
78,FunSearch,"Language,Search",Code generation,"Bernardino Romera-Paredes, Mohammadamin Bareka...","SOTA improvement,Historical significance",Improved SOTA for the cap set problem. Can pla...,Open source,https://www.nature.com/articles/s41586-023-069...,67.0,Mathematical discoveries from program search w...,...,,,,,,Code to run FunSearch with an LLM of your choi...,Industry,,,
88,Gemini 1.0 Ultra,"Multimodal,Language,Vision","Language modelling,Visual question answering,C...",Gemini Team,SOTA improvement,""" Evaluation on a broad range of benchmarks sh...",Hosted access (no API),https://storage.googleapis.com/deepmind-media/...,633.0,Gemini: A Family of Highly Capable Multimodal ...,...,,,132000000.0,,,,Industry,,,
96,Qwen-72B,Language,"Chat,Code generation","Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Ka...",SOTA improvement,"SOTA on several Chinese benchmarks, with highe...",Open access (restricted use),https://huggingface.co/Qwen/Qwen-72B,,,...,,,,Unreleased,Unreleased,up to 100m active users:\nhttps://github.com/Q...,Industry,,,
104,Inflection-2,Language,Language modelling,,Significant use,Inflection-2 either already powers Pi or soon ...,Hosted access (no API),https://inflection.ai/inflection-2,,Inflection-2: The Next Step Up,...,,,,,,"via Pi, no API",Industry,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1222,AlphaGo Lee,Games,Go,"David Silver, Aja Huang, Chris J. Maddison, Ar...",Highly cited,,,https://www.nature.com/articles/nature16961,14887.0,Mastering the game of Go with deep neural netw...,...,,,,,,,Industry,,,
1223,"Variational (untied weights, MC) LSTM (Large)",Language,,"Yarin Gal, Zoubin Ghahramani","Highly cited,SOTA improvement","""The new approach outperforms existing techniq...",,https://arxiv.org/abs/1512.05287?context=stat,1838.0,A Theoretically Grounded Application of Dropou...,...,,,,,,,Academia,,,
1226,ResNet-152 (ImageNet),Vision,Image classification,"Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun",Highly cited,,,https://arxiv.org/abs/1512.03385,156882.0,Deep Residual Learning for Image Recognition,...,,,,,,,Industry,,,
1228,DeepSpeech2 (English),Speech,Speech recognition,"Dario Amodei, Rishita Anubhai, Eric Battenberg...",Highly cited,,,https://arxiv.org/abs/1512.02595,2749.0,Deep Speech 2: End-to-End Speech Recognition i...,...,,,1920.0,,,,Industry,,,


In [3]:
hardware_aliases = ['A100', 'H100', 'P100', 'V100', 'TPU v4', 'TPU v3', 'TPU v2', 'TPU v1', 'K80', 'K40']
hardware_counts = defaultdict(int)
hardware_release_dates = {}
for hardware in frontier_pcd_df['Training hardware'].dropna():
    if "," in hardware: continue
    alias_found = False
    for alias in hardware_aliases:
        if alias in hardware:
            hardware_release_dates[alias] = get_release_date(hardware, hardware_df)
            hardware_counts[alias] += 1
            alias_found = True
    if not alias_found:
        hardware_counts[hardware] += 1

# Print counts in descending order
for hardware, count in sorted(hardware_counts.items(), key=lambda x: x[1], reverse=True):
    print(f'{hardware}: {count}')

A100: 52
V100: 48
TPU v3: 38
TPU v4: 18
TPU v2: 4
P100: 4
K80: 3
NVIDIA GTX Titan X: 3
NVIDIA Geforce GTX 1080 Ti: 2
K40: 2
TPU v1: 2
NVIDIA M40: 2
NVIDIA GeForce GTX TITAN X: 2
H100: 1
NVIDIA A800: 1
NVIDIA GeForce RTX 3090: 1
Huawei Ascend 910: 1
NVIDIA Quadro RTX 4000: 1
NVIDIA RTX A6000: 1
NVIDIA Quadro RTX 8000: 1
NVIDIA Quadro RTX 5000: 1
NVIDIA GeForce RTX 2080 Ti: 1
NVIDIA TITAN Xp: 1
NVIDIA GeForce GTX 1080 Ti: 1
NVIDIA Quadro P600: 1


In [4]:
hardware_release_dates

{'TPU v4': Timestamp('2021-05-20 00:00:00'),
 'H100': Timestamp('2022-09-20 00:00:00'),
 'A100': Timestamp('2020-03-01 00:00:00'),
 'V100': Timestamp('2017-06-21 00:00:00'),
 'TPU v3': Timestamp('2018-05-18 00:00:00'),
 'TPU v2': Timestamp('2017-05-01 00:00:00'),
 'P100': Timestamp('2016-04-05 00:00:00'),
 'K40': Timestamp('2013-11-22 00:00:00'),
 'TPU v1': Timestamp('2015-05-15 00:00:00'),
 'K80': Timestamp('2014-11-17 00:00:00')}

In [5]:
# No TPUs
hardware_aliases = ['A100', 'H100', 'P100', 'V100', 'TPU v4', 'TPU v3', 'TPU v2', 'TPU v1', 'K80', 'K40', 'Titan X']
hardware_counts = defaultdict(int)
for hardware in frontier_pcd_df['Training hardware'].dropna():
    if 'TPU' in hardware:
        continue
    alias_found = False
    for alias in hardware_aliases:
        if alias in hardware:
            hardware_counts[alias] += 1
            alias_found = True
    if not alias_found:
        hardware_counts[hardware] += 1

# Print counts in descending order
for hardware, count in sorted(hardware_counts.items(), key=lambda x: x[1], reverse=True):
    print(f'{hardware}: {count}')

A100: 52
V100: 49
P100: 4
Titan X: 4
K80: 3
NVIDIA Geforce GTX 1080 Ti: 2
K40: 2
NVIDIA M40: 2
NVIDIA GeForce GTX TITAN X: 2
H100: 1
NVIDIA A800: 1
NVIDIA GeForce RTX 3090: 1
Huawei Ascend 910: 1
NVIDIA Quadro RTX 4000: 1
NVIDIA RTX A6000: 1
NVIDIA Quadro RTX 8000: 1
NVIDIA Quadro RTX 5000: 1
NVIDIA GeForce RTX 2080 Ti: 1
NVIDIA TITAN Xp: 1
NVIDIA GeForce GTX 1080 Ti: 1
NVIDIA Quadro P600: 1


# Purchase prices

In [6]:
price_colname = 'Price (hardware purchase)'
purchase_price_df = price_df.dropna(subset=[price_colname])
purchase_price_df

Unnamed: 0,Price source,Price date,Hardware model,Manufacturer (from Hardware model),Vendor,Location,Price per chip-hour (on-demand),Price per chip-hour (1-year CUD),Price per chip-hour (3-year CUD),Price (hardware purchase),Notes
25,https://www.nextplatform.com/2022/05/09/how-mu...,2020-07-01,NVIDIA A100 PCIe,NVIDIA,,,,,,15000.0,Single-unit list price
26,https://www.nextplatform.com/2022/05/09/how-mu...,2022-07-01,NVIDIA A100 PCIe,NVIDIA,,,,,,12500.0,Single-unit list price
38,https://web.archive.org/web/20200521074015/htt...,2020-05-21,NVIDIA A100 SXM4 40 GB,NVIDIA,,,,,,24875.0,"DGX A100, 8 GPU, 320GB. Release price."
39,https://www.nextplatform.com/2022/05/09/how-mu...,2020-07-01,NVIDIA A100 SXM4 40 GB,NVIDIA,,,,,,15000.0,Single-unit list price
40,https://www.nextplatform.com/2022/05/09/how-mu...,2022-07-01,NVIDIA A100 SXM4 40 GB,NVIDIA,,,,,,12500.0,Single-unit list price
48,https://web.archive.org/web/20210630170623/htt...,2021-06-30,NVIDIA A100 SXM4 80 GB,NVIDIA,,,,,,37500.0,"DGX A100, 8 GPU, 640GB"
49,https://web.archive.org/web/20220120191032/htt...,2022-01-20,NVIDIA A100 SXM4 80 GB,NVIDIA,,,,,,20875.0,"DGX A100, 8 GPU, 640GB"
50,https://web.archive.org/web/20230923154035/htt...,2023-09-23,NVIDIA A100 SXM4 80 GB,NVIDIA,,,,,,18548.75,"HGX A100, 8 GPU, 640GB"
57,https://www.techpowerup.com/gpu-specs/geforce-...,2015-03-17,NVIDIA GTX Titan X,NVIDIA,,,,,,999.0,Single-unit release price
59,https://web.archive.org/web/20220929115124/htt...,2022-09-29,NVIDIA H100 SXM5,NVIDIA,,,,,,44489.88,"DGX, 8 GPU, 640GB. Convert Euro to USD: https:..."


In [7]:
# Multiply single unit prices by overhead factor
server_adjusted_purchase_price_df = pd.DataFrame(columns=purchase_price_df.columns)
for i, row in purchase_price_df.iterrows():
    if 'single-unit' in row['Notes'].lower():
        row[price_colname] *= get_server_cost_overhead(row['Hardware model'])
        row['Notes'] = row['Notes'].lower().replace('single-unit', 'server-equivalent')
    server_adjusted_purchase_price_df.loc[i] = row
server_adjusted_purchase_price_df

Unnamed: 0,Price source,Price date,Hardware model,Manufacturer (from Hardware model),Vendor,Location,Price per chip-hour (on-demand),Price per chip-hour (1-year CUD),Price per chip-hour (3-year CUD),Price (hardware purchase),Notes
25,https://www.nextplatform.com/2022/05/09/how-mu...,2020-07-01,NVIDIA A100 PCIe,NVIDIA,,,,,,24900.0,server-equivalent list price
26,https://www.nextplatform.com/2022/05/09/how-mu...,2022-07-01,NVIDIA A100 PCIe,NVIDIA,,,,,,20750.0,server-equivalent list price
38,https://web.archive.org/web/20200521074015/htt...,2020-05-21,NVIDIA A100 SXM4 40 GB,NVIDIA,,,,,,24875.0,"DGX A100, 8 GPU, 320GB. Release price."
39,https://www.nextplatform.com/2022/05/09/how-mu...,2020-07-01,NVIDIA A100 SXM4 40 GB,NVIDIA,,,,,,24900.0,server-equivalent list price
40,https://www.nextplatform.com/2022/05/09/how-mu...,2022-07-01,NVIDIA A100 SXM4 40 GB,NVIDIA,,,,,,20750.0,server-equivalent list price
48,https://web.archive.org/web/20210630170623/htt...,2021-06-30,NVIDIA A100 SXM4 80 GB,NVIDIA,,,,,,37500.0,"DGX A100, 8 GPU, 640GB"
49,https://web.archive.org/web/20220120191032/htt...,2022-01-20,NVIDIA A100 SXM4 80 GB,NVIDIA,,,,,,20875.0,"DGX A100, 8 GPU, 640GB"
50,https://web.archive.org/web/20230923154035/htt...,2023-09-23,NVIDIA A100 SXM4 80 GB,NVIDIA,,,,,,18548.75,"HGX A100, 8 GPU, 640GB"
57,https://www.techpowerup.com/gpu-specs/geforce-...,2015-03-17,NVIDIA GTX Titan X,NVIDIA,,,,,,1638.36,server-equivalent release price
59,https://web.archive.org/web/20220929115124/htt...,2022-09-29,NVIDIA H100 SXM5,NVIDIA,,,,,,44489.88,"DGX, 8 GPU, 640GB. Convert Euro to USD: https:..."


In [8]:
# Log-linear contours decaying at -0.14 OOMs per year
t = np.arange(2012, 2026)
contours = []
for initial_price in np.arange(3, 10, 0.1):
    contours.append(10**initial_price * 10**(-0.1 * (t - 2012)))

In [9]:
fig = go.Figure()
for hardware_alias in hardware_aliases:
    print(hardware_alias)
    _hardware_df = server_adjusted_purchase_price_df[server_adjusted_purchase_price_df['Hardware model'].str.contains(hardware_alias)]
    fig.add_trace(go.Scatter(
        x=_hardware_df['Price date'],
        y=_hardware_df[price_colname],
        mode='markers',
        name=hardware_alias,
        text=_hardware_df['Hardware model'] + ' | ' + _hardware_df['Notes'],
    ))

for i in range(len(contours)):
    fig.add_trace(go.Scatter(
        x=[pd.to_datetime(f"{year}-01-01") for year in t],
        y=contours[i],
        mode='lines',
        line=dict(color='rgb(200, 200, 200)', width=1),
        name=f'-0.1 OOMs/year',
        showlegend=True if i == 0 else False,
    ))

# y limits
fig.update_yaxes(range=[3, 5])

fig.update_layout(
    title='Server-equivalent purchase price vs. performance',
    xaxis_title='Price date',
    yaxis_title='Server-equivalent purchase price ($)',
    yaxis_type='log',
    width=800,
    height=600,
)
fig.show()

A100
H100
P100
V100
TPU v4
TPU v3
TPU v2
TPU v1
K80
K40
Titan X


In [10]:
# Drop thenextplatform rows
no_nextplatform_df = server_adjusted_purchase_price_df[~server_adjusted_purchase_price_df['Price source'].str.contains('nextplatform')]

In [11]:
fig = go.Figure()
for hardware_alias in hardware_aliases:
    _hardware_df = no_nextplatform_df[no_nextplatform_df['Hardware model'].str.contains(hardware_alias)]
    fig.add_trace(go.Scatter(
        x=_hardware_df['Price date'],
        y=_hardware_df[price_colname],
        mode='markers',
        name=hardware_alias,
        text=no_nextplatform_df['Hardware model'] + ' | ' + _hardware_df['Notes'],
    ))
fig.update_layout(
    title='Server-equivalent purchase price vs. performance',
    xaxis_title='Price date',
    yaxis_title='Server-equivalent purchase price ($)',
    # yaxis_type='log',
    width=800,
    height=600,
)
fig.show()

In [12]:
release_price_df = pd.DataFrame(columns=purchase_price_df.columns)
release_price_set = set()
for i, row in purchase_price_df.sort_values(by='Price date').iterrows():
    hardware_model = row['Hardware model']
    if all(alias not in hardware_model for alias in hardware_aliases):
        continue
    if hardware_model in release_price_set:
        continue
    else:
        release_price_set.add(hardware_model)
        if 'single-unit' not in row['Notes'].lower():
            row[price_colname] /= get_server_cost_overhead(row['Hardware model'])
        release_price_df.loc[i] = row
release_price_df[price_colname] = release_price_df[price_colname].astype(float)
release_price_df

# hardware_aliases = ['A100', 'H100', 'P100', 'V100', 'K80', 'K40']
# release_price_df = pd.DataFrame(columns=purchase_price_df.columns)
# release_price_set = set()
# for i, row in purchase_price_df.sort_values(by='Price date').iterrows():
#     this_hardware_alias = None
#     for hardware_alias in hardware_aliases:
#         if hardware_alias in row['Hardware model']:
#             this_hardware_alias = hardware_alias
#     if this_hardware_alias is None or this_hardware_alias in release_price_set:
#         continue
#     else:
#         release_price_set.add(this_hardware_alias)
#         if 'single-unit' not in row['Notes'].lower():
#             row[price_colname] /= get_server_cost_overhead(row['Hardware model'])
#         release_price_df.loc[i] = row
# release_price_df

Unnamed: 0,Price source,Price date,Hardware model,Manufacturer (from Hardware model),Vendor,Location,Price per chip-hour (on-demand),Price per chip-hour (1-year CUD),Price per chip-hour (3-year CUD),Price (hardware purchase),Notes
74,https://www.nextplatform.com/2022/05/09/how-mu...,2012-07-01,NVIDIA Tesla K40s,NVIDIA,,,,,,5400.0,Single-unit release price
81,https://www.anandtech.com/show/8729/nvidia-lau...,2014-11-17,NVIDIA Tesla K80,NVIDIA,,,,,,5000.0,Single-unit release price
57,https://www.techpowerup.com/gpu-specs/geforce-...,2015-03-17,NVIDIA GTX Titan X,NVIDIA,,,,,,999.0,Single-unit release price
97,https://www.techpowerup.com/gpu-specs/tesla-p1...,2016-06-20,NVIDIA Tesla P100 PCIe 16 GB,NVIDIA,,,,,,5699.0,Single-unit release price
104,https://www.nextplatform.com/2022/05/09/how-mu...,2016-07-01,NVIDIA Tesla P100 SXM2,NVIDIA,,,,,,10500.0,Single-unit list price
128,https://web.archive.org/web/20170513081017/htt...,2017-07-01,NVIDIA V100,NVIDIA,,,,,,11020.710059,Price at release (2017 Q3) for a DGX-1 with 8x...
122,https://web.archive.org/web/20190118082125/htt...,2018-05-08,NVIDIA Tesla V100 SXM2 16 GB,NVIDIA,,,,,,10664.0,Single-unit list price
118,https://web.archive.org/web/20190118082125/htt...,2018-05-08,NVIDIA Tesla V100 PCIe 16 GB,NVIDIA,,,,,,10664.0,Single-unit list price
124,https://web.archive.org/web/20190118082125/htt...,2018-05-08,NVIDIA Tesla V100 SXM2 32 GB,NVIDIA,,,,,,11458.0,Single-unit list price
125,https://www.nextplatform.com/2022/05/09/how-mu...,2018-07-01,NVIDIA Tesla V100 SXM3 32 GB,NVIDIA,,,,,,11500.0,Single-unit list price


In [13]:
release_price_df['Price date (float)'] = datetime_to_float_year(release_price_df['Price date'])
log_release_price_reg_results = fit_ols_regression(release_price_df, ['Price date (float)'], 'Price (hardware purchase)', logy=True)
log_release_price_reg_results.summary()


kurtosistest only valid for n>=20 ... continuing anyway, n=16



0,1,2,3
Dep. Variable:,y,R-squared:,0.592
Model:,OLS,Adj. R-squared:,0.562
Method:,Least Squares,F-statistic:,20.28
Date:,"Thu, 06 Jun 2024",Prob (F-statistic):,0.000496
Time:,17:04:51,Log-Likelihood:,2.8745
No. Observations:,16,AIC:,-1.749
Df Residuals:,14,BIC:,-0.2039
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-191.3604,43.374,-4.412,0.001,-284.388,-98.332
x1,0.0968,0.021,4.503,0.000,0.051,0.143

0,1,2,3
Omnibus:,25.779,Durbin-Watson:,1.715
Prob(Omnibus):,0.0,Jarque-Bera (JB):,37.681
Skew:,-2.238,Prob(JB):,6.57e-09
Kurtosis:,9.041,Cond. No.,1620000.0


In [14]:
release_price_df['Price date (float)'] = datetime_to_float_year(release_price_df['Price date'])
linear_release_price_reg_results = fit_ols_regression(release_price_df, ['Price date (float)'], 'Price (hardware purchase)', logy=False)
linear_release_price_reg_results.summary()


kurtosistest only valid for n>=20 ... continuing anyway, n=16



0,1,2,3
Dep. Variable:,y,R-squared:,0.772
Model:,OLS,Adj. R-squared:,0.756
Method:,Least Squares,F-statistic:,47.39
Date:,"Thu, 06 Jun 2024",Prob (F-statistic):,7.51e-06
Time:,17:04:51,Log-Likelihood:,-150.74
No. Observations:,16,AIC:,305.5
Df Residuals:,14,BIC:,307.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-4.403e+06,6.41e+05,-6.866,0.000,-5.78e+06,-3.03e+06
x1,2187.2154,317.721,6.884,0.000,1505.773,2868.658

0,1,2,3
Omnibus:,2.237,Durbin-Watson:,0.929
Prob(Omnibus):,0.327,Jarque-Bera (JB):,1.627
Skew:,0.749,Prob(JB):,0.443
Kurtosis:,2.557,Cond. No.,1620000.0


Exponential model has lower BIC so we'll go with that.

In [15]:
datacenter_gpu_price_reg_results = log_release_price_reg_results

In [16]:
fig = go.Figure()
for hardware_alias in hardware_aliases:
    _hardware_df = release_price_df[release_price_df['Hardware model'].str.contains(hardware_alias)]
    fig.add_trace(go.Scatter(
        x=_hardware_df['Price date'],
        y=_hardware_df[price_colname],
        mode='markers',
        name=hardware_alias,
        text=release_price_df['Hardware model'] + ' | ' + _hardware_df['Notes'],
    ))
fig.update_layout(
    title='ML datacenter GPU prices over time (earliest price for each GPU model)',
    xaxis_title='Price date',
    yaxis_title='Price ($)',
    yaxis_type='log',
    width=800,
    height=600,
)
fig.show()

In [17]:
# Get a list of the 'Hardware name' with the highest 'FP32 (single precision) Performance (FLOP/s)' over time
perf_col = 'FP32 (single precision) Performance (FLOP/s)'
best_gpu = None
best_gpus = []
best_perf = 0
for i, row in hardware_df.dropna(subset=['Release date', perf_col]).sort_values(by='Release date').iterrows():
    if row[perf_col] > best_perf:
        best_gpu = row['Hardware name']
        best_gpus.append(best_gpu)
        best_perf = row[perf_col]
best_gpus

['NVIDIA GeForce 2 GTS Pro',
 'NVIDIA GeForce 3 Ti500',
 'NVIDIA GeForce FX 5200 Ultra',
 'NVIDIA GeForce FX 5800',
 'NVIDIA GeForce FX 5800 Ultra',
 'ATI Xbox 360 GPU 90nm',
 'NVIDIA GeForce 8800 GTX',
 'NVIDIA Quadro FX 5600 Mac Edition',
 'NVIDIA GeForce 8800 Ultra',
 'ATI Radeon HD 2900 XT',
 'AMD FireStream 9170',
 'ATI Radeon HD 3870 X2',
 'Radeon HD 3870 X2',
 'ATI Radeon HD 4870',
 'Radeon HD 4870 X2',
 'Radeon HD 5870',
 'Radeon HD 5970',
 'Radeon HD 6990',
 'Radeon HD 7990',
 'AMD Radeon R9 FURY X',
 'NVIDIA TESLA M60',
 'NVIDIA Tesla P100 DGXS',
 'NVIDIA TITAN X Pascal',
 'NVIDIA P40',
 'NVIDIA Quadro P6000',
 'NVIDIA Tesla V100 PCIe 16 GB',
 'NVIDIA Tesla V100 SXM2 16 GB',
 'NVIDIA Quadro GV100',
 'NVIDIA A100',
 'NVIDIA GeForce RTX 3080',
 'NVIDIA GeForce RTX 3090',
 'NVIDIA RTX A6000',
 'NVIDIA GeForce RTX 3090 Ti',
 'NVIDIA H100 PCIe',
 'NVIDIA GeForce RTX 4090']

In [18]:
best_gpu_df = hardware_df[hardware_df['Hardware name'].isin(best_gpus)].dropna(subset=['Release price (USD)'])

In [19]:
best_gpu_df['Release date (float)'] = datetime_to_float_year(pd.to_datetime(best_gpu_df['Release date']))
best_gpu_df['Release price (USD)'] = [float(price[1:]) for price in best_gpu_df['Release price (USD)'] if '$' in price]
best_gpu_price_reg_results = fit_ols_regression(best_gpu_df, ['Release date (float)'], 'Release price (USD)', logy=True)
best_gpu_price_reg_results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.588
Model:,OLS,Adj. R-squared:,0.566
Method:,Least Squares,F-statistic:,27.06
Date:,"Thu, 06 Jun 2024",Prob (F-statistic):,5.07e-05
Time:,17:04:51,Log-Likelihood:,-1.9904
No. Observations:,21,AIC:,7.981
Df Residuals:,19,BIC:,10.07
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-87.4992,17.380,-5.034,0.000,-123.876,-51.122
x1,0.0449,0.009,5.202,0.000,0.027,0.063

0,1,2,3
Omnibus:,5.262,Durbin-Watson:,1.989
Prob(Omnibus):,0.072,Jarque-Bera (JB):,3.206
Skew:,0.901,Prob(JB):,0.201
Kurtosis:,3.648,Cond. No.,573000.0


In [20]:
# Plot the 'Release price (USD)' of the best GPUs over time
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=best_gpu_df['Release date'],
    y=best_gpu_df['Release price (USD)'],
    mode='markers',
    text=best_gpu_df['Hardware name'],
))
fig.update_layout(
    title='Release price of the best GPUs (by FP32 FLOP/s) over time',
    xaxis_title='Price date',
    yaxis_title='Price ($)',
    yaxis_type='log',
    width=800,
    height=600,
)
fig.show()

In [26]:
num_samples = 1000
gpu_price_growth = np.zeros(num_samples)
datacenter_gpu_weight = 0.5
for i in range(num_samples):
    # generate uniform random number between 0 and 1
    rand = np.random.rand()
    if rand < datacenter_gpu_weight:
        growth_ci = datacenter_gpu_price_reg_results.conf_int()[1]
    else:
        growth_ci = best_gpu_price_reg_results.conf_int()[1]
    gpu_price_growth[i] = norm_from_ci(growth_ci[0], growth_ci[1], 95, num_samples=1)[0]
print_median_and_ci(gpu_price_growth, ci=[10, 90])

Median: 0.06 [80% CI: 0.037, 0.11]


In [27]:
gpu_price_growth.mean()

0.0708330371221071

# Training time

In [None]:
frontier_pcd_df, hardware_df, price_df = load_data_for_cost_estimation(compute_threshold_method="top_n", compute_threshold=10)
frontier_pcd_df

Unnamed: 0,System,Domain,Task,Authors,Notability criteria,Notability criteria notes,Model accessibility,Link,Citations,Reference,...,Training compute lower bound,Training compute upper bound,Training chip-hours,Code accessibility,Dataset accessibility,Accessibility notes,Organization categorization (from Organization),Possibly over 1e23 FLOP,Training cost trends 2,Training cost trends 3
88,Gemini 1.0 Ultra,"Multimodal,Language,Vision","Language modelling,Visual question answering,C...",Gemini Team,SOTA improvement,""" Evaluation on a broad range of benchmarks sh...",Hosted access (no API),https://storage.googleapis.com/deepmind-media/...,633.0,Gemini: A Family of Highly Capable Multimodal ...,...,,,132000000.0,,,,Industry,,,
104,Inflection-2,Language,Language modelling,,Significant use,Inflection-2 either already powers Pi or soon ...,Hosted access (no API),https://inflection.ai/inflection-2,,Inflection-2: The Next Step Up,...,,,,,,"via Pi, no API",Industry,,,
130,Grok-1,Language,"Language modelling,Chat",,SOTA improvement,"""On these benchmarks, Grok-1 displayed strong ...",Open source,"https://x.ai/model-card/, https://x.ai/blog/gr...",,Announcing Grok,...,2,7.0,,Unreleased,Unreleased,apache 2.0,Industry,checked,,
193,Falcon-180B,Language,Language modelling,"Ebtesam Almazrouei, Hamza Alobeidli, Abdulaziz...",SOTA improvement,"""It's currently at the top of the Hugging Face...",Open access (restricted use),https://falconllm.tii.ae/falcon-180b.html; htt...,86.0,The Falcon Series of Open Language Models,...,,,17694720.0,,,"""Falcon 180b can be commercially used but unde...",Government,checked,,
243,Claude 2,Language,"Language modelling,Chat",,Historical significance,,API access,"https://www.anthropic.com/index/claude-2, http...",0.0,,...,,,,,,,Industry,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1179,Xception,Vision,Image classification,François Chollet,Highly cited,,,https://arxiv.org/abs/1610.02357,11578.0,Xception: Deep Learning with Depthwise Separab...,...,,,43200.0,,,,Industry,,,
1180,GNMT,Language,Translation,"Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc ...",Highly cited,,Hosted access (no API),https://arxiv.org/abs/1609.08144,6196.0,Google's Neural Machine Translation System: Br...,...,,,655730.0,,,presumably deployed via Google translate,Industry,,,
1222,AlphaGo Lee,Games,Go,"David Silver, Aja Huang, Chris J. Maddison, Ar...",Highly cited,,,https://www.nature.com/articles/nature16961,14887.0,Mastering the game of Go with deep neural netw...,...,,,,,,,Industry,,,
1226,ResNet-152 (ImageNet),Vision,Image classification,"Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun",Highly cited,,,https://arxiv.org/abs/1512.03385,156882.0,Deep Residual Learning for Image Recognition,...,,,,,,,Industry,,,


In [None]:
training_time_df = frontier_pcd_df.dropna(subset=['Training time (hours)'])
training_time_df

Unnamed: 0,System,Domain,Task,Authors,Notability criteria,Notability criteria notes,Model accessibility,Link,Citations,Reference,...,Training compute lower bound,Training compute upper bound,Training chip-hours,Code accessibility,Dataset accessibility,Accessibility notes,Organization categorization (from Organization),Possibly over 1e23 FLOP,Training cost trends 2,Training cost trends 3
88,Gemini 1.0 Ultra,"Multimodal,Language,Vision","Language modelling,Visual question answering,C...",Gemini Team,SOTA improvement,""" Evaluation on a broad range of benchmarks sh...",Hosted access (no API),https://storage.googleapis.com/deepmind-media/...,633.0,Gemini: A Family of Highly Capable Multimodal ...,...,,,132000000.0,,,,Industry,,,
193,Falcon-180B,Language,Language modelling,"Ebtesam Almazrouei, Hamza Alobeidli, Abdulaziz...",SOTA improvement,"""It's currently at the top of the Hugging Face...",Open access (restricted use),https://falconllm.tii.ae/falcon-180b.html; htt...,86.0,The Falcon Series of Open Language Models,...,,,17694720.0,,,"""Falcon 180b can be commercially used but unde...",Government,checked,,
336,GPT-4,"Multimodal,Language,Vision,Image generation",Language modelling,"OpenAI, Josh Achiam, Steven Adler, Sandhini Ag...","Highly cited,SOTA improvement","See the paper, p.1: ""On a suite of traditional...",API access,https://arxiv.org/abs/2303.08774,3280.0,GPT-4 Technical Report,...,,,57000000.0,,,,Industry,,,
408,BLOOM-176B,Language,Language modelling,"Margaret Mitchell, Giada Pistilli, Yacine Jern...","Historical significance,Highly cited",Was the largest open-source model at the time....,Open access (restricted use),https://arxiv.org/abs/2211.05100,1313.0,BLOOM: A 176B-Parameter Open-Access Multilingu...,...,,,1078272.0,,,responsible use restrictions: https://bigscien...,"Industry,Research collective",,,
423,U-PaLM (540B),Language,Language generation,"Yi Tay, Jason Wei, Hyung Won Chung, Vinh Q. Tr...",SOTA improvement,"""We show that U-PaLM 540B outperforms PaLM 540...",Unreleased,https://arxiv.org/abs/2210.11399,45.0,Transcending Scaling Laws with 0.1% Extra Compute,...,,,61440.0,,,,Industry,,,
425,Flan-PaLM 540B,Language,Language modelling/generation,"Hyung Won Chung, Le Hou, Shayne Longpre, Barre...","Highly cited,SOTA improvement",">1k cites\n\n""Flan-PaLM 540B achieves state-of...",Unreleased,https://arxiv.org/abs/2210.11416,1615.0,Scaling Instruction-Finetuned Language Models,...,,,18944.0,,,,Industry,,,
449,GLM-130B,Language,,"Aohan Zeng, Xiao Liu, Zhengxiao Du, Zihan Wang...",SOTA improvement,"""GLM-130B achieves an accuracy of 80.2% on zer...",Open access (non-commercial),https://keg.cs.tsinghua.edu.cn/glm-130b/posts/...,641.0,GLM-130B: An Open Bilingual Pre-trained Model,...,,,1105920.0,,,non commercial: https://github.com/THUDM/GLM-1...,Academia,,,
471,Minerva (540B),Language,Quantitative reasoning,"Aitor Lewkowycz, Anders Andreassen, David Doha...",SOTA improvement,,Unreleased,https://arxiv.org/abs/2206.14858,427.0,Solving Quantitative Reasoning Problems with L...,...,,,712704.0,Unreleased,Unreleased,,Industry,,,
514,OPT-175B,Language,Language modelling,"Susan Zhang∗ , Stephen Roller∗ , Naman Goyal∗ ...","Significant use,Highly cited",https://ai.meta.com/blog/opt-175b-large-langua...,Open access (non-commercial),https://ai.facebook.com/blog/democratizing-acc...,1987.0,OPT: Open Pre-trained Transformer Language Models,...,,,812544.0,Open source,,non-commercial for weights:\nhttps://github.co...,Industry,,,
524,PaLM (540B),Language,Language modelling,"Aakanksha Chowdhery, Sharan Narang, Jacob Devl...","Highly cited,SOTA improvement","Demonstrates continued benefits of scaling, as...",Unreleased,https://arxiv.org/abs/2204.02311,3532.0,PaLM: Scaling Language Modeling with Pathways,...,,,8404992.0,,,,Industry,,,


In [None]:
training_time_df['Publication date (float)'] = datetime_to_float_year(pd.to_datetime(training_time_df['Publication date']))
reg_results = fit_ols_regression(training_time_df, ['Publication date (float)'], 'Training time (hours)', logy=True)
reg_results.summary()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



0,1,2,3
Dep. Variable:,y,R-squared:,0.11
Model:,OLS,Adj. R-squared:,0.079
Method:,Least Squares,F-statistic:,3.572
Date:,"Thu, 06 Jun 2024",Prob (F-statistic):,0.0688
Time:,16:02:21,Log-Likelihood:,-22.026
No. Observations:,31,AIC:,48.05
Df Residuals:,29,BIC:,50.92
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-162.5951,87.525,-1.858,0.073,-341.603,16.413
x1,0.0819,0.043,1.890,0.069,-0.007,0.170

0,1,2,3
Omnibus:,6.392,Durbin-Watson:,1.425
Prob(Omnibus):,0.041,Jarque-Bera (JB):,4.739
Skew:,-0.752,Prob(JB):,0.0935
Kurtosis:,4.186,Cond. No.,1930000.0


In [None]:
# Plot training time
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=training_time_df['Publication date'],
    y=training_time_df['Training time (hours)'],
    mode='markers',
    text=training_time_df['Model']
))
fig.update_layout(
    title='Training times of notable models',
    xaxis_title='Publication date',
    yaxis_title='Training time (hours)',
    yaxis_type='log',
    width=800,
    height=600,
)
fig.show()

In [None]:
np.median(training_time_df['Training time (hours)'])

720.0

In [None]:
np.percentile(training_time_df['Training time (hours)'], 25)

341.1

In [None]:
np.percentile(training_time_df['Training time (hours)'], 75)

1388.5

In [None]:
training_time_df['Training time (hours)'].notna().sum()

31

In [None]:
np.percentile(frontier_pcd_df['Hardware utilization'].dropna(), 25)

0.25345

In [None]:
np.percentile(frontier_pcd_df['Hardware utilization'].dropna(), 75)

0.45

# Hardware depreciation

In [None]:
import plotly.graph_objects as go

# Values of x
x_values = [x * 0.1 for x in range(0, 61)]

# Calculating the values of each function
exponential_values = [10**(-0.14*x) for x in x_values]
exponential_slow = [10**(-0.10*x) for x in x_values]
exponential_fast = [10**(-0.18*x) for x in x_values]
linear_3_values = [1 - x/3 for x in x_values]
linear_4_values = [1 - x/4 for x in x_values]
linear_5_values = [1 - x/5 for x in x_values]
linear_6_values = [1 - x/6 for x in x_values]

# Creating the plot
fig = go.Figure()

# Adding each line plot to the figure
fig.add_trace(go.Scatter(x=x_values, y=exponential_values, mode='lines', name='Exponential: price performance trend'))
fig.add_trace(go.Scatter(x=x_values, y=exponential_fast, mode='lines', line=dict(width=0), showlegend=False,))
fig.add_trace(go.Scatter(x=x_values, y=exponential_slow, mode='lines', name='90% CI', line=dict(width=0), fill='tonexty', fillcolor='rgba(0,100,200,0.2)',))
fig.add_trace(go.Scatter(x=x_values, y=linear_3_values, mode='lines', line=dict(dash='dot'), name='Linear: 3-year lifetime'))
fig.add_trace(go.Scatter(x=x_values, y=linear_4_values, mode='lines', line=dict(dash='dot'), name='Linear: 4-year lifetime'))
fig.add_trace(go.Scatter(x=x_values, y=linear_5_values, mode='lines', line=dict(dash='dot'), name='Linear: 5-year lifetime'))
fig.add_trace(go.Scatter(x=x_values, y=linear_6_values, mode='lines', line=dict(dash='dot'), name='Linear: 6-year lifetime'))

# Setting the axes limits
fig.update_layout(
    width=800,
    height=600,
    xaxis=dict(range=[0, 6]),
    yaxis=dict(range=[0, 1]),
    title='Comparison of exponential and linear depreciation functions',
    xaxis_title='Years',
    yaxis_title='Value (normalized to 1 at year 0)'
)

# Display the plot
fig.show()

In [None]:
frontier_pcd_df, _hardware_df, price_df = load_data_for_cost_estimation(compute_threshold_method="top_n", compute_threshold=10)

In [None]:
gaps = []
for _, row in frontier_pcd_df.iterrows():
    hardware_model = row['Training hardware']
    try:
        hardware_release_date = get_release_date(hardware_model, _hardware_df)
    except:
        continue
    pub_date = pd.to_datetime(row['Publication date'])
    gap = pub_date - hardware_release_date
    gaps.append(gap.days)
    print(f"{row['Model']}, {hardware_model}: {gap.days} days")


Gemini 1.0 Ultra, Google TPU v4: 930 days
Inflection-2, NVIDIA H100 SXM5: 428 days
Falcon-180B, NVIDIA A100 SXM4 40 GB: 1210 days
PaLM 2, Google TPU v4: 720 days
GPT-4, NVIDIA A100 SXM4 40 GB: 1035 days
GPT-3.5 (text-davinci-003), NVIDIA A100 SXM4 40 GB: 928 days
BLOOM-176B, NVIDIA A100 SXM4 80 GB: 722 days
U-PaLM (540B), Google TPU v4: 518 days
Flan-PaLM 540B, Google TPU v4: 518 days
BlenderBot 3, NVIDIA A100 SXM4 40 GB: 818 days
GLM-130B, NVIDIA A100 SXM4 40 GB: 812 days
Minerva (540B), Google TPU v4: 405 days
Parti, Google TPU v4: 398 days
OPT-175B, NVIDIA A100 SXM4 80 GB: 532 days
PaLM (540B), Google TPU v4: 319 days
LaMDA, Google TPU v3: 1364 days
GLaM, Google TPU v4: 207 days
Gopher (280B), Google TPU v3: 1300 days
Megatron-Turing NLG 530B, NVIDIA A100 SXM4 80 GB: 329 days
HyperCLOVA 82B, NVIDIA A100: 558 days
GOAT, Google TPU v3: 1166 days
ByT5-XXL, Google TPU v3: 1106 days
ProtT5-XXL, Google TPU v3: 1082 days
Meta Pseudo Labels, Google TPU v3: 1018 days
Switch, Google TPU v3: 9

In [None]:
gaps = np.array(gaps)
gaps.mean()

700.2

In [None]:
len(gaps)

45

In [None]:
len(gaps[gaps < 2 * DAYS_PER_YEAR])

26

In [None]:
len(gaps[gaps < 3 * DAYS_PER_YEAR])

39

In [None]:
len(gaps[gaps < 4 * DAYS_PER_YEAR])

45