In [40]:
from collections import defaultdict
import numpy as np
import pandas as pd
import plotly.graph_objects as go

from data import *
from hardware import *
from parameters import *
from regression import *

In [24]:
frontier_pcd_df, hardware_df, price_df = load_data_for_cost_estimation(compute_threshold_method="window_percentile", compute_threshold=0)
frontier_pcd_df

Unnamed: 0,System,Domain,Task,Authors,Notability criteria,Notability criteria notes,Model accessibility,Link,Citations,Reference,...,Organization categorization,Foundation model,Training compute lower bound,Training compute upper bound,Training chip-hours,Code accessibility,Dataset accessibility,Accessibility notes,Organization categorization (from Organization),Possibly over 1e23 FLOP
65,CogAgent,"Vision,Language","Instruction interpretation,Visual question ans...","Wenyi Hong, Weihan Wang, Qingsong Lv, Jiazheng...",SOTA improvement,See Table 1,Open access (restricted use),"https://arxiv.org/abs/2312.08914Wenyi Hong, We...",50.0,CogAgent: A Visual Language Model for GUI Agents,...,,,,,,Open source,,Code is Apache License 2.0; model is under a m...,,
66,FunSearch,"Language,Search",Code generation,"Bernardino Romera-Paredes, Mohammadamin Bareka...","SOTA improvement,Historical significance",Improved SOTA for the cap set problem. Can pla...,Open source,https://www.nature.com/articles/s41586-023-069...,67.0,Mathematical discoveries from program search w...,...,Industry,,,,,,,Code to run FunSearch with an LLM of your choi...,Industry,
75,Gemini Ultra,Multimodal,"Language modelling,Visual question answering,C...",Gemini Team,SOTA improvement,""" Evaluation on a broad range of benchmarks sh...",Hosted access (no API),https://storage.googleapis.com/deepmind-media/...,633.0,Gemini: A Family of Highly Capable Multimodal ...,...,Industry,,,,132000000.0,,,,Industry,
83,Qwen-72B,Language,"Chat,Code generation","Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Ka...",SOTA improvement,"SOTA on several Chinese benchmarks, with highe...",Open access (restricted use),https://huggingface.co/Qwen/Qwen-72B,,,...,Industry,,,,,Unreleased,Unreleased,up to 100m active users:\nhttps://github.com/Q...,Industry,
91,Inflection-2,Language,Language modelling,,Significant use,Inflection-2 either already powers Pi or soon ...,Hosted access (no API),https://inflection.ai/inflection-2,,Inflection-2: The Next Step Up,...,Industry,checked,,,,,,"via Pi, no API",Industry,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1182,AlphaGo Lee,Games,Go,"David Silver, Aja Huang, Chris J. Maddison, Ar...",Highly cited,,,https://www.nature.com/articles/nature16961,14887.0,Mastering the game of Go with deep neural netw...,...,Industry,,,,,,,,Industry,
1183,"Variational (untied weights, MC) LSTM (Large)",Language,,"Yarin Gal, Zoubin Ghahramani","Highly cited,SOTA improvement","""The new approach outperforms existing techniq...",,https://arxiv.org/abs/1512.05287?context=stat,1838.0,A Theoretically Grounded Application of Dropou...,...,Academia,,,,,,,,Academia,
1186,ResNet-152 (ImageNet),Vision,Image classification,"Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun",Highly cited,,,https://arxiv.org/abs/1512.03385,156882.0,Deep Residual Learning for Image Recognition,...,Industry,,,,,,,,Industry,
1188,DeepSpeech2 (English),Speech,Speech recognition,"Dario Amodei, Rishita Anubhai, Eric Battenberg...",Highly cited,,,https://arxiv.org/abs/1512.02595,2749.0,Deep Speech 2: End-to-End Speech Recognition i...,...,Industry,,,,301.0,,,,Industry,


In [25]:
hardware_aliases = ['A100', 'H100', 'P100', 'V100', 'TPU v4', 'TPU v3', 'TPU v2', 'TPU v1', 'K80', 'K40']
hardware_counts = defaultdict(int)
for hardware in frontier_pcd_df['Training hardware'].dropna():
    alias_found = False
    for alias in hardware_aliases:
        if alias in hardware:
            hardware_counts[alias] += 1
            alias_found = True
    if not alias_found:
        hardware_counts[hardware] += 1

# Print counts in descending order
for hardware, count in sorted(hardware_counts.items(), key=lambda x: x[1], reverse=True):
    print(f'{hardware}: {count}')

A100: 47
V100: 43
TPU v3: 39
TPU v4: 18
TPU v2: 4
P100: 4
K80: 3
NVIDIA GTX Titan X: 3
NVIDIA Geforce GTX1080 Ti: 2
K40: 2
TPU v1: 2
NVIDIA M40: 2
NVIDIA GeForce GTX TITAN X: 2
H100: 1
NVIDIA A800: 1
Huawei Ascend 910: 1
NVIDIA Quadro RTX 4000: 1
NVIDIA RTX A6000: 1
NVIDIA Quadro RTX 8000: 1
NVIDIA Quadro RTX 5000: 1
NVIDIA GeForce RTX 2080 Ti: 1
NVIDIA TITAN Xp: 1
NVIDIA GeForce GTX 1080 Ti: 1
NVIDIA Quadro P600: 1
NVIDIA M40,NVIDIA GTX Titan X: 1


In [26]:
# No TPUs
hardware_aliases = ['A100', 'H100', 'P100', 'V100', 'TPU v4', 'TPU v3', 'TPU v2', 'TPU v1', 'K80', 'K40']
hardware_counts = defaultdict(int)
for hardware in frontier_pcd_df['Training hardware'].dropna():
    if 'TPU' in hardware:
        continue
    alias_found = False
    for alias in hardware_aliases:
        if alias in hardware:
            hardware_counts[alias] += 1
            alias_found = True
    if not alias_found:
        hardware_counts[hardware] += 1

# Print counts in descending order
for hardware, count in sorted(hardware_counts.items(), key=lambda x: x[1], reverse=True):
    print(f'{hardware}: {count}')

A100: 47
V100: 43
P100: 4
K80: 3
NVIDIA GTX Titan X: 3
NVIDIA Geforce GTX1080 Ti: 2
K40: 2
NVIDIA M40: 2
NVIDIA GeForce GTX TITAN X: 2
H100: 1
NVIDIA A800: 1
Huawei Ascend 910: 1
NVIDIA Quadro RTX 4000: 1
NVIDIA RTX A6000: 1
NVIDIA Quadro RTX 8000: 1
NVIDIA Quadro RTX 5000: 1
NVIDIA GeForce RTX 2080 Ti: 1
NVIDIA TITAN Xp: 1
NVIDIA GeForce GTX 1080 Ti: 1
NVIDIA Quadro P600: 1
NVIDIA M40,NVIDIA GTX Titan X: 1


# Purchase prices

In [27]:
price_colname = 'Price (hardware purchase)'
purchase_price_df = price_df.dropna(subset=[price_colname])
purchase_price_df

Unnamed: 0,Price source,Price date,Hardware model,Manufacturer (from Hardware model),Vendor,Location,Price per chip-hour (on-demand),Price per chip-hour (1-year CUD),Price per chip-hour (3-year CUD),Price (hardware purchase),Notes
25,https://www.nextplatform.com/2022/05/09/how-mu...,2020-07-01,NVIDIA A100 PCIe,NVIDIA,,,,,,15000.0,Single-unit list price
26,https://www.nextplatform.com/2022/05/09/how-mu...,2022-07-01,NVIDIA A100 PCIe,NVIDIA,,,,,,12500.0,Single-unit list price
38,https://web.archive.org/web/20200521074015/htt...,2020-05-21,NVIDIA A100 SXM4 40 GB,NVIDIA,,,,,,24875.0,"DGX A100, 8 GPU, 320GB. Release price."
39,https://www.nextplatform.com/2022/05/09/how-mu...,2020-07-01,NVIDIA A100 SXM4 40 GB,NVIDIA,,,,,,15000.0,Single-unit list price
40,https://www.nextplatform.com/2022/05/09/how-mu...,2022-07-01,NVIDIA A100 SXM4 40 GB,NVIDIA,,,,,,12500.0,Single-unit list price
48,https://web.archive.org/web/20210630170623/htt...,2021-06-30,NVIDIA A100 SXM4 80 GB,NVIDIA,,,,,,37500.0,"DGX A100, 8 GPU, 640GB"
49,https://web.archive.org/web/20220120191032/htt...,2022-01-20,NVIDIA A100 SXM4 80 GB,NVIDIA,,,,,,20875.0,"DGX A100, 8 GPU, 640GB"
50,https://web.archive.org/web/20230923154035/htt...,2023-09-23,NVIDIA A100 SXM4 80 GB,NVIDIA,,,,,,18548.75,"HGX A100, 8 GPU, 640GB"
57,https://www.techpowerup.com/gpu-specs/geforce-...,2015-03-17,NVIDIA GTX Titan X,NVIDIA,,,,,,999.0,Single-unit release price
59,https://web.archive.org/web/20220929115124/htt...,2022-09-29,NVIDIA H100 SXM5,NVIDIA,,,,,,44489.88,"DGX, 8 GPU, 640GB. Convert Euro to USD: https:..."


In [42]:
# Multiply single unit prices by overhead factor
server_adjusted_purchase_price_df = pd.DataFrame(columns=purchase_price_df.columns)
for i, row in purchase_price_df.iterrows():
    if 'single-unit' in row['Notes'].lower():
        row[price_colname] *= get_server_cost_overhead(row['Hardware model'])
        row['Notes'] = row['Notes'].lower().replace('single-unit', 'server-equivalent')
    server_adjusted_purchase_price_df.loc[i] = row
server_adjusted_purchase_price_df

Unnamed: 0,Price source,Price date,Hardware model,Manufacturer (from Hardware model),Vendor,Location,Price per chip-hour (on-demand),Price per chip-hour (1-year CUD),Price per chip-hour (3-year CUD),Price (hardware purchase),Notes
25,https://www.nextplatform.com/2022/05/09/how-mu...,2020-07-01,NVIDIA A100 PCIe,NVIDIA,,,,,,24900.0,server-equivalent list price
26,https://www.nextplatform.com/2022/05/09/how-mu...,2022-07-01,NVIDIA A100 PCIe,NVIDIA,,,,,,20750.0,server-equivalent list price
38,https://web.archive.org/web/20200521074015/htt...,2020-05-21,NVIDIA A100 SXM4 40 GB,NVIDIA,,,,,,24875.0,"DGX A100, 8 GPU, 320GB. Release price."
39,https://www.nextplatform.com/2022/05/09/how-mu...,2020-07-01,NVIDIA A100 SXM4 40 GB,NVIDIA,,,,,,24900.0,server-equivalent list price
40,https://www.nextplatform.com/2022/05/09/how-mu...,2022-07-01,NVIDIA A100 SXM4 40 GB,NVIDIA,,,,,,20750.0,server-equivalent list price
48,https://web.archive.org/web/20210630170623/htt...,2021-06-30,NVIDIA A100 SXM4 80 GB,NVIDIA,,,,,,37500.0,"DGX A100, 8 GPU, 640GB"
49,https://web.archive.org/web/20220120191032/htt...,2022-01-20,NVIDIA A100 SXM4 80 GB,NVIDIA,,,,,,20875.0,"DGX A100, 8 GPU, 640GB"
50,https://web.archive.org/web/20230923154035/htt...,2023-09-23,NVIDIA A100 SXM4 80 GB,NVIDIA,,,,,,18548.75,"HGX A100, 8 GPU, 640GB"
57,https://www.techpowerup.com/gpu-specs/geforce-...,2015-03-17,NVIDIA GTX Titan X,NVIDIA,,,,,,1638.36,server-equivalent release price
59,https://web.archive.org/web/20220929115124/htt...,2022-09-29,NVIDIA H100 SXM5,NVIDIA,,,,,,44489.88,"DGX, 8 GPU, 640GB. Convert Euro to USD: https:..."


In [29]:
fig = go.Figure()
for hardware_alias in hardware_aliases:
    hardware_df = server_adjusted_purchase_price_df[server_adjusted_purchase_price_df['Hardware model'].str.contains(hardware_alias)]
    fig.add_trace(go.Scatter(
        x=hardware_df['Price date'],
        y=hardware_df[price_colname],
        mode='markers',
        name=hardware_alias,
        text=server_adjusted_purchase_price_df['Hardware model'] + ' | ' + hardware_df['Notes'],
    ))
fig.update_layout(
    title='Server-equivalent purchase price vs. performance',
    xaxis_title='Price date',
    yaxis_title='Server-equivalent purchase price ($)',
    # yaxis_type='log',
    width=800,
    height=600,
)
fig.show()

In [30]:
# Drop thenextplatform rows
server_adjusted_purchase_price_df = server_adjusted_purchase_price_df[~server_adjusted_purchase_price_df['Price source'].str.contains('nextplatform')]

In [31]:
fig = go.Figure()
for hardware_alias in hardware_aliases:
    hardware_df = server_adjusted_purchase_price_df[server_adjusted_purchase_price_df['Hardware model'].str.contains(hardware_alias)]
    fig.add_trace(go.Scatter(
        x=hardware_df['Price date'],
        y=hardware_df[price_colname],
        mode='markers',
        name=hardware_alias,
        text=server_adjusted_purchase_price_df['Hardware model'] + ' | ' + hardware_df['Notes'],
    ))
fig.update_layout(
    title='Server-equivalent purchase price vs. performance',
    xaxis_title='Price date',
    yaxis_title='Server-equivalent purchase price ($)',
    # yaxis_type='log',
    width=800,
    height=600,
)
fig.show()

# Training time

In [32]:
frontier_pcd_df, hardware_df, price_df = load_data_for_cost_estimation(compute_threshold_method="top_n", compute_threshold=10)
frontier_pcd_df

Unnamed: 0,System,Domain,Task,Authors,Notability criteria,Notability criteria notes,Model accessibility,Link,Citations,Reference,...,Organization categorization,Foundation model,Training compute lower bound,Training compute upper bound,Training chip-hours,Code accessibility,Dataset accessibility,Accessibility notes,Organization categorization (from Organization),Possibly over 1e23 FLOP
75,Gemini Ultra,Multimodal,"Language modelling,Visual question answering,C...",Gemini Team,SOTA improvement,""" Evaluation on a broad range of benchmarks sh...",Hosted access (no API),https://storage.googleapis.com/deepmind-media/...,633.0,Gemini: A Family of Highly Capable Multimodal ...,...,Industry,,,,132000000.0,,,,Industry,
91,Inflection-2,Language,Language modelling,,Significant use,Inflection-2 either already powers Pi or soon ...,Hosted access (no API),https://inflection.ai/inflection-2,,Inflection-2: The Next Step Up,...,Industry,checked,,,,,,"via Pi, no API",Industry,
116,Grok-1,Language,"Language modelling,Chat",,SOTA improvement,"""On these benchmarks, Grok-1 displayed strong ...",Open source,"https://x.ai/model-card/, https://x.ai/blog/gr...",,Announcing Grok,...,Industry,checked,2,7.0,,Unreleased,Unreleased,apache 2.0,Industry,checked
130,ChatGLM3,Multimodal,"Chat,Visual question answering",,SOTA improvement,"Aiming at GPT-4V, ChatGLM3 has implemented ite...",,https://www.zhipuai.cn/en/news/76,,Zhipu AI launches third-generation base model,...,Industry,checked,,,,,,,Industry,
176,Falcon-180B,Language,Language modelling,"Ebtesam Almazrouei, Hamza Alobeidli, Abdulaziz...",SOTA improvement,"""It's currently at the top of the Hugging Face...",Open access (restricted use),https://falconllm.tii.ae/falcon-180b.html; htt...,86.0,The Falcon Series of Open Language Models,...,Government,,,,17694720.0,,,"""Falcon 180b can be commercially used but unde...",Government,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1145,Xception,Vision,Image classification,François Chollet,Highly cited,,,https://arxiv.org/abs/1610.02357,11578.0,Xception: Deep Learning with Depthwise Separab...,...,Industry,,,,43200.0,,,,Industry,
1146,GNMT,Language,Translation,"Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc ...",Highly cited,,Hosted access (no API),https://arxiv.org/abs/1609.08144,6196.0,Google's Neural Machine Translation System: Br...,...,Industry,,,,414720.0,,,presumably deployed via Google translate,Industry,
1182,AlphaGo Lee,Games,Go,"David Silver, Aja Huang, Chris J. Maddison, Ar...",Highly cited,,,https://www.nature.com/articles/nature16961,14887.0,Mastering the game of Go with deep neural netw...,...,Industry,,,,,,,,Industry,
1186,ResNet-152 (ImageNet),Vision,Image classification,"Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun",Highly cited,,,https://arxiv.org/abs/1512.03385,156882.0,Deep Residual Learning for Image Recognition,...,Industry,,,,,,,,Industry,


In [33]:
training_time_df = frontier_pcd_df.dropna(subset=['Training time (hours)'])
training_time_df

Unnamed: 0,System,Domain,Task,Authors,Notability criteria,Notability criteria notes,Model accessibility,Link,Citations,Reference,...,Organization categorization,Foundation model,Training compute lower bound,Training compute upper bound,Training chip-hours,Code accessibility,Dataset accessibility,Accessibility notes,Organization categorization (from Organization),Possibly over 1e23 FLOP
75,Gemini Ultra,Multimodal,"Language modelling,Visual question answering,C...",Gemini Team,SOTA improvement,""" Evaluation on a broad range of benchmarks sh...",Hosted access (no API),https://storage.googleapis.com/deepmind-media/...,633.0,Gemini: A Family of Highly Capable Multimodal ...,...,Industry,,,,132000000.0,,,,Industry,
176,Falcon-180B,Language,Language modelling,"Ebtesam Almazrouei, Hamza Alobeidli, Abdulaziz...",SOTA improvement,"""It's currently at the top of the Hugging Face...",Open access (restricted use),https://falconllm.tii.ae/falcon-180b.html; htt...,86.0,The Falcon Series of Open Language Models,...,Government,,,,17694720.0,,,"""Falcon 180b can be commercially used but unde...",Government,
215,Llama 2-70B,Language,Language modelling,"Hugo Touvron, Louis Martin, Kevin Stone, Peter...","Historical significance,Significant use,Highly...",Model has been open-sourced and frequently dow...,Open access (restricted use),https://ai.meta.com/research/publications/llam...,3131.0,Llama 2: Open Foundation and Fine-Tuned Chat M...,...,Industry,,,,1720320.0,,,Llama 2 license. can't use outputs to train mo...,Industry,
227,xTrimoPGLM -100B,Biology,Proteins,"Bo Chen, Xingyi Cheng, Yangli-ao Geng, Shen Li...",SOTA improvement,"""Our extensive experiments reveal that xTrimoP...",Unreleased,https://www.biorxiv.org/content/10.1101/2023.0...,32.0,xTrimoPGLM: Unified 100B-Scale Pre-trained Tra...,...,"Academia,Industry",,,,2352.0,Unreleased,,,"Academia,Industry",
309,GPT-4,Multimodal,Language modelling,OpenAI,"Highly cited,SOTA improvement","See the paper, p.1: ""On a suite of traditional...",API access,https://arxiv.org/abs/2303.08774,3280.0,GPT-4 Technical Report,...,Industry,checked,,,57000000.0,,,,Industry,
325,LLaMA-65B,Language,Language modelling,"Hugo Touvron, Thibaut Lavril, Gautier Izacard,...","Historical significance,Highly cited",Widely-used foundation model that has been ada...,Open access (non-commercial),https://arxiv.org/abs/2302.13971,4640.0,LLaMA: Open and Efficient Foundation Language ...,...,Industry,checked,,,1024000.0,Unreleased,,"""we are releasing our model under a noncommerc...",Industry,
380,BLOOM-176B,Language,Language modelling,"Margaret Mitchell, Giada Pistilli, Yacine Jern...","Historical significance,Highly cited",Was the largest open-source model at the time....,Open access (restricted use),https://arxiv.org/abs/2211.05100,1313.0,BLOOM: A 176B-Parameter Open-Access Multilingu...,...,"Industry,Research collective",checked,,,1078272.0,,,responsible use restrictions: https://bigscien...,"Industry,Research collective",
395,U-PaLM (540B),Language,Language generation,"Yi Tay, Jason Wei, Hyung Won Chung, Vinh Q. Tr...",SOTA improvement,"""We show that U-PaLM 540B outperforms PaLM 540...",Unreleased,https://arxiv.org/abs/2210.11399,45.0,Transcending Scaling Laws with 0.1% Extra Compute,...,Industry,,,,61440.0,,,,Industry,
421,GLM-130B,Language,,"Aohan Zeng, Xiao Liu, Zhengxiao Du, Zihan Wang...",SOTA improvement,"""GLM-130B achieves an accuracy of 80.2% on zer...",Open access (non-commercial),https://keg.cs.tsinghua.edu.cn/glm-130b/posts/...,641.0,GLM-130B: An Open Bilingual Pre-trained Model,...,Academia,checked,,,1105920.0,,,non commercial: https://github.com/THUDM/GLM-1...,Academia,
442,Minerva (540B),Language,Quantitative reasoning,"Aitor Lewkowycz, Anders Andreassen, David Doha...",SOTA improvement,,Unreleased,https://arxiv.org/abs/2206.14858,427.0,Solving Quantitative Reasoning Problems with L...,...,Industry,checked,,,712704.0,Unreleased,Unreleased,,Industry,


In [34]:
training_time_df['Publication date (float)'] = datetime_to_float_year(pd.to_datetime(training_time_df['Publication date']))
reg_results = fit_ols_regression(training_time_df, ['Publication date (float)'], 'Training time (hours)', logy=True)
reg_results.summary()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



0,1,2,3
Dep. Variable:,y,R-squared:,0.08
Model:,OLS,Adj. R-squared:,0.05
Method:,Least Squares,F-statistic:,2.683
Date:,"Mon, 06 May 2024",Prob (F-statistic):,0.112
Time:,18:16:08,Log-Likelihood:,-20.678
No. Observations:,33,AIC:,45.36
Df Residuals:,31,BIC:,48.35
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-124.7684,77.959,-1.600,0.120,-283.767,34.230
x1,0.0632,0.039,1.638,0.112,-0.015,0.142

0,1,2,3
Omnibus:,1.691,Durbin-Watson:,1.54
Prob(Omnibus):,0.429,Jarque-Bera (JB):,0.688
Skew:,-0.224,Prob(JB):,0.709
Kurtosis:,3.547,Cond. No.,1940000.0


In [35]:
# Plot training time
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=training_time_df['Publication date'],
    y=training_time_df['Training time (hours)'],
    mode='markers',
    text=training_time_df['System']
))
fig.update_layout(
    title='Training times of notable models',
    xaxis_title='Publication date',
    yaxis_title='Training time (hours)',
    yaxis_type='log',
    width=800,
    height=600,
)
fig.show()

In [36]:
np.median(training_time_df['Training time (hours)'])

793.5

In [37]:
np.percentile(training_time_df['Training time (hours)'], 5)

120.0

In [38]:
np.percentile(training_time_df['Training time (hours)'], 95)

4320.0