# Setup

In [40]:
import numpy as np
import pandas as pd

from plotting import *
from pprint import pprint

In [41]:
SECONDS_PER_HOUR = 60 * 60
SECONDS_PER_YEAR = SECONDS_PER_HOUR * 24 * 365.25

# Dataset

## ML systems

In [42]:
pcd_df = pd.read_csv('data/All ML Systems - full view.csv')
pcd_df.head()

Unnamed: 0,System,Domain,Task,Authors,Notability criteria,Notability criteria notes,Open-source,Link,Citations,Reference,...,Organization (from Organization),Base model,Finetune compute (FLOP),Finetune compute notes,Authors by country,Hardware quantity,Hardware utilization,Training cost trends,Training cloud compute vendor,Training data center
0,Cohere Command,Language,,,,,,https://cohere.com/models/command,,"World-class AI, at your command",...,Cohere,,,,,,,,,
1,Theseus,Other,Maze solving,Claude Shannon,Historical significance,,,https://www.technologyreview.com/2018/12/19/13...,0.0,Mighty Mouse,...,Bell Laboratories,,,,Theseus,,,,,
2,SNARC,Other,Maze solving,Marvin Minsky,Historical significance,,,https://en.wikipedia.org/wiki/Stochastic_neura...,33.0,A Neural-Analogue Calculator Based upon a Prob...,...,Harvard University,,,,SNARC,,,,,
3,Genetic algorithm,,,NA Barricelli,Historical significance,Possibly first computer simulation of a geneti...,,https://link.springer.com/article/10.1007/BF01...,266.0,Numerical testing of evolution theories,...,Institute for Advanced Study,,,,Genetic algorithm,,,,,
4,Sequence-based pattern recognition,Vision,Character recognition,O. G. Selfridge,Historical significance,,,https://dl.acm.org/doi/10.1145/1455292.1455310,290.0,Pattern recognition and modern computers,...,Massachusetts Institute of Technology (MIT),,,,Sequence-based pattern recognition,,,,,


In [43]:
# Publication date in datetime format
pcd_df.dropna(subset=['Publication date'], inplace=True)
pcd_df['Publication date'] = pd.to_datetime(pcd_df['Publication date'])

In [44]:
# Manually copied from "Training cost trends" Airtable
frontier_systems = [
    "PaLM 2",
    "GPT-4",
    "Minerva (540B)",
    "Megatron-Turing NLG 530B",
    "GPT-3 175B (davinci)",
    "Meena",
    "AlphaStar",
    "AlphaGo Zero",
    "AlphaGo Master",
    "GNMT",
    "Claude 2",
    "PaLM (540B)",
    "ERNIE 3.0 Titan",
    "Gopher (280B)",
    "OpenAI Five",
    "T5-11B",
    "Megatron-BERT",
    "ResNeXt-101 32x48d",
    "AlphaZero",
    "Falcon 180B",
    "GPT-3.5 (text-davinci-003)",
    "Chinchilla",
    "Yuan 1.0",
    "Turing-NLG",
    "BigGAN-deep 512x512",
    "NASv3 (CIFAR-10)",
    "AlphaGo Lee",
    "AlphaGo Fan",
    "OPT-175B",
    "AlphaCode",
    "GLaM",
    "OpenAI Five Rerun",
    "T5-3B",
    "Megatron-LM (8.3B)",
    "FTW",
    "AmoebaNet-A (F=448)",
    "OpenAI TI7 DOTA 1v1",
    "JFT",
    "Llama 2-70B",
    "LLaMA-65B",
    "LaMDA",
    "ALIGN",
    "GShard (dense)",
    "RoBERTa Large",
    "IMPALA",
]

In [45]:
frontier_pcd_df = pcd_df[pcd_df['System'].isin(frontier_systems)]
frontier_pcd_df.head()

Unnamed: 0,System,Domain,Task,Authors,Notability criteria,Notability criteria notes,Open-source,Link,Citations,Reference,...,Organization (from Organization),Base model,Finetune compute (FLOP),Finetune compute notes,Authors by country,Hardware quantity,Hardware utilization,Training cost trends,Training cloud compute vendor,Training data center
265,AlphaGo Fan,Games,Go,"David Silver, Aja Huang, Chris J. Maddison, Ar...",SOTA improvement,,,https://www.nature.com/articles/nature24270.ep...,14389.0,Mastering the game of Go with deep neural netw...,...,Google DeepMind,,,,AlphaGo Fan,,,AlphaGo Fan,,
275,AlphaGo Lee,Games,Go,"David Silver, Aja Huang, Chris J. Maddison, Ar...",Highly cited,,,https://www.nature.com/articles/nature16961,14389.0,Mastering the game of Go with deep neural netw...,...,DeepMind,,,,AlphaGo Lee,,,AlphaGo Lee,,
306,GNMT,Language,Translation,"Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc ...",Highly cited,,,https://arxiv.org/abs/1609.08144,5948.0,Google's Neural Machine Translation System: Br...,...,Google,,,,GNMT,96.0,,GNMT,,
317,NASv3 (CIFAR-10),Vision,,"Barret Zoph, Quoc V. Le",Highly cited,,,https://arxiv.org/abs/1611.01578,4569.0,Neural Architecture Search with Reinforcement ...,...,Google Brain,,,,NASv3 (CIFAR-10),800.0,,NASv3 (CIFAR-10),,
337,AlphaGo Master,Games,Go,"D Silver, J Schrittwieser, K Simonyan, I Anton...",Highly cited,,,https://www.researchgate.net/publication/32047...,7831.0,Mastering the game of Go without human knowledge,...,DeepMind,,,,AlphaGo Master,,,AlphaGo Master,,


In [46]:
assert len(frontier_pcd_df) == len(frontier_systems)

## Prices

In [47]:
price_df = pd.read_csv('data/Hardware prices.csv')
price_df.head()

Unnamed: 0,Price source,Price date,Hardware model,Manufacturer (from Hardware model),Vendor,Location,Price per chip-hour (on-demand),Price per chip-hour (1-year CUD),Price per chip-hour (3-year CUD),Price (hardware purchase)
0,https://web.archive.org/web/20181009102635/htt...,2018-10-09,Google TPU v2,Google,Google Cloud,US,$1.13,,,
1,https://web.archive.org/web/20181011013513/htt...,2018-10-11,Google TPU v3,Google,Google Cloud,US,$2.00,,,
2,https://web.archive.org/web/20181011013513/htt...,2018-10-11,Google TPU v3,Google,Google Cloud,Europe,$2.20,,,
3,https://web.archive.org/web/20190701021000/htt...,2019-07-01,Google TPU v3,Google,Google Cloud,Iowa (us-central1),$2.00,$1.26,$0.90,
4,https://web.archive.org/web/20190728061708/htt...,2019-07-28,Google TPU v3,Google,Google Cloud,Netherlands (europe-west4),$2.00,$1.26,$0.90,


In [48]:
# Price date in datetime format
price_df.dropna(subset=['Price date'], inplace=True)
price_df['Price date'] = pd.to_datetime(price_df['Price date'])

In [49]:
pcd_hardware_model_colname = 'Name of the hardware (from Training hardware)'

# Imputation

Training hardware: "best" available

FLOP/second (flops) utilization: mean of known values

TODO: random sample from distribution, combined with bootstrapping

Number of chip-hours

If num_chips AND training_time_hours known: chip_hours = num_chips * training_time_hours

Else if num_chips unknown: 

Else if training_time_hours unknown: 

Else (both unknown): chip_hours = training_compute_flop / (chip_flops * flops_utilization * SECONDS_PER_HOUR)

In [50]:
chip_hours = []
for i, row in frontier_pcd_df.iterrows():
    if pd.isna(row['Hardware quantity']) or pd.isna(row['Training time (hours)']):
        # TODO impute missing values
        chip_hours.append(np.nan)
    else:
        chip_hours.append(row['Hardware quantity'] * row['Training time (hours)'])

frontier_pcd_df['Training time (chip hours)'] = chip_hours



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



# Price selection

1. Use a fixed mapping from Organization to cloud provider. If no mapping found, default to "Amazon Web Services".
2. If there's a match for the hardware model, use that. Else, discard the ML system from the dataset.
3. Use the price that is nearest to, but prior to, training time + 2 months before the publication date
4. If there are no prices prior to that time, use the nearest price after that time
5. If there are no prices for that hardware model and cloud provider at all, repeat steps 3 and 4 for "Microsoft Azure", then "Google Cloud" as the cloud provider.
6. If there are no prices found from step 5, discard the ML system from the dataset.

In [51]:
def find_closest_price_dates(vendor, hardware_model, date, df):
    """
    Finds the row in the DataFrame with the closest 'Price date' to the given date
    which has a 'Vendor' equal to the specified vendor and a 'Hardware model' equal to the specified hardware_model.
    
    :param vendor: The vendor to match.
    :param hardware_model: The hardware model to match.
    :param date: The target date to find the closest 'Price date' to.
    :param df: The DataFrame containing the hardware price data.
    :return: The row from the DataFrame that matches the criteria.
    """
    # Filter the DataFrame based on vendor and hardware model
    filtered_df = df[(df['Vendor'] == vendor) & (df['Hardware model'] == hardware_model)]

    # Convert the target date to datetime
    target_date = pd.to_datetime(date)

    # Find the row with the smallest date difference
    closest_row_df = filtered_df.iloc[(filtered_df['Price date'] - target_date).abs().argsort()]

    return closest_row_df

# Example usage
example_vendor = "Google Cloud"
example_hardware_model = "Google TPU v3"
example_date = "2019-07-15" # Example date, format should be YYYY-MM-DD

# Find the row
closest_row_df = find_closest_price_dates(example_vendor, example_hardware_model, example_date, price_df)

for i, row in closest_row_df.iterrows():
    if row['Price date'] <= pd.to_datetime(example_date):
        print(f"Price date: {row['Price date']}")
        print(f"Price: {row['Price per chip-hour (1-year CUD)']}")
        break

Price date: 2019-07-01 00:00:00
Price: $1.26


In [52]:
org_to_cloud_vendor = {
    'google': 'Google Cloud',
    'deepmind': 'Google Cloud',
    'microsoft': 'Microsoft Azure',
    'openai': 'Microsoft Azure',
}

In [53]:
price_colname = 'Price per chip-hour (1-year CUD)'
system_to_price = {}

for i, row in frontier_pcd_df.iterrows():
    # Subtract training time plus 2 months from publication date
    if pd.isna(row['Training time (hours)']):
        continue

    training_time_offset = pd.Timedelta(hours=int(row['Training time (hours)']))
    low_buffer_time_offset = pd.Timedelta(days=30)
    mid_buffer_time_offset = pd.Timedelta(days=60)
    high_buffer_time_offset = pd.Timedelta(days=150)

    high_purchase_time = row['Publication date'] - (training_time_offset + low_buffer_time_offset)
    mid_purchase_time = row['Publication date'] - (training_time_offset + mid_buffer_time_offset)
    low_purchase_time = row['Publication date'] - (training_time_offset + high_buffer_time_offset)

    hardware_model = row[pcd_hardware_model_colname]

    orgs = row['Organization'].split(',')
    vendor = None
    for org in orgs:
        for key in org_to_cloud_vendor:
            if key in org.lower():
                vendor = org_to_cloud_vendor[key]
                break

    if vendor is None:
        vendor = 'Amazon Web Services'  # default

    print(f"{row['System']}, {vendor}, {hardware_model}, at {mid_purchase_time}")

    # Find the price of the hardware at the time of purchase
    closest_price_dates_df = find_closest_price_dates(vendor, hardware_model, mid_purchase_time, price_df)
    price_per_chip_hour = None
    for i, price_row in closest_price_dates_df.iterrows():
        if price_row['Price date'] <= mid_purchase_time:
            price_per_chip_hour = price_row[price_colname]
            break
    if price_per_chip_hour is None:
        for i, price_row in closest_price_dates_df.iterrows():
            if price_row['Price date'] > mid_purchase_time:
                price_per_chip_hour = price_row[price_colname]
                break
    if type(price_per_chip_hour) is str:
        system_to_price[row['System']] = float(price_per_chip_hour.strip('$'))
    else:
        print(f"Could not find price")
        continue

system_to_price

GNMT, Google Cloud, NVIDIA Tesla K80, at 2016-01-30 00:00:00
Could not find price
JFT, Google Cloud, NVIDIA Tesla K80, at 2017-03-12 00:00:00
Could not find price
AlphaGo Zero, Google Cloud, Google TPU v1, at 2017-07-30 00:00:00
Could not find price
AmoebaNet-A (F=448), Google Cloud, NVIDIA Tesla K40s, at 2017-11-30 00:00:00
Could not find price
IMPALA, Google Cloud, NVIDIA P100, at 2017-12-02 20:00:00
Could not find price
BigGAN-deep 512x512, Google Cloud, Google TPU v3, at 2018-07-28 00:00:00
Could not find price
RoBERTa Large, Amazon Web Services, NVIDIA Tesla V100 DGXS 32 GB, at 2019-04-27 00:00:00
Megatron-BERT, Amazon Web Services, NVIDIA Tesla V100S PCIe 32 GB, at 2019-05-22 00:00:00
Could not find price
Megatron-LM (8.3B), Amazon Web Services, NVIDIA Tesla V100 DGXS 32 GB, at 2019-07-05 09:00:00
T5-11B, Google Cloud, Google TPU v3, at 2019-08-03 23:00:00
AlphaStar, Google Cloud, Google TPU v3, at 2019-07-18 00:00:00
OpenAI Five, Microsoft Azure, nan, at 2018-12-22 00:00:00
Coul

{'RoBERTa Large': 2.29,
 'Megatron-LM (8.3B)': 2.29,
 'T5-11B': 1.26,
 'AlphaStar': 1.26,
 'Meena': 1.26,
 'GShard (dense)': 1.26,
 'ALIGN': 1.26,
 'Megatron-Turing NLG 530B': 2.62,
 'Gopher (280B)': 1.26,
 'GLaM': 2.03,
 'LaMDA': 1.26,
 'PaLM (540B)': 2.03,
 'OPT-175B': 3.0,
 'Minerva (540B)': 2.03,
 'Llama 2-70B': 3.0,
 'Falcon 180B': 2.4}

# Cost estimation

TODO: inflation adjustment

cost = price_per_chip_hour * chip_hours

In [54]:
def estimate_cost(row, system_to_price):
    system = row['System']
    price = system_to_price.get(system)
    if price is None:
        return None

    chip_hours = row['Training time (chip hours)']
    if np.isnan(chip_hours):
        return None

    cost = price * chip_hours

    # Check for base model
    if not pd.isna(row['Base model']):
        base_model_name = row['Base model']
        base_model = frontier_pcd_df[frontier_pcd_df['System'] == base_model_name].squeeze()
        base_cost = estimate_cost(base_model, system_to_price)
        if base_cost is None:
            return None
        else:
            cost += base_cost

    return cost
    

In [55]:
system_to_cost = {}
for i, row in frontier_pcd_df.iterrows():
    cost = estimate_cost(row, system_to_price)
    if cost is None:
        continue
    system_to_cost[row['System']] = cost

system_to_cost

{'RoBERTa Large': 281395.2,
 'Megatron-LM (8.3B)': 383400.96,
 'T5-11B': 310883.328,
 'AlphaStar': 510935.04,
 'Meena': 928972.8,
 'GShard (dense)': 1300561.92,
 'ALIGN': 224050.176,
 'Megatron-Turing NLG 530B': 9037952.0,
 'Gopher (280B)': 4748083.2,
 'GLaM': 2839531.5199999996,
 'LaMDA': 1786982.4,
 'PaLM (540B)': 17062133.759999998,
 'OPT-175B': 2437632.0,
 'Minerva (540B)': 18508922.88,
 'Falcon 180B': 42467328.0}

# Regression

# Plots

In [56]:
import plotly.graph_objects as go

fig = go.Figure()
for i, row in frontier_pcd_df.iterrows():
    system = row['System']
    cost = system_to_cost.get(system)
    if cost is None:
        continue
    publication_date = row['Publication date']

    fig.add_trace(go.Scatter(
        x=[publication_date],
        y=[cost],
        name=system,
        text=system,
        textposition='top center',
        line=dict(color='#034752'),
        mode='markers+text',
    ))

# log y axis
fig.update_yaxes(type="log")
# no legend
fig.update_layout(showlegend=False)

# axis labels
fig.update_xaxes(title_text='Publication date')
fig.update_yaxes(title_text='Cost (USD, nominal)')

# title
fig.update_layout(title_text='Cost of cloud compute to train frontier ML systems')

# update size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
)

# font size
fig.update_layout(
    font=dict(
        size=14,
    )
)

# axis limits
fig.update_xaxes(range=['2018-01-01', '2025-01-01'])
fig.update_yaxes(range=[5, 8])

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

save_plot(fig, 'results/', 'cost_scatter')

fig.show()

In [57]:
import plotly.graph_objects as go

fig = go.Figure()
for i, row in frontier_pcd_df.iterrows():
    system = row['System']
    cost = system_to_cost.get(system)
    if cost is None:
        continue
    publication_date = row['Publication date']

    fig.add_trace(go.Bar(
        x=[system],
        y=[cost],
        name=system,
        # nice blue color
        marker_color='#034752',
        # text=system,
        # textposition='auto',
    ))

# log y axis
fig.update_yaxes(type="log")
# no legend
fig.update_layout(showlegend=False)

# axis labels
fig.update_xaxes(title_text='System')
fig.update_yaxes(title_text='Cost (USD, nominal)')

# title
fig.update_layout(title_text='Cost of cloud compute to train frontier ML systems')

# update size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
)

# font size
fig.update_layout(
    font=dict(
        size=14,
    )
)

# axis limits
fig.update_yaxes(range=[0, 8])

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

fig.show()


## kNN imputation

In [58]:
# drop unneeded columns from frontier_pcd_df
irrelevant_columns = ['Notability criteria', 'Notability criteria notes', 'Link', 'Citations', 'Parameters notes',
                      'Training compute notes', 'Training dataset notes', 'Dataset size notes',
                      'Inference compute notes', 'Approach', 'Confidence', 'Last modified', 'Created By', 'Benchmark data',
                      'Exclude', 'Authors by country', 'Training cost trends', 'Abstract', 'Compute cost notes',
                      'Training time notes', 'Authors', 'Name of the hardware (from Training hardware)',
                      'Training time (chip hours)', 'Training compute cost (2020 USD)']
frontier_pcd_df = frontier_pcd_df.drop(columns=irrelevant_columns)

# fill column 'Training cloud compute vendor' using org_to_cloud_vendor dictionary
org_to_cloud_vendor = {
    'Google': 'Google Cloud',
    'DeepMind': 'Google Cloud',
    'Google DeepMind': 'Google Cloud',
    'Google Brain': 'Google Cloud',
    'Microsoft': 'Microsoft Azure',
    'OpenAI': 'Microsoft Azure',
}
frontier_pcd_df['Training cloud compute vendor'] = frontier_pcd_df['Organization (from Organization)'].map(org_to_cloud_vendor)
frontier_pcd_df['Training cloud compute vendor'] = frontier_pcd_df['Training cloud compute vendor'].fillna('Amazon Web Services')

In [59]:
frontier_pcd_df.head()

Unnamed: 0,System,Domain,Task,Open-source,Reference,Publication date,Organization,Organization categorization,Parameters,Training compute (FLOP),...,Compute sponsor categorization,Country (from Organization),Organization (from Organization),Base model,Finetune compute (FLOP),Finetune compute notes,Hardware quantity,Hardware utilization,Training cloud compute vendor,Training data center
265,AlphaGo Fan,Games,Go,,Mastering the game of Go with deep neural netw...,2015-10-01,Google DeepMind,Industry,8209984.0,3.8e+20,...,Industry,Multinational,Google DeepMind,,,,,,Google Cloud,
275,AlphaGo Lee,Games,Go,,Mastering the game of Go with deep neural netw...,2016-01-27,DeepMind,Industry,,1.9e+21,...,Industry,United Kingdom of Great Britain and Northern I...,DeepMind,,,,,,Google Cloud,
306,GNMT,Language,Translation,,Google's Neural Machine Translation System: Br...,2016-09-26,Google,Industry,278000000.0,6.9e+21,...,Industry,Multinational,Google,,,,96.0,,Google Cloud,
317,NASv3 (CIFAR-10),Vision,,,Neural Architecture Search with Reinforcement ...,2016-11-05,Google Brain,Industry,37400000.0,2.2e+21,...,Industry,Multinational,Google Brain,,,,800.0,,Google Cloud,
337,AlphaGo Master,Games,Go,,Mastering the game of Go without human knowledge,2017-01-01,DeepMind,Industry,,1.5e+23,...,Industry,United Kingdom of Great Britain and Northern I...,DeepMind,,,,,,Google Cloud,


In [60]:
frontier_pcd_df.columns.tolist()

['System',
 'Domain',
 'Task',
 'Open-source',
 'Reference',
 'Publication date',
 'Organization',
 'Organization categorization',
 'Parameters',
 'Training compute (FLOP)',
 'Training dataset',
 'Training dataset size (datapoints)',
 'Epochs',
 'Inference compute (FLOP)',
 'Training time (hours)',
 'Training hardware',
 'Compute sponsor categorization',
 'Country (from Organization)',
 'Organization (from Organization)',
 'Base model',
 'Finetune compute (FLOP)',
 'Finetune compute notes',
 'Hardware quantity',
 'Hardware utilization',
 'Training cloud compute vendor',
 'Training data center']

In [61]:
# Import KNNImputer from sklearn
from sklearn.impute import KNNImputer

In [62]:
# instantiate the imputer
imputer = KNNImputer(n_neighbors=5)

In [63]:
# convert datetime to float
frontier_pcd_df['Publication date'] = frontier_pcd_df['Publication date'].dt.year + (frontier_pcd_df['Publication date'].dt.month-1) / 12 + (frontier_pcd_df['Publication date'].dt.day-1) / 365

# set the System column as the index
frontier_pcd_df = frontier_pcd_df.set_index('System')

# Identify categorical columns
categorical_cols = frontier_pcd_df.select_dtypes(include=['object', 'category']).columns.tolist()

# one-hot encode all categorical columns
one_hot_pcd_df = pd.get_dummies(frontier_pcd_df, columns=categorical_cols)

In [64]:
# impute the missing values in Training hardware, hardware quantity, Training time (hours)
imputed = imputer.fit_transform(one_hot_pcd_df)

In [65]:
# convert the numpy array back to a dataframe
imputed_pcd_df = pd.DataFrame(imputed, columns=one_hot_pcd_df.columns)

In [66]:
# convert Training hardware back to categorical
imputed_pcd_df['Training hardware'] = ''
for col in imputed_pcd_df.columns:
    if col.startswith('Training hardware_'):
        training_hardware = col.split('Training hardware_')[1]
        imputed_pcd_df['Training hardware'] = imputed_pcd_df['Training hardware'] + pd.Series([int(_) * training_hardware for _ in imputed_pcd_df[col]])

# replace all '' with np.nan
imputed_pcd_df['Training hardware'] = imputed_pcd_df['Training hardware'].replace('', np.nan)

In [67]:
# use KNeighborsClassifier to impute the missing values in Training hardware
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder

# Separate the target and features
target_col = 'Training hardware'
features = imputed_pcd_df.drop(target_col, axis=1)
target = imputed_pcd_df[target_col]

# Encode the target column
label_encoder = LabelEncoder()
target_filled = target.fillna('Missing')  # Temporarily fill missing values
target_encoded = label_encoder.fit_transform(target_filled)

# Train a KNeighborsClassifier
knc = KNeighborsClassifier(n_neighbors=5)
knc.fit(features, target_encoded)

# Predict the missing values
missing_values = features[target.isna()]
predicted = knc.predict(missing_values)

# Decode the predictions
predicted_labels = label_encoder.inverse_transform(predicted)

# Replace the missing values with the predictions
imputed_pcd_df.loc[target.isna(), target_col] = predicted_labels





In [68]:
# restore the System column
imputed_pcd_df['System'] = one_hot_pcd_df.index

In [69]:
# set the System column as the index
imputed_pcd_df = imputed_pcd_df.set_index('System')

In [70]:
# insert imputed values into frontier_pcd_df
frontier_pcd_df['Training hardware'] = imputed_pcd_df['Training hardware']
frontier_pcd_df['Hardware quantity'] = imputed_pcd_df['Hardware quantity']
frontier_pcd_df['Hardware utilization'] = imputed_pcd_df['Hardware utilization']
frontier_pcd_df['Training time (hours)'] = imputed_pcd_df['Training time (hours)']

In [71]:
# calculate training time (chip hours) from training time and hardware quantity
frontier_pcd_df['Training time (chip hours)'] = frontier_pcd_df['Training time (hours)'] * frontier_pcd_df['Hardware quantity']

In [72]:
frontier_pcd_df.head()

Unnamed: 0_level_0,Domain,Task,Open-source,Reference,Publication date,Organization,Organization categorization,Parameters,Training compute (FLOP),Training dataset,...,Country (from Organization),Organization (from Organization),Base model,Finetune compute (FLOP),Finetune compute notes,Hardware quantity,Hardware utilization,Training cloud compute vendor,Training data center,Training time (chip hours)
System,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AlphaGo Fan,Games,Go,,Mastering the game of Go with deep neural netw...,2015.75,Google DeepMind,Industry,8209984.0,3.8e+20,,...,Multinational,Google DeepMind,,,,311.4,,Google Cloud,,116837.28
AlphaGo Lee,Games,Go,,Mastering the game of Go with deep neural netw...,2016.071233,DeepMind,Industry,,1.9e+21,,...,United Kingdom of Great Britain and Northern I...,DeepMind,,,,311.4,,Google Cloud,,116837.28
GNMT,Language,Translation,,Google's Neural Machine Translation System: Br...,2016.73516,Google,Industry,278000000.0,6.9e+21,,...,Multinational,Google,,,,96.0,,Google Cloud,,414720.0
NASv3 (CIFAR-10),Vision,,,Neural Architecture Search with Reinforcement ...,2016.844292,Google Brain,Industry,37400000.0,2.2e+21,,...,Multinational,Google Brain,,,,800.0,,Google Cloud,,300160.0
AlphaGo Master,Games,Go,,Mastering the game of Go without human knowledge,2017.0,DeepMind,Industry,,1.5e+23,,...,United Kingdom of Great Britain and Northern I...,DeepMind,,,,1543.6,,Google Cloud,,3201395.528


In [73]:
frontier_pcd_df['System'] = frontier_pcd_df.index

In [74]:
# calculate training compute cost from training time and hardware
system_to_cost = {}
for i, row in frontier_pcd_df.iterrows():
    cost = estimate_cost(row, system_to_price)
    if cost is None:
        continue
    system_to_cost[row['System']] = cost

system_to_cost

{'RoBERTa Large': 281395.2,
 'Megatron-LM (8.3B)': 383400.96,
 'T5-11B': 310883.328,
 'AlphaStar': 510935.04,
 'Meena': 928972.8,
 'GShard (dense)': 1300561.92,
 'ALIGN': 224050.176,
 'Megatron-Turing NLG 530B': 9037952.0,
 'Gopher (280B)': 4748083.2,
 'GLaM': 2839531.5199999996,
 'LaMDA': 1786982.4,
 'PaLM (540B)': 17062133.759999998,
 'OPT-175B': 2437632.0,
 'Minerva (540B)': 18508922.88,
 'Llama 2-70B': 33841152.0,
 'Falcon 180B': 42467328.0}