# Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

from plotting import *
from prices import *
from inflation import *

# Dataset

## ML systems

In [3]:
pcd_df = pd.read_csv('data/All ML Systems - full view.csv')
pcd_df.head()

Unnamed: 0,System,Domain,Task,Authors,Notability criteria,Notability criteria notes,Open-source,Link,Citations,Reference,...,Organization (from Organization),Base model,Finetune compute (FLOP),Finetune compute notes,Authors by country,Hardware quantity,Hardware utilization,Training cost trends,Training cloud compute vendor,Training data center
0,Cohere Command,Language,,,,,,https://cohere.com/models/command,,"World-class AI, at your command",...,Cohere,,,,,,,,,
1,Theseus,Other,Maze solving,Claude Shannon,Historical significance,,,https://www.technologyreview.com/2018/12/19/13...,0.0,Mighty Mouse,...,Bell Laboratories,,,,Theseus,,,,,
2,SNARC,Other,Maze solving,Marvin Minsky,Historical significance,,,https://en.wikipedia.org/wiki/Stochastic_neura...,33.0,A Neural-Analogue Calculator Based upon a Prob...,...,Harvard University,,,,SNARC,,,,,
3,Genetic algorithm,,,NA Barricelli,Historical significance,Possibly first computer simulation of a geneti...,,https://link.springer.com/article/10.1007/BF01...,266.0,Numerical testing of evolution theories,...,Institute for Advanced Study,,,,Genetic algorithm,,,,,
4,Sequence-based pattern recognition,Vision,Character recognition,O. G. Selfridge,Historical significance,,,https://dl.acm.org/doi/10.1145/1455292.1455310,290.0,Pattern recognition and modern computers,...,Massachusetts Institute of Technology (MIT),,,,Sequence-based pattern recognition,,,,,


In [4]:
# Publication date in datetime format
pcd_df.dropna(subset=['Publication date'], inplace=True)
pcd_df['Publication date'] = pd.to_datetime(pcd_df['Publication date'])

In [5]:
# Manually copied from "Training cost trends" Airtable
frontier_systems = [
    "PaLM 2",
    "GPT-4",
    "Minerva (540B)",
    "Megatron-Turing NLG 530B",
    "GPT-3 175B (davinci)",
    "Meena",
    "AlphaStar",
    "AlphaGo Zero",
    "AlphaGo Master",
    "GNMT",
    "Claude 2",
    "PaLM (540B)",
    "ERNIE 3.0 Titan",
    "Gopher (280B)",
    "OpenAI Five",
    "T5-11B",
    "Megatron-BERT",
    "ResNeXt-101 32x48d",
    "AlphaZero",
    "Falcon 180B",
    "GPT-3.5 (text-davinci-003)",
    "Chinchilla",
    "Yuan 1.0",
    "Turing-NLG",
    "BigGAN-deep 512x512",
    "NASv3 (CIFAR-10)",
    "AlphaGo Lee",
    "AlphaGo Fan",
    "OPT-175B",
    "AlphaCode",
    "GLaM",
    "OpenAI Five Rerun",
    "T5-3B",
    "Megatron-LM (8.3B)",
    "FTW",
    "AmoebaNet-A (F=448)",
    "OpenAI TI7 DOTA 1v1",
    "JFT",
    "Llama 2-70B",
    "LLaMA-65B",
    "LaMDA",
    "ALIGN",
    "GShard (dense)",
    "RoBERTa Large",
    "IMPALA",
]

In [6]:
frontier_pcd_df = pcd_df[pcd_df['System'].isin(frontier_systems)]
frontier_pcd_df.head()

Unnamed: 0,System,Domain,Task,Authors,Notability criteria,Notability criteria notes,Open-source,Link,Citations,Reference,...,Organization (from Organization),Base model,Finetune compute (FLOP),Finetune compute notes,Authors by country,Hardware quantity,Hardware utilization,Training cost trends,Training cloud compute vendor,Training data center
265,AlphaGo Fan,Games,Go,"David Silver, Aja Huang, Chris J. Maddison, Ar...",SOTA improvement,,,https://www.nature.com/articles/nature24270.ep...,14389.0,Mastering the game of Go with deep neural netw...,...,Google DeepMind,,,,AlphaGo Fan,,,AlphaGo Fan,,
275,AlphaGo Lee,Games,Go,"David Silver, Aja Huang, Chris J. Maddison, Ar...",Highly cited,,,https://www.nature.com/articles/nature16961,14389.0,Mastering the game of Go with deep neural netw...,...,DeepMind,,,,AlphaGo Lee,,,AlphaGo Lee,,
306,GNMT,Language,Translation,"Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc ...",Highly cited,,,https://arxiv.org/abs/1609.08144,5948.0,Google's Neural Machine Translation System: Br...,...,Google,,,,GNMT,96.0,,GNMT,,
317,NASv3 (CIFAR-10),Vision,,"Barret Zoph, Quoc V. Le",Highly cited,,,https://arxiv.org/abs/1611.01578,4569.0,Neural Architecture Search with Reinforcement ...,...,Google Brain,,,,NASv3 (CIFAR-10),800.0,,NASv3 (CIFAR-10),,
337,AlphaGo Master,Games,Go,"D Silver, J Schrittwieser, K Simonyan, I Anton...",Highly cited,,,https://www.researchgate.net/publication/32047...,7831.0,Mastering the game of Go without human knowledge,...,DeepMind,,,,AlphaGo Master,,,AlphaGo Master,,


In [7]:
assert len(frontier_pcd_df) == len(frontier_systems)

## Prices

In [8]:
price_df = pd.read_csv('data/Hardware prices.csv')
price_df.head()

Unnamed: 0,Price source,Price date,Hardware model,Manufacturer (from Hardware model),Vendor,Location,Price per chip-hour (on-demand),Price per chip-hour (1-year CUD),Price per chip-hour (3-year CUD),Price (hardware purchase)
0,https://web.archive.org/web/20181009102635/htt...,2018-10-09,Google TPU v2,Google,Google Cloud,US,$1.13,,,
1,https://web.archive.org/web/20181011013513/htt...,2018-10-11,Google TPU v3,Google,Google Cloud,US,$2.00,,,
2,https://web.archive.org/web/20181011013513/htt...,2018-10-11,Google TPU v3,Google,Google Cloud,Europe,$2.20,,,
3,https://web.archive.org/web/20190701021000/htt...,2019-07-01,Google TPU v3,Google,Google Cloud,Iowa (us-central1),$2.00,$1.26,$0.90,
4,https://web.archive.org/web/20190728061708/htt...,2019-07-28,Google TPU v3,Google,Google Cloud,Netherlands (europe-west4),$2.00,$1.26,$0.90,


In [9]:
# Price date in datetime format
price_df.dropna(subset=['Price date'], inplace=True)
price_df['Price date'] = pd.to_datetime(price_df['Price date'])

In [10]:
pcd_hardware_model_colname = 'Name of the hardware (from Training hardware)'

## Hardware data

In [11]:
hardware_df = pd.read_csv('data/Chip dataset-Grid view.csv')
hardware_df.head()

Unnamed: 0,Name of the hardware,Manufacturer,Type,Release date,Release price (USD),FP64 (double precision) Performance (FLOP/s),FP32 (single precision) Performance (FLOP/s),FP16 (half precision) Performance (FLOP/s),Tensor Float 32 (TF32),FP16 Tensor Core,...,Foundry,Number of transistors in million,Prominent Years of usage,Google Cloud pricing ($ per hour) data from 17 dec 2022,Link to datasheet,Source for the Price,ALL ML SYSTEMS,All ML Systems copy,All ML Systems copy.1,Hardware prices
0,3dfx Spectre 1000,Other,GPU,,,,,,,,...,TSMC,30.0,,,,,,,,
1,3dfx Spectre 2000,Other,GPU,,,,,,,,...,TSMC,30.0,,,,,,,,
2,3dfx Spectre 3000,Other,GPU,,,,,,,,...,TSMC,30.0,,,,,,,,
3,3dfx Voodoo4 4000 AGP,Other,GPU,,,,,,,,...,TSMC,14.0,,,,,,,,
4,3dfx Voodoo4 4500 AGP,Other,GPU,2000-10-13,,,,,,,...,TSMC,14.0,,,,,,,,


# Imputation

Training hardware: "best" available

FLOP/second (flops) utilization: mean of known values

TODO: random sample from distribution, combined with bootstrapping

Number of chip-hours

If num_chips AND training_time_hours known: chip_hours = num_chips * training_time_hours

Else if num_chips unknown: 

Else if training_time_hours unknown: 

Else (both unknown): chip_hours = training_compute_flop / (chip_flops * flops_utilization * SECONDS_PER_HOUR)

In [12]:
chip_hours = []
for i, row in frontier_pcd_df.iterrows():
    if pd.isna(row['Hardware quantity']) or pd.isna(row['Training time (hours)']):
        # TODO impute missing values
        chip_hours.append(np.nan)
    else:
        chip_hours.append(row['Hardware quantity'] * row['Training time (hours)'])

frontier_pcd_df['Training time (chip hours)'] = chip_hours

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frontier_pcd_df['Training time (chip hours)'] = chip_hours


# Price selection

1. Use a fixed mapping from Organization to cloud provider. If no mapping found, default to "Amazon Web Services".
2. If there's a match for the hardware model, use that. Else, discard the ML system from the dataset.
3. Use the price that is nearest to, but prior to, training time + 2 months before the publication date
4. If there are no prices prior to that time, use the nearest price after that time
5. If there are no prices for that hardware model and cloud provider at all, repeat steps 3 and 4 for "Microsoft Azure", then "Google Cloud" as the cloud provider.
6. If there are no prices found from step 5, discard the ML system from the dataset.

In [13]:
# Example usage
example_vendor = "Google Cloud"
example_hardware_model = "Google TPU v3"
example_date = "2019-07-15" # Example date, format should be YYYY-MM-DD

# Find the row
closest_row_df = find_closest_price_dates(example_vendor, example_hardware_model, example_date, price_df)

for i, row in closest_row_df.iterrows():
    if row['Price date'] <= pd.to_datetime(example_date):
        print(f"Price date: {row['Price date']}")
        print(f"Price: {row['Price per chip-hour (1-year CUD)']}")
        break    

Price date: 2019-07-01 00:00:00
Price: $1.26


In [14]:
org_to_cloud_vendor = {
    'google': 'Google Cloud',
    'deepmind': 'Google Cloud',
    'microsoft': 'Microsoft Azure',
    'openai': 'Microsoft Azure',
}

In [15]:
price_colname = 'Price per chip-hour (1-year CUD)'
system_to_price = {}

for i, row in frontier_pcd_df.iterrows():
    price = find_price(row, price_df, hardware_df, pcd_hardware_model_colname, price_colname, org_to_cloud_vendor)
    if price is None:
        continue
    else:
        system_to_price[row['System']] = price

system_to_price

==== System: AlphaGo Fan ====
Training time is required but no value found

==== System: AlphaGo Lee ====
Training time is required but no value found

==== System: GNMT ====
Trying NVIDIA Tesla K80 at 2016-01-30 00:00:00
Trying Google Cloud, Price per chip-hour (1-year CUD)
Could not find price
Trying Google Cloud, Price per chip-hour (3-year CUD)
Could not find price
Trying Google Cloud, Price per chip-hour (on-demand)
Found price: 1.4

==== System: NASv3 (CIFAR-10) ====
Training time is required but no value found

==== System: AlphaGo Master ====
Training time is required but no value found

==== System: JFT ====
Trying NVIDIA Tesla K80 at 2017-03-12 00:00:00
Trying Google Cloud, Price per chip-hour (1-year CUD)
Could not find price
Trying Google Cloud, Price per chip-hour (3-year CUD)
Could not find price
Trying Google Cloud, Price per chip-hour (on-demand)
Found price: 1.4

==== System: OpenAI TI7 DOTA 1v1 ====
Training time is required but no value found

==== System: AlphaGo Ze

{'GNMT': 1.4,
 'JFT': 1.4,
 'AmoebaNet-A (F=448)': 0.05271257202460601,
 'IMPALA': 1.46,
 'BigGAN-deep 512x512': 2.0,
 'RoBERTa Large': 2.29,
 'Megatron-BERT': 0.1051277082983274,
 'Megatron-LM (8.3B)': 2.29,
 'T5-11B': 1.26,
 'AlphaStar': 1.26,
 'Meena': 1.26,
 'GPT-3 175B (davinci)': 2.29,
 'GShard (dense)': 1.26,
 'ALIGN': 1.26,
 'Megatron-Turing NLG 530B': 2.62,
 'Gopher (280B)': 1.26,
 'GLaM': 2.03,
 'LaMDA': 1.26,
 'PaLM (540B)': 2.03,
 'OPT-175B': 3.0,
 'Minerva (540B)': 2.03,
 'LLaMA-65B': 2.35,
 'GPT-4': 2.4,
 'Llama 2-70B': 3.0,
 'Falcon 180B': 2.4}

In [16]:
len(system_to_price)

25

# Cost estimation

TODO: inflation adjustment

cost = price_per_chip_hour * chip_hours

In [17]:
def estimate_cost(row, system_to_price):
    system = row['System']
    price = system_to_price.get(system)
    if price is None:
        return None

    chip_hours = row['Training time (chip hours)']
    if np.isnan(chip_hours):
        return None

    cost = price * chip_hours

    # Check for base model
    if not pd.isna(row['Base model']):
        base_model_name = row['Base model']
        base_model = frontier_pcd_df[frontier_pcd_df['System'] == base_model_name].squeeze()
        base_cost = estimate_cost(base_model, system_to_price)
        if base_cost is None:
            return None
        else:
            cost += base_cost

    return cost
    

In [18]:
system_to_cost = {}
for i, row in frontier_pcd_df.iterrows():
    cost = estimate_cost(row, system_to_price)
    if cost is None and row['System'] in system_to_price:
        print(f"Could not estimate cost for {row['System']}")
        continue
    system_to_cost[row['System']] = cost

system_to_cost

Could not estimate cost for Llama 2-70B


{'AlphaGo Fan': None,
 'AlphaGo Lee': None,
 'GNMT': 580608.0,
 'NASv3 (CIFAR-10)': None,
 'AlphaGo Master': None,
 'JFT': 100800.0,
 'OpenAI TI7 DOTA 1v1': None,
 'AlphaGo Zero': None,
 'AlphaZero': None,
 'AmoebaNet-A (F=448)': 3985.0704450602143,
 'IMPALA': 146.0,
 'ResNeXt-101 32x48d': None,
 'FTW': None,
 'BigGAN-deep 512x512': 24576.0,
 'RoBERTa Large': 281395.2,
 'Megatron-BERT': 74924.93821505114,
 'Megatron-LM (8.3B)': 383400.96,
 'T5-3B': None,
 'T5-11B': 310883.328,
 'AlphaStar': 510935.04,
 'OpenAI Five': None,
 'OpenAI Five Rerun': None,
 'Meena': 928972.8,
 'Turing-NLG': None,
 'GPT-3 175B (davinci)': 8134080.0,
 'GShard (dense)': 1300561.92,
 'ALIGN': 224050.176,
 'Megatron-Turing NLG 530B': 9037952.0,
 'Yuan 1.0': None,
 'Gopher (280B)': 4748083.2,
 'GLaM': 2839531.5199999996,
 'ERNIE 3.0 Titan': None,
 'AlphaCode': None,
 'LaMDA': 1786982.4,
 'Chinchilla': None,
 'PaLM (540B)': 17062133.759999998,
 'OPT-175B': 2437632.0,
 'Minerva (540B)': 18508922.88,
 'GPT-3.5 (text-

In [19]:
len(system_to_cost)

44

# Regression

# Plots

In [20]:
import plotly.graph_objects as go

fig = go.Figure()
for i, row in frontier_pcd_df.iterrows():
    system = row['System']
    cost = system_to_cost.get(system)
    if cost is None:
        continue
    publication_date = row['Publication date']

    fig.add_trace(go.Scatter(
        x=[publication_date],
        y=[cost],
        name=system,
        text=system,
        textposition='top center',
        line=dict(color='#034752'),
        mode='markers+text',
    ))

# log y axis
fig.update_yaxes(type="log")
# no legend
fig.update_layout(showlegend=False)

# axis labels
fig.update_xaxes(title_text='Publication date')
fig.update_yaxes(title_text='Cost (USD, nominal)')

# title
fig.update_layout(title_text='Cost of cloud compute to train frontier ML systems')

# update size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
)

# font size
fig.update_layout(
    font=dict(
        size=14,
    )
)

# axis limits
# fig.update_xaxes(range=['2017-01-01', '2025-01-01'])
# fig.update_yaxes(range=[5, 8])

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

save_plot(fig, 'results/', 'cost_scatter')

fig.show()

In [45]:
import plotly.graph_objects as go

fig = go.Figure()
for i, row in frontier_pcd_df.iterrows():
    system = row['System']
    cost = system_to_cost.get(system)
    if cost is None:
        continue
    publication_date = row['Publication date']

    fig.add_trace(go.Bar(
        x=[system],
        y=[cost],
        name=system,
        # nice blue color
        marker_color='#034752',
        # text=system,
        # textposition='auto',
    ))

# log y axis
fig.update_yaxes(type="log")
# no legend
fig.update_layout(showlegend=False)

# axis labels
fig.update_xaxes(title_text='System')
fig.update_yaxes(title_text='Cost (USD, nominal)')

# title
fig.update_layout(title_text='Cost of cloud compute to train frontier ML systems')

# update size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
)

# font size
fig.update_layout(
    font=dict(
        size=14,
    )
)

# axis limits
fig.update_yaxes(range=[0, 8])

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

fig.show()


# Price imputation

In [160]:
from sklearn.impute import KNNImputer

In [161]:
price_df.columns

Index(['Price source', 'Price date', 'Hardware model',
       'Manufacturer (from Hardware model)', 'Vendor', 'Location',
       'Price per chip-hour (on-demand)', 'Price per chip-hour (1-year CUD)',
       'Price per chip-hour (3-year CUD)', 'Price (hardware purchase)'],
      dtype='object')

In [162]:
# drop unneeded columns from price_df
irrelevant_columns = ['Price source', 'Price (hardware purchase)']
price_df = price_df.drop(columns=irrelevant_columns)

In [163]:
price_df.columns

Index(['Price date', 'Hardware model', 'Manufacturer (from Hardware model)',
       'Vendor', 'Location', 'Price per chip-hour (on-demand)',
       'Price per chip-hour (1-year CUD)', 'Price per chip-hour (3-year CUD)'],
      dtype='object')

In [164]:
# instantiate the imputer
imputer = KNNImputer(n_neighbors=1)

In [165]:
# Convert price columns to float - drop the $ sign
for col in ['Price per chip-hour (1-year CUD)', 'Price per chip-hour (3-year CUD)', 'Price per chip-hour (on-demand)']:
    price_df[col] = price_df[col].str.replace('$', '').astype(float)


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.



In [166]:
# convert datetime to float
price_df['Price date'] = price_df['Price date'].dt.year + (price_df['Price date'].dt.month-1) / 12 + (price_df['Price date'].dt.day-1) / 365

# set the price date column as the index
# price_df = price_df.set_index('Price date')

# Identify categorical columns
categorical_cols = price_df.select_dtypes(include=['object', 'category']).columns.tolist()

# one-hot encode all categorical columns
one_hot_price_df = pd.get_dummies(price_df, columns=categorical_cols)

In [167]:
one_hot_price_df

Unnamed: 0,Price date,Price per chip-hour (on-demand),Price per chip-hour (1-year CUD),Price per chip-hour (3-year CUD),Hardware model_Google TPU v2,Hardware model_Google TPU v3,Hardware model_Google TPU v4,Hardware model_NVIDIA A100,Hardware model_NVIDIA A100 SXM4 40 GB,Hardware model_NVIDIA A100 SXM4 80 GB,...,Vendor_Google Cloud,Vendor_Microsoft Azure,Location_East US,Location_Europe,Location_Iowa (us-central1),Location_Netherlands (europe-west4),Location_Oregon,Location_US,Location_US East (Northern Virginia),Location_us-central2-b
0,2018.771918,1.13,,,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
1,2018.777397,2.0,,,0,1,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
2,2018.777397,2.2,,,0,1,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
3,2019.5,2.0,1.26,0.9,0,1,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
4,2019.573973,2.0,1.26,0.9,0,1,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
5,2019.665525,2.0,1.26,0.9,0,1,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
6,2019.930365,2.0,1.26,0.9,0,1,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
7,2020.919406,2.0,1.26,0.9,0,1,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
8,2021.183105,2.0,1.26,0.9,0,1,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
9,2021.769178,2.0,1.26,0.9,0,1,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0


In [168]:
categorical_cols

['Hardware model', 'Manufacturer (from Hardware model)', 'Vendor', 'Location']

In [169]:
# impute the missing values in Training hardware, hardware quantity, Training time (hours)
imputed = imputer.fit_transform(one_hot_price_df)

In [170]:
# convert the numpy array back to a dataframe
imputed_price_df = pd.DataFrame(imputed, columns=one_hot_price_df.columns)

In [171]:
imputed_price_df

Unnamed: 0,Price date,Price per chip-hour (on-demand),Price per chip-hour (1-year CUD),Price per chip-hour (3-year CUD),Hardware model_Google TPU v2,Hardware model_Google TPU v3,Hardware model_Google TPU v4,Hardware model_NVIDIA A100,Hardware model_NVIDIA A100 SXM4 40 GB,Hardware model_NVIDIA A100 SXM4 80 GB,...,Vendor_Google Cloud,Vendor_Microsoft Azure,Location_East US,Location_Europe,Location_Iowa (us-central1),Location_Netherlands (europe-west4),Location_Oregon,Location_US,Location_US East (Northern Virginia),Location_us-central2-b
0,2018.771918,1.13,1.26,0.9,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2018.777397,2.0,1.26,0.9,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2018.777397,2.2,1.26,0.9,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2019.5,2.0,1.26,0.9,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,2019.573973,2.0,1.26,0.9,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,2019.665525,2.0,1.26,0.9,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,2019.930365,2.0,1.26,0.9,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,2020.919406,2.0,1.26,0.9,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,2021.183105,2.0,1.26,0.9,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9,2021.769178,2.0,1.26,0.9,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# convert Training hardware back to categorical
imputed_pcd_df['Training hardware'] = ''
for col in imputed_pcd_df.columns:
    if col.startswith('Training hardware_'):
        training_hardware = col.split('Training hardware_')[1]
        imputed_pcd_df['Training hardware'] = imputed_pcd_df['Training hardware'] + pd.Series([int(_) * training_hardware for _ in imputed_pcd_df[col]])

# replace all '' with np.nan
imputed_pcd_df['Training hardware'] = imputed_pcd_df['Training hardware'].replace('', np.nan)