# Check accuracy of kNN imputation

In [1]:
import numpy as np
import pandas as pd

from plotting import *
from pprint import pprint

from sklearn.impute import KNNImputer

In [2]:
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning, module='sklearn.neighbors')

## Load the known data

In [3]:
pcd_df = pd.read_csv('data/All ML Systems - full view.csv')
pcd_df.head()

Unnamed: 0,System,Domain,Task,Authors,Notability criteria,Notability criteria notes,Open-source,Link,Citations,Reference,...,Organization (from Organization),Base model,Finetune compute (FLOP),Finetune compute notes,Authors by country,Hardware quantity,Hardware utilization,Training cost trends,Training cloud compute vendor,Training data center
0,Cohere Command,Language,,,,,,https://cohere.com/models/command,,"World-class AI, at your command",...,Cohere,,,,,,,,,
1,Theseus,Other,Maze solving,Claude Shannon,Historical significance,,,https://www.technologyreview.com/2018/12/19/13...,0.0,Mighty Mouse,...,Bell Laboratories,,,,Theseus,,,,,
2,SNARC,Other,Maze solving,Marvin Minsky,Historical significance,,,https://en.wikipedia.org/wiki/Stochastic_neura...,33.0,A Neural-Analogue Calculator Based upon a Prob...,...,Harvard University,,,,SNARC,,,,,
3,Genetic algorithm,,,NA Barricelli,Historical significance,Possibly first computer simulation of a geneti...,,https://link.springer.com/article/10.1007/BF01...,266.0,Numerical testing of evolution theories,...,Institute for Advanced Study,,,,Genetic algorithm,,,,,
4,Sequence-based pattern recognition,Vision,Character recognition,O. G. Selfridge,Historical significance,,,https://dl.acm.org/doi/10.1145/1455292.1455310,290.0,Pattern recognition and modern computers,...,Massachusetts Institute of Technology (MIT),,,,Sequence-based pattern recognition,,,,,


In [4]:
# Publication date in float format
pcd_df.dropna(subset=['Publication date'], inplace=True)
pcd_df['Publication date'] = pd.to_datetime(pcd_df['Publication date'])
pcd_df['Publication date'] = pcd_df['Publication date'].dt.year + (pcd_df['Publication date'].dt.month-1)/12 + (pcd_df['Publication date'].dt.day-1)/365

In [5]:
# Manually copied from "Training cost trends" Airtable
frontier_systems = [
    "PaLM 2",
    "GPT-4",
    "Minerva (540B)",
    "Megatron-Turing NLG 530B",
    "GPT-3 175B (davinci)",
    "Meena",
    "AlphaStar",
    "AlphaGo Zero",
    "AlphaGo Master",
    "GNMT",
    "Claude 2",
    "PaLM (540B)",
    "ERNIE 3.0 Titan",
    "Gopher (280B)",
    "OpenAI Five",
    "T5-11B",
    "Megatron-BERT",
    "ResNeXt-101 32x48d",
    "AlphaZero",
    "Falcon 180B",
    "GPT-3.5 (text-davinci-003)",
    "Chinchilla",
    "Yuan 1.0",
    "Turing-NLG",
    "BigGAN-deep 512x512",
    "NASv3 (CIFAR-10)",
    "AlphaGo Lee",
    "AlphaGo Fan",
    "OPT-175B",
    "AlphaCode",
    "GLaM",
    "OpenAI Five Rerun",
    "T5-3B",
    "Megatron-LM (8.3B)",
    "FTW",
    "AmoebaNet-A (F=448)",
    "OpenAI TI7 DOTA 1v1",
    "JFT",
    "Llama 2-70B",
    "LLaMA-65B",
    "LaMDA",
    "ALIGN",
    "GShard (dense)",
    "RoBERTa Large",
    "IMPALA",
]

In [6]:
frontier_pcd_df = pcd_df[pcd_df['System'].isin(frontier_systems)]
frontier_pcd_df.head()

Unnamed: 0,System,Domain,Task,Authors,Notability criteria,Notability criteria notes,Open-source,Link,Citations,Reference,...,Organization (from Organization),Base model,Finetune compute (FLOP),Finetune compute notes,Authors by country,Hardware quantity,Hardware utilization,Training cost trends,Training cloud compute vendor,Training data center
265,AlphaGo Fan,Games,Go,"David Silver, Aja Huang, Chris J. Maddison, Ar...",SOTA improvement,,,https://www.nature.com/articles/nature24270.ep...,14389.0,Mastering the game of Go with deep neural netw...,...,Google DeepMind,,,,AlphaGo Fan,,,AlphaGo Fan,,
275,AlphaGo Lee,Games,Go,"David Silver, Aja Huang, Chris J. Maddison, Ar...",Highly cited,,,https://www.nature.com/articles/nature16961,14389.0,Mastering the game of Go with deep neural netw...,...,DeepMind,,,,AlphaGo Lee,,,AlphaGo Lee,,
306,GNMT,Language,Translation,"Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc ...",Highly cited,,,https://arxiv.org/abs/1609.08144,5948.0,Google's Neural Machine Translation System: Br...,...,Google,,,,GNMT,96.0,,GNMT,,
317,NASv3 (CIFAR-10),Vision,,"Barret Zoph, Quoc V. Le",Highly cited,,,https://arxiv.org/abs/1611.01578,4569.0,Neural Architecture Search with Reinforcement ...,...,Google Brain,,,,NASv3 (CIFAR-10),800.0,,NASv3 (CIFAR-10),,
337,AlphaGo Master,Games,Go,"D Silver, J Schrittwieser, K Simonyan, I Anton...",Highly cited,,,https://www.researchgate.net/publication/32047...,7831.0,Mastering the game of Go without human knowledge,...,DeepMind,,,,AlphaGo Master,,,AlphaGo Master,,


In [7]:
pcd_hardware_model_colname = 'Name of the hardware (from Training hardware)'

In [8]:
chip_hours = []
for i, row in frontier_pcd_df.iterrows():
    if pd.isna(row['Hardware quantity']) or pd.isna(row['Training time (hours)']):
        chip_hours.append(np.nan)
    else:
        chip_hours.append(row['Hardware quantity'] * row['Training time (hours)'])

frontier_pcd_df['Training time (chip hours)'] = chip_hours

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frontier_pcd_df['Training time (chip hours)'] = chip_hours


## Prepare for imputation

In [9]:
frontier_pcd_df = frontier_pcd_df.set_index('System')

In [10]:
# drop unneeded columns from frontier_pcd_df
irrelevant_columns = ['Notability criteria', 'Notability criteria notes', 'Link', 'Citations', 'Parameters notes',
                      'Training compute notes', 'Training dataset notes', 'Dataset size notes',
                      'Inference compute notes', 'Approach', 'Confidence', 'Last modified', 'Created By', 'Benchmark data',
                      'Exclude', 'Authors by country', 'Training cost trends', 'Abstract', 'Compute cost notes',
                      'Training time notes', 'Authors', 'Name of the hardware (from Training hardware)',
                      'Training time (chip hours)', 'Training compute cost (2020 USD)', 'Organization categorization',
                      'Training dataset', 'Inference compute (FLOP)', 'Compute sponsor categorization',
                      'Finetune compute (FLOP)', 'Finetune compute notes', ]
frontier_pcd_df = frontier_pcd_df.drop(columns=irrelevant_columns)

# fill column 'Training cloud compute vendor' using org_to_cloud_vendor dictionary
org_to_cloud_vendor = {
    'Google': 'Google Cloud',
    'DeepMind': 'Google Cloud',
    'Google DeepMind': 'Google Cloud',
    'Google Brain': 'Google Cloud',
    'Microsoft': 'Microsoft Azure',
    'OpenAI': 'Microsoft Azure',
}
frontier_pcd_df['Training cloud compute vendor'] = frontier_pcd_df['Organization (from Organization)'].map(org_to_cloud_vendor)
frontier_pcd_df['Training cloud compute vendor'] = frontier_pcd_df['Training cloud compute vendor'].fillna('Amazon Web Services')

In [11]:
# convert large number columns to logarithmic
frontier_pcd_df['log_params'] = np.log10(frontier_pcd_df['Parameters'])
frontier_pcd_df['log_compute'] = np.log10(frontier_pcd_df['Training compute (FLOP)'])
frontier_pcd_df['log_dataset'] = np.log10(frontier_pcd_df['Training dataset size (datapoints)'])
# drop raw columns
raw_columns = ['Parameters', 'Training compute (FLOP)', 'Training dataset size (datapoints)']
frontier_pcd_df.drop(columns=raw_columns, inplace=True)

In [12]:
frontier_pcd_df.columns

Index(['Domain', 'Task', 'Open-source', 'Reference', 'Publication date',
       'Organization', 'Epochs', 'Training time (hours)', 'Training hardware',
       'Country (from Organization)', 'Organization (from Organization)',
       'Base model', 'Hardware quantity', 'Hardware utilization',
       'Training cloud compute vendor', 'Training data center', 'log_params',
       'log_compute', 'log_dataset'],
      dtype='object')

In [13]:
print(
    "Known hardware models:", frontier_pcd_df['Training hardware'].notna().sum(), "\n",
    "Known hardware utilization:", frontier_pcd_df['Hardware utilization'].notna().sum(), "\n",
    "Known hardware quantity:", frontier_pcd_df['Hardware quantity'].notna().sum(), "\n",
    "Known training time:", frontier_pcd_df['Training time (hours)'].notna().sum(), "\n",
    "Total rows:", frontier_pcd_df.shape[0]
)

Known hardware models: 35 
 Known hardware utilization: 14 
 Known hardware quantity: 33 
 Known training time: 27 
 Total rows: 45


Next, for different values of k, perform imputation and calculate the MSE for imputed quantitative data and accuracy for categorical data.

## Run imputation many times and evaluate

In [14]:
known_hardware_models = frontier_pcd_df['Training hardware'].notna()

In [19]:
num_trials = 1000
k_values = [1, 2, 4, 9, 16, 25, 36, 45]
results = {k: [] for k in k_values}

In [16]:
def impute_training_hardware(dataframe, n=5):
    # use KNeighborsClassifier to impute the missing values in Training hardware
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.preprocessing import LabelEncoder

    # Separate the target and features
    target_col = 'Training hardware'
    features = dataframe.drop(target_col, axis=1)
    target = dataframe[target_col]

    # Encode the target column
    label_encoder = LabelEncoder()
    target_filled = target.fillna('Unknown')  # Temporarily fill missing values
    target_encoded = label_encoder.fit_transform(target_filled)

    # Train a KNeighborsClassifier
    knc = KNeighborsClassifier(n_neighbors=n)
    knc.fit(features, target_encoded)

    # Predict the missing values
    missing_values = features[target.isna()]
    predicted = knc.predict(missing_values)

    # Decode the predictions
    predicted_labels = label_encoder.inverse_transform(predicted)

    # Replace the missing values with the predictions
    dataframe.loc[target.isna(), target_col] = predicted_labels

    # replace all 'Unknown' with np.nan
    dataframe['Training hardware'] = dataframe['Training hardware'].replace('Unknown', np.nan)

In [20]:
for i in range(num_trials):
    for k in k_values:
        # select 5 random rows with known hardware model
        filtered_df = frontier_pcd_df[known_hardware_models]
        holdout_models = filtered_df.sample(n=5)
        val_df = frontier_pcd_df.copy()
        val_df.loc[holdout_models.index, 'Training hardware'] = np.nan

        # impute hardware
        imputer = KNNImputer(n_neighbors=k)
        # Identify categorical columns
        categorical_cols = val_df.select_dtypes(include=['object', 'category']).columns.tolist()
        # one-hot encode all categorical columns
        one_hot_pcd_df = pd.get_dummies(val_df, columns=categorical_cols)
        # impute the missing values in Training hardware, hardware quantity, Training time (hours)
        imputed = imputer.fit_transform(one_hot_pcd_df)
        # convert the numpy array back to a dataframe
        imputed_pcd_df = pd.DataFrame(imputed, columns=one_hot_pcd_df.columns)
        # convert Training hardware back to categorical
        imputed_pcd_df['Training hardware'] = ''
        for col in imputed_pcd_df.columns:
            if col.startswith('Training hardware_'):
                training_hardware = col.split('Training hardware_')[1]
                imputed_pcd_df['Training hardware'] = imputed_pcd_df['Training hardware'] + pd.Series([int(_) * training_hardware for _ in imputed_pcd_df[col]])
        # replace all '' with np.nan
        imputed_pcd_df['Training hardware'] = imputed_pcd_df['Training hardware'].replace('', np.nan)
        missing_values = imputed_pcd_df['Training hardware'].isna().sum()
        N = k
        while missing_values > 0:
            try:
                impute_training_hardware(imputed_pcd_df, n=N)
            except ValueError:
                most_common_hardware = imputed_pcd_df['Training hardware'].value_counts().index[0]
                imputed_pcd_df.loc[imputed_pcd_df['Training hardware'].isna(), 'Training hardware'] = most_common_hardware
            print(imputed_pcd_df['Training hardware'].isna().sum())
            if imputed_pcd_df['Training hardware'].isna().sum() == missing_values:
                N += 5
            else:
                missing_values = imputed_pcd_df['Training hardware'].isna().sum()
        # restore the System column
        imputed_pcd_df['System'] = one_hot_pcd_df.index
        # set the System column as the index
        imputed_pcd_df = imputed_pcd_df.set_index('System')
        # insert imputed values into val_df
        val_df['Training hardware'] = imputed_pcd_df['Training hardware']
        val_df['Hardware quantity'] = imputed_pcd_df['Hardware quantity']
        val_df['Hardware utilization'] = imputed_pcd_df['Hardware utilization']
        val_df['Training time (hours)'] = imputed_pcd_df['Training time (hours)']

        # calculate accuracy
        true_hardware = holdout_models['Training hardware']
        imputed_hardware = val_df.loc[holdout_models.index, 'Training hardware']
        accuracy = np.sum(true_hardware == imputed_hardware)/5
        results[k].append(accuracy)

15
8
5
5
5
5
2
0
6
6
6
5
3
0
9
8
8
6
6
6
0
12
12
10
8
6
5
1
0
14
13
13
11
2
0
15
15
15
15
15
0
15
15
0
15
0
15
10
10
10
10
0
6
6
4
4
3
1
0
8
7
7
6
6
2
0
14
14
12
10
2
0
14
14
14
14
14
14
14
0
15
15
15
15
15
0
15
15
0
15
0
15
11
9
9
6
6
0
4
4
4
4
4
0
9
8
8
5
5
5
0
12
8
5
4
4
0
15
13
13
13
13
13
13
0
15
15
15
15
15
0
15
15
0
15
0
15
11
11
11
11
7
6
6
0
6
6
6
5
3
0
6
6
6
5
1
0
13
10
6
4
4
0
14
13
13
12
12
4
0
15
15
15
15
15
0
15
15
0
15
0
15
11
11
11
11
11
5
0
6
6
5
4
4
4
4
0
9
9
8
8
7
7
2
0
12
8
3
2
0
15
15
15
15
15
15
0
15
15
15
15
15
0
15
15
0
15
0
15
9
7
5
5
5
2
0
7
6
6
6
5
3
0
12
12
6
6
6
0
11
6
3
3
0
15
15
15
15
15
15
0
15
15
15
15
15
0
15
15
0
15
0
15
10
10
10
10
9
5
5
0
6
6
4
4
4
1
0
8
7
7
6
6
5
5
0
13
12
9
9
9
9
9
9
9
0
14
14
13
13
13
13
13
13
0
13
6
0
15
15
0
15
0
15
11
7
6
6
6
5
5
0
9
8
8
7
6
6
6
3
0
9
7
7
6
6
6
2
0
9
3
0
13
12
11
9
9
8
2
0
15
15
15
15
15
0
15
15
0
15
0
15
9
7
7
7
7
6
1
0
6
6
4
4
4
0
10
10
8
6
6
6
1
0
11
8
5
4
4
0
14
14
14
14
14
14
14
0
15
15
15
15
15
0
15
15
0

In [24]:
results = {k: round(np.mean(v), 4) for k, v in results.items()}
print(results)

{1: 0.164, 2: 0.117, 4: 0.199, 9: 0.228, 16: 0.227, 25: 0.252, 36: 0.254, 45: 0.261}
