In [1]:
import tqdm
import json
import random
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import cm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from zeo_amd.plotting import savefig

plt.style.use("jupyter")

## Loading the data

We are going to load the data for the hyperparameter search of a balanced classifier.

In [2]:
COLUMNS = [
    "label", 'n_pos', 'n_neg', 'val_accuracy', 'val_precision',
    'val_recall', 'val_F1-score', 'val_roc_auc', 'val_pr_auc',
    'test_accuracy', 'test_precision', 'test_recall', 'test_F1-score',
    'test_roc_auc', 'test_pr_auc', "classifier", "params_str"
]

In [3]:
amd = pd.read_json("../data/hparams_rnd_balanced.json")

amd["params_str"] = amd["params"].apply(json.dumps)
amd = amd.loc[amd["classifier"] == "XGBClassifier"]

In [4]:
mean_amd = (amd
    .groupby(["label", "classifier", "params_str"])
    .mean()
    .reset_index()
)

mean_amd = mean_amd.drop([
    "run", "n_pos", "n_neg", "seed",
    "params_index",
], axis=1)

std_amd = (amd
    .groupby(["label", "classifier", "params_str"])
    .std()
    .reset_index()
)

std_amd = std_amd.drop([
    "run", "n_pos", "n_neg", "seed",
    "params_index",
], axis=1)

In [5]:
soap = pd.read_json("../data/hparams_rnd_soap_balanced.json")

soap["params_str"] = soap["params"].apply(json.dumps)
soap = soap.loc[soap["classifier"] == "XGBClassifier"]

In [6]:
mean_soap = (soap
    .groupby(["label", "classifier", "params_str"])
    .mean()
    .reset_index()
)

mean_soap = mean_soap.drop([
    "run", "n_pos", "n_neg", "seed",
    "params_index",
], axis=1)

std_soap = (soap
    .groupby(["label", "classifier", "params_str"])
    .std()
    .reset_index()
)

std_soap = std_soap.drop([
    "run", "n_pos", "n_neg", "seed",
    "params_index",
], axis=1)

## Comparing SOAP and AMD in terms of the best classifiers that can be achieved

In [7]:
def create_table(mean, std):
    mean = np.round(mean, 2).applymap(lambda x: f"{x:.2f}")
    std = np.round(std, 2).applymap(lambda x: f"{x:.2f}")
    
    return mean + " ± " + std

TEST_COLS = [c for c in COLUMNS if c.startswith("test_")]

In [8]:
params = '{"colsample_bytree": 0.5, "learning_rate": 0.1, "max_depth": 4, "min_child_weight": 1, "n_estimators": 200, "subsample": 0.5}'

In [9]:
COLS = ["label", "test_roc_auc", "test_pr_auc"]

In [10]:
best_soap_idx = mean_soap.groupby("label").idxmax()
best_soap_mean = mean_soap.loc[best_soap_idx["val_roc_auc"]].drop(["classifier", "params_str"], axis=1)[COLS]
best_soap_std = std_soap.loc[best_soap_mean.index][COLS].set_index("label")
best_soap_mean = best_soap_mean.set_index("label")

best_amd_idx = mean_amd.groupby("label").idxmax()
best_amd_mean = mean_amd.loc[best_amd_idx["val_roc_auc"]].drop(["classifier", "params_str"], axis=1)[COLS]
best_amd_std = std_amd.loc[best_amd_mean.index][COLS].set_index("label")
best_amd_mean = best_amd_mean.set_index("label")

In [11]:
table_amd = create_table(best_amd_mean, best_amd_std)
table_amd.columns = ["amd_" + c for c in table_amd.columns]

table_soap = create_table(best_soap_mean, best_soap_std)
table_soap.columns = ["soap_" + c for c in table_soap.columns]

In [12]:
ORDER_COLS = ["amd_test_roc_auc", "soap_test_roc_auc", "amd_test_pr_auc", "soap_test_pr_auc"]
table = pd.concat([table_amd, table_soap], axis=1)
table = table[ORDER_COLS]

In [13]:
table

Unnamed: 0_level_0,amd_test_roc_auc,soap_test_roc_auc,amd_test_pr_auc,soap_test_pr_auc
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Al,0.90 ± 0.04,0.83 ± 0.02,0.87 ± 0.04,0.82 ± 0.04
B,0.75 ± 0.18,0.74 ± 0.20,0.79 ± 0.18,0.78 ± 0.18
Be,0.80 ± 0.23,0.80 ± 0.21,0.86 ± 0.15,0.83 ± 0.20
Ca,0.61 ± 0.18,0.68 ± 0.20,0.69 ± 0.12,0.76 ± 0.14
Co,0.49 ± 0.25,0.49 ± 0.20,0.55 ± 0.20,0.58 ± 0.19
F,0.77 ± 0.12,0.73 ± 0.12,0.81 ± 0.11,0.76 ± 0.10
Ga,0.59 ± 0.25,0.77 ± 0.16,0.58 ± 0.22,0.81 ± 0.12
Ge,0.73 ± 0.08,0.73 ± 0.13,0.72 ± 0.09,0.73 ± 0.13
K,0.71 ± 0.15,0.67 ± 0.09,0.74 ± 0.15,0.66 ± 0.09
Mg,0.76 ± 0.14,0.49 ± 0.30,0.76 ± 0.18,0.55 ± 0.24


In [14]:
print(table.to_latex())

\begin{tabular}{lllll}
\toprule
{} & amd\_test\_roc\_auc & soap\_test\_roc\_auc & amd\_test\_pr\_auc & soap\_test\_pr\_auc \\
label &                  &                   &                 &                  \\
\midrule
Al    &      0.90 ± 0.04 &       0.83 ± 0.02 &     0.87 ± 0.04 &      0.82 ± 0.04 \\
B     &      0.75 ± 0.18 &       0.74 ± 0.20 &     0.79 ± 0.18 &      0.78 ± 0.18 \\
Be    &      0.80 ± 0.23 &       0.80 ± 0.21 &     0.86 ± 0.15 &      0.83 ± 0.20 \\
Ca    &      0.61 ± 0.18 &       0.68 ± 0.20 &     0.69 ± 0.12 &      0.76 ± 0.14 \\
Co    &      0.49 ± 0.25 &       0.49 ± 0.20 &     0.55 ± 0.20 &      0.58 ± 0.19 \\
F     &      0.77 ± 0.12 &       0.73 ± 0.12 &     0.81 ± 0.11 &      0.76 ± 0.10 \\
Ga    &      0.59 ± 0.25 &       0.77 ± 0.16 &     0.58 ± 0.22 &      0.81 ± 0.12 \\
Ge    &      0.73 ± 0.08 &       0.73 ± 0.13 &     0.72 ± 0.09 &      0.73 ± 0.13 \\
K     &      0.71 ± 0.15 &       0.67 ± 0.09 &     0.74 ± 0.15 &      0.66 ± 0.09 \\
Mg    &      0.