In [8]:
import pandas as pd
import wandb
from tqdm.notebook import tqdm
import pickle
from os.path import exists
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import math
from matplotlib.ticker import MaxNLocator

from data.get_uci import all_datasets
from analysis.util import fetch, init_uci_dict, get_uci_info

uci_info = get_uci_info()

In [7]:
filters = {
    "group": "benchmark6"
}
raw_orig = fetch("soft-gp-2", filters)

100%|██████████| 231/231 [01:46<00:00,  2.18it/s]


In [9]:
runs = {}
uci_dict2 = {}
for exp in raw_orig:
    model = exp.config["model.name"]
    dataset = exp.config["dataset.name"]
    num_inducing = exp.config["model.num_inducing"]
    dtype = exp.config["model.dtype"]
    seed = exp.config["training.seed"]
    train_frac = float(exp.config["dataset.train_frac"])
    if model == "svi-gp" and not exp.config["model.learn_noise"]:
        continue 
    if model == "sv-gp" and not exp.config["model.learn_noise"]:
        continue 
    uci_dict2[(dataset, seed, num_inducing, train_frac, model)] = exp.history
    runs[(dataset, seed, num_inducing, train_frac, model)] = exp.run.id

In [21]:
seeds = [6535, 8830, 92357]
num_inducings = [512, 1024]
fracs = [0.9]
# fracs = [0.44, 0.89]


UCI_INFO = {
    "N": [int(np.floor(N * 0.9)) for _, N, _, _ in uci_info],
    "D": [D for _, _, D, _ in uci_info],
    # "exact-rep": [e for _, _, _, e in uci_info],
}
KZZ = {}
all_bins = {}

models = ["soft-gp"]

# models = ["sv-gp"]
for seed in seeds:
    for model in models:
        for num_inducing in num_inducings:
            for frac in fracs:
                xs = []
                ts = []
                K_zzs = []
                bins1 = []
                bins2 = []
                bins3 = []
                bins4 = []
                bins5 = []
                bins6 = []
                for dataset, _, _, _ in uci_info:
                    try:
                        xs += [float(uci_dict2[(dataset, seed, num_inducing, frac, model)]["test_rmse"][49])]
                        ts += [float(np.array(uci_dict2[(dataset, seed, num_inducing, frac, model)]["epoch_time"][49]).mean())]
                        K_zzs += [uci_dict2[(dataset, seed, num_inducing, frac, model)]["K_zz"][i] for i in range(5)]
                        bins1 += [uci_dict2[(dataset, seed, num_inducing, frac, model)]["K_zz_bin_0.0"][49]]
                        bins2 += [uci_dict2[(dataset, seed, num_inducing, frac, model)]["K_zz_bin_1e-20"][49]]
                        bins3 += [uci_dict2[(dataset, seed, num_inducing, frac, model)]["K_zz_bin_1e-10"][49]]
                        bins4 += [uci_dict2[(dataset, seed, num_inducing, frac, model)]["K_zz_bin_1e-05"][49]]
                        bins5 += [uci_dict2[(dataset, seed, num_inducing, frac, model)]["K_zz_bin_0.01"][49]]
                        bins6 += [uci_dict2[(dataset, seed, num_inducing, frac, model)]["K_zz_bin_0.5"][49]]
                    except Exception as e:
                        xs += [np.nan]
                        ts += [np.nan]
                        bins1 += [np.nan]
                        bins2 += [np.nan]
                        bins3 += [np.nan]
                        bins4 += [np.nan]
                        bins5 += [np.nan]
                        bins6 += [np.nan]
                        print("Exception", e, model, dataset)

                UCI_INFO[f"{model}-{num_inducing}-{frac}-{seed}"] = xs
                # UCI_INFO[f"time-{model}-{num_inducing}-{frac}-{seed}"] = ts
                all_bins[f"0.0-{model}-{num_inducing}-{frac}-{seed}"] = bins1
                all_bins[f"1e-20-{model}-{num_inducing}-{frac}-{seed}"] = bins2
                all_bins[f"1e-10-{model}-{num_inducing}-{frac}-{seed}"] = bins3
                all_bins[f"1e-05-{model}-{num_inducing}-{frac}-{seed}"] = bins4
                all_bins[f"0.01-{model}-{num_inducing}-{frac}-{seed}"] = bins5
                all_bins[f"0.5-{model}-{num_inducing}-{frac}-{seed}"] = bins6
                KZZ[f"kzz-{model}-{num_inducing}-{frac}-{seed}"] = K_zzs
df = pd.DataFrame(data=UCI_INFO)
df.index = [name.capitalize().replace("_", "-") for name, _, _, _ in uci_info]
df

Unnamed: 0,N,D,soft-gp-512-0.9-6535,soft-gp-1024-0.9-6535,soft-gp-512-0.9-8830,soft-gp-1024-0.9-8830,soft-gp-512-0.9-92357,soft-gp-1024-0.9-92357
Pol,13500,26,0.18909,0.188739,0.201634,0.200055,0.194141,0.198044
Elevators,14939,18,0.391094,0.389692,0.397342,0.400878,0.377828,0.381116
Bike,15641,17,0.207342,0.21364,0.20769,0.216379,0.197691,0.207423
Kin40k,36000,8,0.242823,0.235801,0.22854,0.221071,0.240351,0.234103
Protein,41157,9,0.657614,0.656349,0.637196,0.640186,0.652701,0.651852
Keggdirected,43944,20,0.078741,0.079589,0.077793,0.079131,0.0868,0.086169
Slice,48150,385,0.05091,0.04477,0.045886,0.041128,0.050083,0.048265
Keggundirected,57247,27,0.117767,0.116152,0.111164,0.111012,0.118161,0.117762
3droad,391386,3,0.605115,0.607919,0.600449,0.609718,0.604705,0.60672
Song,270000,90,0.798819,0.809756,0.79033,0.803228,0.789414,0.802923


In [22]:
filters = {
    "group": "noise"
}
raw = fetch("soft-gp-2", filters)

100%|██████████| 46/46 [00:20<00:00,  2.30it/s]


In [23]:
runs = {}
uci_dict = {}
for exp in raw:
    model = exp.config["model.name"]
    dataset = exp.config["dataset.name"]
    num_inducing = exp.config["model.num_inducing"]
    dtype = exp.config["model.dtype"]
    seed = exp.config["training.seed"]
    learn_noise = exp.config["model.learn_noise"]
    train_frac = float(exp.config["dataset.train_frac"])
    uci_dict[(dataset, seed, num_inducing, train_frac, model, learn_noise)] = exp.history
    runs[(dataset, seed, num_inducing, train_frac, model, learn_noise)] = exp.run.id

print(uci_dict.keys())

dict_keys([('buzz', 92357, 512, 0.9, 'soft-gp', True), ('buzz', 8830, 512, 0.9, 'soft-gp', True), ('song', 92357, 512, 0.9, 'soft-gp', True), ('song', 8830, 512, 0.9, 'soft-gp', True), ('3droad', 92357, 512, 0.9, 'soft-gp', True), ('3droad', 8830, 512, 0.9, 'soft-gp', True), ('keggundirected', 92357, 512, 0.9, 'soft-gp', True), ('keggundirected', 8830, 512, 0.9, 'soft-gp', True), ('slice', 92357, 512, 0.9, 'soft-gp', True), ('slice', 8830, 512, 0.9, 'soft-gp', True), ('keggdirected', 92357, 512, 0.9, 'soft-gp', True), ('keggdirected', 8830, 512, 0.9, 'soft-gp', True), ('protein', 92357, 512, 0.9, 'soft-gp', True), ('protein', 8830, 512, 0.9, 'soft-gp', True), ('kin40k', 92357, 512, 0.9, 'soft-gp', True), ('kin40k', 8830, 512, 0.9, 'soft-gp', True), ('bike', 92357, 512, 0.9, 'soft-gp', True), ('bike', 8830, 512, 0.9, 'soft-gp', True), ('elevators', 92357, 512, 0.9, 'soft-gp', True), ('elevators', 8830, 512, 0.9, 'soft-gp', True), ('pol', 92357, 512, 0.9, 'soft-gp', True), ('pol', 8830, 

In [26]:
# UCI_INFO = {
#     "N": [int(np.floor(N * 0.9)) for _, N, _, _ in uci_info],
#     "D": [D for _, _, D, _ in uci_info],
#     # "exact-rep": [e for _, _, _, e in uci_info],
# }
# KZZ = {}
# all_bins = {}

models = ["soft-gp"]
seeds = [6535, 8830, 92357]

# models = ["sv-gp"]
for ln in [True]:
    model = "soft-gp"
    for seed in seeds:
        frac = 0.9
        num_inducing = 512
        xs = []
        ts = []
        K_zzs = []
        bins1 = []
        bins2 = []
        bins3 = []
        bins4 = []
        bins5 = []
        bins6 = []
        for dataset, _, _, _ in uci_info:
            try:
                xs += [float(uci_dict[(dataset, seed, num_inducing, frac, model, ln)]["test_rmse"][49])]
                ts += [float(np.array(uci_dict[(dataset, seed, num_inducing, frac, model, ln)]["epoch_time"][49]).mean())]
                K_zzs += [uci_dict[(dataset, seed, num_inducing, frac, model, ln)]["K_zz"][i] for i in range(5)]
                bins1 += [uci_dict[(dataset, seed, num_inducing, frac, model, ln)]["K_zz_bin_0.0"][49]]
                bins2 += [uci_dict[(dataset, seed, num_inducing, frac, model, ln)]["K_zz_bin_1e-20"][49]]
                bins3 += [uci_dict[(dataset, seed, num_inducing, frac, model, ln)]["K_zz_bin_1e-10"][49]]
                bins4 += [uci_dict[(dataset, seed, num_inducing, frac, model, ln)]["K_zz_bin_1e-05"][49]]
                bins5 += [uci_dict[(dataset, seed, num_inducing, frac, model, ln)]["K_zz_bin_0.01"][49]]
                bins6 += [uci_dict[(dataset, seed, num_inducing, frac, model, ln)]["K_zz_bin_0.5"][49]]
            except Exception as e:
                xs += [np.nan]
                ts += [np.nan]
                bins1 += [np.nan]
                bins2 += [np.nan]
                bins3 += [np.nan]
                bins4 += [np.nan]
                bins5 += [np.nan]
                bins6 += [np.nan]
                print("Exception", e, model, dataset)

        UCI_INFO[f"{model}-{num_inducing}-{frac}-{ln}-{seed}"] = xs
        # UCI_INFO[f"time-{model}-{num_inducing}-{frac}-{seed}-{ln}"] = ts
        all_bins[f"0.0-{model}-{num_inducing}-{frac}-{ln}-{seed}"] = bins1
        all_bins[f"1e-20-{model}-{num_inducing}-{frac}-{ln}-{seed}"] = bins2
        all_bins[f"1e-10-{model}-{num_inducing}-{frac}-{ln}-{seed}"] = bins3
        all_bins[f"1e-05-{model}-{num_inducing}-{frac}-{ln}-{seed}"] = bins4
        all_bins[f"0.01-{model}-{num_inducing}-{frac}-{ln}-{seed}"] = bins5
        all_bins[f"0.5-{model}-{num_inducing}-{frac}-{ln}-{seed}"] = bins6
        KZZ[f"kzz-{model}-{num_inducing}-{ln}-{frac}-{seed}"] = K_zzs
df = pd.DataFrame(data=UCI_INFO)
df.index = [name.capitalize().replace("_", "-") for name, _, _, _ in uci_info]
df

Exception ('houseelectric', 8830, 512, 0.9, 'soft-gp', True) soft-gp houseelectric
Exception 49 soft-gp buzz
Exception ('houseelectric', 92357, 512, 0.9, 'soft-gp', True) soft-gp houseelectric


Unnamed: 0,N,D,soft-gp-512-0.9-6535,soft-gp-1024-0.9-6535,soft-gp-512-0.9-8830,soft-gp-1024-0.9-8830,soft-gp-512-0.9-92357,soft-gp-1024-0.9-92357,soft-gp-512-0.9-6535-True,soft-gp-512-0.9-8830-True,soft-gp-512-0.9-92357-True,soft-gp-512-0.9-True-6535,soft-gp-512-0.9-True-8830,soft-gp-512-0.9-True-92357
Pol,13500,26,0.18909,0.188739,0.201634,0.200055,0.194141,0.198044,0.228467,0.237021,0.240419,0.228467,0.237021,0.240419
Elevators,14939,18,0.391094,0.389692,0.397342,0.400878,0.377828,0.381116,0.389617,0.398407,0.389898,0.389617,0.398407,0.389898
Bike,15641,17,0.207342,0.21364,0.20769,0.216379,0.197691,0.207423,0.244963,0.247469,0.243926,0.244963,0.247469,0.243926
Kin40k,36000,8,0.242823,0.235801,0.22854,0.221071,0.240351,0.234103,0.389934,0.381906,0.384263,0.389934,0.381906,0.384263
Protein,41157,9,0.657614,0.656349,0.637196,0.640186,0.652701,0.651852,0.726131,0.72138,0.728298,0.726131,0.72138,0.728298
Keggdirected,43944,20,0.078741,0.079589,0.077793,0.079131,0.0868,0.086169,0.081146,0.079418,0.088499,0.081146,0.079418,0.088499
Slice,48150,385,0.05091,0.04477,0.045886,0.041128,0.050083,0.048265,0.052266,0.047804,0.053476,0.052266,0.047804,0.053476
Keggundirected,57247,27,0.117767,0.116152,0.111164,0.111012,0.118161,0.117762,0.120411,0.112902,0.120932,0.120411,0.112902,0.120932
3droad,391386,3,0.605115,0.607919,0.600449,0.609718,0.604705,0.60672,0.727352,0.729022,0.727345,0.727352,0.729022,0.727345
Song,270000,90,0.798819,0.809756,0.79033,0.803228,0.789414,0.802923,0.79705,0.78819,0.787715,0.79705,0.78819,0.787715


In [36]:
def pm_var(df, model):
    m = df[[f'{model}-6535', f'{model}-8830', f'{model}-92357']].mean(axis=1).round(3).astype(str)
    v = df[[f'{model}-6535', f'{model}-8830', f'{model}-92357']].std(axis=1).round(3).astype(str).apply(lambda x: f" $\pm$ {x}")
    return (m + v).apply(lambda x: x.replace("nan $\pm$ nan", "-"))
df_rmse = pd.DataFrame()
df_rmse[['N', 'D']] = df[['N', 'D']]
for model in models:
    for num_inducing in [512]:
        for frac in fracs:
            df_rmse[f'{model}-{num_inducing}-{frac}'] = pm_var(df, f'{model}-{num_inducing}-{frac}')
            df_rmse[f'{model}-{num_inducing}-{frac}-{True}'] = pm_var(df, f'{model}-{num_inducing}-{frac}-{True}')

df_rmse = df_rmse.sort_values(by=['D'], ascending=[True])
df_rmse = df_rmse.drop(columns=['D', 'N'])
print("RMSE")
df_rmse

RMSE


Unnamed: 0,soft-gp-512-0.9,soft-gp-512-0.9-True
3droad,0.603 $\pm$ 0.003,0.728 $\pm$ 0.001
Kin40k,0.237 $\pm$ 0.008,0.385 $\pm$ 0.004
Protein,0.649 $\pm$ 0.011,0.725 $\pm$ 0.004
Houseelectric,0.064 $\pm$ 0.001,0.071 $\pm$ nan
Bike,0.204 $\pm$ 0.006,0.245 $\pm$ 0.002
Elevators,0.389 $\pm$ 0.01,0.393 $\pm$ 0.005
Keggdirected,0.081 $\pm$ 0.005,0.083 $\pm$ 0.005
Pol,0.195 $\pm$ 0.006,0.235 $\pm$ 0.006
Keggundirected,0.116 $\pm$ 0.004,0.118 $\pm$ 0.004
Buzz,0.253 $\pm$ 0.002,0.26 $\pm$ 0.001


In [37]:
latex_table = df_rmse.to_latex(
    index=True,
    escape=False,
    float_format="{:0.3f}".format,
)
print(latex_table)

\begin{tabular}{lll}
\toprule
 & soft-gp-512-0.9 & soft-gp-512-0.9-True \\
\midrule
3droad & 0.603 $\pm$ 0.003 & 0.728 $\pm$ 0.001 \\
Kin40k & 0.237 $\pm$ 0.008 & 0.385 $\pm$ 0.004 \\
Protein & 0.649 $\pm$ 0.011 & 0.725 $\pm$ 0.004 \\
Houseelectric & 0.064 $\pm$ 0.001 & 0.071 $\pm$ nan \\
Bike & 0.204 $\pm$ 0.006 & 0.245 $\pm$ 0.002 \\
Elevators & 0.389 $\pm$ 0.01 & 0.393 $\pm$ 0.005 \\
Keggdirected & 0.081 $\pm$ 0.005 & 0.083 $\pm$ 0.005 \\
Pol & 0.195 $\pm$ 0.006 & 0.235 $\pm$ 0.006 \\
Keggundirected & 0.116 $\pm$ 0.004 & 0.118 $\pm$ 0.004 \\
Buzz & 0.253 $\pm$ 0.002 & 0.26 $\pm$ 0.001 \\
Song & 0.793 $\pm$ 0.005 & 0.791 $\pm$ 0.005 \\
Slice & 0.049 $\pm$ 0.003 & 0.051 $\pm$ 0.003 \\
\bottomrule
\end{tabular}

