In [5]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr

main_experiments_path = "../experiments"


basic_method_names = [
    "svr",
    "pls",
    "gradboost",
    "mlp_0.01",
    "mlp_0.001",
    "mlp_0.0001",
    "rbf_full_0.01",
    "rbf_full_0.001",
    "rbf_full_0.0001",
    "rbf_diag_0.01",
    "rbf_diag_0.001",
    "rbf_diag_0.0001",
    "rbf_scal_0.01",
    "rbf_scal_0.001",
    "rbf_scal_0.0001",
    "rbf_univ_0.01",
    "rbf_univ_0.001",
    "rbf_univ_0.0001",
    "ffn_0.1_0.01",
    "ffn_0.1_0.001",
    "ffn_0.1_0.0001",
    "ffn_1_0.01",
    "ffn_1_0.001",
    "ffn_1_0.0001",
    "ffn_10_0.01",
    "ffn_10_0.001",
    "ffn_10_0.0001",
    "uffn_0.1_0.01",
    "uffn_0.1_0.001",
    "uffn_0.1_0.0001",
    "uffn_1_0.01",
    "uffn_1_0.001",
    "uffn_1_0.0001",
    "uffn_10_0.01",
    "uffn_10_0.001",
    "uffn_10_0.0001",
]


basic_condensed_method_names = [
    "svr",
    "pls",
    "gradboost",
    "mlp",
    "rbf_full",
    "rbf_diag",
    "rbf_scal",
    "rbf_univ",
    "ffn",
    "uffn",
]

method_names = []
condensed_method_names = []
for i in range(3):
    method_names += [str(i+1)+"_"+method for method in basic_method_names]
    condensed_method_names += [str(i+1)+"_"+method for method in basic_condensed_method_names]

data_dims = list(range(1,4))
repetition_ids = list(range(8))
learning_rates = ["0.01", "0.001", "0.0001"]

experiments_data = xr.DataArray(np.zeros((len(data_dims),len(basic_condensed_method_names), len(repetition_ids), len(learning_rates))) - 1, coords=[data_dims, basic_condensed_method_names, repetition_ids, learning_rates], dims=['data_dims','method', 'repetition','lr'])

method_settings = os.listdir(main_experiments_path)

# sort by name
method_settings = sorted(method_settings)

#print all folders in the main experiments path
for method_folder in method_settings:
    # iterate over repetitions

    if method_folder.startswith("."):
        continue

    for repetition_folder in os.listdir(main_experiments_path + "/" + method_folder):
        # iterate over files

        if repetition_folder.startswith("."):
            continue

        ## add this repetition to the experiment df as column
        try:
            ## read the npy file
            train_loss = np.load(main_experiments_path + "/" + method_folder + "/" + repetition_folder + "/data/" + "train_loss.npy")
            val_loss = np.load(main_experiments_path + "/" + method_folder + "/" + repetition_folder +  "/data/"  + "val_loss.npy")
            test_loss = np.load(main_experiments_path + "/" + method_folder + "/" + repetition_folder + "/data/" + "test_loss.npy")
        except Exception as e:
            print("Error reading files for dataset: ", method_folder, e)
            continue
        
        ### extract the min val loss and pick the corresponding test loss

        min_val_loss = np.min(val_loss)
        min_val_loss_index = np.argmin(val_loss)

        min_test_loss = test_loss[min_val_loss_index]

        method_folder_id = int(method_folder.split("_")[1])

        method_description = method_names[method_folder_id - 1]
        method_parts = method_description.split("_")

        data_dim = int(method_parts[0])
        method_name = method_parts[1]

        repetition_id = int(repetition_folder.split("_")[1])

        if method_name == "rbf":
            method_name = method_name + "_" + method_parts[2]

        if len(method_parts) == 2:
            experiments_data.loc[data_dim, method_name, repetition_id, :] = min_test_loss
        else:
            lr = method_parts[-1]
            experiments_data.loc[data_dim, method_name, repetition_id, lr] = min_test_loss



## replace all -1 with nan

#test = experiments_data.where(experiments_data == -1)
#print(test)

experiments_data = experiments_data.where(experiments_data != -1)

print(experiments_data.loc[1,"svr",:,:])


experiments_data



<xarray.DataArray (repetition: 8, lr: 3)> Size: 192B
array([[ 2310.53222444,  2310.53222444,  2310.53222444],
       [10907.80513673, 10907.80513673, 10907.80513673],
       [ 5606.95879561,  5606.95879561,  5606.95879561],
       [22360.14822796, 22360.14822796, 22360.14822796],
       [21951.72935318, 21951.72935318, 21951.72935318],
       [25112.42429179, 25112.42429179, 25112.42429179],
       [ 7671.04355497,  7671.04355497,  7671.04355497],
       [ 8113.56069236,  8113.56069236,  8113.56069236]])
Coordinates:
    data_dims   int64 8B 1
    method      <U9 36B 'svr'
  * repetition  (repetition) int64 64B 0 1 2 3 4 5 6 7
  * lr          (lr) <U6 72B '0.01' '0.001' '0.0001'


In [6]:
_experiments_data = experiments_data.copy()


repetition_dim_mean_data = _experiments_data.mean(dim=['data_dims', 'repetition'])


min_mlp_mean = repetition_dim_mean_data.sel(dict(method="mlp")).min()
repetition_dim_mean_data = repetition_dim_mean_data / min_mlp_mean
_experiments_data = _experiments_data / min_mlp_mean


repetition_dim_std_data = _experiments_data.std(dim=['data_dims', 'repetition'])

repetition_dim_summary_data = xr.concat([repetition_dim_mean_data, repetition_dim_std_data], dim= pd.Index(["mean", "std"], name='summary'))

means = repetition_dim_summary_data.sel(summary="mean")
min_mean_index = means.argmin(dim="lr")
min_mean_index = min_mean_index.drop_vars('summary')
min_mean_index = min_mean_index.squeeze()

collapsed_repetition_dim_summary_data = repetition_dim_summary_data.isel(lr=min_mean_index, missing_dims="ignore")
collapsed_repetition_dim_summary_data = collapsed_repetition_dim_summary_data.drop_vars('lr')

collapsed_repetition_dim_summary_data

In [7]:
_experiments_data = experiments_data.copy()


### average over repetitions and pick the min value across learning rates

### average over repetitions and pick the min value across learning rates
repetition_mean_data = _experiments_data.mean(dim='repetition')
min_mlp_mean = repetition_mean_data.sel(dict(method="mlp")).min(dim='lr')

print("min mlp ",min_mlp_mean.shape, "min mlp", min_mlp_mean)
repetition_mean_data = repetition_mean_data / min_mlp_mean

_experiments_data = _experiments_data / min_mlp_mean 
repetition_std_data = _experiments_data.std(dim='repetition')
## combine mean and std into one xarray

repetition_summary_data = xr.concat([repetition_mean_data, repetition_std_data], dim= pd.Index(["mean", "std"], name='summary'))

### now keep the min mean value across learning rates for each method and data dimension combination and also keep the corresponding std value 
#min_mean_lr = repetition_summary_data.loc[dict(summary='mean')].argmin(dim='lr')

means = repetition_summary_data.sel(summary="mean")
min_mean_index = means.argmin(dim="lr")

print("min mean index: ",min_mean_index)

min_mean_index = min_mean_index.drop_vars('summary')

min_mean_index = min_mean_index.squeeze()

collapsed_repetition_summary_data = repetition_summary_data.isel(lr=min_mean_index, missing_dims="ignore")
collapsed_repetition_summary_data = collapsed_repetition_summary_data.drop_vars('lr')
collapsed_repetition_summary_data

min mlp  (3,) min mlp <xarray.DataArray (data_dims: 3)> Size: 24B
array([ 9206.10632324,  5548.06118822, 59633.24827566])
Coordinates:
  * data_dims  (data_dims) int64 24B 1 2 3
    method     <U9 36B 'mlp'
min mean index:  <xarray.DataArray (data_dims: 3, method: 10)> Size: 240B
array([[0, 0, 0, 1, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 1, 0, 1, 1],
       [0, 0, 0, 1, 0, 0, 0, 0, 2, 1]])
Coordinates:
  * data_dims  (data_dims) int64 24B 1 2 3
  * method     (method) <U9 360B 'svr' 'pls' 'gradboost' ... 'ffn' 'uffn'
    summary    <U4 16B 'mean'


In [8]:
method_renaming = {
    "svr": "SVR",
    "gradboost": "GradBoost",
    "mlp": "MLP",
    "rbf_full": "RBF Full",
    "rbf_diag": "RBF Diag",
    "rbf_scal": "RBF Scal",
    "rbf_univ": "U-RBF",
    "ffn": "FFM",
    "uffn": "U-FFM",
}

### rename the method coords and only keep the methods that have a mapping

collapsed_repetition_summary_data = collapsed_repetition_summary_data.sel(dict(method=[key for key in method_renaming.keys() if key in collapsed_repetition_summary_data.method.values]))#.sel(method=[key for key in method_renaming.keys() if key in method_renaming.keys()])
collapsed_repetition_summary_data.coords['method'] = [method_renaming[method] for method in collapsed_repetition_summary_data.method.values]



mean_summary_data = collapsed_repetition_summary_data.sel(summary="mean")
std_summary_data = collapsed_repetition_summary_data.sel(summary="std") 



mean_df = mean_summary_data.to_dataframe(name='value').reset_index()
mean_df = mean_df.pivot_table(index='data_dims', columns='method', values='value', aggfunc=lambda x: list(x)[0])

std_df = std_summary_data.to_dataframe(name='value').reset_index()
std_df = std_df.pivot_table(index='data_dims', columns='method', values='value', aggfunc=lambda x: list(x)[0])


## rename the row index 
mean_df.index = [str(dim) + " m"  for dim in mean_df.index]

std_df.index = [str(dim) + " s"  for dim in std_df.index]


### merge the mean and std dataframes into one and alternate the rows

final_df = pd.concat([mean_df, std_df], axis=0)
final_df = final_df.sort_index()


collapsed_repetition_dim_summary_data = collapsed_repetition_dim_summary_data.sel(dict(method=[key for key in method_renaming.keys() if key in collapsed_repetition_dim_summary_data.method.values]))#.sel(method=[key for key in method_renaming.keys() if key in method_renaming.keys()])
collapsed_repetition_dim_summary_data.coords['method'] = [method_renaming[method] for method in collapsed_repetition_dim_summary_data.method.values]


mean_datasets_summary_data = collapsed_repetition_dim_summary_data.sel(summary="mean")
std_datasets_summary_data = collapsed_repetition_dim_summary_data.sel(summary="std")

mean_datasets_summary_data = mean_datasets_summary_data.to_dataframe(name='value').reset_index()
mean_datasets_summary_data = mean_datasets_summary_data.pivot_table( columns='method', values='value', aggfunc=lambda x: list(x)[0])

std_datasets_summary_data = std_datasets_summary_data.to_dataframe(name='value').reset_index()
std_datasets_summary_data = std_datasets_summary_data.pivot_table( columns='method', values='value', aggfunc=lambda x: list(x)[0])

## rename the row index
mean_datasets_summary_data.index = [str(dim) + " m"  for dim in mean_datasets_summary_data.index]

std_datasets_summary_data.index = [str(dim) + " s"  for dim in std_datasets_summary_data.index]


final_datasets_summary_df = pd.concat([mean_datasets_summary_data, std_datasets_summary_data], axis=0)
final_datasets_summary_df = final_datasets_summary_df.sort_index()

final_df = pd.concat([final_df, final_datasets_summary_df], axis=0)


final_df


method,FFM,GradBoost,MLP,RBF Diag,RBF Full,RBF Scal,SVR,U-FFM,U-RBF
1 m,1.00283,1.548095,1.0,0.854234,0.854234,0.853799,1.412571,1.00283,0.858014
1 s,0.670492,0.973234,0.663087,0.737322,0.737322,0.737545,0.892459,0.670492,0.683838
2 m,1.067577,1.174838,1.0,0.964061,0.964061,0.988634,1.042753,1.016116,0.92521
2 s,0.684937,0.899472,0.747993,0.751146,0.751146,0.735714,0.674185,0.701181,0.786894
3 m,1.008353,0.078135,1.0,0.985568,0.989905,0.989338,0.077782,0.99523,0.971961
3 s,0.858827,0.068065,0.878774,0.881624,0.879113,0.880366,0.066703,0.870039,0.889005
value m,1.011187,0.341333,1.0,0.966242,0.969714,0.971334,0.314467,0.996215,0.952925
value s,1.562728,0.306158,1.580697,1.584483,1.584878,1.585147,0.273563,1.567924,1.583218
