In [9]:
import os, json
import numpy as np
import pandas as pd
import itertools as it
from scipy.stats import wilcoxon

In [10]:
def flatten_dict(d, parent_key='', sep='.'):
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

In [11]:
def convert_batch_metrics_to_df(df):
    # Find all rows with same name and concatenate, drop Nones
    metrics_names = df.columns.unique()
    metrics = {}
    for metric_name in metrics_names:
        numpy_array = np.concatenate(df[metric_name].values, axis=0)
        # delete nans
        numpy_array = numpy_array[~np.isnan(numpy_array)]
        metrics[metric_name] = numpy_array
    return pd.DataFrame(metrics)

In [12]:
exp_name = "small-10-gen"
exp_ids_to_drop = ["Y7IGw7GruwZud8VT"]
# Decide how to subset
subset_keys = ["data_params.name"]
sep_str_converter = {"model.rec_model.name": lambda x:x,
                     "init.method": lambda x: x if (x is not None and isinstance(x,str)) else "Random",
                     "init.training.cutoff": lambda x: "cutoff" if x is not None and x is True else ""}
sep_keys = list(sep_str_converter.keys())
metrics_to_test = []

In [13]:
sep_keys

['model.rec_model.name', 'init.method', 'init.training.cutoff']

In [14]:
exp_folder = f"../out/exp/{exp_name}"
log_folder = f"../out/log/{exp_name}"
results_folder = f"../out/results/{exp_name}"

In [15]:
exp_ids = []

In [16]:
# Find all re-run exp_ids of log_folder
exp_logs = {}
for exp_id in os.listdir(log_folder):
    for version_str in os.listdir(f"{log_folder}/{exp_id}/lightning_logs"):
        if version_str == "version_1":
            exp_ids.append(exp_id)
            exp_logs[exp_id] = convert_batch_metrics_to_df(pd.read_csv(f"{log_folder}/{exp_id}/lightning_logs/version_1/metrics_per_sample.csv", header=None, index_col=0).T)

In [17]:
# Load the configuration of each exp_id
exp_confs = {}
for exp_id in exp_ids:
    if exp_id in exp_ids_to_drop: continue
    exp_id_config_path = f"{exp_folder}/{exp_id}.json"
    with open(exp_id_config_path, "r") as f:
        exp_confs[exp_id] = flatten_dict(json.load(f))
exp_confs_df = pd.DataFrame(exp_confs).T # concatenate exp_confs with exp_ids as row_idx

In [18]:
len(exp_confs)

11

In [19]:
def f1_function(prec, rec):
    app = 2*prec*rec/(prec+rec)
    app[np.isnan(app)] = 0
    return app

for exp_id,df in exp_logs.items():
    for k in [5,10,20]:
        exp_logs[exp_id][f"test_F1_@{k}"] = f1_function(df[f"test_Precision_@{k}"], df[f"test_Recall_@{k}"])

In [20]:
exp_id_to_conf_name = {}
group_subsets = {}
for group_subset_key_values, group_df in exp_confs_df.groupby(subset_keys):
    group_ids = group_df.index
    group_subsets[group_subset_key_values] = group_ids
    for sep_values,sep_df in group_df.groupby(sep_keys, dropna=False):
        if len(sep_df.index) > 1:
            raise ValueError((f"Multiple exps for {sep_values}"))
        conf_name = [*group_subset_key_values]
        for sep_key,sep_values in zip(sep_keys,sep_values):
            app = sep_str_converter[sep_key](sep_values)
            if app != "" and (isinstance(app,str) or not np.isnan(app)): conf_name.append(app)
        conf_name = "_".join(conf_name)
        exp_id_to_conf_name[sep_df.index[0]] = conf_name

In [21]:
# avg_per_metric_per_exp_id = {}
# for metric_name in metrics_to_test:
#     for exp_id, df in exp_logs.items():
#         avg_per_metric_per_exp_id[exp_id] = df[metric_name].mean()

In [22]:
metrics_to_test = [f"test_{metric}_@{k}" for metric in ["F1","NDCG"] for k in [5,10,20]]
stat_test = lambda x,y: np.round(wilcoxon(x, y, alternative="greater").pvalue,3)

In [23]:
stat_tests = {}
for group_subset_values, group_subset_exp_ids in group_subsets.items():
    for exp_id1, exp_id2 in it.permutations(group_subset_exp_ids,2):
        for metric_name in metrics_to_test:
            arr1 = exp_logs[exp_id1][metric_name]
            arr2 = exp_logs[exp_id2][metric_name]
            pvalue = stat_test(arr1, arr2)
            #stat_tests[(exp_id_to_conf_name[exp_id1], exp_id_to_conf_name[exp_id2], metric_name)] = pvalue
            stat_tests[(exp_id_to_conf_name[exp_id1], exp_id_to_conf_name[exp_id2], metric_name)] = f"{np.mean(arr1)} vs {np.mean(arr2)}"

In [24]:
#if pvalue < alpha, arr1 is greater than arr2

In [25]:
# Save stat tests in a csv
stat_tests_df = pd.DataFrame(stat_tests, index=["pvalue"]).T

In [26]:
stat_tests_df

Unnamed: 0,Unnamed: 1,Unnamed: 2,pvalue
behance_BERT4Rec_svd,behance_BERT4Rec_leporid,test_F1_@5,0.16636029618344836 vs 0.17422385837259843
behance_BERT4Rec_svd,behance_BERT4Rec_leporid,test_F1_@10,0.11250835713906884 vs 0.1183990657750171
behance_BERT4Rec_svd,behance_BERT4Rec_leporid,test_F1_@20,0.07121702948874009 vs 0.07395979329983982
behance_BERT4Rec_svd,behance_BERT4Rec_leporid,test_NDCG_@5,0.3748891823194629 vs 0.38620753600672986
behance_BERT4Rec_svd,behance_BERT4Rec_leporid,test_NDCG_@10,0.4135545370210072 vs 0.42793945013545454
...,...,...,...
bookcrossing_SASRec_svd,bookcrossing_GRU4Rec_leporid,test_F1_@10,0.017094435968247064 vs 0.017290360735791443
bookcrossing_SASRec_svd,bookcrossing_GRU4Rec_leporid,test_F1_@20,0.01785714311056394 vs 0.0176390601846447
bookcrossing_SASRec_svd,bookcrossing_GRU4Rec_leporid,test_NDCG_@5,0.02528112373266626 vs 0.028195357114751022
bookcrossing_SASRec_svd,bookcrossing_GRU4Rec_leporid,test_NDCG_@10,0.04127926222080815 vs 0.04304827090577576


In [27]:
if not os.path.exists(results_folder):
    os.makedirs(results_folder)

In [28]:
# Save stat tests in a csv
stat_tests_df.to_csv(f"{results_folder}/prova_last_stat_tests_palue.csv")

In [29]:
f#"{results_folder}/stat_tests_means.csv"

<_io.TextIOWrapper name='../out/exp/small-10-gen/GUY0Cs8x6YG5GC20.json' mode='r' encoding='UTF-8'>