In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import scipy.stats as stats

# Reseting to the default seaborn theme
sb.set_theme()

In [None]:
out_dir = "out"

In [None]:
datasets = ['amzms', 'ml100k', 'mlls']
datasets.sort()

algorithms = ['opf', 'opf_snn', 'kmeans', 'user_knn', 'dbscan']
col_name = {"opf": "OPF", 
            "opf_snn": "OPF$_{SNN}$", 
            "kmeans": "$k$-Means", 
            "user_knn": "User-KNN", 
            "dbscan": "DBSCAN"}
dist_name = {"cosine": "Cosine", 
             "euclidean": "Euclidean", 
             "jaccard": "Jaccard", 
             "pearson": "Pearson", 
             "squared_euclidean": "Squared Euclidean"}

In [None]:
def csv_to_df(f_path: str, alg: str = ''):
    
    df = pd.read_csv(f_path, header=0)
    df.drop(['Unnamed: 0'], axis=1, inplace=True)
    df[['algorithm']] = df[['algorithm']].applymap(lambda x: col_name.get(x))
    df[['distance']] = df[['distance']].applymap(lambda x: dist_name.get(x))
    
    if alg in algorithms:
        df = df[df['algorithm'] == col_name.get(alg)]
        return df
    
    return df

### Plot evaluation measure x distance function

In [None]:
sb.set_context("poster", font_scale=1.2)
sb.set_style(style="whitegrid", rc={"context": "poster", "font_scale": 3, "grid.linestyle": "--"})

metric = 'rmse'
plt_name = f"opf_snn_dist_{metric}"
    
for ds in datasets:
    
    df_snn = csv_to_df(f"../out/{ds}_results.csv", alg='opf_snn')
    
    g = sb.relplot(kind="line", data=df_snn, x="n_neighbors", y=metric, hue="distance", style="distance", markers=True,
            palette="Set1", height=6, aspect=1.6, legend='brief', facet_kws={'legend_out': False}, errorbar=('ci', 50))
    
    g.set(xlabel="Maximum number of neighbors ($k_{max}$)", ylabel=metric.upper())
    
    sb.despine(offset=5, trim=False)
    
    # plt.title(ds.upper())
    plt.tight_layout(pad=.4)
    plt.legend(frameon=False, ncol=2, bbox_to_anchor=(0.48, 1.4), loc='upper center', borderaxespad=0)
    plt.savefig(f"./figs/{ds}_opf_snn_dist_{metric}.pdf", format='pdf', bbox_inches='tight')
    plt.show()

In [None]:
def read_df_from_csv(ds_name: str, alg: str = ''):
    
    col_name = {"opf": "OPF", 
                "opf_snn": "OPF$_{SNN}$", 
                "user_knn": "User-KNN", 
                "kmeans": "$k$-Means",
                "dbscan": "DBSCAN"}
    
    dist_name = {"cosine": "Cosine", "euclidean": "Euclidean", 
                 "jaccard": "Jaccard", "pearson": "Pearson", 
                 "squared_euclidean": "Squared Euclidean"}
    
    for f in os.scandir('/'.join(['..', out_dir])):

        if '.csv' in f.name and 'old' not in f.name and ds in f.name:

            # print("Reading '", ds_name, "' dataset...")
            _df = pd.read_csv(f.path, header=0)
            _df.drop(['Unnamed: 0'], axis=1, inplace=True)
            _df[['algorithm']] = _df[['algorithm']].applymap(lambda x: col_name.get(x))
            _df[['distance']] = _df[['distance']].applymap(lambda x: dist_name.get(x))
        
    if alg in algorithms:
        _df = _df[_df['algorithm'] == col_name.get(alg)]
        return _df
        
    return _df

## Recommendation phase evaluation
#### Generaring statistics regarding the results from sparse data

In [None]:
def create_new_entry(cols='alg'):

    if cols == 'metric':
        return {v: "" for v in metric_map.values()}
    
    return {v: "" for v in algo_map.values()}

In [None]:
out_dir = "out"

algo_map = {"user_knn": "User-KNN", 
            "kmeans": "$k$-Means", 
            "opf": "OPF", 
            "opf_snn": "OPF$_{SNN}$",
            "dbscan": "DBSCAN"}

metric_map = {"mae": "MAE", "rmse": "RMSE", 
              "ndcg@1": "NDCG@1", "ndcg@5": "NDCG@5", "ndcg@10": "NDCG@10", 
              "precision@1": "PRECISION@1", "precision@5": "PRECISION@5", "precision@10": "PRECISION@10",
              "recall@1": "RECALL@1", "recall@5": "RECALL@5", "recall@10": "RECALL@10"}
dataset_map = {"amzms": "AMZ-MS", "ml100k": "ML-100K", "mlls": "ML-LS"}

In [None]:
approaches = list(algo_map.values())
measures = list(metric_map.values())
kmax = [30, 50, 60]
distances = ['Jaccard', 'Cosine', 'Squared Euclidean']
alg_prm = {datasets[i]: [kmax[i], distances[i]] for i in range(0, len(datasets))}

tb_index = pd.MultiIndex.from_product([list(dataset_map.values()), approaches], names=["Dataset", "Approach"])
tb = pd.DataFrame(index=tb_index, columns=list(metric_map.values()))

for ds in datasets:

    df = read_df_from_csv(ds)
        
    for alg in approaches:
        
        # Create a empty dataframe row
        row = create_new_entry(cols='metric')
    
        search_query = f"n_neighbors == {alg_prm.get(ds)[0]} and distance == '{alg_prm.get(ds)[1]}'"
        
        query = df.query(search_query).groupby('algorithm')[list(metric_map.keys())]
        
        for m_key, m_value in metric_map.items():
        
            mean = query.mean().loc[alg][m_key]
            std = query.std().loc[alg][m_key]
            row[m_value] = f"{mean:.4f} \pm {std:.4f}"
        
        tb.loc[dataset_map.get(ds), alg] = row

print("Done...")

In [None]:
# Rating prediction average results
df_rp = tb[measures[:2]]
# df_rp

In [None]:
# Ranking average results
df_rank = tb[measures[2:5]]
# df_rank

In [None]:
# Decision support average results
df_ds = tb[measures[5:]]
# df_ds

In [None]:
# Average results considering all evaluation measures and datasets
tb

In [None]:
def read_df_from_csv(ds_name: str, alg: str = ''):
    
    col_name = {"opf": "OPF", "opf_snn": "OPF$_{SNN}$", 
                "user_knn": "User-KNN", "kmeans": "$k$-Means",
                "dbscan": "DBSCAN"}
    dist_name = {"cosine": "Cosine", "euclidean": "Euclidean", 
                 "jaccard": "Jaccard", "pearson": "Pearson", 
                 "squared_euclidean": "Squared Euclidean"}
    
    for f in os.scandir('/'.join(['..', out_dir])):

        if '.csv' in f.name and 'old' not in f.name and ds in f.name:

            # print("Reading '", ds_name, "' dataset...")
            _df = pd.read_csv(f.path, header=0)
            _df.drop(['Unnamed: 0'], axis=1, inplace=True)
            _df[['algorithm']] = _df[['algorithm']].applymap(lambda x: col_name.get(x))
            _df[['distance']] = _df[['distance']].applymap(lambda x: dist_name.get(x))
        
    if alg in algorithms:
        _df = _df[_df['algorithm'] == col_name.get(alg)]
        return _df
    
    return _df

In [None]:
key_to_metric = {value: key for key, value in metric_map.items()}
col_measures = list(key_to_metric.values())
np_algs = np.array(list(algo_map.keys()))

alpha = 0.05 # Significance level of the test
alt_hipothesis = 'greater' # Better result = Maximum result
zmethod = 'wilcox'

for ds in datasets:
   
    # Get the optimal parameters regarding dataset ds
    kmax = alg_prm.get(ds)[0]
    dist = alg_prm.get(ds)[1]
    
    print("\nDataset:", ds, "  kmax:", kmax, "  Distance:", dist)
    # print("Kmax: ", kmax)
    # print("Distance: ", dist)
    
    df = read_df_from_csv(ds)
    
    search_query = f"n_neighbors == {alg_prm.get(ds)[0]} and distance == '{alg_prm.get(ds)[1]}'"
    df_algs = {key: df.query(f"{search_query} and algorithm == '{algo_map[key]}'")[col_measures] 
               for key in algo_map.keys()}
    
    for metric, metric_key in metric_map.items():
        
        # Select the arg function (min or max) based on current eval metric
        if metric in ['mae', 'rmse']:
            idx_best = np.argmin([df_algs[i][metric].mean() for i in np_algs])
        else:
            idx_best = np.argmax([df_algs[i][metric].mean() for i in np_algs])
        
        alg_best = np_algs[idx_best]
        print("Best:", alg_best)
        
        # Sets the control group (i.e., the best average result)
        cg = df_algs[alg_best][metric].values
        
        test_groups = np.delete(np_algs, idx_best)

        for alg_test in test_groups:
            
            print("Test", alg_test, "against", alg_best, "...")
            
            tg = df_algs[alg_test][metric].values

            stat, pvalue = stats.wilcoxon(cg, tg, zero_method=zmethod, alternative='two-sided')

            # print(f"Stats: {stat}, p-value: {pvalue: .4f}")

            if pvalue > alpha: 
                # print("There is no difference in the medians of A and B.")
                
                # control group
                x = tb.loc[dataset_map.get(ds), algo_map.get(alg_best)][metric_key]
                
                if "$" not in x:
                    tb.loc[dataset_map.get(ds), algo_map.get(alg_best)][metric_key] = "$" + x + "$"
                
                # test group
                x = tb.loc[dataset_map.get(ds), algo_map.get(alg_test)][metric_key]
                
                if "$" not in x:
                    tb.loc[dataset_map.get(ds), algo_map.get(alg_test)][metric_key] = "$" + x + "$"
                
            else:
                # print("There is a difference between the medians of A and B.")

                # Better result = minimum result
                if metric in ['mae', 'rmse']:
                    # print("Alternative = 'less'")
                    alt_hipothesis = 'less'
                else:
                    # print("Alternative = 'greater'")
                    alt_hipothesis = 'greater'

                stat, pvalue = stats.wilcoxon(cg, tg, zero_method=zmethod, alternative=alt_hipothesis)

                if pvalue > alpha:
                    # Null Hipothesis is true: highlight the test group (tg) in bold
                    # print(alg, "is superior.")
                    
                    # test group
                    x = tb.loc[dataset_map.get(ds), algo_map.get(alg_test)][metric_key]
                    
                    if "mathbf" not in x:
                        if "$" in x: x = x[1:-1]
                        tb.loc[dataset_map.get(ds), algo_map.get(alg_test)][metric_key] = "$\mathbf{" + x + "}$"
                    
                    # control group
                    x = tb.loc[dataset_map.get(ds), algo_map.get(alg_best)][metric_key]
                    
                    if "$" not in x:
                        tb.loc[dataset_map.get(ds), algo_map.get(alg_best)][metric_key] = "$" + x + "$"
                        
                else:
                    # Alt. Hipothesis is true: highlight the control group (cg) in bold
                    # print(np_algs[idx_best], "is superior.")
                    
                    # control group
                    x = tb.loc[dataset_map.get(ds), algo_map.get(alg_best)][metric_key]
                    
                    if "mathbf" not in x:
                        if "$" in x: x = x[1:-1]
                        tb.loc[dataset_map.get(ds), algo_map.get(alg_best)][metric_key] = "$\mathbf{" + x + "}$"

                    # test group
                    x = tb.loc[dataset_map.get(ds), algo_map.get(alg_test)][metric_key]
                    
                    if "$" not in x:
                        tb.loc[dataset_map.get(ds), algo_map.get(alg_test)][metric_key] = "$" + x + "$"

print("Done...")

# Saving the dataframe as .csv (will be converted to latex table further) 
save_path = "/".join(["tables", "sparse", f"sparse_data_stats_22.06.2023.csv"])
tb.to_csv(save_path)

print("Results latex table saved in '", save_path)