# MovieLens 1M - Results of Frisch et al. co-clustering

Dataset with different matching user/movie ids

In [8]:
import os
import pickle
import pandas as pd
import numpy as np
import re

from sklearn.cluster import KMeans
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import normalized_mutual_info_score

current_path = os.getcwd()
while not os.path.exists(os.path.join(current_path, '.git')):
    current_path = os.path.dirname(current_path)

import sys
sys.path.append(f'{current_path}/tauCC/src')
from fairness_metrics import balance_gen, balance_chierichetti, KL_fairness_error

root = os.getcwd()

In [2]:
# Dataset
DATASET = "movielens-1m"
SENSITIVE = "gender"
TRUE_LABEL = "genres"
TRUE_LABEL_DIM = "cols"

if "movielens" in DATASET:
    dataset_path = f"./fair_taucc/datasets/movielens/{DATASET}"
else:
    dataset_path = f"./fair_taucc/datasets/{DATASET}"
    
Sx = np.load(dataset_path + f"/{SENSITIVE}.npy", allow_pickle=True).astype(int)

if TRUE_LABEL != " ":
    true_labels = np.load(dataset_path + f"/{TRUE_LABEL}.npy", allow_pickle=True).astype(int)
    if true_labels.ndim != 1:
        true_labels = true_labels.reshape(-1)

# Frisch et al. results
if DATASET == "movielens-1m":
    result_path = root + f"/results/movielens-1m/{SENSITIVE}/lbm_fair"
    baseline_path = root + f"/results/movielens-1m/{SENSITIVE}/lbm_baseline"
else:
    result_path = root + f"/results/{DATASET}/{SENSITIVE}/lbm_fair"
    baseline_path = root + f"/results/{DATASET}/{SENSITIVE}/lbm_baseline"

In [3]:
# movies/users mapping of Frisch et al.
df_movies = pd.read_csv(root + f"/data/movielens_1m/movielens-1M_mapping_movies.csv")
df_users = pd.read_csv(root + f"/data/movielens_1m/movielens-1M_mapping_users.csv")

# movies/users mapping of TauCC
df_movies_tau = pd.read_csv("./fair_taucc/datasets/movielens/movielens-1m/mapping_movies.csv")
df_users_tau = pd.read_csv("./fair_taucc/datasets/movielens/movielens-1m/mapping_users.csv")

def get_pkl_files_with_os(directory):
    absolute_path = []
    filenames = []
    for file in os.listdir(directory):
        if file.endswith(".pkl"):
            filenames.append(file)
            absolute_path.append(os.path.join(directory, file))
    return absolute_path, filenames

def get_row_labels(row_clus):
    row_labels = np.full(row_clus.shape, -1, dtype=int)
    for id_row, clus in enumerate(row_clus):
        user = df_users[df_users["id_row"] == id_row]["id_user"].values[0]
        true_id_row = df_users_tau[df_users_tau["id_user"] == user]["id_row"].values[0]
        row_labels[true_id_row] = clus
        #print(f"Frisch et al.: row {id_row} - user {user} - cluster {clus}")
        #print(f"Battaglia et al.: row {true_id_row} - user {user} - cluster {clus}")
    return row_labels

def get_col_labels(col_clus):
    col_labels = np.full(col_clus.shape, -1, dtype=int)
    for id_col, clus in enumerate(col_clus):
        movie = df_movies[df_movies["id_col"] == id_col]["id_movie"].values[0]
        true_id_col = df_movies_tau[df_movies_tau["id_movie"] == movie]["id_col"].values[0]
        col_labels[true_id_col] = clus
        #print(f"Frisch et al.: col {id_col} - movie {movie} - cluster {clus}")
        #print(f"Battaglia et al.: col {true_id_col} - movie {movie} - cluster {clus}")
    return col_labels

# Best run of Vanilla LBM

### Results of LBM baseline

In [4]:
baseline_path

'./fair_taucc/algorithms/C-Fairness-RecSys/reproducibility_study/Frisch_et_al/results/movielens-1m/gender/lbm_baseline'

In [5]:
# Find the best run of Vanilla LBM
pkl_abspath, pkl_filenames = get_pkl_files_with_os(baseline_path)
exec_times = pd.read_csv(baseline_path + "/time.csv", sep=",")["time"].to_numpy()
total_runs = len(pkl_filenames)

if not os.path.exists(baseline_path + "/results.csv"):
    with open(baseline_path + "/results.csv", "a") as file:
        file.write(f"run;row_clus;col_clus;NLL;NMI_true_labels;AMI_true_labels;ARI_true_labels;balance_chierichetti;balance_bera;KL_fairness_error;time\n")


print("***LBM Ordinal***")
for filename, path in zip(pkl_filenames, pkl_abspath):
    match = re.search(r"run_(\d+)", filename)
    if match:
        run = int(match.group(1))
        print("run: ", run)
    else:
        raise ValueError(f"Run not found in {filename}")
        
    with open(path, "rb") as f:
        data = pickle.load(f)
        
    nll = data["nll"]
    K_rows = data["nq"]
    K_cols = data["nl"]
    
    print("Run Kmeans on tau1...")
    kmeans_rows = KMeans(n_clusters=K_rows).fit(data["model"]["tau_1"])
    print("Kmeans on tau1 terminated.")
    print("Run Kmeans on tau2...")
    kmeans_cols = KMeans(n_clusters=K_cols).fit(data["model"]["tau_2"])
    print("Kmeans on tau2 terminated.")
    
    row_clus = kmeans_rows.labels_
    col_clus = kmeans_cols.labels_
    
    row_labels = get_row_labels(row_clus)
    col_labels = get_col_labels(col_clus)
    
    np.save(baseline_path + f"/run_{run}_row_clustering.npy", row_labels)
    np.save(baseline_path + f"/run_{run}_col_clustering.npy", col_labels)
    
    bera = balance_gen(Sx, row_labels)
    chierichetti = balance_chierichetti(Sx, row_labels)
    kl_error = KL_fairness_error(row_labels, K_rows, Sx)
    
    time = exec_times[run-1]
    
    NMI_true_labels = normalized_mutual_info_score(true_labels, col_labels)
    AMI_true_labels = adjusted_mutual_info_score(true_labels, col_labels)
    ARI_true_labels = adjusted_rand_score(true_labels, col_labels)
    
    with open(baseline_path + "/results.csv", "a") as file:
        file.write(f"{run};{K_rows};{K_cols};{nll};{NMI_true_labels};{AMI_true_labels};{ARI_true_labels};{chierichetti};{bera};{kl_error};{time}\n")
    
    

***LBM Ordinal***
run:  6
Run Kmeans on tau1...
Kmeans on tau1 terminated.
Run Kmeans on tau2...
Kmeans on tau2 terminated.
run:  4
Run Kmeans on tau1...
Kmeans on tau1 terminated.
Run Kmeans on tau2...
Kmeans on tau2 terminated.
run:  5
Run Kmeans on tau1...
Kmeans on tau1 terminated.
Run Kmeans on tau2...
Kmeans on tau2 terminated.
run:  2
Run Kmeans on tau1...
Kmeans on tau1 terminated.
Run Kmeans on tau2...
Kmeans on tau2 terminated.
run:  10
Run Kmeans on tau1...
Kmeans on tau1 terminated.
Run Kmeans on tau2...
Kmeans on tau2 terminated.
run:  8
Run Kmeans on tau1...
Kmeans on tau1 terminated.
Run Kmeans on tau2...
Kmeans on tau2 terminated.
run:  3
Run Kmeans on tau1...
Kmeans on tau1 terminated.
Run Kmeans on tau2...
Kmeans on tau2 terminated.
run:  1
Run Kmeans on tau1...
Kmeans on tau1 terminated.
Run Kmeans on tau2...
Kmeans on tau2 terminated.
run:  9
Run Kmeans on tau1...
Kmeans on tau1 terminated.
Run Kmeans on tau2...
Kmeans on tau2 terminated.
run:  7
Run Kmeans on tau1.

### Best run of LBM baselin

In [6]:
df_vanilla = pd.read_csv(baseline_path + "/results.csv", sep=";")
df_vanilla

Unnamed: 0,run,row_clus,col_clus,NLL,NMI_true_labels,AMI_true_labels,ARI_true_labels,balance_chierichetti,balance_bera,KL_fairness_error,time
0,6,25,25,3986005.0,0.090629,0.070098,0.028452,0.206897,0.605868,0.343406,5045.175394
1,4,25,25,3986684.0,0.094484,0.073973,0.02651,0.22619,0.647908,0.242136,5443.626268
2,5,25,25,3985971.0,0.091658,0.071175,0.022266,0.177419,0.532555,0.339765,5139.602992
3,2,25,25,3987253.0,0.087713,0.06701,0.024104,0.205128,0.601571,0.299057,5207.207937
4,10,25,25,3984633.0,0.098922,0.078428,0.02642,0.042553,0.144254,0.789284,4937.501157
5,8,25,25,3987266.0,0.089462,0.068963,0.022083,0.223077,0.643627,0.284208,4749.995351
6,3,25,25,3987711.0,0.08089,0.060004,0.020289,0.212766,0.619618,0.303019,5495.473327
7,1,25,25,3988156.0,0.081704,0.060931,0.021881,0.169643,0.512598,0.42359,5176.061762
8,9,25,25,3984776.0,0.097685,0.077019,0.02926,0.142857,0.441779,0.57477,4766.815741
9,7,25,25,3985428.0,0.093269,0.072628,0.026031,0.212121,0.603393,0.319006,5138.986692


In [7]:
# Best run
id_row = df_vanilla["NLL"].argmin()
best_run_row = df_vanilla.iloc[id_row]
best_run = int(best_run_row["run"])
print("best run of LBM baseline: ", best_run)

best run of LBM baseline:  10


In [9]:
mean_vanilla = df_vanilla.groupby(["row_clus", "col_clus"]).mean().drop(["run"],axis=1)
mean_vanilla

Unnamed: 0_level_0,Unnamed: 1_level_0,NLL,NMI_true_labels,AMI_true_labels,ARI_true_labels,balance_chierichetti,balance_bera,KL_fairness_error,time
row_clus,col_clus,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
25,25,3986388.0,0.090642,0.070023,0.02473,0.181865,0.535317,0.391824,5110.044662


In [10]:
var_vanilla = df_vanilla.groupby(["row_clus", "col_clus"]).var().drop(["run"],axis=1)
var_vanilla

Unnamed: 0_level_0,Unnamed: 1_level_0,NLL,NMI_true_labels,AMI_true_labels,ARI_true_labels,balance_chierichetti,balance_bera,KL_fairness_error,time
row_clus,col_clus,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
25,25,1489122.0,3.6e-05,3.7e-05,9e-06,0.003089,0.02308,0.028143,61862.568915


In [11]:
std_vanilla = df_vanilla.groupby(["row_clus", "col_clus"]).std().drop(["run"],axis=1)
std_vanilla

Unnamed: 0_level_0,Unnamed: 1_level_0,NLL,NMI_true_labels,AMI_true_labels,ARI_true_labels,balance_chierichetti,balance_bera,KL_fairness_error,time
row_clus,col_clus,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
25,25,1220.296004,0.006017,0.006115,0.003044,0.055578,0.151921,0.16776,248.721871


In [12]:
aggregated_vanilla = pd.DataFrame()

for key in mean_vanilla.keys():
    aggregated_vanilla[f"{key}_mean"] = mean_vanilla[key].values

for key in std_vanilla.keys():
    aggregated_vanilla[f"{key}_std"] = std_vanilla[key].values
    
for key in var_vanilla.keys():
    aggregated_vanilla[f"{key}_var"] = var_vanilla[key].values
    
aggregated_vanilla

Unnamed: 0,NLL_mean,NMI_true_labels_mean,AMI_true_labels_mean,ARI_true_labels_mean,balance_chierichetti_mean,balance_bera_mean,KL_fairness_error_mean,time_mean,NLL_std,NMI_true_labels_std,...,KL_fairness_error_std,time_std,NLL_var,NMI_true_labels_var,AMI_true_labels_var,ARI_true_labels_var,balance_chierichetti_var,balance_bera_var,KL_fairness_error_var,time_var
0,3986388.0,0.090642,0.070023,0.02473,0.181865,0.535317,0.391824,5110.044662,1220.296004,0.006017,...,0.16776,248.721871,1489122.0,3.6e-05,3.7e-05,9e-06,0.003089,0.02308,0.028143,61862.568915


In [13]:
aggregated_vanilla.to_csv(baseline_path + "/aggregated.csv", index=False)

# Parity LBM vs LBM baseline

In [14]:
result_path

'./fair_taucc/algorithms/C-Fairness-RecSys/reproducibility_study/Frisch_et_al/results/movielens-1m/gender/lbm_fair'

In [15]:
baseline_row_labels = np.load(baseline_path + f"/run_{best_run}_row_clustering.npy")
baseline_col_labels = np.load(baseline_path + f"/run_{best_run}_col_clustering.npy")

pkl_abspath, pkl_filenames = get_pkl_files_with_os(result_path)
exec_times = pd.read_csv(result_path + "/time.csv", sep=",")["time"].to_numpy()
total_runs = len(pkl_filenames)

if not os.path.exists(result_path + "/results.csv"):
    with open(result_path + "/results.csv", "a") as file:
        file.write(f"run;row_clus;col_clus;NLL;NMI_true_labels;AMI_true_labels;ARI_true_labels;NMI_rows;AMI_rows;ARI_rows;NMI_cols;AMI_cols;ARI_cols;balance_chierichetti;balance_bera;KL_fairness_error;time\n")

print("***Parity LBM***")

for filename, path in zip(pkl_filenames, pkl_abspath):

    match = re.search(r"run_(\d+)", filename)
    if match:
        run = int(match.group(1))
        print("run: ", run)
    else:
        raise ValueError(f"Run not found in {filename}")
        
    with open(path, "rb") as f:
        data = pickle.load(f)
    
    nll = data["nll"]
    K_rows = data["nq"]
    K_cols = data["nl"]
    
    print("Run Kmeans on tau1...")
    kmeans_rows = KMeans(n_clusters=K_rows).fit(data["model"]["tau_1"])
    print("Kmeans on tau1 terminated.")
    print("Run Kmeans on tau2...")
    kmeans_cols = KMeans(n_clusters=K_cols).fit(data["model"]["tau_2"])
    print("Kmeans on tau2 terminated.")
    
    row_clus = kmeans_rows.labels_
    col_clus = kmeans_cols.labels_
    
    row_labels = get_row_labels(row_clus)
    col_labels = get_col_labels(col_clus)
    
    np.save(result_path + f"/run_{run}_row_clustering.npy", row_labels)
    np.save(result_path + f"/run_{run}_col_clustering.npy", col_labels)
    
    bera = balance_gen(Sx, row_labels)
    chierichetti = balance_chierichetti(Sx, row_labels)
    kl_error = KL_fairness_error(row_labels, K_rows, Sx)
    
    time = exec_times[run-1]
    
    if TRUE_LABEL_DIM == "cols":
        NMI_true_labels = normalized_mutual_info_score(true_labels, col_labels)
        AMI_true_labels = adjusted_mutual_info_score(true_labels, col_labels)
        ARI_true_labels = adjusted_rand_score(true_labels, col_labels)
    else:
        NMI_true_labels = normalized_mutual_info_score(true_labels, row_labels)
        AMI_true_labels = adjusted_mutual_info_score(true_labels, row_labels)
        ARI_true_labels = adjusted_rand_score(true_labels, row_labels)
    
    NMI_rows = normalized_mutual_info_score(baseline_row_labels, row_labels)
    AMI_rows = adjusted_mutual_info_score(baseline_row_labels, row_labels)
    ARI_rows = adjusted_rand_score(baseline_row_labels, row_labels)
    
    NMI_cols = normalized_mutual_info_score(baseline_col_labels, col_labels)
    AMI_cols = adjusted_mutual_info_score(baseline_col_labels, col_labels)
    ARI_cols = adjusted_rand_score(baseline_col_labels, col_labels)
        
    with open(result_path + "/results.csv", "a") as file:
        file.write(f"{run};{K_rows};{K_cols};{nll};{NMI_true_labels};{AMI_true_labels};{ARI_true_labels};{NMI_rows};{AMI_rows};{ARI_rows};{NMI_cols};{AMI_cols};{ARI_cols};{chierichetti};{bera};{kl_error};{time}\n")
    

***Parity LBM***
run:  7
Run Kmeans on tau1...
Kmeans on tau1 terminated.
Run Kmeans on tau2...
Kmeans on tau2 terminated.
run:  8
Run Kmeans on tau1...
Kmeans on tau1 terminated.
Run Kmeans on tau2...
Kmeans on tau2 terminated.
run:  2
Run Kmeans on tau1...
Kmeans on tau1 terminated.
Run Kmeans on tau2...
Kmeans on tau2 terminated.
run:  10
Run Kmeans on tau1...
Kmeans on tau1 terminated.
Run Kmeans on tau2...
Kmeans on tau2 terminated.
run:  5
Run Kmeans on tau1...
Kmeans on tau1 terminated.
Run Kmeans on tau2...
Kmeans on tau2 terminated.
run:  3
Run Kmeans on tau1...
Kmeans on tau1 terminated.
Run Kmeans on tau2...
Kmeans on tau2 terminated.
run:  6
Run Kmeans on tau1...
Kmeans on tau1 terminated.
Run Kmeans on tau2...
Kmeans on tau2 terminated.
run:  1
Run Kmeans on tau1...
Kmeans on tau1 terminated.
Run Kmeans on tau2...
Kmeans on tau2 terminated.
run:  4
Run Kmeans on tau1...
Kmeans on tau1 terminated.
Run Kmeans on tau2...
Kmeans on tau2 terminated.
run:  9
Run Kmeans on tau1..

In [16]:
df_fair = pd.read_csv(result_path + "/results.csv", sep=";")
df_fair

Unnamed: 0,run,row_clus,col_clus,NLL,NMI_true_labels,AMI_true_labels,ARI_true_labels,NMI_rows,AMI_rows,ARI_rows,NMI_cols,AMI_cols,ARI_cols,balance_chierichetti,balance_bera,KL_fairness_error,time
0,7,25,25,3984804.0,0.091641,0.071128,0.021653,0.459592,0.449958,0.230007,0.558026,0.54496,0.456917,0.236559,0.676114,0.150128,5032.348135
1,8,25,25,3984188.0,0.092138,0.071549,0.025701,0.484446,0.475368,0.267283,0.565572,0.552609,0.534894,0.165138,0.500915,0.189265,5040.653415
2,2,25,25,3985372.0,0.092294,0.071897,0.023427,0.468148,0.45885,0.239284,0.555363,0.542148,0.446064,0.242424,0.689606,0.142907,5469.025649
3,10,25,25,3985860.0,0.099683,0.079277,0.029875,0.492767,0.483856,0.261134,0.581568,0.569048,0.540696,0.23,0.660872,0.155068,5035.211016
4,5,25,25,3983999.0,0.097283,0.076742,0.03328,0.508402,0.499749,0.286754,0.606648,0.594615,0.497517,0.223602,0.645849,0.157725,5172.110274
5,3,25,25,3986908.0,0.081881,0.061076,0.024936,0.443871,0.434196,0.215078,0.541476,0.527667,0.50195,0.230769,0.662668,0.161417,5457.330221
6,6,25,25,3984960.0,0.091677,0.071016,0.024984,0.483373,0.474237,0.269331,0.564221,0.551103,0.513932,0.163636,0.497001,0.225913,5142.353296
7,1,25,25,3984845.0,0.094724,0.074135,0.026849,0.47305,0.463765,0.24581,0.572905,0.560019,0.54269,0.227642,0.655354,0.148218,5759.248202
8,4,25,25,3984838.0,0.088205,0.067617,0.024728,0.48372,0.474647,0.275289,0.582112,0.569673,0.536206,0.186441,0.555379,0.17284,5472.791174
9,9,25,25,3986329.0,0.086163,0.065484,0.021877,0.475581,0.466387,0.266771,0.577758,0.56512,0.529642,0.15,0.460987,0.189871,5038.973672


In [17]:
df_fair.drop(["run"], axis=1, inplace=True)
df_fair

Unnamed: 0,row_clus,col_clus,NLL,NMI_true_labels,AMI_true_labels,ARI_true_labels,NMI_rows,AMI_rows,ARI_rows,NMI_cols,AMI_cols,ARI_cols,balance_chierichetti,balance_bera,KL_fairness_error,time
0,25,25,3984804.0,0.091641,0.071128,0.021653,0.459592,0.449958,0.230007,0.558026,0.54496,0.456917,0.236559,0.676114,0.150128,5032.348135
1,25,25,3984188.0,0.092138,0.071549,0.025701,0.484446,0.475368,0.267283,0.565572,0.552609,0.534894,0.165138,0.500915,0.189265,5040.653415
2,25,25,3985372.0,0.092294,0.071897,0.023427,0.468148,0.45885,0.239284,0.555363,0.542148,0.446064,0.242424,0.689606,0.142907,5469.025649
3,25,25,3985860.0,0.099683,0.079277,0.029875,0.492767,0.483856,0.261134,0.581568,0.569048,0.540696,0.23,0.660872,0.155068,5035.211016
4,25,25,3983999.0,0.097283,0.076742,0.03328,0.508402,0.499749,0.286754,0.606648,0.594615,0.497517,0.223602,0.645849,0.157725,5172.110274
5,25,25,3986908.0,0.081881,0.061076,0.024936,0.443871,0.434196,0.215078,0.541476,0.527667,0.50195,0.230769,0.662668,0.161417,5457.330221
6,25,25,3984960.0,0.091677,0.071016,0.024984,0.483373,0.474237,0.269331,0.564221,0.551103,0.513932,0.163636,0.497001,0.225913,5142.353296
7,25,25,3984845.0,0.094724,0.074135,0.026849,0.47305,0.463765,0.24581,0.572905,0.560019,0.54269,0.227642,0.655354,0.148218,5759.248202
8,25,25,3984838.0,0.088205,0.067617,0.024728,0.48372,0.474647,0.275289,0.582112,0.569673,0.536206,0.186441,0.555379,0.17284,5472.791174
9,25,25,3986329.0,0.086163,0.065484,0.021877,0.475581,0.466387,0.266771,0.577758,0.56512,0.529642,0.15,0.460987,0.189871,5038.973672


In [18]:
mean_fair = df_fair.groupby(["row_clus", "col_clus"]).mean()
mean_fair

Unnamed: 0_level_0,Unnamed: 1_level_0,NLL,NMI_true_labels,AMI_true_labels,ARI_true_labels,NMI_rows,AMI_rows,ARI_rows,NMI_cols,AMI_cols,ARI_cols,balance_chierichetti,balance_bera,KL_fairness_error,time
row_clus,col_clus,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
25,25,3985210.0,0.091569,0.070992,0.025731,0.477295,0.468101,0.255674,0.570565,0.557696,0.510051,0.205621,0.600474,0.169335,5262.004506


In [19]:
std_fair = df_fair.groupby(["row_clus", "col_clus"]).std()
std_fair

Unnamed: 0_level_0,Unnamed: 1_level_0,NLL,NMI_true_labels,AMI_true_labels,ARI_true_labels,NMI_rows,AMI_rows,ARI_rows,NMI_cols,AMI_cols,ARI_cols,balance_chierichetti,balance_bera,KL_fairness_error,time
row_clus,col_clus,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
25,25,918.508167,0.005204,0.0053,0.003565,0.017862,0.018163,0.022355,0.017992,0.018472,0.034713,0.035292,0.087154,0.025705,257.769488


In [20]:
var_fair = df_fair.groupby(["row_clus", "col_clus"]).var()
var_fair

Unnamed: 0_level_0,Unnamed: 1_level_0,NLL,NMI_true_labels,AMI_true_labels,ARI_true_labels,NMI_rows,AMI_rows,ARI_rows,NMI_cols,AMI_cols,ARI_cols,balance_chierichetti,balance_bera,KL_fairness_error,time
row_clus,col_clus,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
25,25,843657.252678,2.7e-05,2.8e-05,1.3e-05,0.000319,0.00033,0.0005,0.000324,0.000341,0.001205,0.001246,0.007596,0.000661,66445.108853


In [21]:
aggregated = pd.DataFrame()
aggregated

In [22]:
for key in mean_fair.keys():
    aggregated[f"{key}_mean"] = mean_fair[key].values

for key in std_fair.keys():
    aggregated[f"{key}_std"] = std_fair[key].values
    
for key in var_fair.keys():
    aggregated[f"{key}_var"] = var_fair[key].values

In [23]:
aggregated

Unnamed: 0,NLL_mean,NMI_true_labels_mean,AMI_true_labels_mean,ARI_true_labels_mean,NMI_rows_mean,AMI_rows_mean,ARI_rows_mean,NMI_cols_mean,AMI_cols_mean,ARI_cols_mean,...,NMI_rows_var,AMI_rows_var,ARI_rows_var,NMI_cols_var,AMI_cols_var,ARI_cols_var,balance_chierichetti_var,balance_bera_var,KL_fairness_error_var,time_var
0,3985210.0,0.091569,0.070992,0.025731,0.477295,0.468101,0.255674,0.570565,0.557696,0.510051,...,0.000319,0.00033,0.0005,0.000324,0.000341,0.001205,0.001246,0.007596,0.000661,66445.108853


In [24]:
aggregated.to_csv(result_path + "/aggregated.csv", index=False)