In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import random

In [3]:
def load_results_complete(cancer, base):
    strategies = ["snf", "correlation", "correlation_multi_omics"]
    percentiles = ["001", "005", "01", "025", "05", "075", "09", "095", "099"]
    
    mlp = pd.read_csv(base+"/mlp/mlp_results.csv")
    mlp['Algorithm'] = "MLP"
    results = {}
    for strategy in strategies:
        results[strategy] = {}
        for percentile in percentiles:
            path = f"{base}/{strategy}/{percentile}/"
            gat = pd.read_csv(path+"gat_results.csv")
            gcn = pd.read_csv(path+"gcn_results.csv")
            
            gat['Algorithm'] = "GAT"
            gcn['Algorithm'] = "GCN"
            
            
            result_df = pd.concat([mlp, gat, gcn])
            result_df = result_df.rename(columns={
                "loss": "Loss",
                "acc": "Accuracy",
                "auc_roc": "AUC-ROC",
                "auc_pr": "AUC-PR",
                "precision": "Precision",
                "recall": "Recall"
            })
            results[strategy][percentile] = result_df
            
    return results


def load_means_accross_thresholds(cancer, base):
    strategies = ["snf", "correlation", "correlation_multi_omics"]
    percentiles = ["001", "005", "01", "025", "05", "075", "09", "095", "099"]
    
    
    results = {}
    for strategy in strategies:
        gat_df = pd.DataFrame()
        gcn_df = pd.DataFrame()
        for percentile in percentiles:
            path = f"{base}/{strategy}/{percentile}/"
            gat = pd.read_csv(path+"gat_results.csv")
            gcn = pd.read_csv(path+"gcn_results.csv")
            
            col_percentile = percentile[0]+"."+percentile[1:]
            gat_df[col_percentile] = gat.mean()
            gcn_df[col_percentile] = gcn.mean()
        
        gat_df = gat_df.T
        gcn_df = gcn_df.T
        gat_df["Algorithm"] = "GAT"
        gcn_df["Algorithm"] = "GCN"
        
        gat_df = gat_df.reset_index().rename(columns={'index':'Threshold'})
        gcn_df = gcn_df.reset_index().rename(columns={'index':'Threshold'})
        
        df = pd.concat([gat_df, gcn_df]).reset_index()
        df = df.rename(columns={
                "loss": "Loss",
                "acc": "Accuracy",
                "auc_roc": "AUC-ROC",
                "auc_pr": "AUC-PR",
                "precision": "Precision",
                "recall": "Recall"
            })
        results[strategy] = df
            
    return results



strategy_translation = {
    "snf": "SNF",
    "correlation": "CGEN",   # Correlation Gene Expression Network
    "correlation_multi_omics": "CMON"   # Correlation Multi-Omics Network
}
def plot_results(base):
    base = base+"plots/"
    metrics = ["AUC-ROC", "AUC-PR", "Precision", "Recall", "Accuracy"]
    cancers = ["COAD", "KIRC", "LUAD"]
    algorithms = ["MLP", "GAT", "GCN"]
    colors = ["#41AFD3", "#A2E454", "#E0823C"]
    
    for cancer in cancers:
        print(cancer)
        results_complete = load_results_complete(cancer, base+f"../{cancer}")
        
        for strategy, results in results_complete.items():
            for threshold, df in results.items():
                threshold_str = threshold[0]+"."+threshold[1:]
                title = f"{cancer}: {strategy_translation[strategy]} - {threshold_str}"
                file_name_base = f"{base}{cancer}/"
                
                for metric in metrics:
                    random.seed(42)
                    np.random.seed(42)
                    file_name = f"{file_name_base}{metric}/{cancer}_{strategy_translation[strategy]}_{threshold_str}.pdf" 
                    ax = sns.violinplot(data=df, x='Algorithm', y=metric, palette='turbo',
                                        inner=None, linewidth=0, saturation=0.4, scale='count')
                    ax.set(ylim=(0, 1))
                    sns.boxplot(x='Algorithm', y=metric, data=df, palette='turbo', width=0.3,
                                boxprops={'zorder': 2}, ax=ax).set(title=title)
                    
                    for i, algorithm in enumerate(algorithms):
                        median = df.loc[df['Algorithm']==algorithm][metric].median()
                        plt.axhline(y=median, color=colors[i], linestyle ="--")
                    #return ax, df
                    
                    plt.savefig(file_name)
                    plt.clf()
    return


def plot_results_lines(base):
    base = base+"plots/"
    metrics = ["AUC-ROC", "AUC-PR", "Precision", "Recall", "Accuracy"]
    cancers = ["COAD", "KIRC", "LUAD"]
    for cancer in cancers:
        print(cancer)
        results_complete = load_means_accross_thresholds(cancer, base+f"../{cancer}")
        
        for strategy, results in results_complete.items():
            title = f"{cancer}: {strategy_translation[strategy]}"
            file_name_base = f"{base}{cancer}/"

            for metric in metrics:
                file_name = f"{file_name_base}{metric}/{cancer}_{strategy_translation[strategy]}_growth.pdf"
                sns.lineplot(x="Threshold", y=metric, hue="Algorithm", data=results, 
                             palette=["#A2E454", "#E0823C"], marker="o").set(title=title)
                plt.savefig(file_name)
                plt.clf()
    return

In [28]:
import numpy as np
import pandas as pd
from scipy import stats
import scikit_posthocs as sp
import itertools
from contextlib import redirect_stdout


def stats_test(df, p_target, out_file_name):

    c1to3 = []
    for i in range(1,4):
        c1to3.append(df[df['Algorithm'] == i])
    
    file = out_file_name + '_kw.txt' 
    print("Saving results to ", file)
    with open(file, 'w') as f:
        with redirect_stdout(f):

            for column in df.columns: 
                if column == 'Algorithm': 
                    continue 
                print(f'Kruskal Wallis test for {column}') 
                # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kruskal.html
                w, p = stats.kruskal(c1to3[0][column], c1to3[1][column], c1to3[2][column]) 
                 
                if p > p_target: 
                    print('It does not rejects the Null Hypothesis that data belongs the same distribution.') 
                    print(f'The Kruskal-Wallis H statistic: {w}') 
                    print(f'p-value: {p:.4f}') 
                else: 
                    print('It rejects the Null Hypothesis that data belongs the same distribution.') 
                    print(f'The Kruskal-Wallis H statistic: {w}') 
                    print(f'p-value: {p:.4f}') 

             
                    #phdt = sp.posthoc_ttest(a=df, val_col=column, group_col='Algorithm', p_adjust='bonferroni')
                    phdt = sp.posthoc_wilcoxon(a=df, val_col=column, group_col='Algorithm', p_adjust='bonferroni')
                    print('Post-Hoc Wilcoxon signed-rank test with Bonferroni correction (p-values)')
                    print(phdt)

                    print(f'Pairs with p < {p_target}:')
                    comb = itertools.combinations(phdt.columns, 2)
                    for c1, c2 in comb:
                        if phdt[c1][c2] < p_target:
                            print(c1, ',', c2, ':', phdt[c1][c2])

                print('\n')

In [9]:
def get_algorithm_df(all_data, network, threshold, algorithm, setup):
    
    df = all_data[network][threshold]
    df = df.loc[df['Algorithm'] == algorithm]
    df = df.replace(algorithm, setup)
    df = df.loc[:, ['Algorithm', 'Accuracy', 'AUC-ROC', 'AUC-PR']]
    return df


# *_best: tuple ('threshold', 'GNN')
def perform_stats_test(cancer_cohort, snf_best, cgen_best, cmon_best, base=None):
    cancer = load_results_complete(cancer_cohort, base)
    
    mlp = get_algorithm_df(cancer, 'snf', '001', 'MLP', 'MLP')
    snf = get_algorithm_df(cancer, 'snf', snf_best[0], snf_best[1], 'SNF-Best')
    cgen = get_algorithm_df(cancer, 'correlation', cgen_best[0], cgen_best[1], 'CGEN-Best')
    cmon = get_algorithm_df(cancer, 'correlation_multi_omics', cmon_best[0], cmon_best[1], 'CMON-Best')
    
    best_cfgs = pd.concat([mlp,snf,cgen,cmon])
    best_cfgs = best_cfgs.reset_index(drop=True)
    
    stats_test(best_cfgs, 0.05, cancer_cohort)
    return best_cfgs

In [18]:
def show_metrics_mean_and_std(best_cfgs):
    pd.options.display.float_format = "{:,.2f}".format
    for alg in ["MLP", "CGEN-Best", "CMON-Best", "SNF-Best"]:
        print("Algorithm: ", alg)
        alg_results = best_cfgs.loc[best_cfgs["Algorithm"] == alg]
        display(alg_results.describe().iloc[1:3, :])

In [29]:
cancer = 'KIRC'
cgen_best = ['099', 'GAT']
cmon_best = ['099', 'GAT']
snf_best = ['099', 'GAT']

base = f"C:/Users/colombelli/Desktop/TCC/experiments_extra_40/{cancer}" 
best_cfgs = perform_stats_test(cancer, snf_best, cgen_best, cmon_best, base)
show_metrics_mean_and_std(best_cfgs)

Saving results to  KIRC_kw.txt
Algorithm:  MLP


Unnamed: 0,Accuracy,AUC-ROC,AUC-PR
mean,0.51,0.74,0.54
std,0.06,0.05,0.08


Algorithm:  CGEN-Best


Unnamed: 0,Accuracy,AUC-ROC,AUC-PR
mean,0.47,0.69,0.45
std,0.07,0.04,0.06


Algorithm:  CMON-Best


Unnamed: 0,Accuracy,AUC-ROC,AUC-PR
mean,0.51,0.73,0.52
std,0.08,0.06,0.08


Algorithm:  SNF-Best


Unnamed: 0,Accuracy,AUC-ROC,AUC-PR
mean,0.48,0.71,0.49
std,0.07,0.05,0.07


In [32]:
cancer = 'COAD'
cgen_best = ['099', 'GAT']
cmon_best = ['099', 'GAT']
snf_best = ['099', 'GCN']

base = f"C:/Users/colombelli/Desktop/TCC/experiments_extra_40/{cancer}" 
best_cfgs = perform_stats_test(cancer, snf_best, cgen_best, cmon_best, base)
show_metrics_mean_and_std(best_cfgs)

Saving results to  COAD_kw.txt
Algorithm:  MLP


Unnamed: 0,Accuracy,AUC-ROC,AUC-PR
mean,0.41,0.66,0.39
std,0.09,0.06,0.08


Algorithm:  CGEN-Best


Unnamed: 0,Accuracy,AUC-ROC,AUC-PR
mean,0.41,0.65,0.39
std,0.08,0.06,0.07


Algorithm:  CMON-Best


Unnamed: 0,Accuracy,AUC-ROC,AUC-PR
mean,0.41,0.65,0.38
std,0.09,0.07,0.08


Algorithm:  SNF-Best


Unnamed: 0,Accuracy,AUC-ROC,AUC-PR
mean,0.37,0.63,0.36
std,0.1,0.07,0.07


In [34]:
cancer = 'LUAD'
cgen_best = ['099', 'GAT']
cmon_best = ['099', 'GAT']
snf_best = ['099', 'GAT']

base = f"C:/Users/colombelli/Desktop/TCC/experiments_extra_40/{cancer}" 
best_cfgs = perform_stats_test(cancer, snf_best, cgen_best, cmon_best, base)
show_metrics_mean_and_std(best_cfgs)

Saving results to  LUAD_kw.txt
Algorithm:  MLP


Unnamed: 0,Accuracy,AUC-ROC,AUC-PR
mean,0.45,0.71,0.46
std,0.06,0.04,0.07


Algorithm:  CGEN-Best


Unnamed: 0,Accuracy,AUC-ROC,AUC-PR
mean,0.46,0.7,0.43
std,0.07,0.04,0.07


Algorithm:  CMON-Best


Unnamed: 0,Accuracy,AUC-ROC,AUC-PR
mean,0.43,0.7,0.42
std,0.07,0.04,0.06


Algorithm:  SNF-Best


Unnamed: 0,Accuracy,AUC-ROC,AUC-PR
mean,0.45,0.7,0.44
std,0.07,0.05,0.07


In [33]:
base = "C:/Users/colombelli/Desktop/TCC/experiments_extra_40/"
plot_results(base)

COAD
KIRC
LUAD


<Figure size 432x288 with 0 Axes>

In [34]:
plot_results_lines(base)

COAD
KIRC
LUAD


<Figure size 432x288 with 0 Axes>

In [48]:
cancer = "COAD"
base = f"C:/Users/colombelli/Desktop/TCC/experiments_extra_40/{cancer}" 
results=load_means_accross_thresholds(cancer, base) 

In [49]:
pd.options.display.float_format = "{:,.4f}".format
for strategy, df in results.items():
    print("\n\n", strategy, "\n")
    
    print("GAT")
    gat = df.loc[df['Algorithm'] == "GAT"]
    display(gat.T)
    
    print("\nGCN")
    gcn = df.loc[df['Algorithm'] == "GCN"]
    display(gcn.T)



 snf 

GAT


Unnamed: 0,0,1,2,3,4,5,6,7,8
index,0,1,2,3,4,5,6,7,8
Threshold,0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99
Loss,0.9009,11.2272,1.0149,0.8650,1.0999,2.0159,8.5848,4.2092,3.6328
Accuracy,0.2262,0.2910,0.2600,0.2428,0.2724,0.3028,0.2503,0.3234,0.3690
AUC-ROC,0.4811,0.5244,0.5102,0.5133,0.5195,0.5733,0.5068,0.5868,0.6314
AUC-PR,0.2503,0.2799,0.2673,0.2725,0.2819,0.3071,0.2603,0.3242,0.3562
Precision,0.0221,0.2910,0.0000,0.0226,0.0459,0.3122,0.2621,0.3322,0.3723
Recall,0.0221,0.2910,0.0000,0.0159,0.0269,0.1800,0.2379,0.3131,0.3545
Algorithm,GAT,GAT,GAT,GAT,GAT,GAT,GAT,GAT,GAT



GCN


Unnamed: 0,9,10,11,12,13,14,15,16,17
index,0,1,2,3,4,5,6,7,8
Threshold,0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99
Loss,1.0577,0.7661,1.3561,1.0124,1.0057,1.3811,0.7974,0.9001,2.8563
Accuracy,0.1745,0.2462,0.2931,0.2503,0.2545,0.2462,0.2428,0.3379,0.3676
AUC-ROC,0.4439,0.4990,0.5575,0.5325,0.5542,0.5257,0.5426,0.6141,0.6333
AUC-PR,0.2284,0.2542,0.2911,0.2816,0.2807,0.2694,0.2846,0.3303,0.3648
Precision,0.0000,0.0000,0.1567,0.0000,0.0000,0.0697,0.0100,0.3410,0.3758
Recall,0.0000,0.0000,0.0103,0.0000,0.0000,0.0048,0.0007,0.0897,0.3421
Algorithm,GCN,GCN,GCN,GCN,GCN,GCN,GCN,GCN,GCN




 correlation 

GAT


Unnamed: 0,0,1,2,3,4,5,6,7,8
index,0,1,2,3,4,5,6,7,8
Threshold,0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99
Loss,11.7870,11.7475,10.6421,1.2816,6.5698,2.0469,5.0311,2.2876,2.7058
Accuracy,0.2497,0.2469,0.2414,0.2586,0.2545,0.3200,0.2828,0.3110,0.4090
AUC-ROC,0.4998,0.4979,0.4921,0.5434,0.5042,0.6018,0.5574,0.5799,0.6529
AUC-PR,0.2584,0.2577,0.2555,0.2775,0.2659,0.3245,0.3016,0.3141,0.3919
Precision,0.2497,0.2469,0.2414,0.1610,0.2576,0.3214,0.2836,0.3128,0.4178
Recall,0.2497,0.2469,0.2414,0.0234,0.2497,0.2186,0.2490,0.2510,0.3772
Algorithm,GAT,GAT,GAT,GAT,GAT,GAT,GAT,GAT,GAT



GCN


Unnamed: 0,9,10,11,12,13,14,15,16,17
index,0,1,2,3,4,5,6,7,8
Threshold,0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99
Loss,0.7620,1.0213,1.3646,0.7567,1.0126,1.3643,1.0604,2.9406,3.4559
Accuracy,0.2379,0.2241,0.2793,0.2428,0.2483,0.2724,0.3379,0.3262,0.3676
AUC-ROC,0.5152,0.5071,0.5508,0.5607,0.5579,0.5851,0.6386,0.5963,0.6168
AUC-PR,0.2732,0.2537,0.2871,0.2812,0.2820,0.2951,0.3494,0.3297,0.3558
Precision,0.0000,0.0000,0.0865,0.0000,0.0000,0.2276,0.3360,0.3328,0.3630
Recall,0.0000,0.0000,0.0076,0.0000,0.0000,0.0345,0.0986,0.2731,0.3221
Algorithm,GCN,GCN,GCN,GCN,GCN,GCN,GCN,GCN,GCN




 correlation_multi_omics 

GAT


Unnamed: 0,0,1,2,3,4,5,6,7,8
index,0,1,2,3,4,5,6,7,8
Threshold,0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99
Loss,1.0168,3.2550,1.0117,11.7971,2.7947,11.5728,2.8092,1.5895,2.6431
Accuracy,0.2393,0.2572,0.2386,0.2524,0.2634,0.2821,0.2793,0.2724,0.4076
AUC-ROC,0.4975,0.5080,0.5455,0.5014,0.5215,0.5197,0.5532,0.5296,0.6476
AUC-PR,0.2572,0.2709,0.2680,0.2614,0.2795,0.2720,0.2953,0.2741,0.3811
Precision,0.0000,0.2297,0.0000,0.2524,0.1656,0.2821,0.2855,0.2496,0.4111
Recall,0.0000,0.2248,0.0000,0.2524,0.1503,0.2821,0.2152,0.1462,0.3745
Algorithm,GAT,GAT,GAT,GAT,GAT,GAT,GAT,GAT,GAT



GCN


Unnamed: 0,9,10,11,12,13,14,15,16,17
index,0,1,2,3,4,5,6,7,8
Threshold,0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99
Loss,0.7582,0.7554,0.7555,0.7599,1.3346,1.0360,1.5544,2.1376,6.6231
Accuracy,0.2352,0.2510,0.2421,0.2545,0.2393,0.2462,0.3269,0.3048,0.3979
AUC-ROC,0.5455,0.5644,0.5604,0.5720,0.5649,0.5568,0.5883,0.5561,0.6270
AUC-PR,0.2708,0.2827,0.2745,0.2910,0.2780,0.2768,0.3182,0.2976,0.3616
Precision,0.0000,0.0000,0.0000,0.0000,0.0000,0.0317,0.2849,0.2904,0.4000
Recall,0.0000,0.0000,0.0000,0.0000,0.0000,0.0028,0.1152,0.1752,0.3917
Algorithm,GCN,GCN,GCN,GCN,GCN,GCN,GCN,GCN,GCN


In [24]:
cancer = "COAD"
base = f"C:/Users/colombelli/Desktop/TCC/experiments_extra_40/{cancer}" 
mlp = pd.read_csv(base+"/mlp/mlp_results.csv")
mlp.mean()

loss        2.49
acc         0.41
auc_roc     0.66
auc_pr      0.39
precision   0.42
recall      0.38
dtype: float64

In [36]:
table_dict = {}
for s in ['correlation', 'correlation_multi_omics', 'snf']:
    for alg in ["GAT", "GCN"]:
        
        table_dict[s+"-"+alg] = results[s].loc[results[s]['Algorithm'] == alg]["AUC-PR"].values

In [37]:
pd.DataFrame.from_dict(table_dict, orient='index')

Unnamed: 0,0,1,2,3,4,5,6,7,8
correlation-GAT,0.27,0.3,0.29,0.34,0.33,0.31,0.29,0.34,0.43
correlation-GCN,0.29,0.31,0.31,0.36,0.32,0.31,0.33,0.34,0.39
correlation_multi_omics-GAT,0.3,0.28,0.27,0.3,0.29,0.31,0.31,0.27,0.42
correlation_multi_omics-GCN,0.32,0.35,0.33,0.32,0.32,0.29,0.31,0.31,0.39
snf-GAT,0.3,0.28,0.29,0.3,0.29,0.29,0.29,0.31,0.44
snf-GCN,0.27,0.27,0.29,0.28,0.29,0.29,0.31,0.33,0.4
