In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import random

In [28]:
def load_results_complete(cancer, base):
    strategies = ["snf", "correlation", "correlation_multi_omics"]
    percentiles = ["001", "005", "01", "025", "05", "075", "09", "095", "099"]
    
    mlp = pd.read_csv(base+"/mlp/mlp_results.csv")
    mlp['Algorithm'] = "MLP"
    results = {}
    for strategy in strategies:
        results[strategy] = {}
        for percentile in percentiles:
            path = f"{base}/{strategy}/{percentile}/"
            gat = pd.read_csv(path+"gat_results.csv")
            gcn = pd.read_csv(path+"gcn_results.csv")
            
            gat['Algorithm'] = "GAT"
            gcn['Algorithm'] = "GCN"
            
            
            result_df = pd.concat([mlp, gat, gcn])
            result_df = result_df.rename(columns={
                "loss": "Loss",
                "acc": "Accuracy",
                "auc_roc": "AUC-ROC",
                "auc_pr": "AUC-PR",
                "precision": "Precision",
                "recall": "Recall"
            })
            results[strategy][percentile] = result_df
            
    return results


def load_means_accross_thresholds(cancer, base):
    strategies = ["snf", "correlation", "correlation_multi_omics"]
    percentiles = ["001", "005", "01", "025", "05", "075", "09", "095", "099"]
    
    
    results = {}
    for strategy in strategies:
        gat_df = pd.DataFrame()
        gcn_df = pd.DataFrame()
        for percentile in percentiles:
            path = f"{base}/{strategy}/{percentile}/"
            gat = pd.read_csv(path+"gat_results.csv")
            gcn = pd.read_csv(path+"gcn_results.csv")
            
            col_percentile = percentile[0]+"."+percentile[1:]
            gat_df[col_percentile] = gat.mean()
            gcn_df[col_percentile] = gcn.mean()
        
        gat_df = gat_df.T
        gcn_df = gcn_df.T
        gat_df["Algorithm"] = "GAT"
        gcn_df["Algorithm"] = "GCN"
        
        gat_df = gat_df.reset_index().rename(columns={'index':'Threshold'})
        gcn_df = gcn_df.reset_index().rename(columns={'index':'Threshold'})
        
        df = pd.concat([gat_df, gcn_df]).reset_index()
        df = df.rename(columns={
                "loss": "Loss",
                "acc": "Accuracy",
                "auc_roc": "AUC-ROC",
                "auc_pr": "AUC-PR",
                "precision": "Precision",
                "recall": "Recall"
            })
        results[strategy] = df
            
    return results



strategy_translation = {
    "snf": "SNF",
    "correlation": "CGEN",   # Correlation Gene Expression Network
    "correlation_multi_omics": "CMON"   # Correlation Multi-Omics Network
}
def plot_results(base):
    base = base+"plots/"
    metrics = ["AUC-ROC", "AUC-PR", "Precision", "Recall", "Accuracy"]
    cancers = ["COAD", "KIRC", "LUAD"]
    algorithms = ["MLP", "GAT", "GCN"]
    colors = ["#41AFD3", "#A2E454", "#E0823C"]
    
    for cancer in cancers:
        print(cancer)
        results_complete = load_results_complete(cancer, base+f"../{cancer}")
        
        for strategy, results in results_complete.items():
            for threshold, df in results.items():
                threshold_str = threshold[0]+"."+threshold[1:]
                title = f"{cancer}: {strategy_translation[strategy]} - {threshold_str}"
                file_name_base = f"{base}{cancer}/"
                
                for metric in metrics:
                    random.seed(42)
                    np.random.seed(42)
                    file_name = f"{file_name_base}{metric}/{cancer}_{strategy_translation[strategy]}_{threshold_str}.pdf" 
                    ax = sns.violinplot(data=df, x='Algorithm', y=metric, palette='turbo',
                                        inner=None, linewidth=0, saturation=0.4, scale='count')
                    ax.set(ylim=(0, 1))
                    sns.boxplot(x='Algorithm', y=metric, data=df, palette='turbo', width=0.3,
                                boxprops={'zorder': 2}, ax=ax).set(title=title)
                    
                    for i, algorithm in enumerate(algorithms):
                        median = df.loc[df['Algorithm']==algorithm][metric].median()
                        plt.axhline(y=median, color=colors[i], linestyle ="--")
                    #return ax, df
                    
                    plt.savefig(file_name)
                    plt.clf()
    return


def plot_results_lines(base):
    base = base+"plots/"
    metrics = ["AUC-ROC", "AUC-PR", "Precision", "Recall", "Accuracy"]
    cancers = ["COAD", "KIRC", "LUAD"]
    for cancer in cancers:
        print(cancer)
        results_complete = load_means_accross_thresholds(cancer, base+f"../{cancer}")
        
        for strategy, results in results_complete.items():
            title = f"{cancer}: {strategy_translation[strategy]}"
            file_name_base = f"{base}{cancer}/"

            for metric in metrics:
                file_name = f"{file_name_base}{metric}/{cancer}_{strategy_translation[strategy]}_growth.pdf"
                sns.lineplot(x="Threshold", y=metric, hue="Algorithm", data=results, 
                             palette=["#A2E454", "#E0823C"], marker="o").set(title=title)
                plt.savefig(file_name)
                plt.clf()
    return

In [29]:
import numpy as np
import pandas as pd
from scipy import stats
import scikit_posthocs as sp
import itertools
from contextlib import redirect_stdout


def stats_test(df, p_target, out_file_name):

    c1to3 = []
    for i in range(1,4):
        c1to3.append(df[df['Algorithm'] == i])
    
    file = out_file_name + '_kw.txt' 
    print("Saving results to ", file)
    with open(file, 'w') as f:
        with redirect_stdout(f):

            for column in df.columns: 
                if column == 'Algorithm': 
                    continue 
                print(f'Kruskal Wallis test for {column}') 
                # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kruskal.html
                w, p = stats.kruskal(c1to3[0][column], c1to3[1][column], c1to3[2][column]) 
                 
                if p > p_target: 
                    print('It does not rejects the Null Hypothesis that data belongs the same distribution.') 
                    print(f'The Kruskal-Wallis H statistic: {w}') 
                    print(f'p-value: {p:.4f}') 
                else: 
                    print('It rejects the Null Hypothesis that data belongs the same distribution.') 
                    print(f'The Kruskal-Wallis H statistic: {w}') 
                    print(f'p-value: {p:.4f}') 

             
                    phdt = sp.posthoc_ttest(a=df, val_col=column, group_col='Algorithm', p_adjust='bonferroni')
                    #phdt = sp.posthoc_wilcoxon(a=df, val_col=column, group_col='Algorithm', p_adjust='bonferroni')
                    print('Post-Hoc Wilcoxon signed-rank test with Bonferroni correction (p-values)')
                    print(phdt)

                    print(f'Pairs with p < {p_target}:')
                    comb = itertools.combinations(phdt.columns, 2)
                    for c1, c2 in comb:
                        if phdt[c1][c2] < p_target:
                            print(c1, ',', c2, ':', phdt[c1][c2])

                print('\n')

In [30]:
def get_algorithm_df(all_data, network, threshold, algorithm, setup):
    
    df = all_data[network][threshold]
    df = df.loc[df['Algorithm'] == algorithm]
    df = df.replace(algorithm, setup)
    df = df.loc[:, ['Algorithm', 'Accuracy', 'AUC-ROC', 'AUC-PR']]
    return df


# *_best: tuple ('threshold', 'GNN')
def perform_stats_test(cancer_cohort, snf_best, cgen_best, cmon_best, base=None):
    cancer = load_results_complete(cancer_cohort, base)
    
    mlp = get_algorithm_df(cancer, 'snf', '001', 'MLP', 'MLP')
    snf = get_algorithm_df(cancer, 'snf', snf_best[0], snf_best[1], 'SNF-Best')
    cgen = get_algorithm_df(cancer, 'correlation', cgen_best[0], cgen_best[1], 'CGEN-Best')
    cmon = get_algorithm_df(cancer, 'correlation_multi_omics', cmon_best[0], cmon_best[1], 'CMON-Best')
    
    best_cfgs = pd.concat([mlp,snf,cgen,cmon])
    best_cfgs = best_cfgs.reset_index(drop=True)
    
    stats_test(best_cfgs, 0.05, cancer_cohort)
    return #best_cfgs

In [5]:
cancer = 'KIRC'
cgen_best = ['099', 'GCN']
cmon_best = ['099', 'GAT']
snf_best = ['095', 'GCN']

base = f"C:/Users/colombelli/Desktop/TCC/experiments_extra_40/{cancer}" 
perform_stats_test(cancer, snf_best, cgen_best, cmon_best, base)

Saving results to  KIRC_kw.txt


In [125]:
cancer = 'COAD'
cgen_best = ['09', 'GCN']
cmon_best = ['099', 'GAT']
snf_best = ['099', 'GCN']

perform_stats_test(cancer, snf_best, cgen_best, cmon_best)

Saving results to  COAD_kw.txt


In [126]:
cancer = 'LUAD'
cgen_best = ['099', 'GAT']
cmon_best = ['099', 'GAT']
snf_best = ['099', 'GAT']

perform_stats_test(cancer, snf_best, cgen_best, cmon_best)

Saving results to  LUAD_kw.txt


In [33]:
base = "C:/Users/colombelli/Desktop/TCC/experiments_extra_40/"
plot_results(base)

COAD
KIRC
LUAD


<Figure size 432x288 with 0 Axes>

In [34]:
plot_results_lines(base)

COAD
KIRC
LUAD


<Figure size 432x288 with 0 Axes>

In [25]:
cancer = "COAD"
base = f"C:/Users/colombelli/Desktop/TCC/experiments_extra_40/{cancer}" 
results=load_means_accross_thresholds(cancer, base) 

In [26]:
pd.options.display.float_format = "{:,.2f}".format
for strategy, df in results.items():
    print("\n\n", strategy, "\n")
    
    print("GAT")
    gat = df.loc[df['Algorithm'] == "GAT"]
    display(gat.T)
    
    print("\nGCN")
    gcn = df.loc[df['Algorithm'] == "GCN"]
    display(gcn.T)



 snf 

GAT


Unnamed: 0,0,1,2,3,4,5,6,7,8
index,0,1,2,3,4,5,6,7,8
Threshold,0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99
Loss,0.90,11.23,1.01,0.87,1.10,2.02,8.58,4.21,3.63
Accuracy,0.23,0.29,0.26,0.24,0.27,0.30,0.25,0.32,0.37
AUC-ROC,0.48,0.52,0.51,0.51,0.52,0.57,0.51,0.59,0.63
AUC-PR,0.25,0.28,0.27,0.27,0.28,0.31,0.26,0.32,0.36
Precision,0.02,0.29,0.00,0.02,0.05,0.31,0.26,0.33,0.37
Recall,0.02,0.29,0.00,0.02,0.03,0.18,0.24,0.31,0.35
Algorithm,GAT,GAT,GAT,GAT,GAT,GAT,GAT,GAT,GAT



GCN


Unnamed: 0,9,10,11,12,13,14,15,16,17
index,0,1,2,3,4,5,6,7,8
Threshold,0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99
Loss,1.06,0.77,1.36,1.01,1.01,1.38,0.80,0.90,2.86
Accuracy,0.17,0.25,0.29,0.25,0.25,0.25,0.24,0.34,0.37
AUC-ROC,0.44,0.50,0.56,0.53,0.55,0.53,0.54,0.61,0.63
AUC-PR,0.23,0.25,0.29,0.28,0.28,0.27,0.28,0.33,0.36
Precision,0.00,0.00,0.16,0.00,0.00,0.07,0.01,0.34,0.38
Recall,0.00,0.00,0.01,0.00,0.00,0.00,0.00,0.09,0.34
Algorithm,GCN,GCN,GCN,GCN,GCN,GCN,GCN,GCN,GCN




 correlation 

GAT


Unnamed: 0,0,1,2,3,4,5,6,7,8
index,0,1,2,3,4,5,6,7,8
Threshold,0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99
Loss,11.79,11.75,10.64,1.28,6.57,2.05,5.03,2.29,2.71
Accuracy,0.25,0.25,0.24,0.26,0.25,0.32,0.28,0.31,0.41
AUC-ROC,0.50,0.50,0.49,0.54,0.50,0.60,0.56,0.58,0.65
AUC-PR,0.26,0.26,0.26,0.28,0.27,0.32,0.30,0.31,0.39
Precision,0.25,0.25,0.24,0.16,0.26,0.32,0.28,0.31,0.42
Recall,0.25,0.25,0.24,0.02,0.25,0.22,0.25,0.25,0.38
Algorithm,GAT,GAT,GAT,GAT,GAT,GAT,GAT,GAT,GAT



GCN


Unnamed: 0,9,10,11,12,13,14,15,16,17
index,0,1,2,3,4,5,6,7,8
Threshold,0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99
Loss,0.76,1.02,1.36,0.76,1.01,1.36,1.06,2.94,3.46
Accuracy,0.24,0.22,0.28,0.24,0.25,0.27,0.34,0.33,0.37
AUC-ROC,0.52,0.51,0.55,0.56,0.56,0.59,0.64,0.60,0.62
AUC-PR,0.27,0.25,0.29,0.28,0.28,0.30,0.35,0.33,0.36
Precision,0.00,0.00,0.09,0.00,0.00,0.23,0.34,0.33,0.36
Recall,0.00,0.00,0.01,0.00,0.00,0.03,0.10,0.27,0.32
Algorithm,GCN,GCN,GCN,GCN,GCN,GCN,GCN,GCN,GCN




 correlation_multi_omics 

GAT


Unnamed: 0,0,1,2,3,4,5,6,7,8
index,0,1,2,3,4,5,6,7,8
Threshold,0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99
Loss,1.02,3.25,1.01,11.80,2.79,11.57,2.81,1.59,2.64
Accuracy,0.24,0.26,0.24,0.25,0.26,0.28,0.28,0.27,0.41
AUC-ROC,0.50,0.51,0.55,0.50,0.52,0.52,0.55,0.53,0.65
AUC-PR,0.26,0.27,0.27,0.26,0.28,0.27,0.30,0.27,0.38
Precision,0.00,0.23,0.00,0.25,0.17,0.28,0.29,0.25,0.41
Recall,0.00,0.22,0.00,0.25,0.15,0.28,0.22,0.15,0.37
Algorithm,GAT,GAT,GAT,GAT,GAT,GAT,GAT,GAT,GAT



GCN


Unnamed: 0,9,10,11,12,13,14,15,16,17
index,0,1,2,3,4,5,6,7,8
Threshold,0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99
Loss,0.76,0.76,0.76,0.76,1.33,1.04,1.55,2.14,6.62
Accuracy,0.24,0.25,0.24,0.25,0.24,0.25,0.33,0.30,0.40
AUC-ROC,0.55,0.56,0.56,0.57,0.56,0.56,0.59,0.56,0.63
AUC-PR,0.27,0.28,0.27,0.29,0.28,0.28,0.32,0.30,0.36
Precision,0.00,0.00,0.00,0.00,0.00,0.03,0.28,0.29,0.40
Recall,0.00,0.00,0.00,0.00,0.00,0.00,0.12,0.18,0.39
Algorithm,GCN,GCN,GCN,GCN,GCN,GCN,GCN,GCN,GCN


In [24]:
cancer = "COAD"
base = f"C:/Users/colombelli/Desktop/TCC/experiments_extra_40/{cancer}" 
mlp = pd.read_csv(base+"/mlp/mlp_results.csv")
mlp.mean()

loss        2.49
acc         0.41
auc_roc     0.66
auc_pr      0.39
precision   0.42
recall      0.38
dtype: float64