# Compute the metrics for DAG based models: 
### A. TropiGAT
### B. TropiSAGE
***

> Importations :
    

In [None]:
rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/train_nn/TropiGAT_ensemble_27_11_2024_log \
/media/concha-eloko/Linux/PPT_clean/reviewed_models

rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/train_nn/TropiGAT_ensemble_ultraF_27_11_2024_log \
/media/concha-eloko/Linux/PPT_clean/reviewed_models

rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/train_nn/TropiSAGE_ensemble_27_11_2024_log \
/media/concha-eloko/Linux/PPT_clean/reviewed_models

rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/train_nn/TropiSAGE_ensemble_ultraF_27_11_2024_log \
/media/concha-eloko/Linux/PPT_clean/reviewed_models

> Relevant functions: 

In [3]:
def make_table(log_file) :
    import pandas as pd
    lines_data = [line for line in open(log_file).read().split("\n") if line[0:6] == "Epoch:" if len(line.split("\t")) == 6] # if len(line.split("\t")) == 6*
    lines_split = [line.split("\t") for line in lines_data]
    df_raw = pd.DataFrame(lines_split , columns = ["Epoch","Train_loss","Test_loss","MCC","AUC","Acc"])
    df = df_raw.applymap(lambda x: float(x.split(":")[1]))
    df.set_index("Epoch", inplace = True)
    return df
    
def plot_loss(df) : 
    import matplotlib.pyplot as plt
    import seaborn as sns
    sns.set(style="whitegrid")
    plt.figure(figsize=(10, 6))
    plt.plot(df.index, df['Train_loss'], label='train loss', marker='o', linestyle='-', color ="red")
    plt.plot(df.index, df['Test_loss'], label='test loss', marker='s', linestyle='--', color = "blue")
    plt.title('Loss over epochs')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)   
    plt.show()  
    

### A. TropiGAT

In [1]:
import os 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

path_metrics = "/media/concha-eloko/Linux/PPT_clean/reviewed_models"

path_tropigat_uf = f"{path_metrics}/TropiGAT_ensemble_ultraF_27_11_2024_log"
path_tropigat = f"{path_metrics}/TropiGAT_ensemble_27_11_2024_log"

> Regular filtration: 

In [2]:
# Open the data frame
metrics_names = ["KL_type" , "n_prophages", "F1_score", "precision" , "recall", "MCC", "Accuracy", "AUC"]
df_metrics = pd.read_csv(f"{path_tropigat}/Metric_Report.27_11_2024.tsv", sep = "\t", names = metrics_names)
df_metrics = df_metrics.drop_duplicates(subset = ["KL_type"], keep = "last")
df_metrics_sorted = df_metrics.sort_values(by='KL_type', key=lambda x: x.str.split("KL").str[1].str.split("__").str[0].astype(int))
df_metrics_sorted.reset_index(drop=True, inplace=True)

# Add the KL_clean column : 
df_metrics_sorted["KL_clean"] = df_metrics_sorted["KL_type"].apply(lambda x : x.split("__")[0])
df_metrics_sorted['F1_score'] = df_metrics_sorted['F1_score'].astype(float)

average_metrics = []

for kltype in df_metrics_sorted["KL_clean"].unique().tolist() :
    df_kl = df_metrics_sorted[df_metrics_sorted["KL_clean"] == kltype]
    # get the optimum version :  
    max_mcc_index = df_kl['MCC'].idxmax()
    v_model = df_kl.loc[max_mcc_index, 'KL_type']
    # get the average metrics :
    av_f1 = np.round(np.mean(df_kl["F1_score"].values), 4)
    av_preci = np.round(np.mean(df_kl["precision"].values), 4)
    av_recall = np.round(np.mean(df_kl["recall"].values), 4)
    av_mcc = np.round(np.mean(df_kl["MCC"].values), 4)
    av_acc = np.round(np.mean(df_kl["Accuracy"].values), 4)
    av_auc = np.round(np.mean(df_kl["AUC"].values), 4)
    # make the line :
    line = [kltype, v_model ,df_kl["n_prophages"].values[0], av_preci, av_recall, av_acc, av_f1, av_auc, av_mcc]
    average_metrics.append(line)

names_col = ["KL_type","model_version","n_prophages","precision","recall","Accuracy","F1_score","AUC","MCC"]
average_metric_df = pd.DataFrame(average_metrics, columns = names_col)
average_metric_df.to_csv(f"/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/Metric_Report.review.GAT.F.tsv", sep = "\t", index = False, header = True)

average_metric_df

Unnamed: 0,KL_type,model_version,n_prophages,precision,recall,Accuracy,F1_score,AUC,MCC
0,KL1,KL1__4,190,0.8537,0.5473,0.9105,0.6428,0.8888,0.6295
1,KL2,KL2__3,351,0.9384,0.7486,0.9495,0.8323,0.9697,0.8104
2,KL3,KL3__5,145,0.8362,0.9857,0.9651,0.9034,0.9857,0.8879
3,KL4,KL4__5,13,0.2000,0.4000,0.8000,0.2667,0.8000,0.1915
4,KL5,KL5__4,28,0.7800,0.7000,0.9000,0.6476,0.8072,0.6529
...,...,...,...,...,...,...,...,...,...
87,KL153,KL153__4,18,0.3000,0.4000,0.8400,0.3333,0.5556,0.2556
88,KL155,KL155__2,13,0.4000,0.4000,0.8857,0.4000,0.9667,0.3667
89,KL157,KL157__4,13,0.5000,0.6000,0.8286,0.5333,0.7667,0.4441
90,KL166,KL166__3,10,0.2000,0.2000,0.7333,0.2000,0.7200,0.0735


> Ultrafiltration:

In [3]:
# Open the data frame
metrics_names = ["KL_type" , "n_prophages", "F1_score", "precision" , "recall", "MCC", "Accuracy", "AUC"]
df_metrics = pd.read_csv(f"{path_tropigat_uf}/Metric_Report.27_11_2024.tsv", sep = "\t", names = metrics_names)
df_metrics = df_metrics.drop_duplicates(subset = ["KL_type"], keep = "last")
df_metrics_sorted = df_metrics.sort_values(by='KL_type', key=lambda x: x.str.split("KL").str[1].str.split("__").str[0].astype(int))
df_metrics_sorted.reset_index(drop=True, inplace=True)

# Add the KL_clean column : 
df_metrics_sorted["KL_clean"] = df_metrics_sorted["KL_type"].apply(lambda x : x.split("__")[0])
df_metrics_sorted['F1_score'] = df_metrics_sorted['F1_score'].astype(float)

average_metrics = []

for kltype in df_metrics_sorted["KL_clean"].unique().tolist() :
    df_kl = df_metrics_sorted[df_metrics_sorted["KL_clean"] == kltype]
    # get the optimum version :  
    max_mcc_index = df_kl['MCC'].idxmax()
    v_model = df_kl.loc[max_mcc_index, 'KL_type']
    # get the average metrics :
    av_f1 = np.round(np.mean(df_kl["F1_score"].values), 4)
    av_preci = np.round(np.mean(df_kl["precision"].values), 4)
    av_recall = np.round(np.mean(df_kl["recall"].values), 4)
    av_mcc = np.round(np.mean(df_kl["MCC"].values), 4)
    av_acc = np.round(np.mean(df_kl["Accuracy"].values), 4)
    av_auc = np.round(np.mean(df_kl["AUC"].values), 4)
    # make the line :
    line = [kltype, v_model ,df_kl["n_prophages"].values[0], av_preci, av_recall, av_acc, av_f1, av_auc, av_mcc]
    average_metrics.append(line)

names_col = ["KL_type","model_version","n_prophages","precision","recall","Accuracy","F1_score","AUC","MCC"]
average_metric_df = pd.DataFrame(average_metrics, columns = names_col)
average_metric_df.to_csv(f"/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/Metric_Report.review.GAT.UF.tsv", sep = "\t", index = False, header = True)

UF_tropigat = average_metric_df

In [4]:
# UF_tropigat.sort_values(by='MCC', ascending=False).to_csv(f"/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/Metric_Report.review.GAT.UF.ordered.tsv", sep = "\t", index = False, header = True)
UF_tropigat.sort_values(by='MCC', ascending=True)

Unnamed: 0,KL_type,model_version,n_prophages,precision,recall,Accuracy,F1_score,AUC,MCC
29,KL31,KL31__4,11,0.0000,0.0000,0.6667,0.0000,0.4800,-0.1832
5,KL6,KL6__1,11,0.0000,0.0000,0.7333,0.0000,0.6000,-0.1032
8,KL9,KL9__3,18,0.0000,0.0000,0.8400,0.0000,0.6444,-0.0436
56,KL103,KL103__3,11,0.0000,0.0000,0.8000,0.0000,0.7600,-0.0400
43,KL55,KL55__4,17,0.0667,0.2000,0.7778,0.1000,0.7000,0.0122
...,...,...,...,...,...,...,...,...,...
44,KL56,KL56__2,13,0.7000,1.0000,0.9143,0.8000,0.9333,0.7873
2,KL3,KL3__5,90,0.8292,0.8445,0.9444,0.8299,0.9531,0.8019
55,KL102,KL102__5,111,0.8570,0.8182,0.9454,0.8323,0.9504,0.8034
20,KL22,KL22__4,43,0.8643,0.8500,0.9440,0.8288,0.9238,0.8156


In [8]:
n_uf = sum(UF_tropigat["n_prophages"])
weighted_mcc_tropigat = [] 
for _, row in UF_tropigat.iterrows() : 
    w_mcc = row["MCC"] * row["n_prophages"] / n_uf
    weighted_mcc_tropigat.append(w_mcc)

MCC_weigth_tropigat = sum(weighted_mcc_tropigat)
MCC_weigth_tropigat


0.5468401081081082

***
### B. TropiSAGE

In [6]:
import os 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

path_metrics = "/media/concha-eloko/Linux/PPT_clean/reviewed_models"

path_tropisage_uf = f"{path_metrics}/TropiSAGE_ensemble_ultraF_27_11_2024_log"
path_tropisage = f"{path_metrics}/TropiSAGE_ensemble_27_11_2024_log"

> Regular filtration: 

In [15]:
# Open the data frame
metrics_names = ["KL_type" , "n_prophages", "F1_score", "precision" , "recall", "MCC", "Accuracy", "AUC"]
df_metrics = pd.read_csv(f"{path_tropisage}/Metric_Report.27_11_2024.tsv", sep = "\t", names = metrics_names)
df_metrics = df_metrics.drop_duplicates(subset = ["KL_type"], keep = "last")
df_metrics_sorted = df_metrics.sort_values(by='KL_type', key=lambda x: x.str.split("KL").str[1].str.split("__").str[0].astype(int))
df_metrics_sorted.reset_index(drop=True, inplace=True)

# Add the KL_clean column : 
df_metrics_sorted["KL_clean"] = df_metrics_sorted["KL_type"].apply(lambda x : x.split("__")[0])
df_metrics_sorted['F1_score'] = df_metrics_sorted['F1_score'].astype(float)

average_metrics = []

for kltype in df_metrics_sorted["KL_clean"].unique().tolist() :
    df_kl = df_metrics_sorted[df_metrics_sorted["KL_clean"] == kltype]
    # get the optimum version :  
    max_mcc_index = df_kl['MCC'].idxmax()
    v_model = df_kl.loc[max_mcc_index, 'KL_type']
    # get the average metrics :
    av_f1 = np.round(np.mean(df_kl["F1_score"].values), 4)
    av_preci = np.round(np.mean(df_kl["precision"].values), 4)
    av_recall = np.round(np.mean(df_kl["recall"].values), 4)
    av_mcc = np.round(np.mean(df_kl["MCC"].values), 4)
    av_acc = np.round(np.mean(df_kl["Accuracy"].values), 4)
    av_auc = np.round(np.mean(df_kl["AUC"].values), 4)
    # make the line :
    line = [kltype, v_model ,df_kl["n_prophages"].values[0], av_preci, av_recall, av_acc, av_f1, av_auc, av_mcc]
    average_metrics.append(line)

names_col = ["KL_type","model_version","n_prophages","precision","recall","Accuracy","F1_score","AUC","MCC"]
average_metric_df = pd.DataFrame(average_metrics, columns = names_col)
average_metric_df.to_csv(f"/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/Metric_Report.review.SAGE.F.tsv", sep = "\t", index = False, header = True)

average_metric_df

Unnamed: 0,KL_type,model_version,n_prophages,precision,recall,Accuracy,F1_score,AUC,MCC
0,KL1,KL1__4,190,0.8853,0.7895,0.9474,0.8338,0.8871,0.8051
1,KL2,KL2__4,351,0.9162,0.8743,0.9657,0.8946,0.9616,0.8746
2,KL3,KL3__5,145,0.9349,0.9572,0.9814,0.9448,0.9974,0.9345
3,KL4,KL4__5,13,0.2000,0.4000,0.8571,0.2667,0.8666,0.2582
4,KL5,KL5__4,28,0.6000,0.8000,0.8875,0.6334,0.8572,0.6152
...,...,...,...,...,...,...,...,...,...
87,KL153,KL153__4,18,0.1167,0.4000,0.7200,0.1800,0.4444,0.0843
88,KL155,KL155__2,13,0.7000,0.8000,0.9143,0.7333,0.9000,0.6958
89,KL157,KL157__4,13,0.5000,0.8000,0.8571,0.6000,0.8000,0.5540
90,KL166,KL166__3,10,0.2333,0.6000,0.6667,0.3333,0.6400,0.2021


> Ultrafiltration:

In [7]:
# Open the data frame
metrics_names = ["KL_type" , "n_prophages", "F1_score", "precision" , "recall", "MCC", "Accuracy", "AUC"]
df_metrics = pd.read_csv(f"{path_tropisage_uf}/Metric_Report.27_11_2024.tsv", sep = "\t", names = metrics_names)
df_metrics = df_metrics.drop_duplicates(subset = ["KL_type"], keep = "last")
df_metrics_sorted = df_metrics.sort_values(by='KL_type', key=lambda x: x.str.split("KL").str[1].str.split("__").str[0].astype(int))
df_metrics_sorted.reset_index(drop=True, inplace=True)

# Add the KL_clean column : 
df_metrics_sorted["KL_clean"] = df_metrics_sorted["KL_type"].apply(lambda x : x.split("__")[0])
df_metrics_sorted['F1_score'] = df_metrics_sorted['F1_score'].astype(float)

average_metrics = []

for kltype in df_metrics_sorted["KL_clean"].unique().tolist() :
    df_kl = df_metrics_sorted[df_metrics_sorted["KL_clean"] == kltype]
    # get the optimum version :  
    max_mcc_index = df_kl['MCC'].idxmax()
    v_model = df_kl.loc[max_mcc_index, 'KL_type']
    # get the average metrics :
    av_f1 = np.round(np.mean(df_kl["F1_score"].values), 4)
    av_preci = np.round(np.mean(df_kl["precision"].values), 4)
    av_recall = np.round(np.mean(df_kl["recall"].values), 4)
    av_mcc = np.round(np.mean(df_kl["MCC"].values), 4)
    av_acc = np.round(np.mean(df_kl["Accuracy"].values), 4)
    av_auc = np.round(np.mean(df_kl["AUC"].values), 4)
    # make the line :
    line = [kltype, v_model ,df_kl["n_prophages"].values[0], av_preci, av_recall, av_acc, av_f1, av_auc, av_mcc]
    average_metrics.append(line)

names_col = ["KL_type","model_version","n_prophages","precision","recall","Accuracy","F1_score","AUC","MCC"]
average_metric_df = pd.DataFrame(average_metrics, columns = names_col)
average_metric_df.to_csv(f"/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/Metric_Report.review.SAGE.UF.tsv", sep = "\t", index = False, header = True)

UF_tropisage = average_metric_df

In [13]:
n_uf = sum(UF_tropisage["n_prophages"])
weighted_mcc_tropisage = [] 
for _, row in UF_tropisage.iterrows() : 
    w_mcc = row["MCC"] * row["n_prophages"] / n_uf
    weighted_mcc_tropisage.append(w_mcc)

MCC_weigth_tropisage = sum(weighted_mcc_tropisage)
MCC_weigth_tropisage

0.5277113783783783

***
### Weighted MCC and P-value:  

In [None]:
# TropiGAT : 
n_uf = sum(UF_tropigat["n_prophages"])
weighted_mcc_tropigat = [] 
for _, row in UF_tropigat.iterrows() : 
    w_mcc = row["MCC"] * row["n_prophages"] / n_uf
    weighted_mcc_tropigat.append(w_mcc)

MCC_weigth_tropigat = sum(weighted_mcc_tropigat)
MCC_weigth_tropigat

# TropiSAGE : 
n_uf = sum(UF_tropisage["n_prophages"])
weighted_mcc_tropisage = [] 
for _, row in UF_tropisage.iterrows() : 
    w_mcc = row["MCC"] * row["n_prophages"] / n_uf
    weighted_mcc_tropisage.append(w_mcc)

MCC_weigth_tropisage = sum(weighted_mcc_tropisage)
MCC_weigth_tropisage



In [14]:
import numpy as np
import scipy.stats as stats

# Assuming you already have the following lists of weighted MCC values:
# weighted_mcc_tropigat and weighted_mcc_tropisage

# Compute the variances of the two groups
var_tropigat = np.var(weighted_mcc_tropigat, ddof=1)  # ddof=1 for sample variance
var_tropisage = np.var(weighted_mcc_tropisage, ddof=1)

# Perform the F-test
f_stat = var_tropigat / var_tropisage
df1 = len(weighted_mcc_tropigat) - 1  # Degrees of freedom for TropiGAT
df2 = len(weighted_mcc_tropisage) - 1  # Degrees of freedom for TropiSAGE

# Two-tailed p-value from the F-distribution
p_value = 2 * min(
    stats.f.cdf(f_stat, df1, df2), 
    1 - stats.f.cdf(f_stat, df1, df2)
)

# Display the results
print(f"F-statistic: {f_stat}")
print(f"Degrees of Freedom: df1 = {df1}, df2 = {df2}")
print(f"P-value: {p_value}")

# Interpretation
if p_value < 0.05:
    print("The variances are significantly different (reject H0).")
else:
    print("The variances are not significantly different (fail to reject H0).")

F-statistic: 1.0911656291380998
Degrees of Freedom: df1 = 84, df2 = 84
P-value: 0.6902155166323276
The variances are not significantly different (fail to reject H0).


In [15]:
if len(weighted_mcc_tropigat) != len(weighted_mcc_tropisage):
    raise ValueError("The two groups must have the same number of models for a paired t-test.")

# Compute the paired t-test
t_stat, p_value = stats.ttest_rel(weighted_mcc_tropigat, weighted_mcc_tropisage)

# Display the results
print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}")

# Interpretation
alpha = 0.05
if p_value < alpha:
    print("There is a significant difference between the two ensembles (reject H0).")
else:
    print("There is no significant difference between the two ensembles (fail to reject H0).")


T-statistic: 2.0090322708830795
P-value: 0.04774511095121014
There is a significant difference between the two ensembles (reject H0).


***
### Compare metrics between each: 

In [10]:
UF_tropigat = pd.read_csv(f"/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/Metric_Report.review.GAT.UF.tsv", sep = "\t", header = 0)
UF_tropisage = pd.read_csv(f"/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/Metric_Report.review.SAGE.UF.tsv", sep = "\t", header = 0)

In [9]:
dico_compare_dag = {}
kltypes = UF_tropigat["KL_type"].tolist()

for _, kltype in 


Unnamed: 0,KL_type,model_version,n_prophages,precision,recall,Accuracy,F1_score,AUC,MCC
0,KL1,KL1__3,58,0.2824,0.3600,0.7882,0.3102,0.8414,0.1943
1,KL2,KL2__3,98,0.5376,0.5556,0.8586,0.5369,0.7873,0.4607
2,KL3,KL3__5,90,0.8200,0.9111,0.9444,0.8508,0.9481,0.8286
3,KL4,KL4__3,12,0.2667,1.0000,0.6000,0.4200,0.9000,0.3772
4,KL5,KL5__2,20,0.5667,0.8000,0.8167,0.6433,0.9000,0.5614
...,...,...,...,...,...,...,...,...,...
80,KL151,KL151__1,21,0.5500,0.7000,0.8333,0.6067,0.7800,0.5182
81,KL153,KL153__4,16,0.2000,0.4000,0.7334,0.2667,0.7250,0.1390
82,KL155,KL155__5,12,0.2667,0.8000,0.6571,0.3933,0.8000,0.3132
83,KL157,KL157__3,13,0.5000,0.6000,0.9143,0.5333,0.6667,0.5291


In [11]:
UF_tropigat

Unnamed: 0,KL_type,model_version,n_prophages,precision,recall,Accuracy,F1_score,AUC,MCC
0,KL1,KL1__3,58,0.3667,0.2400,0.8177,0.2864,0.7628,0.1946
1,KL2,KL2__2,98,0.5973,0.5778,0.8759,0.5743,0.8000,0.5109
2,KL3,KL3__5,90,0.8292,0.8445,0.9444,0.8299,0.9531,0.8019
3,KL4,KL4__1,12,0.4833,1.0000,0.7714,0.6133,0.9667,0.5884
4,KL5,KL5__1,20,0.5333,0.5000,0.9000,0.4933,0.9000,0.4898
...,...,...,...,...,...,...,...,...,...
80,KL151,KL151__1,21,0.6000,0.6000,0.8667,0.5733,0.8400,0.5144
81,KL153,KL153__1,16,0.4000,0.4000,0.8889,0.4000,0.8250,0.3622
82,KL155,KL155__2,12,0.5000,0.6000,0.9143,0.5333,0.8333,0.5291
83,KL157,KL157__4,13,0.4667,0.6000,0.8857,0.5000,0.8333,0.4943


In [12]:
UF_tropisage

Unnamed: 0,KL_type,model_version,n_prophages,precision,recall,Accuracy,F1_score,AUC,MCC
0,KL1,KL1__3,58,0.2824,0.3600,0.7882,0.3102,0.8414,0.1943
1,KL2,KL2__3,98,0.5376,0.5556,0.8586,0.5369,0.7873,0.4607
2,KL3,KL3__5,90,0.8200,0.9111,0.9444,0.8508,0.9481,0.8286
3,KL4,KL4__3,12,0.2667,1.0000,0.6000,0.4200,0.9000,0.3772
4,KL5,KL5__2,20,0.5667,0.8000,0.8167,0.6433,0.9000,0.5614
...,...,...,...,...,...,...,...,...,...
80,KL151,KL151__1,21,0.5500,0.7000,0.8333,0.6067,0.7800,0.5182
81,KL153,KL153__4,16,0.2000,0.4000,0.7334,0.2667,0.7250,0.1390
82,KL155,KL155__5,12,0.2667,0.8000,0.6571,0.3933,0.8000,0.3132
83,KL157,KL157__3,13,0.5000,0.6000,0.9143,0.5333,0.6667,0.5291


***
### Make Figures : 