In [2]:
from torch_geometric.data import HeteroData, DataLoader
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, to_hetero , SAGEConv
from torch_geometric.utils import negative_sampling
from torch_geometric.loader import LinkNeighborLoader

import torch
from torch import nn 
import torch.nn.functional as F
import torch.optim as optim

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder , label_binarize , OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import os 
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
import random
from collections import Counter
import warnings
warnings.filterwarnings("ignore") 


In [4]:
# *****************************************************************************
# Load the Dataframes :
path_work = "/media/concha-eloko/Linux/PPT_clean"
#path_work = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023"

    # Open the DF
DF_info = pd.read_csv(f"{path_work}/TropiGATv2.final_df.tsv", sep = "\t" ,  header = 0)

# Let's do without the anubis_return : 
DF_info = DF_info[DF_info["Dataset"] != "anubis_return"]
DF_info

Unnamed: 0,Phage,Protein_name,KL_type_LCA,Infected_ancestor,index,Dataset,seq,domain_seq,1,2,...,1272,1273,1274,1275,1276,1277,1278,1279,1280,prophage_id
0,GCF_016651625.1__phage29,GCF_016651625.1__phage29__142,KL43,GCF_016651625.1,ppt__1,ppt,MSVPNQTPYNIYTANGLTTVFTYEFYIISASDLRVSINGDVVTSGY...,KDFVNINDYWFPTDGDDFYPALNKALSVSPHVLIPPGKHYLKSTVS...,-0.018416,0.022387,...,0.004437,0.087907,0.015800,0.025778,0.065790,0.034045,-0.070899,0.016068,0.065339,prophage_12186
1,GCF_016651625.1__phage29,GCF_016651625.1__phage29__150,KL43,GCF_016651625.1,anubis__0,anubis,MRANLIKTNFTAGEISPRLMGRVDIARYANGAKIIENAVCVVQGGV...,QAASPGAWTREDTVWTEEFGYPGAVTLYQQRLVLAGSPQYPQTIWW...,0.036016,0.005938,...,-0.037612,0.008772,0.010556,-0.049738,-0.012549,0.092624,-0.136602,-0.191378,0.135658,prophage_12186
2,GCF_016651625.1__phage12,GCF_016651625.1__phage12__59,KL43,GCF_016651625.1,ppt__4,ppt,MSISKRNFLKAVSCAYFFYSFKALTKVNQPIEDYISTKDKNTWPSK...,NTWPSKVHRVEEFYTSTDRDYSDAILRGINYCSLNNCVLFFSDKYK...,0.026004,0.024372,...,-0.026018,0.018206,0.036751,-0.032549,0.064112,0.061520,-0.024423,-0.027998,0.028089,prophage_924
3,GCF_019928025.1__phage0,GCF_019928025.1__phage0__10,KL43,n1471,ppt__4,ppt,MSISKRNFLKAVSCAYFFYSFKALTKVNQPIEDYISTKDKNTWPSK...,NTWPSKVHRVEEFYTSTDRDYSDAILRGINYCSLNNCVLFFSDKYK...,0.026004,0.024372,...,-0.026018,0.018206,0.036751,-0.032549,0.064112,0.061520,-0.024423,-0.027998,0.028089,prophage_2929
4,GCF_004313505.1__phage4,GCF_004313505.1__phage4__113,KL14,GCF_004313505.1,anubis__5,anubis,MSEYDTGNPVPSASMPDAWDNMQSIDKFVNSSDETITTRTGQQLDT...,KAIFDAWLDFGIDWNGNESISLQLQTAVNYVSKLPYGGEIVLRPGV...,-0.023648,0.052674,...,-0.025991,0.068538,-0.051192,0.026481,0.069100,0.017813,-0.103797,0.018961,0.117058,prophage_11091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17712,GCF_015209405.1__phage17,GCF_015209405.1__phage17__0,KL2,81.9/1004099,anubis__1643,anubis,MASIKELPRWEDEVYQIARGDKVEGGVGGIANMQAKTLAERTRYLK...,GERNKPRDRVLYREFSRIGKNGALTERIVKDIPDGTIGYAAIIQKE...,-0.014377,0.119705,...,-0.001167,0.108840,0.008335,0.026747,0.046616,0.033648,-0.092224,0.048720,0.034757,prophage_12
17713,GCF_004311345.1__phage11,GCF_004311345.1__phage11__99,KL34,GCF_004311345.1,anubis__1644,anubis,MTANYPASILPPNATAVERAIDRASAAALERLPVYLIRWVKDPDSC...,LMAIRPGAFDDLPNVNNCKNIFTNCSSLTGIPASLFSRMKIEDFSD...,-0.001691,-0.067717,...,0.073706,0.057788,0.004341,0.007389,-0.081588,0.100995,-0.045545,0.021685,0.013167,prophage_11944
17714,GCF_900172635.1__phage3,GCF_900172635.1__phage3__1811,KL124,GCF_900172635.1,anubis__1647,anubis,MTANYPASILPPNATAVERAIDRASAAALERLPVYLIRWVKDPDSC...,LMAIRPGAFDDLPNVNNCKNIFTNCSSLAGIPASLFSRMKIEDFSD...,-0.002816,-0.055336,...,0.056314,0.059700,0.004641,0.016676,-0.071711,0.095491,-0.055784,0.015615,0.000201,prophage_11522
17715,GCF_013604525.1__phage18,GCF_013604525.1__phage18__52,KL30,n49324932,anubis__1649,anubis,MANIEKLGSSSPEVLLKNATNLDKLVNGRESESLPDRFGVLRKTWH...,LRYCTPEMMEAAGDGVADDSLAFQAALDEAASRTIMVNGSYAPQVV...,-0.000031,0.046085,...,-0.006530,0.077754,-0.024675,0.026408,0.016113,0.043904,-0.091418,0.078778,0.105842,prophage_2576


> ambiguity levels :

In [6]:
# No ambiguity lvl_0: 
DF_info_lvl_0 = DF_info[~DF_info["KL_type_LCA"].str.contains("\\|")]
DF_info_lvl_0 = DF_info_lvl_0.drop_duplicates(subset = ["Infected_ancestor","index","prophage_id"] , keep = "first").reset_index(drop=True)

# ambiguity lvl_1 : tolerate 2 different KLtypes : 
columns_name = DF_info.columns
DF_info_dict = DF_info.to_dict(orient='index')
with open(f"{path_work}/TropiGATv2.ambiguity.lvl_1.tsv", "w") as outfile:
    outfile.write(f"{','.join(columns_name)}\n")
    for index, row in tqdm(DF_info_dict.items()):
        if row["KL_type_LCA"].count("|") == 0:
            line = [str(row[info]) for info in row]
            outfile.write(f"{','.join(line)}\n")
        elif row["KL_type_LCA"].count("|") <= 1:
            kltypes = row["KL_type_LCA"].split("|")
            for index_kl, kltype in enumerate(kltypes) :
                row["KL_type_LCA"] = kltype
                row["Infected_ancestor"] = f"{row['Infected_ancestor']}_{index_kl}"
                line = [str(row[info]) for info in row]
                outfile.write(f"{','.join(line)}\n")
        else :
            pass

# ambiguity lvl_2 : tolerate 3 different KLtypes : 
columns_name = DF_info.columns
DF_info_dict = DF_info.to_dict(orient='index')
with open(f"{path_work}/TropiGATv2.ambiguity.lvl_2.tsv", "w") as outfile:
    outfile.write(f"{','.join(columns_name)}\n")
    for index, row in tqdm(DF_info_dict.items()):
        if row["KL_type_LCA"].count("|") == 0:
            line = [str(row[info]) for info in row]
            outfile.write(f"{','.join(line)}\n")
        elif row["KL_type_LCA"].count("|") <= 2:
            kltypes = row["KL_type_LCA"].split("|")
            for index_kl, kltype in enumerate(kltypes) :
                row["KL_type_LCA"] = kltype
                row["Infected_ancestor"] = f"{row['Infected_ancestor']}_{index_kl}"
                line = [str(row[info]) for info in row]
                outfile.write(f"{','.join(line)}\n")
        else :
            pass


100%|███████████████████████████████████| 17717/17717 [00:13<00:00, 1326.22it/s]
100%|███████████████████████████████████| 17717/17717 [00:12<00:00, 1411.47it/s]


In [7]:
# level 0 :
DF_info_lvl_0 = DF_info[~DF_info["KL_type_LCA"].str.contains("\\|")]
DF_info_lvl_0 = DF_info_lvl_0.drop_duplicates(subset = ["Infected_ancestor","index","prophage_id"] , keep = "first").reset_index(drop=True)

# level 1 :
DF_info_lvl_1 = pd.read_csv(f"{path_work}/TropiGATv2.ambiguity.lvl_1.tsv", sep = "," ,  header = 0)
DF_info_lvl_1 = DF_info_lvl_1.drop_duplicates(subset = ["Infected_ancestor","index","prophage_id"] , keep = "first").reset_index(drop=True)

# level 2 :
DF_info_lvl_2 = pd.read_csv(f"{path_work}/TropiGATv2.ambiguity.lvl_2.tsv", sep = "," ,  header = 0)
DF_info_lvl_2 = DF_info_lvl_2.drop_duplicates(subset = ["Infected_ancestor","index","prophage_id"] , keep = "first").reset_index(drop=True)


In [8]:
DF_info_lvl_0

Unnamed: 0,Phage,Protein_name,KL_type_LCA,Infected_ancestor,index,Dataset,seq,domain_seq,1,2,...,1272,1273,1274,1275,1276,1277,1278,1279,1280,prophage_id
0,GCF_016651625.1__phage29,GCF_016651625.1__phage29__142,KL43,GCF_016651625.1,ppt__1,ppt,MSVPNQTPYNIYTANGLTTVFTYEFYIISASDLRVSINGDVVTSGY...,KDFVNINDYWFPTDGDDFYPALNKALSVSPHVLIPPGKHYLKSTVS...,-0.018416,0.022387,...,0.004437,0.087907,0.015800,0.025778,0.065790,0.034045,-0.070899,0.016068,0.065339,prophage_12186
1,GCF_016651625.1__phage29,GCF_016651625.1__phage29__150,KL43,GCF_016651625.1,anubis__0,anubis,MRANLIKTNFTAGEISPRLMGRVDIARYANGAKIIENAVCVVQGGV...,QAASPGAWTREDTVWTEEFGYPGAVTLYQQRLVLAGSPQYPQTIWW...,0.036016,0.005938,...,-0.037612,0.008772,0.010556,-0.049738,-0.012549,0.092624,-0.136602,-0.191378,0.135658,prophage_12186
2,GCF_016651625.1__phage12,GCF_016651625.1__phage12__59,KL43,GCF_016651625.1,ppt__4,ppt,MSISKRNFLKAVSCAYFFYSFKALTKVNQPIEDYISTKDKNTWPSK...,NTWPSKVHRVEEFYTSTDRDYSDAILRGINYCSLNNCVLFFSDKYK...,0.026004,0.024372,...,-0.026018,0.018206,0.036751,-0.032549,0.064112,0.061520,-0.024423,-0.027998,0.028089,prophage_924
3,GCF_019928025.1__phage0,GCF_019928025.1__phage0__10,KL43,n1471,ppt__4,ppt,MSISKRNFLKAVSCAYFFYSFKALTKVNQPIEDYISTKDKNTWPSK...,NTWPSKVHRVEEFYTSTDRDYSDAILRGINYCSLNNCVLFFSDKYK...,0.026004,0.024372,...,-0.026018,0.018206,0.036751,-0.032549,0.064112,0.061520,-0.024423,-0.027998,0.028089,prophage_2929
4,GCF_004313505.1__phage4,GCF_004313505.1__phage4__113,KL14,GCF_004313505.1,anubis__5,anubis,MSEYDTGNPVPSASMPDAWDNMQSIDKFVNSSDETITTRTGQQLDT...,KAIFDAWLDFGIDWNGNESISLQLQTAVNYVSKLPYGGEIVLRPGV...,-0.023648,0.052674,...,-0.025991,0.068538,-0.051192,0.026481,0.069100,0.017813,-0.103797,0.018961,0.117058,prophage_11091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9471,GCF_015209405.1__phage17,GCF_015209405.1__phage17__0,KL2,81.9/1004099,anubis__1643,anubis,MASIKELPRWEDEVYQIARGDKVEGGVGGIANMQAKTLAERTRYLK...,GERNKPRDRVLYREFSRIGKNGALTERIVKDIPDGTIGYAAIIQKE...,-0.014377,0.119705,...,-0.001167,0.108840,0.008335,0.026747,0.046616,0.033648,-0.092224,0.048720,0.034757,prophage_12
9472,GCF_004311345.1__phage11,GCF_004311345.1__phage11__99,KL34,GCF_004311345.1,anubis__1644,anubis,MTANYPASILPPNATAVERAIDRASAAALERLPVYLIRWVKDPDSC...,LMAIRPGAFDDLPNVNNCKNIFTNCSSLTGIPASLFSRMKIEDFSD...,-0.001691,-0.067717,...,0.073706,0.057788,0.004341,0.007389,-0.081588,0.100995,-0.045545,0.021685,0.013167,prophage_11944
9473,GCF_900172635.1__phage3,GCF_900172635.1__phage3__1811,KL124,GCF_900172635.1,anubis__1647,anubis,MTANYPASILPPNATAVERAIDRASAAALERLPVYLIRWVKDPDSC...,LMAIRPGAFDDLPNVNNCKNIFTNCSSLAGIPASLFSRMKIEDFSD...,-0.002816,-0.055336,...,0.056314,0.059700,0.004641,0.016676,-0.071711,0.095491,-0.055784,0.015615,0.000201,prophage_11522
9474,GCF_013604525.1__phage18,GCF_013604525.1__phage18__52,KL30,n49324932,anubis__1649,anubis,MANIEKLGSSSPEVLLKNATNLDKLVNGRESESLPDRFGVLRKTWH...,LRYCTPEMMEAAGDGVADDSLAFQAALDEAASRTIMVNGSYAPQVV...,-0.000031,0.046085,...,-0.006530,0.077754,-0.024675,0.026408,0.016113,0.043904,-0.091418,0.078778,0.105842,prophage_2576


***
# Build the graph :

In [9]:
def make_graph(df_info) : 
    # **************************************************************
    # initialize the graph
    graph_data = HeteroData()
    # Indexation process (shall add the N phages to predict)
    indexation_nodes_A = df_info["Infected_ancestor"].unique().tolist()  
    indexation_nodes_B1 = df_info["Phage"].unique().tolist()
    indexation_nodes_B2 = df_info["index"].unique().tolist() 
    ID_nodes_A = {item:index for index, item in enumerate(indexation_nodes_A)}
    ID_nodes_A_r = {index:item for index, item in enumerate(indexation_nodes_A)}
    ID_nodes_B1 = {item:index for index, item in enumerate(indexation_nodes_B1)}
    ID_nodes_B1_r = {index:item for index, item in enumerate(indexation_nodes_B1)}
    ID_nodes_B2 = {item:index for index, item in enumerate(indexation_nodes_B2)}
    ID_nodes_B2_r = {index:item for index, item in enumerate(indexation_nodes_B2)}
    # **************************************************************
    # Make the node feature file : 
    OHE = OneHotEncoder(sparse=False)
    one_hot_encoded = OHE.fit_transform(df_info[["KL_type_LCA"]])
    label_mapping = {label: one_hot_encoded[i] for i, label in enumerate(OHE.categories_[0])}
    embeddings_columns = [str(i) for i in range(1, 1281)]
    node_feature_A = torch.tensor([label_mapping[df_info[df_info["Infected_ancestor"] == ID_nodes_A_r[i]]["KL_type_LCA"].values[0]] for i in range(0,len(ID_nodes_A_r))], dtype=torch.float)
    node_feature_B1 = torch.zeros((len(ID_nodes_B1), 0), dtype=torch.float)
    node_feature_B2 = torch.tensor([df_info[df_info["index"] == ID_nodes_B2_r[i]][embeddings_columns].values[0].tolist() for i in range(0,len(ID_nodes_B2_r))] , dtype=torch.float)
    # feed the graph
    graph_data["A"].x = node_feature_A
    graph_data["B1"].x = node_feature_B1
    graph_data["B2"].x = node_feature_B2
    # **************************************************************
    # Make edge file
    # Node B1 (prophage) - Node A (bacteria) :
    edge_index_B1_A = []
    for _, row in df_info.iterrows() :
        edge_index_B1_A.append([ID_nodes_B1[row["Phage"]], ID_nodes_A[row["Infected_ancestor"]]])
    edge_index_B1_A = torch.tensor(edge_index_B1_A , dtype=torch.long)
    # Node A (bacteria) - Node B1 (prophage) :
    edge_index_A_B1 = []
    for _, row in df_info.iterrows() :
        edge_index_A_B1.append([ID_nodes_A[row["Infected_ancestor"]] , ID_nodes_B1[row["Phage"]]])
    edge_index_A_B1 = torch.tensor(edge_index_A_B1 , dtype=torch.long)
    # Node B2 (depolymerase) - Node B1 (prophage) :
    edge_index_B2_B1 = []
    for phage in df_info.Phage.unique() :
        all_data_phage = df_info[df_info["Phage"] == phage]
        for _, row in all_data_phage.iterrows() :
            edge_index_B2_B1.append([ID_nodes_B2[row["index"]], ID_nodes_B1[row["Phage"]]])
    edge_index_B2_B1 = torch.tensor(edge_index_B2_B1 , dtype=torch.long)
    # feed the graph
    graph_data['B1', 'infects', 'A'].edge_index = edge_index_B1_A.t().contiguous()
    graph_data['B2', 'expressed', 'B1'].edge_index = edge_index_B2_B1.t().contiguous()
    # That one is optional  
    graph_data['A', 'harbors', 'B1'].edge_index = edge_index_A_B1.t().contiguous()
    # **************************************************************
    # Make the Y file : 
    graph_data['B1', 'infects', 'A'].y = torch.ones(len(graph_data['B1', 'infects', 'A'].edge_index[0]))
    graph_data['B2', 'expressed', 'B1'].y = torch.ones(len(graph_data['B2', 'expressed', 'B1'].edge_index[0]))
    # That one is optional  
    graph_data['A', 'harbors', 'B1'].y = torch.ones(len(graph_data['A', 'harbors', 'B1'].edge_index[0]))
    
    return graph_data
    

In [10]:
graph_lvl_0 = make_graph(DF_info_lvl_0)
torch.save(graph_lvl_0 , f'{path_work}/Tropi_graph.lvl_0.woAR.1309.pt')

In [11]:
graph_lvl_1 = make_graph(DF_info_lvl_1)
torch.save(graph_lvl_1 , f'{path_work}/Tropi_graph.lvl_1.woAR.1309.pt')

In [12]:
graph_lvl_2 = make_graph(DF_info_lvl_2)
torch.save(graph_lvl_2 , f'{path_work}/Tropi_graph.lvl_2.woAR.1309.pt')

In [13]:
graph_lvl_0

HeteroData(
  [1mA[0m={ x=[4499, 127] },
  [1mB1[0m={ x=[7573, 0] },
  [1mB2[0m={ x=[3426, 1280] },
  [1m(B1, infects, A)[0m={
    edge_index=[2, 9476],
    y=[9476]
  },
  [1m(B2, expressed, B1)[0m={
    edge_index=[2, 9476],
    y=[9476]
  },
  [1m(A, harbors, B1)[0m={
    edge_index=[2, 9476],
    y=[9476]
  }
)

In [14]:
graph_lvl_1

HeteroData(
  [1mA[0m={ x=[4585, 127] },
  [1mB1[0m={ x=[7640, 0] },
  [1mB2[0m={ x=[3449, 1280] },
  [1m(B1, infects, A)[0m={
    edge_index=[2, 9626],
    y=[9626]
  },
  [1m(B2, expressed, B1)[0m={
    edge_index=[2, 9626],
    y=[9626]
  },
  [1m(A, harbors, B1)[0m={
    edge_index=[2, 9626],
    y=[9626]
  }
)

In [15]:
graph_lvl_2

HeteroData(
  [1mA[0m={ x=[4687, 127] },
  [1mB1[0m={ x=[7688, 0] },
  [1mB2[0m={ x=[3470, 1280] },
  [1m(B1, infects, A)[0m={
    edge_index=[2, 9803],
    y=[9803]
  },
  [1m(B2, expressed, B1)[0m={
    edge_index=[2, 9803],
    y=[9803]
  },
  [1m(A, harbors, B1)[0m={
    edge_index=[2, 9803],
    y=[9803]
  }
)

In [None]:
rsync -avzhe ssh \
/media/concha-eloko/Linux/PPT_clean/Tropi_graph.lvl_0.1309.pt \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/train_nn

rsync -avzhe ssh \
/media/concha-eloko/Linux/PPT_clean/Tropi_graph.lvl_1.1309.pt \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/train_nn

rsync -avzhe ssh \
/media/concha-eloko/Linux/PPT_clean/Tropi_graph.lvl_2.1309.pt \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/train_nn


In [None]:
rsync -avzhe ssh \
/media/concha-eloko/Linux/PPT_clean/Tropi_graph.lvl_0.woAR.1309.pt \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/train_nn

rsync -avzhe ssh \
/media/concha-eloko/Linux/PPT_clean/Tropi_graph.lvl_1.woAR.1309.pt \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/train_nn

rsync -avzhe ssh \
/media/concha-eloko/Linux/PPT_clean/Tropi_graph.lvl_2.woAR.1309.pt \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/train_nn
