In [1]:
from torch_geometric.data import HeteroData, DataLoader
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, to_hetero , SAGEConv
from torch_geometric.utils import negative_sampling
from torch_geometric.loader import LinkNeighborLoader

import torch
from torch import nn 
import torch.nn.functional as F
import torch.optim as optim

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder , label_binarize , OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import os 
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
import random
from collections import Counter
import warnings
warnings.filterwarnings("ignore") 


In [6]:
# *****************************************************************************
# Load the Dataframes :
path_work = "/media/concha-eloko/Linux/PPT_clean"
#path_work = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023"

    # Open the DF
DF_info = pd.read_csv(f"{path_work}/DF_Dpo.final.2705.tsv", sep = "\t" ,  header = 0 )
    # Open the embeddings
DF_embeddings = pd.read_csv(f"{path_work}/Dpo.2705.embeddings.ultimate.csv", sep = ",", header= None )
DF_embeddings.rename(columns={0: 'index'}, inplace=True)

    # Filter the DF :
DF_info_filtered = DF_info[~DF_info["KL_type_LCA"].str.contains("\\|")]
DF_info_ToReLabel = DF_info[DF_info["KL_type_LCA"].str.contains("\\|")]
all_data = pd.merge(DF_info_filtered , DF_embeddings , on = "index")

# Mind the over representation of outbreaks :
all_data = all_data.drop_duplicates(subset = ["Infected_ancestor","index","prophage_id"] , keep = "first").reset_index(drop=True)


In [3]:
all_data

Unnamed: 0,Phage,KL_type_LCA,Infected_ancestor,Protein_name,Dataset,index,seq,prophage_id,1,2,...,1271,1272,1273,1274,1275,1276,1277,1278,1279,1280
0,GCF_016651625.1__phage29,KL43,GCF_016651625.1,GCF_016651625.1__phage29__142,ppt,ppt__1,MSVPNQTPYNIYTANGLTTVFTYEFYIISASDLRVSINGDVVTSGY...,prophage_12186,-0.018416,0.022387,...,0.030484,0.004437,0.087907,0.015800,0.025778,0.065790,0.034045,-0.070899,0.016068,0.065339
1,GCF_016651625.1__phage29,KL43,GCF_016651625.1,GCF_016651625.1__phage29__150,anubis,anubis__0,MRANLIKTNFTAGEISPRLMGRVDIARYANGAKIIENAVCVVQGGV...,prophage_12186,0.036016,0.005938,...,0.008465,-0.037612,0.008772,0.010556,-0.049738,-0.012549,0.092624,-0.136602,-0.191378,0.135658
2,GCF_016651625.1__phage12,KL43,GCF_016651625.1,GCF_016651625.1__phage12__59,ppt,ppt__4,MSISKRNFLKAVSCAYFFYSFKALTKVNQPIEDYISTKDKNTWPSK...,prophage_924,0.026004,0.024372,...,0.054964,-0.026018,0.018206,0.036751,-0.032549,0.064112,0.061520,-0.024423,-0.027998,0.028089
3,GCF_019928025.1__phage0,KL43,n1471,GCF_019928025.1__phage0__10,ppt,ppt__4,MSISKRNFLKAVSCAYFFYSFKALTKVNQPIEDYISTKDKNTWPSK...,prophage_2929,0.026004,0.024372,...,0.054964,-0.026018,0.018206,0.036751,-0.032549,0.064112,0.061520,-0.024423,-0.027998,0.028089
4,GCF_004313505.1__phage4,KL14,GCF_004313505.1,GCF_004313505.1__phage4__113,anubis,anubis__5,MSEYDTGNPVPSASMPDAWDNMQSIDKFVNSSDETITTRTGQQLDT...,prophage_11091,-0.023648,0.052674,...,-0.003019,-0.025991,0.068538,-0.051192,0.026481,0.069100,0.017813,-0.103797,0.018961,0.117058
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9672,GCF_015209405.1__phage17,KL2,81.9/1004099,GCF_015209405.1__phage17__0,anubis,anubis__1643,MASIKELPRWEDEVYQIARGDKVEGGVGGIANMQAKTLAERTRYLK...,prophage_12,-0.014377,0.119705,...,0.040801,-0.001167,0.108840,0.008335,0.026747,0.046616,0.033648,-0.092224,0.048720,0.034757
9673,GCF_004311345.1__phage11,KL34,GCF_004311345.1,GCF_004311345.1__phage11__99,anubis,anubis__1644,MTANYPASILPPNATAVERAIDRASAAALERLPVYLIRWVKDPDSC...,prophage_11944,-0.001691,-0.067717,...,0.119783,0.073706,0.057788,0.004341,0.007389,-0.081588,0.100995,-0.045545,0.021685,0.013167
9674,GCF_900172635.1__phage3,KL124,GCF_900172635.1,GCF_900172635.1__phage3__1811,anubis,anubis__1647,MTANYPASILPPNATAVERAIDRASAAALERLPVYLIRWVKDPDSC...,prophage_11522,-0.002816,-0.055336,...,0.110956,0.056314,0.059700,0.004641,0.016676,-0.071711,0.095491,-0.055784,0.015615,0.000201
9675,GCF_013604525.1__phage18,KL30,n49324932,GCF_013604525.1__phage18__52,anubis,anubis__1649,MANIEKLGSSSPEVLLKNATNLDKLVNGRESESLPDRFGVLRKTWH...,prophage_2576,-0.000031,0.046085,...,-0.016720,-0.006530,0.077754,-0.024675,0.026408,0.016113,0.043904,-0.091418,0.078778,0.105842


In [None]:
#graph_data = torch.load(f'{path_work}/graph_file.1107.pt')
#graph_data = torch.load(f'{path_work}/train_nn/graph_file.1107.pt')

graph_data

In [7]:
all_data[all_data["index"] == "anubis__1644"]

Unnamed: 0,Phage,KL_type_LCA,Infected_ancestor,Protein_name,Dataset,index,seq,prophage_id,1,2,...,1271,1272,1273,1274,1275,1276,1277,1278,1279,1280
9673,GCF_004311345.1__phage11,KL34,GCF_004311345.1,GCF_004311345.1__phage11__99,anubis,anubis__1644,MTANYPASILPPNATAVERAIDRASAAALERLPVYLIRWVKDPDSC...,prophage_11944,-0.001691,-0.067717,...,0.119783,0.073706,0.057788,0.004341,0.007389,-0.081588,0.100995,-0.045545,0.021685,0.013167


***
# Build the graph :

In [8]:
# *****************************************************************************
# Nodes A : the bacteria (ancestors) - KLtype feature
# Nodes B1 : the prophage (phage) - No feature
# Nodes B2 : the depo (index_seq) - 1280-d embeddings

# Build the Graph Data :
graph_data = HeteroData()

    # Indexation process (shall add the N phages to predict)
indexation_nodes_A = all_data["Infected_ancestor"].unique().tolist()  
indexation_nodes_B1 = all_data["Phage"].unique().tolist() #+ [f"Dpo_to_predict_{n}" for n in DF_embeddings["index"].unique().tolist()]
indexation_nodes_B2 = DF_embeddings["index"].unique().tolist() 

ID_nodes_A = {item:index for index, item in enumerate(indexation_nodes_A)}
ID_nodes_A_r = {index:item for index, item in enumerate(indexation_nodes_A)}

ID_nodes_B1 = {item:index for index, item in enumerate(indexation_nodes_B1)}
ID_nodes_B1_r = {index:item for index, item in enumerate(indexation_nodes_B1)}

ID_nodes_B2 = {item:index for index, item in enumerate(indexation_nodes_B2)}
ID_nodes_B2_r = {index:item for index, item in enumerate(indexation_nodes_B2)}

In [9]:
# Make the node feature file : 
OHE = OneHotEncoder(sparse=False)
one_hot_encoded = OHE.fit_transform(all_data[["KL_type_LCA"]])
label_mapping = {label: one_hot_encoded[i] for i, label in enumerate(OHE.categories_[0])}

node_feature_A = torch.tensor([label_mapping[all_data[all_data["Infected_ancestor"] == ID_nodes_A_r[i]]["KL_type_LCA"].values[0]] for i in range(0,len(ID_nodes_A_r))], dtype=torch.float)
node_feature_B1 = torch.zeros((len(ID_nodes_B1), 0), dtype=torch.float)
node_feature_B2 = torch.tensor([DF_embeddings[DF_embeddings["index"] == ID_nodes_B2_r[i]].values[0][1:1281].tolist() for i in range(0,len(ID_nodes_B2_r))] , dtype=torch.float)

# feed the graph
graph_data["A"].x = node_feature_A
graph_data["B1"].x = node_feature_B1
graph_data["B2"].x = node_feature_B2

# Write files : 
#node_feature_A_array = node_feature_A.numpy()
#node_feature_B1_array = node_feature_B1.numpy()
#node_feature_B2_array = node_feature_B2.numpy()

#df_node_feature_A_array = pd.DataFrame(node_feature_A_array)
#df_node_feature_B1_array = pd.DataFrame(node_feature_B1_array)
#df_node_feature_B2_array = pd.DataFrame(node_feature_B2_array)

#df_node_feature_A_array.to_csv(f"{path_work}/node_features.A.csv", index=False, header=False)
#df_node_feature_B1_array.to_csv(f"{path_work}/node_features.B1.csv", index=False, header=False)
#df_node_feature_B2_array.to_csv(f"{path_work}/node_features.B2.csv", index=False, header=False)

> Original : 

In [8]:
# Make edge file
# Node B1 (prophage) - Node A (bacteria) :
edge_index_B1_A = []
for _, row in all_data.iterrows() :
    edge_index_B1_A.append([ID_nodes_B1[row["Phage"]], ID_nodes_A[row["Infected_ancestor"]]])
edge_index_B1_A = torch.tensor(edge_index_B1_A , dtype=torch.long)

# Node A (bacteria) - Node B1 (prophage) :
edge_index_A_B1 = []
for _, row in all_data.iterrows() :
    edge_index_A_B1.append([ID_nodes_A[row["Infected_ancestor"]] , ID_nodes_B1[row["Phage"]]])
edge_index_A_B1 = torch.tensor(edge_index_A_B1 , dtype=torch.long)

# Node B2 (depolymerase) - Node B1 (prophage) :
edge_index_B2_B1 = []
for phage in all_data.Phage.unique() :
    all_data_phage = all_data[all_data["Phage"] == phage]
    for _, row in all_data_phage.iterrows() :
        edge_index_B2_B1.append([ID_nodes_B2[row["index"]], ID_nodes_B1[row["Phage"]]])
# Add in there the edges between the fake prophages and the each Dpos :
#for prophage , index in ID_nodes_B1.items() :
#    if prophage.count("Dpo_to_predict_") > 0 : 
#        id_dpo = prophage.split("Dpo_to_predict_")[1]
#        edge_index_B2_B1.append([ID_nodes_B2[id_dpo], index])
edge_index_B2_B1 = torch.tensor(edge_index_B2_B1 , dtype=torch.long)

# feed the graph
graph_data['B1', 'infects', 'A'].edge_index = edge_index_B1_A.t().contiguous()
graph_data['B2', 'expressed', 'B1'].edge_index = edge_index_B2_B1.t().contiguous()
# That one is optional  
graph_data['A', 'harbors', 'B1'].edge_index = edge_index_A_B1.t().contiguous()

# Write files : 
#edge_index_B1_A_array = edge_index_B1_A.numpy()
#edge_index_A_B1_array = edge_index_A_B1.numpy()
#edge_index_B2_B1_array = edge_index_B2_B1.numpy()

#df_edge_index_B1_A_array = pd.DataFrame(edge_index_B1_A_array)
#df_edge_index_A_B1_array = pd.DataFrame(edge_index_A_B1_array)
#df_edge_index_B2_B1_array = pd.DataFrame(edge_index_B2_B1_array)

#df_edge_index_B1_A_array.to_csv(f"{path_work}/edge_index_B1_A_array.csv", index=False, header=False)
#df_edge_index_A_B1_array.to_csv(f"{path_work}/edge_index_A_B1_array.csv", index=False, header=False)
#df_edge_index_B2_B1_array.to_csv(f"{path_work}/edge_index_B2_B1_array.csv", index=False, header=False)

> Without doublons : 

In [11]:
# **************************************************************
# Make edge file
# Node B1 (prophage) - Node A (bacteria) :
edge_index_B1_A = []
track_B1_A = set()
for _, row in all_data.iterrows() :
    pair = [ID_nodes_B1[row["Phage"]], ID_nodes_A[row["Infected_ancestor"]]]
    if tuple(pair) not in track_B1_A : 
        track_B1_A.add(tuple(pair))
        edge_index_B1_A.append(pair)
    else :
        continue
edge_index_B1_A = torch.tensor(edge_index_B1_A , dtype=torch.long)
# Node A (bacteria) - Node B1 (prophage) :
edge_index_A_B1 = []
track_A_B1 = set()
for _, row in all_data.iterrows() :
    pair = [ID_nodes_A[row["Infected_ancestor"]] , ID_nodes_B1[row["Phage"]]]
    if tuple(pair) not in track_A_B1 :
        track_A_B1.add(tuple(pair))
        edge_index_A_B1.append(pair)
edge_index_A_B1 = torch.tensor(edge_index_A_B1 , dtype=torch.long)
# Node B2 (depolymerase) - Node B1 (prophage) :
edge_index_B2_B1 = []
for phage in all_data.Phage.unique() :
    all_data_phage = all_data[all_data["Phage"] == phage]
    for _, row in all_data_phage.iterrows() :
        edge_index_B2_B1.append([ID_nodes_B2[row["index"]], ID_nodes_B1[row["Phage"]]])
edge_index_B2_B1 = torch.tensor(edge_index_B2_B1 , dtype=torch.long)
# feed the graph
graph_data['B1', 'infects', 'A'].edge_index = edge_index_B1_A.t().contiguous()
graph_data['B2', 'expressed', 'B1'].edge_index = edge_index_B2_B1.t().contiguous()
# That one is optional  
graph_data['A', 'harbors', 'B1'].edge_index = edge_index_A_B1.t().contiguous()

In [12]:
    # Make the Y file : 
graph_data['B1', 'infects', 'A'].y = torch.ones(len(graph_data['B1', 'infects', 'A'].edge_index[0]))
graph_data['B2', 'expressed', 'B1'].y = torch.ones(len(graph_data['B2', 'expressed', 'B1'].edge_index[0]))
# That one is optional  
graph_data['A', 'harbors', 'B1'].y = torch.ones(len(graph_data['A', 'harbors', 'B1'].edge_index[0]))

***
# Work on the GNN

In [13]:
# *****************************************************************************
# Data instance : 

#graph_data
torch.save(graph_data , f'{path_work}/graph_file.debug_clean.1909.pt')
#test_graph = torch.load(f'{path_work}/graph_data/graph_file.2607.OHE.pt')

In [10]:
graph_data 

HeteroData(
  [1mA[0m={ x=[4530, 127] },
  [1mB1[0m={ x=[7731, 0] },
  [1mB2[0m={ x=[3608, 1280] },
  [1m(B1, infects, A)[0m={
    edge_index=[2, 9677],
    y=[9677]
  },
  [1m(B2, expressed, B1)[0m={
    edge_index=[2, 9677],
    y=[9677]
  },
  [1m(A, harbors, B1)[0m={
    edge_index=[2, 9677],
    y=[9677]
  }
)

In [5]:
test_graph

HeteroData(
  [1mA[0m={ x=[4530, 127] },
  [1mB1[0m={ x=[11339, 0] },
  [1mB2[0m={ x=[3608, 1280] },
  [1m(B1, infects, A)[0m={
    edge_index=[2, 7731],
    y=[7731]
  },
  [1m(B2, expressed, B1)[0m={
    edge_index=[2, 13285],
    y=[13285]
  },
  [1m(A, harbors, B1)[0m={
    edge_index=[2, 7731],
    y=[7731]
  }
)

In [None]:
rsync -avzhe ssh \
/media/concha-eloko/Linux/PPT_clean/graph_file.debug.1909.pt \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/train_nn


rsync -avzhe ssh \
/media/concha-eloko/Linux/PPT_clean/graph_file.debug_clean.1909.pt \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/train_nn





***
## Check the integrety of the DF info files : 

The esm2 representations match or not ?

In [14]:
path_work = "/media/concha-eloko/Linux/PPT_clean"

    # Open the DF
DF_info = pd.read_csv(f"{path_work}/TropiGATv2.final_df.tsv", sep = "\t" ,  header = 0)
DF_embeddings = pd.read_csv(f"{path_work}/Dpo.2705.embeddings.ultimate.csv", sep = ",", header= None )
DF_embeddings.rename(columns={0: 'index'}, inplace=True)


In [18]:
DF_info_index = DF_info.drop_duplicates(subset = ["index"], keep = "first")
DF_info_index

for _,row in DF_info_index.iterrows() : 
    

Unnamed: 0,Phage,Protein_name,KL_type_LCA,Infected_ancestor,index,Dataset,seq,domain_seq,1,2,...,1272,1273,1274,1275,1276,1277,1278,1279,1280,prophage_id
0,GCF_016651625.1__phage29,GCF_016651625.1__phage29__142,KL43,GCF_016651625.1,ppt__1,ppt,MSVPNQTPYNIYTANGLTTVFTYEFYIISASDLRVSINGDVVTSGY...,KDFVNINDYWFPTDGDDFYPALNKALSVSPHVLIPPGKHYLKSTVS...,-0.018416,0.022387,...,0.004437,0.087907,0.015800,0.025778,0.065790,0.034045,-0.070899,0.016068,0.065339,prophage_12186
1,GCF_016651625.1__phage29,GCF_016651625.1__phage29__150,KL43,GCF_016651625.1,anubis__0,anubis,MRANLIKTNFTAGEISPRLMGRVDIARYANGAKIIENAVCVVQGGV...,QAASPGAWTREDTVWTEEFGYPGAVTLYQQRLVLAGSPQYPQTIWW...,0.036016,0.005938,...,-0.037612,0.008772,0.010556,-0.049738,-0.012549,0.092624,-0.136602,-0.191378,0.135658,prophage_12186
2,GCF_016651625.1__phage12,GCF_016651625.1__phage12__59,KL43,GCF_016651625.1,ppt__4,ppt,MSISKRNFLKAVSCAYFFYSFKALTKVNQPIEDYISTKDKNTWPSK...,NTWPSKVHRVEEFYTSTDRDYSDAILRGINYCSLNNCVLFFSDKYK...,0.026004,0.024372,...,-0.026018,0.018206,0.036751,-0.032549,0.064112,0.061520,-0.024423,-0.027998,0.028089,prophage_924
4,GCF_004313505.1__phage4,GCF_004313505.1__phage4__113,KL14,GCF_004313505.1,anubis__5,anubis,MSEYDTGNPVPSASMPDAWDNMQSIDKFVNSSDETITTRTGQQLDT...,KAIFDAWLDFGIDWNGNESISLQLQTAVNYVSKLPYGGEIVLRPGV...,-0.023648,0.052674,...,-0.025991,0.068538,-0.051192,0.026481,0.069100,0.017813,-0.103797,0.018961,0.117058,prophage_11091
5,GCF_004313505.1__phage1,GCF_004313505.1__phage1__233,KL14,n830830,anubis__7,anubis,MAFNPELGSTSPAVLLDNAERLDKLVNGPAADVPDRGGDPLYSWRQ...,KAIFDAWLDFGIDWNGNESVSLQLQTAVNYVSKLPYGGEIVCRPGV...,-0.028778,0.060945,...,-0.020966,0.073692,-0.047220,0.027809,0.073462,0.025343,-0.100966,0.013383,0.116881,prophage_18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20134,GCF_002248635.1__phage4,GCF_002248635.1__phage4__44,KL102,n320,anubis_return__4264,anubis_return,MVSLKGMGSTFRDCTALISLPSGLLDGCINLTSLTLTFSGCTSLAL...,MVSLKGMGSTFRDCTALISLPSGLLDGCINLTSLTLTFSGCTSLAL...,-0.000585,-0.087093,...,0.036749,0.048489,0.020484,0.023950,-0.048109,0.134457,-0.101326,0.088485,0.037368,prophage_3054
20135,GCF_001905235.1__phage21,GCF_001905235.1__phage21__0,KL107,n35403540,anubis_return__4272,anubis_return,MLKHSLAIATCLAFSSSVMGNEANLLYTNTMQFPYKHNADGYMVFD...,VMGNEANLLYTNTMQFPYKHNADGYMVFDIHGKLVVPPEGHFDTLN...,0.076721,0.027635,...,0.042391,-0.004292,-0.004047,-0.011631,-0.026469,0.070159,-0.077212,-0.077950,-0.034630,prophage_313
20136,GCF_004312845.1__phage3,GCF_004312845.1__phage3__38,KL9,GCF_004312845.1,anubis_return__4275,anubis_return,MAILITGKSMTRLPESSSWEEEIELITRSERVAGGLDGPANRPLKS...,DAVIRRDLASDKGTSGVGKLGDKPLVAISYYKSKGQSDQDAVQAAF...,0.032196,0.048856,...,-0.016331,0.084711,0.056063,0.001793,0.073958,0.090169,-0.060105,0.023726,0.086452,prophage_12656
20137,GCF_900172635.1__phage2,GCF_900172635.1__phage2__1608,KL124,GCF_900172635.1,anubis_return__4287,anubis_return,MADLSISVISDQASESNQAGWWHPLDSFQGVEYYGLCKEYGTAGYH...,MADLSISVISDQASESNQAGWWHPLDSFQGVEYYGLCKEYGTAGYH...,-0.011089,-0.005328,...,0.034656,0.046130,0.012586,-0.021702,-0.023386,0.105700,-0.099147,-0.057367,0.091427,prophage_12780


In [15]:
DF_info

Unnamed: 0,Phage,Protein_name,KL_type_LCA,Infected_ancestor,index,Dataset,seq,domain_seq,1,2,...,1272,1273,1274,1275,1276,1277,1278,1279,1280,prophage_id
0,GCF_016651625.1__phage29,GCF_016651625.1__phage29__142,KL43,GCF_016651625.1,ppt__1,ppt,MSVPNQTPYNIYTANGLTTVFTYEFYIISASDLRVSINGDVVTSGY...,KDFVNINDYWFPTDGDDFYPALNKALSVSPHVLIPPGKHYLKSTVS...,-0.018416,0.022387,...,0.004437,0.087907,0.015800,0.025778,0.065790,0.034045,-0.070899,0.016068,0.065339,prophage_12186
1,GCF_016651625.1__phage29,GCF_016651625.1__phage29__150,KL43,GCF_016651625.1,anubis__0,anubis,MRANLIKTNFTAGEISPRLMGRVDIARYANGAKIIENAVCVVQGGV...,QAASPGAWTREDTVWTEEFGYPGAVTLYQQRLVLAGSPQYPQTIWW...,0.036016,0.005938,...,-0.037612,0.008772,0.010556,-0.049738,-0.012549,0.092624,-0.136602,-0.191378,0.135658,prophage_12186
2,GCF_016651625.1__phage12,GCF_016651625.1__phage12__59,KL43,GCF_016651625.1,ppt__4,ppt,MSISKRNFLKAVSCAYFFYSFKALTKVNQPIEDYISTKDKNTWPSK...,NTWPSKVHRVEEFYTSTDRDYSDAILRGINYCSLNNCVLFFSDKYK...,0.026004,0.024372,...,-0.026018,0.018206,0.036751,-0.032549,0.064112,0.061520,-0.024423,-0.027998,0.028089,prophage_924
3,GCF_019928025.1__phage0,GCF_019928025.1__phage0__10,KL43,n1471,ppt__4,ppt,MSISKRNFLKAVSCAYFFYSFKALTKVNQPIEDYISTKDKNTWPSK...,NTWPSKVHRVEEFYTSTDRDYSDAILRGINYCSLNNCVLFFSDKYK...,0.026004,0.024372,...,-0.026018,0.018206,0.036751,-0.032549,0.064112,0.061520,-0.024423,-0.027998,0.028089,prophage_2929
4,GCF_004313505.1__phage4,GCF_004313505.1__phage4__113,KL14,GCF_004313505.1,anubis__5,anubis,MSEYDTGNPVPSASMPDAWDNMQSIDKFVNSSDETITTRTGQQLDT...,KAIFDAWLDFGIDWNGNESISLQLQTAVNYVSKLPYGGEIVLRPGV...,-0.023648,0.052674,...,-0.025991,0.068538,-0.051192,0.026481,0.069100,0.017813,-0.103797,0.018961,0.117058,prophage_11091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20134,GCF_002248635.1__phage4,GCF_002248635.1__phage4__44,KL102,n320,anubis_return__4264,anubis_return,MVSLKGMGSTFRDCTALISLPSGLLDGCINLTSLTLTFSGCTSLAL...,MVSLKGMGSTFRDCTALISLPSGLLDGCINLTSLTLTFSGCTSLAL...,-0.000585,-0.087093,...,0.036749,0.048489,0.020484,0.023950,-0.048109,0.134457,-0.101326,0.088485,0.037368,prophage_3054
20135,GCF_001905235.1__phage21,GCF_001905235.1__phage21__0,KL107,n35403540,anubis_return__4272,anubis_return,MLKHSLAIATCLAFSSSVMGNEANLLYTNTMQFPYKHNADGYMVFD...,VMGNEANLLYTNTMQFPYKHNADGYMVFDIHGKLVVPPEGHFDTLN...,0.076721,0.027635,...,0.042391,-0.004292,-0.004047,-0.011631,-0.026469,0.070159,-0.077212,-0.077950,-0.034630,prophage_313
20136,GCF_004312845.1__phage3,GCF_004312845.1__phage3__38,KL9,GCF_004312845.1,anubis_return__4275,anubis_return,MAILITGKSMTRLPESSSWEEEIELITRSERVAGGLDGPANRPLKS...,DAVIRRDLASDKGTSGVGKLGDKPLVAISYYKSKGQSDQDAVQAAF...,0.032196,0.048856,...,-0.016331,0.084711,0.056063,0.001793,0.073958,0.090169,-0.060105,0.023726,0.086452,prophage_12656
20137,GCF_900172635.1__phage2,GCF_900172635.1__phage2__1608,KL124,GCF_900172635.1,anubis_return__4287,anubis_return,MADLSISVISDQASESNQAGWWHPLDSFQGVEYYGLCKEYGTAGYH...,MADLSISVISDQASESNQAGWWHPLDSFQGVEYYGLCKEYGTAGYH...,-0.011089,-0.005328,...,0.034656,0.046130,0.012586,-0.021702,-0.023386,0.105700,-0.099147,-0.057367,0.091427,prophage_12780


In [16]:
DF_embeddings

Unnamed: 0,index,1,2,3,4,5,6,7,8,9,...,1271,1272,1273,1274,1275,1276,1277,1278,1279,1280
0,ppt__2930,-0.000061,-0.017329,0.012884,0.037123,-0.123747,0.004186,-0.061367,-0.056718,-0.037215,...,0.098806,0.012989,-0.001155,0.139749,-0.030987,0.059306,0.107041,-0.041463,-0.085581,0.114973
1,ppt__3300,0.004044,0.040011,-0.001234,-0.095745,-0.058056,-0.002394,0.007648,-0.059740,0.060850,...,-0.020369,0.016287,0.062586,-0.024336,0.019276,0.069623,0.035261,-0.118962,0.035672,0.085582
2,ppt__1182,0.018767,0.068116,-0.009109,-0.012598,-0.107001,0.011569,-0.030943,-0.045359,0.048923,...,0.014524,-0.024645,0.071878,0.018206,0.042790,0.088410,0.031970,-0.124592,0.070040,0.065348
3,ppt__3540,-0.028261,-0.047253,-0.027340,-0.052824,-0.089644,-0.023079,0.094861,0.026104,0.024001,...,0.051728,0.005634,-0.077874,0.030336,-0.037648,0.050625,0.046142,-0.158841,-0.007670,0.034556
4,ppt__942,0.014863,0.028030,0.014927,-0.025997,-0.096138,0.016290,0.015008,-0.066254,0.077959,...,0.008521,-0.019820,0.123201,-0.040306,0.030893,0.051362,0.047316,-0.102698,0.044830,0.084530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3603,anubis__304,0.006264,0.006471,-0.031665,0.078502,-0.131247,0.077167,0.043005,-0.183636,-0.022181,...,0.044299,-0.061847,0.017696,0.054798,-0.035830,-0.030202,0.039051,-0.127020,-0.113630,0.211258
3604,anubis__1273,-0.019114,0.063302,0.006635,-0.060343,-0.034054,-0.003895,0.033920,-0.080352,0.073579,...,-0.004504,-0.007906,0.075141,-0.052423,0.027127,0.073984,0.030664,-0.096409,0.011906,0.124885
3605,anubis__1311,0.051261,0.067942,0.005061,-0.019131,-0.060296,0.000984,0.037515,-0.033887,0.091774,...,0.044678,0.052609,0.112994,-0.000592,0.027122,0.086020,0.013660,-0.055491,0.021665,0.049301
3606,anubis__1525,-0.010655,0.083864,0.009084,-0.042220,-0.066479,0.008724,0.010109,-0.078033,0.065285,...,0.020752,0.024543,0.071302,0.035980,0.012171,0.054399,0.032167,-0.151018,0.042541,0.035221
