In [1]:
from torch_geometric.data import HeteroData, DataLoader
import torch_geometric.transforms as T
from torch_geometric.nn import to_hetero , HeteroConv , GATv2Conv
from torch_geometric.utils import negative_sampling
from torch_geometric.loader import LinkNeighborLoader
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder , label_binarize , OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score , matthews_corrcoef

import TropiGAT_functions 
#from TropiGAT_functions import get_top_n_kltypes ,clean_print 

import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
import random
from collections import Counter, defaultdict
import warnings
import logging
from multiprocessing.pool import ThreadPool
warnings.filterwarnings("ignore")

# *****************************************************************************
# Load the Dataframes :
path_work = "/media/concha-eloko/Linux/PPT_clean"
path_ensemble = f"{path_work}/ficheros_28032023/winning_ensemble_0702"
#path_ensemble = f"{path_work}/ficheros_28032023/ensemble_tailored_0612"


> Make model : 

In [2]:
dico_models, errors = TropiGAT_functions.make_ensemble_TropiGAT(path_ensemble)

***
### Predictions at the phage level :

In [3]:
def phage_lvl_query_graph(df_data) :
    """
    This function builds the query graph for the ensemble model.
    Inputs : a dataframe with cols : phage <sep> protein name
    Output : The query graph
    """
    query_graph = HeteroData()
    # Indexation process :
    indexation_nodes_B1 = df_data["Phage"].unique().tolist()
    indexation_nodes_B2 = df_data["Protein_name"].unique().tolist()
    ID_nodes_B1 = {item:index for index, item in enumerate(indexation_nodes_B1)}
    ID_nodes_B1_r = {index:item for index, item in enumerate(indexation_nodes_B1)}
    ID_nodes_B2 = {item:index for index, item in enumerate(indexation_nodes_B2)}
    ID_nodes_B2_r = {index:item for index, item in enumerate(indexation_nodes_B2)}
    # Make the feature files :
    embeddings_columns = [str(i) for i in range(1, 1281)]
    node_feature_B1 = torch.zeros((len(ID_nodes_B1), 0), dtype=torch.float)
    node_feature_B2 = torch.tensor([df_data[df_data["Protein_name"] == ID_nodes_B2_r[i]][embeddings_columns].values[0].tolist() for i in range(0,len(ID_nodes_B2_r))] , dtype=torch.float)
     # feed the graph
    query_graph["B1"].x = node_feature_B1
    query_graph["B2"].x = node_feature_B2
        # Node B2 (depolymerase) - Node B1 (prophage) :
    edge_index_B2_B1 = []
    for phage in df_data.Phage.unique() :
        all_data_phage = df_data[df_data["Phage"] == phage]
        for _, row in all_data_phage.iterrows() :
            edge_index_B2_B1.append([ID_nodes_B2[row["index"]], ID_nodes_B1[row["Phage"]]])
    edge_index_B2_B1 = torch.tensor(edge_index_B2_B1 , dtype=torch.long)
    # feed the graph
    query_graph['B2', 'expressed', 'B1'].edge_index = edge_index_B2_B1.t().contiguous()

    return query_graph



In [4]:
round_probabilities = lambda x: torch.round(x * 10**4) / (10**4)

@torch.no_grad()
def phage_lvl_make_predictions(model, data):
	model.eval() 
	output = model(data)
	probabilities = torch.sigmoid(output)
	predictions = probabilities.round() 
	return predictions, round_probabilities(probabilities)
 
        
def phage_lvl_run_prediction(query_graph, df_data, dico_ensemble) :
    dico_predictions = defaultdict(list)
    indexation_nodes_B1 = df_data["Phage"].unique().tolist()
    ID_nodes_B1_r = {index:item for index, item in enumerate(indexation_nodes_B1)}
    for KL_type in dico_ensemble :
        model = dico_ensemble[KL_type]
        prediction, probabilities = phage_lvl_make_predictions(model, query_graph)
        for index_phage, output_phage in enumerate(prediction) :
            if int(output_phage) == 1 :
                a = (KL_type, float(probabilities[index_phage]))
                dico_predictions[indexation_nodes_B1[index_phage]].append(a)
            else :
                continue
    return dico_predictions

#### Ferriol :

> Prepare files Ferriol 

In [5]:
import pandas as pd 
import os 

path_project = "/media/concha-eloko/Linux/77_strains_phage_project"
path_Dpo_domain_org = "/media/concha-eloko/Linux/depolymerase_building/clean_77_phages_depo"

dpo_embeddings = pd.read_csv(f"{path_project}/rbp_work/Dpo_domains_77.esm2.embedding.1512.csv", sep = "," , header = None)

# Adjusting the DF for the function :
dpo_embeddings = dpo_embeddings.drop([1281] , axis = 1)
dpo_embeddings.set_index([0], inplace = True)
dpo_embeddings.index = [x if x.count("__cds") > 0 else x.replace("_cds", "__cds") for x in dpo_embeddings.index]
dpo_embeddings.index = [x.replace("__CDS","__cds") if x.count("__CDS") > 0 else x for x in dpo_embeddings.index]
dpo_embeddings.index = ['__'.join(x.split("__")[0:-1])+'_Dpo_domain' if x.count("_Dpo_domain") == 0 else x for x in dpo_embeddings.index]

dpo_embeddings = dpo_embeddings.reset_index()
dpo_embeddings["Phage"] = dpo_embeddings["index"].apply(lambda x : x.split("__")[0] if x != "K42PH8_Dpo_domain" else x.split("_")[0])
dpo_embeddings["Protein_name"] = dpo_embeddings["index"].apply(lambda x : x)
dpo_embeddings.columns = [str(col) for col in dpo_embeddings.columns]

In [22]:
dpo_embeddings

Unnamed: 0,index,1,2,3,4,5,6,7,8,9,...,1273,1274,1275,1276,1277,1278,1279,1280,Phage,Protein_name
0,K15PH90__cds_55_Dpo_domain,-0.028760,0.046677,-0.010773,0.028452,-0.090442,0.027041,0.004249,-0.083708,0.022172,...,0.079159,0.012298,0.027317,0.037254,0.069599,-0.097522,0.067495,0.062502,K15PH90,K15PH90__cds_55_Dpo_domain
1,K80PH1317b__cds_54_Dpo_domain,0.007689,0.036850,-0.006928,-0.056424,-0.090723,0.018707,0.014913,-0.070090,0.073792,...,0.059184,-0.006782,0.023955,0.035585,0.048035,-0.081247,0.043776,0.118674,K80PH1317b,K80PH1317b__cds_54_Dpo_domain
2,K64PH164C4__cds_24_Dpo_domain,0.015762,0.062429,-0.003427,-0.003609,-0.101109,0.028121,0.004342,-0.096114,0.062562,...,0.095159,0.024894,0.002837,0.046701,0.062497,-0.084956,0.027426,0.051051,K64PH164C4,K64PH164C4__cds_24_Dpo_domain
3,K5lambda5__cds_196_Dpo_domain,0.040111,0.046436,-0.012045,-0.043877,-0.100054,-0.028328,0.028640,-0.047144,0.065727,...,0.096149,-0.031008,0.040423,0.082593,0.050161,-0.105612,0.023642,0.081104,K5lambda5,K5lambda5__cds_196_Dpo_domain
4,K11PH164C1__cds_46_Dpo_domain,0.017319,0.077582,-0.001212,-0.030026,-0.070916,-0.011639,0.006673,-0.078486,0.072836,...,0.099658,-0.028466,0.020794,0.082009,0.030658,-0.091195,0.047744,0.105303,K11PH164C1,K11PH164C1__cds_46_Dpo_domain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,K24PH164C1__cds_55_Dpo_domain,0.039957,-0.007577,-0.037035,0.054355,-0.128855,0.044301,0.099503,-0.163834,0.076119,...,0.027130,0.023492,-0.033999,-0.025833,0.030128,-0.042748,-0.122199,0.124239,K24PH164C1,K24PH164C1__cds_55_Dpo_domain
120,K42PH8__cds_43_Dpo_domain,0.015222,-0.034378,-0.049853,0.063044,-0.158600,0.051105,0.079608,-0.183104,0.016171,...,0.006023,0.016808,-0.034763,-0.044617,0.048723,-0.071603,-0.118990,0.130576,K42PH8,K42PH8__cds_43_Dpo_domain
121,K10PH82C1__cds_45_Dpo_domain,0.017037,-0.028448,-0.066188,0.038587,-0.124722,0.051205,0.086921,-0.141901,0.024835,...,0.013690,0.011025,-0.067530,-0.004214,0.050276,-0.072483,-0.137275,0.142341,K10PH82C1,K10PH82C1__cds_45_Dpo_domain
122,K44PH129C1__cds_45_Dpo_domain,0.010439,-0.023608,-0.016569,0.054134,-0.148860,0.043717,0.065837,-0.149970,0.020374,...,-0.003943,0.033981,-0.043663,-0.035262,0.058799,-0.032766,-0.110497,0.138307,K44PH129C1,K44PH129C1__cds_45_Dpo_domain


In [6]:
ferriol_graph = phage_lvl_query_graph(dpo_embeddings)

> Make predictions :

In [7]:
ferriol_graph = phage_lvl_query_graph(dpo_embeddings)

In [8]:
pred = phage_lvl_run_prediction(ferriol_graph, dpo_embeddings ,dico_models)

In [9]:
pred

defaultdict(list,
            {'K41P2': [('KL105', 0.9872999787330627),
              ('KL46', 0.8062999844551086),
              ('KL22', 0.6342999935150146),
              ('KL106', 0.978600025177002),
              ('KL48', 0.9126999974250793),
              ('KL38', 0.9663000106811523),
              ('KL21', 0.748199999332428),
              ('KL27', 0.9868000149726868),
              ('KL43', 0.8090000152587891),
              ('KL15', 0.7889000177383423),
              ('KL39', 0.5526000261306763),
              ('KL14', 0.89410001039505),
              ('KL123', 0.9979000091552734),
              ('KL125', 0.9728000164031982),
              ('KL136', 0.830299973487854),
              ('KL18', 0.6672999858856201),
              ('KL128', 0.8391000032424927)],
             'K65PH164': [('KL105', 0.927299976348877),
              ('KL53', 0.8654999732971191),
              ('KL17', 0.9186000227928162),
              ('KL46', 0.5012999773025513),
              ('KL22', 0.5993999838

> Bea

In [27]:
import pandas as pd 
import os 

path_project = "/media/concha-eloko/Linux/PPT_clean/in_vitro"

bea_embeddings = pd.read_csv(f"{path_project}/Bea_phages.esm2.embedding.csv", sep = "," , header = None)
bea_embeddings = bea_embeddings.drop([1281] , axis = 1)
#bea_embeddings.set_index([0], inplace = True)
bea_embeddings["Phage"] = bea_embeddings[0].apply(lambda x : x.split("_")[0])
bea_embeddings["Protein_name"] = bea_embeddings[0].apply(lambda x : x)
bea_embeddings.columns = [str(col) for col in bea_embeddings.columns]

bea_embeddings.rename(columns={"0": 'index'}, inplace=True)

In [28]:
bea_embeddings

Unnamed: 0,index,1,2,3,4,5,6,7,8,9,...,1273,1274,1275,1276,1277,1278,1279,1280,Phage,Protein_name
0,A1c_00034,-0.031228,0.018990,0.011914,-0.036915,-0.085279,0.074510,0.038491,-0.092385,0.041953,...,0.086334,0.015169,-0.009941,0.070748,0.061202,-0.088183,0.029006,0.092003,A1c,A1c_00034
1,A1h_00013,0.003407,0.058613,0.004211,-0.032270,-0.080894,0.003224,0.026575,-0.049651,0.068412,...,0.103554,-0.011835,0.040593,0.037963,0.031057,-0.102158,0.077059,0.088456,A1h,A1h_00013
2,S11a_00036,-0.000087,0.046707,0.027543,-0.037515,-0.073041,0.012802,0.033908,-0.055600,0.117382,...,0.144737,-0.022651,0.048932,0.046084,0.025735,-0.100239,0.049737,0.135533,S11a,S11a_00036
3,A1q_00023,0.035220,-0.004737,-0.047403,0.058447,-0.127027,0.047977,0.093704,-0.168074,0.073823,...,0.038854,0.031011,-0.032295,-0.027215,0.027464,-0.042393,-0.133780,0.124112,A1q,A1q_00023
4,A1q_00010,0.006108,0.036785,0.033718,-0.061564,-0.102759,0.008589,0.025191,-0.038584,0.046164,...,0.079121,-0.012699,0.048490,0.054487,0.050371,-0.102361,0.034225,0.098482,A1q,A1q_00010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,A1e_00012,0.023357,0.039636,-0.014646,-0.014371,-0.082256,0.020024,-0.017925,-0.063363,0.068452,...,0.106709,-0.003896,0.034324,0.069450,0.058052,-0.072267,0.086228,0.071590,A1e,A1e_00012
67,M5a_00170,-0.021793,0.077967,-0.017637,-0.069029,-0.085250,0.012727,0.054289,-0.040869,0.069873,...,0.092628,-0.031385,0.033944,0.073543,0.025255,-0.100661,0.049953,0.117564,M5a,M5a_00170
68,A1j_00002,0.036885,-0.005403,-0.047889,0.057499,-0.128569,0.041746,0.098268,-0.166326,0.072257,...,0.038163,0.026237,-0.031345,-0.027221,0.030271,-0.040723,-0.132024,0.123907,A1j,A1j_00002
69,A1n_00050,0.041736,-0.007706,-0.042998,0.052101,-0.130531,0.043664,0.096089,-0.163562,0.078218,...,0.038726,0.026058,-0.026446,-0.025789,0.034434,-0.033185,-0.127778,0.121125,A1n,A1n_00050


In [29]:
bea_graph = phage_lvl_query_graph(bea_embeddings)

In [30]:
bea_pred = phage_lvl_run_prediction(bea_graph, dpo_embeddings ,dico_models)

> Towndsend

In [None]:
import pandas as pd 
import os 

path_project = "/media/concha-eloko/Linux/PPT_clean/in_vitro"

towndsend_embeddings = pd.read_csv(f"{path_project}/Townsed_phages.esm2.embedding.1112.csv", sep = "," , header = None)
towndsend_embeddings = towndsend_embeddings.drop([1281] , axis = 1)
towndsend_embeddings.set_index([0], inplace = True)


***
> run the predictions 

In [None]:
# Run the predictions ferriol :
ferriol_predictions = {}
for dpo in dpo_embeddings.index : 
    graph_dpo = TropiGAT_functions.make_query_graph([dpo_embeddings.loc[dpo].values])
    pred = TropiGAT_functions.run_prediction(graph_dpo,dico_models)
    ferriol_predictions[dpo] = pred

In [None]:
ferriol_predictions["K17alfa62__cds_66_Dpo_domain"]

In [None]:
# format the results : 
ferriol_pred_formated = TropiGAT_functions.format_predictions(ferriol_predictions , sep = "__")
TropiGAT_functions.clean_print(ferriol_pred_formated)

***

In [None]:
# Run the predictions Bea :
bea_predictions = {}
for dpo in bea_embeddings.index : 
    graph_dpo = TropiGAT_functions.make_query_graph([bea_embeddings.loc[dpo].values])
    pred = TropiGAT_functions.run_prediction(graph_dpo,dico_models)
    bea_predictions[dpo] = pred

In [None]:
# format the results : 
bea_pred_formated = TropiGAT_functions.format_predictions(bea_predictions , sep = "_")
TropiGAT_functions.clean_print(bea_pred_formated)

In [None]:
# Run the predictions Towndsend :
towndsend_predictions = {}
for dpo in towndsend_embeddings.index : 
    graph_dpo = TropiGAT_functions.make_query_graph([towndsend_embeddings.loc[dpo].values])
    pred = TropiGAT_functions.run_prediction(graph_dpo,dico_models)
    towndsend_predictions[dpo] = pred

In [None]:
# format the results : 
towndsend_pred_formated = TropiGAT_functions.format_predictions(towndsend_predictions , sep = "_")
TropiGAT_functions.clean_print(towndsend_pred_formated)

***

In [None]:
others_embeddings.info()

In [None]:
others_embeddings.index

In [None]:
# Run the predictions Others :
other_predictions = {}
for index, dpo in enumerate(others_embeddings.index) :
    if dpo not in ["MN781108.1_prot_QGZ15323.1_262"] :
        try : 
            graph_dpo = TropiGAT_functions.make_query_graph([others_embeddings.loc[dpo].values])
            pred = TropiGAT_functions.run_prediction(graph_dpo,dico_models)
            other_predictions[dpo] = pred
        except Exception as e :
            print(e, dpo)

In [None]:
# format the results : 
others_pred_formated = TropiGAT_functions.format_predictions(other_predictions , sep = "_prot_")
TropiGAT_functions.clean_print(other_predictions)

***
# Write the results : 

> Others : 

In [None]:
predictions = [other_predictions]

with open("/media/concha-eloko/Linux/PPT_clean/TropiGAT.Others.all.results.classic_0802.tsv", "w") as outfile:
    for prediction in predictions:
        prediction_sorted = dict(sorted(prediction.items()))
        for prot in prediction_sorted:
            if prediction_sorted[prot] == "No hits" or len(prediction_sorted[prot]) == 0:
                outfile.write(f"{prot}\tNo hits\n")
            else:
                outfile.write(f"{prot}\t")
                hits = [f"{kltype}:{round(score, 3)}" for kltype, score in prediction_sorted[prot].items()]
                sorted_hits = " ; ".join(sorted(hits, key=lambda x: float(x.split(":")[1]), reverse=True))
                outfile.write(sorted_hits)
                outfile.write("\n")

> Predictions : 

In [None]:
predictions = [ferriol_predictions , bea_predictions , towndsend_predictions]

with open("/media/concha-eloko/Linux/PPT_clean/TropiGAT.results.classic_0802.tsv", "w") as outfile:
    for prediction in predictions:
        for prot in prediction:
            if prediction[prot] == "No hits" or len(prediction[prot]) == 0:
                outfile.write(f"{prot}\tNo hits\n")
            else:
                outfile.write(f"{prot}\t")
                hits = [f"{kltype}:{round(score, 3)}" for kltype, score in prediction[prot].items()]
                sorted_hits = " ; ".join(sorted(hits, key=lambda x: float(x.split(":")[1]), reverse=True))
                outfile.write(sorted_hits)
                outfile.write("\n")

> Parse TropiGAT and Seqbased results :

In [32]:
import pandas as pd 
import os 

path_project = "/media/concha-eloko/Linux/PPT_clean"

tropigat_results = pd.read_csv(f"{path_project}/TropiGAT.results.classic_0802.tsv", sep = "\t", names = ["protein", "predictions_tropigat"])

# Seqbased_model.results.bit50.0101.tsv
# Seqbased_model.0101.results.tsv
# Seqbased_model.1001.results.tsv
seqbased_results = pd.read_csv(f"{path_project}/Seqbased_model.results.bit75.0802.tsv", sep = "\t", names = ["protein", "predictions_seqbased"])

tropigat_results


Unnamed: 0,protein,predictions_tropigat
0,K15PH90__cds_55_Dpo_domain,KL27:0.999 ; KL3:0.998 ; KL60:0.996 ; KL15:0.9...
1,K80PH1317b__cds_54_Dpo_domain,KL60:0.863 ; KL128:0.802 ; KL53:0.59 ; KL48:0.541
2,K64PH164C4__cds_24_Dpo_domain,KL14:0.995 ; KL64:0.979 ; KL13:0.963 ; KL60:0....
3,K5lambda5__cds_196_Dpo_domain,KL18:0.998 ; KL60:0.995 ; KL25:0.903 ; KL127:0...
4,K11PH164C1__cds_46_Dpo_domain,KL123:0.922 ; KL145:0.811 ; KL127:0.71 ; KL57:...
...,...,...
235,NBNDMPCG_00163,KL13:0.958 ; KL2:0.954 ; KL57:0.94 ; KL60:0.92...
236,NJHLHPIG_00061,KL46:0.99 ; KL128:0.975 ; KL18:0.971 ; KL52:0....
237,HIIECEMK_00054,KL60:0.995 ; KL18:0.991 ; KL23:0.943 ; KL14:0....
238,PP187_gp237,KL110:0.955 ; KL114:0.931 ; KL70:0.897 ; KL9:0...


In [43]:
import statistics 

n_predictions = []

for _, row in tropigat_results.iterrows() : 
    n_pred = len(row["predictions_tropigat"].split(";"))
    n_predictions.append(n_pred)

statistics.mean(n_predictions), max(n_predictions), min(n_predictions), statistics.stdev(n_predictions) , statistics.median(n_predictions)

n_predictions.sort()
n_predictions


[3,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 15,
 15,
 15,
 15,
 15,
 15,
 15,
 15,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 20,
 20,
 20,
 20,
 20,
 20,

In [3]:
tropigat_results["protein_id"] = tropigat_results["protein"].apply(lambda x : x.split("_Dpo")[0])
seqbased_results["protein_id"] = seqbased_results["protein"].apply(lambda x : x.split("_A")[0] if x.count("_A")>0 else "__".join(x.split(",")[0].split(" ")) if x.count(",")>0 else x)

merged_df = tropigat_results.merge(seqbased_results, on='protein_id', how='inner')
merged_df["phage"] = merged_df["protein_id"].apply(lambda x : x.split("__")[0] if x.count("__")>0 else x.split("_")[0])

merged_df_sorted = merged_df.sort_values(by='phage', ascending=True)
merged_df_sorted

Unnamed: 0,protein_x,predictions_tropigat,protein_id,protein_y,predictions_seqbased,phage
173,A1a_00002,KL123:0.977 ; KL14:0.911 ; KL13:0.9 ; KL39:0.8...,A1a_00002,A1a_00002,KL102:0.691,A1a
158,A1a_00014,KL45:0.878 ; KL2:0.741 ; KL74:0.57 ; KL48:0.52...,A1a_00014,A1a_00014,KL151:0.698,A1a
159,A1b_00048,KL128:0.919 ; KL70:0.885 ; KL52:0.862 ; KL46:0...,A1b_00048,A1b_00048,KL157:0.729,A1b
151,A1b_00036,KL7:0.983 ; KL13:0.975 ; KL123:0.971 ; KL14:0....,A1b_00036,A1b_00036,KL102:0.691,A1b
182,A1c_00046,KL13:0.958 ; KL123:0.951 ; KL19:0.92 ; KL14:0....,A1c_00046,A1c_00046,KL102:0.691,A1c
...,...,...,...,...,...,...
181,S13a_00036,KL18:0.959 ; KL27:0.938 ; KL60:0.87 ; KL123:0....,S13a_00036,S13a_00036,KL38:0.822,S13a
143,S13b_00058,KL47:0.988 ; KL14:0.921 ; KL23:0.752 ; KL64:0....,S13b_00058,S13b_00058,KL63:0.867,S13b
175,S13c_00055,KL27:0.958 ; KL38:0.955 ; KL60:0.92 ; KL57:0.9...,S13c_00055,S13c_00055,No predictions,S13c
174,S13d_00057,KL14:0.999 ; KL52:0.973 ; KL38:0.954 ; KL27:0....,S13d_00057,S13d_00057,KL14:0.736,S13d


In [4]:
final_df = merged_df_sorted[["phage","protein_id", "predictions_seqbased", "predictions_tropigat"]]



In [5]:
final_df.to_csv(f"{path_project}/PPT_results.classic_0802.bit75.tsv", sep = "\t", header = True, index = False)

In [6]:
final_df

Unnamed: 0,phage,protein_id,predictions_seqbased,predictions_tropigat
173,A1a,A1a_00002,KL102:0.691,KL123:0.977 ; KL14:0.911 ; KL13:0.9 ; KL39:0.8...
158,A1a,A1a_00014,KL151:0.698,KL45:0.878 ; KL2:0.741 ; KL74:0.57 ; KL48:0.52...
159,A1b,A1b_00048,KL157:0.729,KL128:0.919 ; KL70:0.885 ; KL52:0.862 ; KL46:0...
151,A1b,A1b_00036,KL102:0.691,KL7:0.983 ; KL13:0.975 ; KL123:0.971 ; KL14:0....
182,A1c,A1c_00046,KL102:0.691,KL13:0.958 ; KL123:0.951 ; KL19:0.92 ; KL14:0....
...,...,...,...,...
181,S13a,S13a_00036,KL38:0.822,KL18:0.959 ; KL27:0.938 ; KL60:0.87 ; KL123:0....
143,S13b,S13b_00058,KL63:0.867,KL47:0.988 ; KL14:0.921 ; KL23:0.752 ; KL64:0....
175,S13c,S13c_00055,No predictions,KL27:0.958 ; KL38:0.955 ; KL60:0.92 ; KL57:0.9...
174,S13d,S13d_00057,KL14:0.736,KL14:0.999 ; KL52:0.973 ; KL38:0.954 ; KL27:0....
