In [None]:
from torch_geometric.data import HeteroData, DataLoader
import torch_geometric.transforms as T
from torch_geometric.nn import HeteroConv , GATv2Conv 
#from torch_geometric.utils import negative_sampling
#from torch_geometric.loader import LinkNeighborLoader

import torch
from torch import nn 
import torch.nn.functional as F
import torch.optim as optim

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder , label_binarize , OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

import os 
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
import random
from collections import Counter
import warnings
warnings.filterwarnings("ignore") 



In [None]:
import TropiGAT_models
import TropiGAT_graph

In [None]:
def build_graph_baseline(df_info) : 
    # **************************************************************
    # initialize the graph
    graph_data = HeteroData()
    # Indexation process  
    indexation_nodes_A = df_info["Infected_ancestor"].unique().tolist()  
    indexation_nodes_B1 = df_info["Phage"].unique().tolist()
    indexation_nodes_B2 = df_info["index"].unique().tolist() 
    ID_nodes_A = {item:index for index, item in enumerate(indexation_nodes_A)}
    ID_nodes_A_r = {index:item for index, item in enumerate(indexation_nodes_A)}
    ID_nodes_B1 = {item:index for index, item in enumerate(indexation_nodes_B1)}
    ID_nodes_B1_r = {index:item for index, item in enumerate(indexation_nodes_B1)}
    ID_nodes_B2 = {item:index for index, item in enumerate(indexation_nodes_B2)}
    ID_nodes_B2_r = {index:item for index, item in enumerate(indexation_nodes_B2)}
    # **************************************************************
    # Make the node feature file : 
    OHE = OneHotEncoder(sparse=False)
    one_hot_encoded = OHE.fit_transform(df_info[["KL_type_LCA"]])
    label_mapping = {label: one_hot_encoded[i] for i, label in enumerate(OHE.categories_[0])}
    embeddings_columns = [str(i) for i in range(1, 1281)]
    node_feature_A = torch.tensor([label_mapping[df_info[df_info["Infected_ancestor"] == ID_nodes_A_r[i]]["KL_type_LCA"].values[0]] for i in range(0,len(ID_nodes_A_r))], dtype=torch.float)
    node_feature_B1 = torch.zeros((len(ID_nodes_B1), 0), dtype=torch.float)
    node_feature_B2 = torch.tensor([df_info[df_info["index"] == ID_nodes_B2_r[i]][embeddings_columns].values[0].tolist() for i in range(0,len(ID_nodes_B2_r))] , dtype=torch.float)
    # feed the graph
    graph_data["A"].x = node_feature_A
    graph_data["B1"].x = node_feature_B1
    graph_data["B2"].x = node_feature_B2
    # **************************************************************
    # Make edge file
    # Node B1 (prophage) - Node A (bacteria) :
    edge_index_B1_A = []
    track_B1_A = set()
    for _, row in df_info.iterrows() :
        pair = [ID_nodes_B1[row["Phage"]], ID_nodes_A[row["Infected_ancestor"]]]
        if tuple(pair) not in track_B1_A : 
            track_B1_A.add(tuple(pair))
            edge_index_B1_A.append(pair)
        else :
            continue
    edge_index_B1_A = torch.tensor(edge_index_B1_A , dtype=torch.long)
    # Node A (bacteria) - Node B1 (prophage) :
    edge_index_A_B1 = []
    track_A_B1 = set()
    for _, row in df_info.iterrows() :
        pair = [ID_nodes_A[row["Infected_ancestor"]] , ID_nodes_B1[row["Phage"]]]
        if tuple(pair) not in track_A_B1 :
            track_A_B1.add(tuple(pair))
            edge_index_A_B1.append(pair)
    edge_index_A_B1 = torch.tensor(edge_index_A_B1 , dtype=torch.long)
    # Node B2 (depolymerase) - Node B1 (prophage) :
    edge_index_B2_B1 = []
    for phage in df_info.Phage.unique() :
        all_data_phage = df_info[df_info["Phage"] == phage]
        for _, row in all_data_phage.iterrows() :
            edge_index_B2_B1.append([ID_nodes_B2[row["index"]], ID_nodes_B1[row["Phage"]]])
    edge_index_B2_B1 = torch.tensor(edge_index_B2_B1 , dtype=torch.long)
    # feed the graph
    graph_data['B1', 'infects', 'A'].edge_index = edge_index_B1_A.t().contiguous()
    graph_data['B2', 'expressed', 'B1'].edge_index = edge_index_B2_B1.t().contiguous()
    # That one is optional  
    graph_data['A', 'harbors', 'B1'].edge_index = edge_index_A_B1.t().contiguous()
    dico_prophage_kltype_associated = {}
    for negative_index,phage in tqdm(enumerate(df_info["Phage"].unique().tolist())) :
        kltypes = set()
        dpos = df_info[df_info["Phage"] == phage]["index"]
        for dpo in dpos : 
            tmp_kltypes = df_info[df_info["index"] == dpo]["KL_type_LCA"].values
            kltypes.update(tmp_kltypes)
        dico_prophage_kltype_associated[phage] = kltypes
    return graph_data , dico_prophage_kltype_associated


def build_graph_masking(graph_data, dico_prophage_kltype_associated , df_info, KL_type, ratio , f_train, f_test, f_eval) : 
    # **************************************************************
    # Indexation process  
    indexation_nodes_A = df_info["Infected_ancestor"].unique().tolist()  
    indexation_nodes_B1 = df_info["Phage"].unique().tolist()
    indexation_nodes_B2 = df_info["index"].unique().tolist() 
    ID_nodes_A = {item:index for index, item in enumerate(indexation_nodes_A)}
    ID_nodes_A_r = {index:item for index, item in enumerate(indexation_nodes_A)}
    ID_nodes_B1 = {item:index for index, item in enumerate(indexation_nodes_B1)}
    ID_nodes_B1_r = {index:item for index, item in enumerate(indexation_nodes_B1)}
    ID_nodes_B2 = {item:index for index, item in enumerate(indexation_nodes_B2)}
    ID_nodes_B2_r = {index:item for index, item in enumerate(indexation_nodes_B2)}
    # **************************************************************
    # Make the Y file : 
    B1_labels = df_info.drop_duplicates(subset = ["Phage"], keep = "first")["KL_type_LCA"].apply(lambda x : 1 if x == KL_type else 0).to_list()
    graph_data["B1"].y = torch.tensor(B1_labels)
    # **************************************************************
    # Make mask files :
    # get the positive and negative indices lists :
    positive_indices = [index for index,label in enumerate(B1_labels) if label==1]
    negative_indices = []
    for negative_index,phage in enumerate(df_info["Phage"].unique().tolist()) :
        if KL_type not in dico_prophage_kltype_associated[ID_nodes_B1_r[negative_index]] :
            negative_indices.append(negative_index)
    # make the train, test, val lists : 
    n_samples = len(positive_indices)
    #train_indices, test_indices, val_indices = [],[],[]
    # make train : 
    train_pos = random.sample(positive_indices, int(f_train*n_samples))
    train_neg = random.sample(negative_indices, int(f_train*n_samples*ratio))
    train_indices = train_pos + train_neg
    train_mask = [1 if n in train_indices else 0 for n in range(0,len(B1_labels))]
    # make test : 
    pool_positives_test = list(set(positive_indices) - set(train_pos))
    pool_negatives_test = list(set(negative_indices) - set(train_neg))
    test_pos = random.sample(pool_positives_test, int(f_test*n_samples))
    test_neg = random.sample(pool_negatives_test, int(f_test*n_samples*ratio))
    test_indices = test_pos + test_neg
    test_mask = [1 if n in test_indices else 0 for n in range(0,len(B1_labels))]
    # make eval
    pool_positives_eval = list(set(positive_indices) - set(train_pos) - set(test_pos))
    pool_negatives_eval = list(set(negative_indices) - set(train_neg) - set(test_neg))
    eval_pos = random.sample(pool_positives_eval, int(f_eval*n_samples))
    eval_neg = random.sample(pool_negatives_eval, int(f_eval*n_samples*ratio))
    eval_indices = eval_pos + eval_neg
    eval_mask = [1 if n in eval_indices else 0 for n in range(0,len(B1_labels))]
    # Transfer data to graph :
    graph_data["B1"].train_mask = torch.tensor(train_mask)
    graph_data["B1"].test_mask = torch.tensor(test_mask)
    graph_data["B1"].eval_mask = torch.tensor(eval_mask)

    return graph_data


In [42]:
import os 
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
import random
from collections import Counter
import warnings

import TropiGAT_functions
warnings.filterwarnings("ignore") 

# *****************************************************************************
# Load the Dataframes :
path_work = "/media/concha-eloko/Linux/PPT_clean"
#path_work = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023"

    # Open the DF
DF_info_old = pd.read_csv(f"{path_work}/TropiGATv2.final_df.tsv", sep = "\t" ,  header = 0)
DF_info_old = DF_info_old.drop_duplicates(subset = ["Protein_name"])

DF_info = pd.read_csv(f"{path_work}/TropiGATv2.final_df_v2.tsv", sep = "\t" ,  header = 0)
DF_info = DF_info.drop_duplicates(subset = ["Protein_name"])


In [None]:
df_prophages = DF_info.drop_duplicates(subset = ["Phage"], keep = "first")
dico_prophage_info = {row["Phage"] : {"prophage_strain" : row["prophage_id"] , "ancestor" : row["Infected_ancestor"]} for _,row in df_prophages.iterrows()}

def get_filtered_prophages(prophage) :
    combinations = []
    to_exclude = set()
    to_keep = set()
    to_keep.add(prophage)
    df_prophage_group = DF_info[(DF_info["prophage_id"] == dico_prophage_info[prophage]["prophage_strain"]) & (DF_info["Infected_ancestor"] == dico_prophage_info[prophage]["ancestor"])]
    if len(df_prophage_group) == 1 : 
        pass
    else :
        depo_set = set(df_prophage_group[df_prophage_group["Phage"] == prophage]["domain_seq"].values)
        for prophage_tmp in df_prophage_group["Phage"].unique().tolist() :
            if prophage_tmp != prophage :
                tmp_depo_set = set(df_prophage_group[df_prophage_group["Phage"] == prophage_tmp]["domain_seq"].values)
                if depo_set == tmp_depo_set :
                    to_exclude.add(prophage_tmp)
                else :
                    if tmp_depo_set not in combinations :
                        to_keep.add(prophage_tmp)
                        combinations.append(tmp_depo_set)
                    else :
                        to_exclude.add(prophage_tmp)
    return df_prophage_group , to_exclude , to_keep

good_prophages = set()
excluded_prophages = set()

for prophage, info_prophage in tqdm(dico_prophage_info.items()) :
    if prophage not in excluded_prophages and prophage not in good_prophages:
        _, excluded_members , kept_members = get_filtered_prophages(prophage) 
        good_prophages.update(kept_members)
        excluded_prophages.update(excluded_members)

DF_info_lvl_0_filtered = DF_info[DF_info["Phage"].isin(good_prophages)]
DF_info_lvl_0_final = DF_info_lvl_0_filtered[~DF_info_lvl_0_filtered["KL_type_LCA"].str.contains("\\|")]

In [44]:
# Ambiguous ones :
# level 0 :
DF_info_lvl_0_old = DF_info_old[~DF_info_old["KL_type_LCA"].str.contains("\\|")]
DF_info_lvl_0 = DF_info[~DF_info["KL_type_LCA"].str.contains("\\|")]


DF_info_lvl_0_filter1 = DF_info_lvl_0.drop_duplicates(subset = ["Infected_ancestor","index","prophage_id"] , keep = "first").reset_index(drop=True)
DF_info_lvl_0_old_filter1 = DF_info_lvl_0_old.drop_duplicates(subset = ["Infected_ancestor","index","prophage_id"] , keep = "first").reset_index(drop=True)

#dico_prophage_kltype = {row["Phage"]:row["KL_type_LCA"] for _,row in DF_info_lvl_0.drop_duplicates(subset = ["Phage"]).iterrows()}


> Regular DF : 

In [45]:
from collections import Counter
from statistics import mean 

Counter(DF_info["Phage"])
n_prophages = [value for key,value in dict(Counter(DF_info["Phage"])).items()]

mean(n_prophages)

1.2800825980852262

In [None]:
1.3359614542268945 ; 1.2800825980852262

In [46]:
Counter(DF_info["Phage"])

Counter({'GCF_003037395.1__phage28': 12,
         'GCF_015278455.1__phage8': 4,
         'GCF_015992305.1__phage9': 4,
         'GCF_018279325.1__phage9': 4,
         'GCF_003111885.2__phage9': 4,
         'GCF_021498005.1__phage9': 4,
         'GCF_003571545.1__phage18': 4,
         'GCF_013376535.2__phage4': 4,
         'GCF_003584585.1__phage9': 4,
         'GCF_009906895.1__phage6': 4,
         'GCF_008572815.1__phage18': 4,
         'GCF_015243235.1__phage6': 4,
         'GCF_002853435.3__phage4': 4,
         'GCF_900513915.1__phage21': 4,
         'GCF_001913175.1__phage4': 4,
         'GCF_900517675.1__phage18': 4,
         'GCF_900506935.1__phage17': 4,
         'GCF_003227135.1__phage13': 4,
         'GCF_900514355.1__phage12': 4,
         'GCF_900514255.1__phage13': 4,
         'GCF_900511545.1__phage16': 4,
         'GCF_003227635.1__phage15': 4,
         'GCF_021136845.1__phage2': 4,
         'GCF_900506945.1__phage10': 4,
         'GCF_003227105.1__phage16': 4,
         'G

In [15]:
DF_info[DF_info["Infected_ancestor"] == "n34153415"]#["prophage_id"].values

Unnamed: 0,Phage,Protein_name,KL_type_LCA,Infected_ancestor,index,Dataset,seq,domain_seq,1,2,...,1272,1273,1274,1275,1276,1277,1278,1279,1280,prophage_id
672,GCF_002970155.2__phage25,GCF_002970155.2__phage25__177,KL107,n34153415,ppt__116,ppt,MKHLFRHWRTSGAVIGSLLKKGSIAVLALLVVFLAGRIYESQRGPA...,MKHLFRHWRTSGAVIGSLLKKGSIAVLALLVVFLAGRIYESQRGPA...,0.044154,-0.082627,...,-0.064292,-0.044367,-0.030648,-0.056637,0.045530,0.120135,-0.101227,-0.000446,0.082778,prophage_15
674,GCF_002969855.2__phage23,GCF_002969855.2__phage23__171,KL107,n34153415,ppt__116,ppt,MKHLFRHWRTSGAVIGSLLKKGSIAVLALLVVFLAGRIYESQRGPA...,MKHLFRHWRTSGAVIGSLLKKGSIAVLALLVVFLAGRIYESQRGPA...,0.044154,-0.082627,...,-0.064292,-0.044367,-0.030648,-0.056637,0.045530,0.120135,-0.101227,-0.000446,0.082778,prophage_15
682,GCF_002969965.2__phage18,GCF_002969965.2__phage18__171,KL107,n34153415,ppt__116,ppt,MKHLFRHWRTSGAVIGSLLKKGSIAVLALLVVFLAGRIYESQRGPA...,MKHLFRHWRTSGAVIGSLLKKGSIAVLALLVVFLAGRIYESQRGPA...,0.044154,-0.082627,...,-0.064292,-0.044367,-0.030648,-0.056637,0.045530,0.120135,-0.101227,-0.000446,0.082778,prophage_15
687,GCF_002970475.2__phage27,GCF_002970475.2__phage27__170,KL107,n34153415,ppt__116,ppt,MKHLFRHWRTSGAVIGSLLKKGSIAVLALLVVFLAGRIYESQRGPA...,MKHLFRHWRTSGAVIGSLLKKGSIAVLALLVVFLAGRIYESQRGPA...,0.044154,-0.082627,...,-0.064292,-0.044367,-0.030648,-0.056637,0.045530,0.120135,-0.101227,-0.000446,0.082778,prophage_15
689,GCF_002184835.2__phage5,GCF_002184835.2__phage5__78,KL107,n34153415,ppt__116,ppt,MKHLFRHWRTSGAVIGSLLKKGSIAVLALLVVFLAGRIYESQRGPA...,MKHLFRHWRTSGAVIGSLLKKGSIAVLALLVVFLAGRIYESQRGPA...,0.044154,-0.082627,...,-0.064292,-0.044367,-0.030648,-0.056637,0.045530,0.120135,-0.101227,-0.000446,0.082778,prophage_15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19928,GCF_002969535.2__phage20,GCF_002969535.2__phage20__0,KL107,n34153415,anubis_return__53,anubis_return,MASIKELPRWEDEVYQIARGDKVEGGVGGIANMQAKTLAERTRYLK...,LLDPSAPEPMRMFNIKGKPRIELTAEELATFNATYSKYLKKGSQYL...,0.020271,0.077943,...,-0.019126,0.039506,0.068645,0.022851,0.172154,0.104167,-0.081774,0.036112,-0.053186,prophage_12
19930,GCF_002185055.2__phage18,GCF_002185055.2__phage18__0,KL107,n34153415,anubis_return__53,anubis_return,MASIKELPRWEDEVYQIARGDKVEGGVGGIANMQAKTLAERTRYLK...,LLDPSAPEPMRMFNIKGKPRIELTAEELATFNATYSKYLKKGSQYL...,0.020271,0.077943,...,-0.019126,0.039506,0.068645,0.022851,0.172154,0.104167,-0.081774,0.036112,-0.053186,prophage_12
19931,GCF_002973975.2__phage16,GCF_002973975.2__phage16__0,KL107,n34153415,anubis_return__53,anubis_return,MASIKELPRWEDEVYQIARGDKVEGGVGGIANMQAKTLAERTRYLK...,LLDPSAPEPMRMFNIKGKPRIELTAEELATFNATYSKYLKKGSQYL...,0.020271,0.077943,...,-0.019126,0.039506,0.068645,0.022851,0.172154,0.104167,-0.081774,0.036112,-0.053186,prophage_12
19932,GCF_002970875.2__phage22,GCF_002970875.2__phage22__0,KL107,n34153415,anubis_return__53,anubis_return,MASIKELPRWEDEVYQIARGDKVEGGVGGIANMQAKTLAERTRYLK...,LLDPSAPEPMRMFNIKGKPRIELTAEELATFNATYSKYLKKGSQYL...,0.020271,0.077943,...,-0.019126,0.039506,0.068645,0.022851,0.172154,0.104167,-0.081774,0.036112,-0.053186,prophage_12


In [None]:
DF_info_old

***

> The prophages DF :

In [47]:
df_prophages_old = DF_info_old.drop_duplicates(subset = ["Phage"], keep = "first")
df_prophages_old

Unnamed: 0,Phage,Protein_name,KL_type_LCA,Infected_ancestor,index,Dataset,seq,domain_seq,1,2,...,1272,1273,1274,1275,1276,1277,1278,1279,1280,prophage_id
0,GCF_016651625.1__phage29,GCF_016651625.1__phage29__142,KL43,GCF_016651625.1,ppt__1,ppt,MSVPNQTPYNIYTANGLTTVFTYEFYIISASDLRVSINGDVVTSGY...,KDFVNINDYWFPTDGDDFYPALNKALSVSPHVLIPPGKHYLKSTVS...,-0.018416,0.022387,...,0.004437,0.087907,0.015800,0.025778,0.065790,0.034045,-0.070899,0.016068,0.065339,prophage_12186
2,GCF_016651625.1__phage12,GCF_016651625.1__phage12__59,KL43,GCF_016651625.1,ppt__4,ppt,MSISKRNFLKAVSCAYFFYSFKALTKVNQPIEDYISTKDKNTWPSK...,NTWPSKVHRVEEFYTSTDRDYSDAILRGINYCSLNNCVLFFSDKYK...,0.026004,0.024372,...,-0.026018,0.018206,0.036751,-0.032549,0.064112,0.061520,-0.024423,-0.027998,0.028089,prophage_924
3,GCF_019928025.1__phage0,GCF_019928025.1__phage0__10,KL43,n1471,ppt__4,ppt,MSISKRNFLKAVSCAYFFYSFKALTKVNQPIEDYISTKDKNTWPSK...,NTWPSKVHRVEEFYTSTDRDYSDAILRGINYCSLNNCVLFFSDKYK...,0.026004,0.024372,...,-0.026018,0.018206,0.036751,-0.032549,0.064112,0.061520,-0.024423,-0.027998,0.028089,prophage_2929
4,GCF_004313505.1__phage4,GCF_004313505.1__phage4__113,KL14,GCF_004313505.1,anubis__5,anubis,MSEYDTGNPVPSASMPDAWDNMQSIDKFVNSSDETITTRTGQQLDT...,KAIFDAWLDFGIDWNGNESISLQLQTAVNYVSKLPYGGEIVLRPGV...,-0.023648,0.052674,...,-0.025991,0.068538,-0.051192,0.026481,0.069100,0.017813,-0.103797,0.018961,0.117058,prophage_11091
5,GCF_004313505.1__phage1,GCF_004313505.1__phage1__233,KL14,n830830,anubis__7,anubis,MAFNPELGSTSPAVLLDNAERLDKLVNGPAADVPDRGGDPLYSWRQ...,KAIFDAWLDFGIDWNGNESVSLQLQTAVNYVSKLPYGGEIVCRPGV...,-0.028778,0.060945,...,-0.020966,0.073692,-0.047220,0.027809,0.073462,0.025343,-0.100966,0.013383,0.116881,prophage_18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20133,GCF_002186895.1__phage9,GCF_002186895.1__phage9__5,KL57,GCF_002186895.1,anubis_return__4260,anubis_return,MRYRFIALALCLLSGSKVAISAGFDCSLANLSPTEKTICSNEYLSG...,ITDSPWLVKKIFSSDSFEGGINLEGMNVSSILTYQEIKNDLYIYIS...,0.073450,0.046651,...,0.035302,0.012151,0.003563,-0.022575,0.014130,0.063376,-0.050646,-0.085156,-0.010849,prophage_6002
20135,GCF_001905235.1__phage21,GCF_001905235.1__phage21__0,KL107,n35403540,anubis_return__4272,anubis_return,MLKHSLAIATCLAFSSSVMGNEANLLYTNTMQFPYKHNADGYMVFD...,VMGNEANLLYTNTMQFPYKHNADGYMVFDIHGKLVVPPEGHFDTLN...,0.076721,0.027635,...,0.042391,-0.004292,-0.004047,-0.011631,-0.026469,0.070159,-0.077212,-0.077950,-0.034630,prophage_313
20136,GCF_004312845.1__phage3,GCF_004312845.1__phage3__38,KL9,GCF_004312845.1,anubis_return__4275,anubis_return,MAILITGKSMTRLPESSSWEEEIELITRSERVAGGLDGPANRPLKS...,DAVIRRDLASDKGTSGVGKLGDKPLVAISYYKSKGQSDQDAVQAAF...,0.032196,0.048856,...,-0.016331,0.084711,0.056063,0.001793,0.073958,0.090169,-0.060105,0.023726,0.086452,prophage_12656
20137,GCF_900172635.1__phage2,GCF_900172635.1__phage2__1608,KL124,GCF_900172635.1,anubis_return__4287,anubis_return,MADLSISVISDQASESNQAGWWHPLDSFQGVEYYGLCKEYGTAGYH...,MADLSISVISDQASESNQAGWWHPLDSFQGVEYYGLCKEYGTAGYH...,-0.011089,-0.005328,...,0.034656,0.046130,0.012586,-0.021702,-0.023386,0.105700,-0.099147,-0.057367,0.091427,prophage_12780


In [48]:
df_prophages = DF_info.drop_duplicates(subset = ["Phage"], keep = "first")
df_prophages

Unnamed: 0,Phage,Protein_name,KL_type_LCA,Infected_ancestor,index,Dataset,seq,domain_seq,1,2,...,1272,1273,1274,1275,1276,1277,1278,1279,1280,prophage_id
0,GCF_902164905.1__phage1,GCF_902164905.1__phage1__34,KL41,GCF_902164905.1,minibatch__460,minibatch,MPATPQDRLYGLTTSVAVKPPVFISVDYDVARFGEQTITSKTPTDE...,QDRLYGLTTSVAVKPPVFISVDYDVARFGEQTITSKTPTDERTITT...,0.025276,0.053137,...,-0.011464,0.081105,0.012011,0.042917,0.009402,0.093175,-0.080562,0.000897,0.111854,prophage_11309
1,GCF_015910145.1__phage5,GCF_015910145.1__phage5__1351,KL122|KL106,n4984,minibatch__1084,minibatch,MTVSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVT...,VSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVTLT...,0.004905,0.040896,...,-0.040657,0.087288,0.022292,0.024434,0.025246,0.083449,-0.123537,0.047648,0.061250,prophage_5
2,GCF_900502315.1__phage13,GCF_900502315.1__phage13__356,KL122|KL106,n4984,minibatch__1084,minibatch,MTVSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVT...,VSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVTLT...,0.004905,0.040896,...,-0.040657,0.087288,0.022292,0.024434,0.025246,0.083449,-0.123537,0.047648,0.061250,prophage_5
3,GCF_004803085.1__phage3,GCF_004803085.1__phage3__24,KL122|KL106,n4984,minibatch__1084,minibatch,MTVSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVT...,VSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVTLT...,0.004905,0.040896,...,-0.040657,0.087288,0.022292,0.024434,0.025246,0.083449,-0.123537,0.047648,0.061250,prophage_5
4,GCF_017310305.1__phage5,GCF_017310305.1__phage5__1353,KL30,n4996,minibatch__1084,minibatch,MTVSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVT...,VSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVTLT...,0.004905,0.040896,...,-0.040657,0.087288,0.022292,0.024434,0.025246,0.083449,-0.123537,0.047648,0.061250,prophage_5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21341,GCF_000349245.1__phage1,GCF_000349245.1__phage1__213,KL53,n56795679,anubis_return__4209,anubis_return,MTRLPESSLWEDEIELISRSERVSGGLDGVANRPLKSLANRTRYLK...,LGRLWKRRFEGNEIRMAWARAKSLKQTSAPQDFAFKNCLKAAASIS...,-0.001302,0.069886,...,0.014471,0.095149,0.058190,0.047760,0.130289,0.055118,-0.061362,0.013399,0.027540,prophage_237
21342,GCF_900506765.1__phage17,GCF_900506765.1__phage17__90,KL149,GCF_900506765.1,anubis_return__4216,anubis_return,MMTTLNEHPQWESDIYLIKRSDLVAGGRGGIANMQAQQLANRTAFL...,NRRWFRRFTGNIRAEWSGIHDLSQSSAPVDSYIYRLLLASAVGSPD...,0.053704,0.083858,...,0.032803,0.109572,0.010032,0.024949,0.094129,0.028693,-0.061396,0.006824,0.046220,prophage_15598
21346,GCF_002186895.1__phage9,GCF_002186895.1__phage9__5,KL57,GCF_002186895.1,anubis_return__4260,anubis_return,MRYRFIALALCLLSGSKVAISAGFDCSLANLSPTEKTICSNEYLSG...,ITDSPWLVKKIFSSDSFEGGINLEGMNVSSILTYQEIKNDLYIYIS...,0.073450,0.046651,...,0.035302,0.012151,0.003563,-0.022575,0.014130,0.063376,-0.050646,-0.085156,-0.010849,prophage_6002
21347,GCF_004312845.1__phage3,GCF_004312845.1__phage3__38,KL9,GCF_004312845.1,anubis_return__4275,anubis_return,MAILITGKSMTRLPESSSWEEEIELITRSERVAGGLDGPANRPLKS...,DAVIRRDLASDKGTSGVGKLGDKPLVAISYYKSKGQSDQDAVQAAF...,0.032196,0.048856,...,-0.016331,0.084711,0.056063,0.001793,0.073958,0.090169,-0.060105,0.023726,0.086452,prophage_12656


***

> Clean KL type target DF :

In [49]:
DF_info_lvl_0_old

Unnamed: 0,Phage,Protein_name,KL_type_LCA,Infected_ancestor,index,Dataset,seq,domain_seq,1,2,...,1272,1273,1274,1275,1276,1277,1278,1279,1280,prophage_id
0,GCF_016651625.1__phage29,GCF_016651625.1__phage29__142,KL43,GCF_016651625.1,ppt__1,ppt,MSVPNQTPYNIYTANGLTTVFTYEFYIISASDLRVSINGDVVTSGY...,KDFVNINDYWFPTDGDDFYPALNKALSVSPHVLIPPGKHYLKSTVS...,-0.018416,0.022387,...,0.004437,0.087907,0.015800,0.025778,0.065790,0.034045,-0.070899,0.016068,0.065339,prophage_12186
1,GCF_016651625.1__phage29,GCF_016651625.1__phage29__150,KL43,GCF_016651625.1,anubis__0,anubis,MRANLIKTNFTAGEISPRLMGRVDIARYANGAKIIENAVCVVQGGV...,QAASPGAWTREDTVWTEEFGYPGAVTLYQQRLVLAGSPQYPQTIWW...,0.036016,0.005938,...,-0.037612,0.008772,0.010556,-0.049738,-0.012549,0.092624,-0.136602,-0.191378,0.135658,prophage_12186
2,GCF_016651625.1__phage12,GCF_016651625.1__phage12__59,KL43,GCF_016651625.1,ppt__4,ppt,MSISKRNFLKAVSCAYFFYSFKALTKVNQPIEDYISTKDKNTWPSK...,NTWPSKVHRVEEFYTSTDRDYSDAILRGINYCSLNNCVLFFSDKYK...,0.026004,0.024372,...,-0.026018,0.018206,0.036751,-0.032549,0.064112,0.061520,-0.024423,-0.027998,0.028089,prophage_924
3,GCF_019928025.1__phage0,GCF_019928025.1__phage0__10,KL43,n1471,ppt__4,ppt,MSISKRNFLKAVSCAYFFYSFKALTKVNQPIEDYISTKDKNTWPSK...,NTWPSKVHRVEEFYTSTDRDYSDAILRGINYCSLNNCVLFFSDKYK...,0.026004,0.024372,...,-0.026018,0.018206,0.036751,-0.032549,0.064112,0.061520,-0.024423,-0.027998,0.028089,prophage_2929
4,GCF_004313505.1__phage4,GCF_004313505.1__phage4__113,KL14,GCF_004313505.1,anubis__5,anubis,MSEYDTGNPVPSASMPDAWDNMQSIDKFVNSSDETITTRTGQQLDT...,KAIFDAWLDFGIDWNGNESISLQLQTAVNYVSKLPYGGEIVLRPGV...,-0.023648,0.052674,...,-0.025991,0.068538,-0.051192,0.026481,0.069100,0.017813,-0.103797,0.018961,0.117058,prophage_11091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20134,GCF_002248635.1__phage4,GCF_002248635.1__phage4__44,KL102,n320,anubis_return__4264,anubis_return,MVSLKGMGSTFRDCTALISLPSGLLDGCINLTSLTLTFSGCTSLAL...,MVSLKGMGSTFRDCTALISLPSGLLDGCINLTSLTLTFSGCTSLAL...,-0.000585,-0.087093,...,0.036749,0.048489,0.020484,0.023950,-0.048109,0.134457,-0.101326,0.088485,0.037368,prophage_3054
20135,GCF_001905235.1__phage21,GCF_001905235.1__phage21__0,KL107,n35403540,anubis_return__4272,anubis_return,MLKHSLAIATCLAFSSSVMGNEANLLYTNTMQFPYKHNADGYMVFD...,VMGNEANLLYTNTMQFPYKHNADGYMVFDIHGKLVVPPEGHFDTLN...,0.076721,0.027635,...,0.042391,-0.004292,-0.004047,-0.011631,-0.026469,0.070159,-0.077212,-0.077950,-0.034630,prophage_313
20136,GCF_004312845.1__phage3,GCF_004312845.1__phage3__38,KL9,GCF_004312845.1,anubis_return__4275,anubis_return,MAILITGKSMTRLPESSSWEEEIELITRSERVAGGLDGPANRPLKS...,DAVIRRDLASDKGTSGVGKLGDKPLVAISYYKSKGQSDQDAVQAAF...,0.032196,0.048856,...,-0.016331,0.084711,0.056063,0.001793,0.073958,0.090169,-0.060105,0.023726,0.086452,prophage_12656
20137,GCF_900172635.1__phage2,GCF_900172635.1__phage2__1608,KL124,GCF_900172635.1,anubis_return__4287,anubis_return,MADLSISVISDQASESNQAGWWHPLDSFQGVEYYGLCKEYGTAGYH...,MADLSISVISDQASESNQAGWWHPLDSFQGVEYYGLCKEYGTAGYH...,-0.011089,-0.005328,...,0.034656,0.046130,0.012586,-0.021702,-0.023386,0.105700,-0.099147,-0.057367,0.091427,prophage_12780


In [50]:
DF_info_lvl_0

Unnamed: 0,Phage,Protein_name,KL_type_LCA,Infected_ancestor,index,Dataset,seq,domain_seq,1,2,...,1272,1273,1274,1275,1276,1277,1278,1279,1280,prophage_id
0,GCF_902164905.1__phage1,GCF_902164905.1__phage1__34,KL41,GCF_902164905.1,minibatch__460,minibatch,MPATPQDRLYGLTTSVAVKPPVFISVDYDVARFGEQTITSKTPTDE...,QDRLYGLTTSVAVKPPVFISVDYDVARFGEQTITSKTPTDERTITT...,0.025276,0.053137,...,-0.011464,0.081105,0.012011,0.042917,0.009402,0.093175,-0.080562,0.000897,0.111854,prophage_11309
4,GCF_017310305.1__phage5,GCF_017310305.1__phage5__1353,KL30,n4996,minibatch__1084,minibatch,MTVSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVT...,VSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVTLT...,0.004905,0.040896,...,-0.040657,0.087288,0.022292,0.024434,0.025246,0.083449,-0.123537,0.047648,0.061250,prophage_5
10,GCF_001701985.1__phage2,GCF_001701985.1__phage2__357,KL30,n4988,minibatch__1084,minibatch,MTVSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVT...,VSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVTLT...,0.004905,0.040896,...,-0.040657,0.087288,0.022292,0.024434,0.025246,0.083449,-0.123537,0.047648,0.061250,prophage_6465
12,GCF_001611095.1__phage5,GCF_001611095.1__phage5__1365,KL30,n49894989,minibatch__1084,minibatch,MTVSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVT...,VSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVTLT...,0.004905,0.040896,...,-0.040657,0.087288,0.022292,0.024434,0.025246,0.083449,-0.123537,0.047648,0.061250,prophage_5
15,GCF_902156555.1__phage3,GCF_902156555.1__phage3__511,KL30,GCF_902156555.1,minibatch__1084,minibatch,MTVSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVT...,VSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVTLT...,0.004905,0.040896,...,-0.040657,0.087288,0.022292,0.024434,0.025246,0.083449,-0.123537,0.047648,0.061250,prophage_1828
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21342,GCF_900506765.1__phage17,GCF_900506765.1__phage17__90,KL149,GCF_900506765.1,anubis_return__4216,anubis_return,MMTTLNEHPQWESDIYLIKRSDLVAGGRGGIANMQAQQLANRTAFL...,NRRWFRRFTGNIRAEWSGIHDLSQSSAPVDSYIYRLLLASAVGSPD...,0.053704,0.083858,...,0.032803,0.109572,0.010032,0.024949,0.094129,0.028693,-0.061396,0.006824,0.046220,prophage_15598
21344,GCF_003255785.1__phage1,GCF_003255785.1__phage1__10,KL127,GCF_003255785.1,anubis_return__4239,anubis_return,MNGLNHNALTCSAVPIPPWERSLQTVEAQPYFSVSQASLVLEGIVF...,MNGLNHNALTCSAVPIPPWERSLQTVEAQPYFSVSQASLVLEGIVF...,0.010626,-0.025389,...,0.045372,0.009262,-0.008319,-0.050856,0.034115,0.101663,-0.108278,-0.135629,0.102486,prophage_3577
21346,GCF_002186895.1__phage9,GCF_002186895.1__phage9__5,KL57,GCF_002186895.1,anubis_return__4260,anubis_return,MRYRFIALALCLLSGSKVAISAGFDCSLANLSPTEKTICSNEYLSG...,ITDSPWLVKKIFSSDSFEGGINLEGMNVSSILTYQEIKNDLYIYIS...,0.073450,0.046651,...,0.035302,0.012151,0.003563,-0.022575,0.014130,0.063376,-0.050646,-0.085156,-0.010849,prophage_6002
21347,GCF_004312845.1__phage3,GCF_004312845.1__phage3__38,KL9,GCF_004312845.1,anubis_return__4275,anubis_return,MAILITGKSMTRLPESSSWEEEIELITRSERVAGGLDGPANRPLKS...,DAVIRRDLASDKGTSGVGKLGDKPLVAISYYKSKGQSDQDAVQAAF...,0.032196,0.048856,...,-0.016331,0.084711,0.056063,0.001793,0.073958,0.090169,-0.060105,0.023726,0.086452,prophage_12656


***
> The filtered DF : 

In [None]:
DF_info_lvl_0_filter1 = DF_info_lvl_0.drop_duplicates(subset = ["Infected_ancestor","index","prophage_id"] , keep = "first").reset_index(drop=True)
DF_info_lvl_0_filter1

In [None]:
DF_info_lvl_0_old_filter1 = DF_info_lvl_0_old.drop_duplicates(subset = ["Infected_ancestor","index","prophage_id"] , keep = "first").reset_index(drop=True)
DF_info_lvl_0_old_filter1.columns

In [71]:
DF_info_lvl_0_filter1.drop_duplicates(subset = ["Phage"])

Unnamed: 0,Phage,Protein_name,KL_type_LCA,Infected_ancestor,index,Dataset,seq,domain_seq,1,2,...,1272,1273,1274,1275,1276,1277,1278,1279,1280,prophage_id
0,GCF_902164905.1__phage1,GCF_902164905.1__phage1__34,KL41,GCF_902164905.1,minibatch__460,minibatch,MPATPQDRLYGLTTSVAVKPPVFISVDYDVARFGEQTITSKTPTDE...,QDRLYGLTTSVAVKPPVFISVDYDVARFGEQTITSKTPTDERTITT...,0.025276,0.053137,...,-0.011464,0.081105,0.012011,0.042917,0.009402,0.093175,-0.080562,0.000897,0.111854,prophage_11309
1,GCF_017310305.1__phage5,GCF_017310305.1__phage5__1353,KL30,n4996,minibatch__1084,minibatch,MTVSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVT...,VSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVTLT...,0.004905,0.040896,...,-0.040657,0.087288,0.022292,0.024434,0.025246,0.083449,-0.123537,0.047648,0.061250,prophage_5
2,GCF_001701985.1__phage2,GCF_001701985.1__phage2__357,KL30,n4988,minibatch__1084,minibatch,MTVSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVT...,VSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVTLT...,0.004905,0.040896,...,-0.040657,0.087288,0.022292,0.024434,0.025246,0.083449,-0.123537,0.047648,0.061250,prophage_6465
3,GCF_001611095.1__phage5,GCF_001611095.1__phage5__1365,KL30,n49894989,minibatch__1084,minibatch,MTVSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVT...,VSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVTLT...,0.004905,0.040896,...,-0.040657,0.087288,0.022292,0.024434,0.025246,0.083449,-0.123537,0.047648,0.061250,prophage_5
4,GCF_902156555.1__phage3,GCF_902156555.1__phage3__511,KL30,GCF_902156555.1,minibatch__1084,minibatch,MTVSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVT...,VSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVTLT...,0.004905,0.040896,...,-0.040657,0.087288,0.022292,0.024434,0.025246,0.083449,-0.123537,0.047648,0.061250,prophage_1828
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11292,GCF_000349245.1__phage1,GCF_000349245.1__phage1__213,KL53,n56795679,anubis_return__4209,anubis_return,MTRLPESSLWEDEIELISRSERVSGGLDGVANRPLKSLANRTRYLK...,LGRLWKRRFEGNEIRMAWARAKSLKQTSAPQDFAFKNCLKAAASIS...,-0.001302,0.069886,...,0.014471,0.095149,0.058190,0.047760,0.130289,0.055118,-0.061362,0.013399,0.027540,prophage_237
11293,GCF_900506765.1__phage17,GCF_900506765.1__phage17__90,KL149,GCF_900506765.1,anubis_return__4216,anubis_return,MMTTLNEHPQWESDIYLIKRSDLVAGGRGGIANMQAQQLANRTAFL...,NRRWFRRFTGNIRAEWSGIHDLSQSSAPVDSYIYRLLLASAVGSPD...,0.053704,0.083858,...,0.032803,0.109572,0.010032,0.024949,0.094129,0.028693,-0.061396,0.006824,0.046220,prophage_15598
11295,GCF_002186895.1__phage9,GCF_002186895.1__phage9__5,KL57,GCF_002186895.1,anubis_return__4260,anubis_return,MRYRFIALALCLLSGSKVAISAGFDCSLANLSPTEKTICSNEYLSG...,ITDSPWLVKKIFSSDSFEGGINLEGMNVSSILTYQEIKNDLYIYIS...,0.073450,0.046651,...,0.035302,0.012151,0.003563,-0.022575,0.014130,0.063376,-0.050646,-0.085156,-0.010849,prophage_6002
11296,GCF_004312845.1__phage3,GCF_004312845.1__phage3__38,KL9,GCF_004312845.1,anubis_return__4275,anubis_return,MAILITGKSMTRLPESSSWEEEIELITRSERVAGGLDGPANRPLKS...,DAVIRRDLASDKGTSGVGKLGDKPLVAISYYKSKGQSDQDAVQAAF...,0.032196,0.048856,...,-0.016331,0.084711,0.056063,0.001793,0.073958,0.090169,-0.060105,0.023726,0.086452,prophage_12656


***
> How can the filtering step be better ?

If remove prophages that are : <br>
1. from the same strain <br>
2. infecting the same ancestor <br>
3. holding the same set of depolymerase

In [101]:
df_prophages = DF_info.drop_duplicates(subset = ["Phage"], keep = "first")
dico_prophage_info = {row["Phage"] : {"prophage_strain" : row["prophage_id"] , "ancestor" : row["Infected_ancestor"]} for _,row in df_prophages.iterrows()}


In [102]:
df_prophages = DF_info.drop_duplicates(subset = ["Phage"], keep = "first")
dico_prophage_info = {row["Phage"] : {"prophage_strain" : row["prophage_id"] , "ancestor" : row["Infected_ancestor"]} for _,row in df_prophages.iterrows()}

def get_filtered_prophages(prophage) :
    combinations = []
    to_exclude = set()
    to_keep = set()
    to_keep.add(prophage)
    df_prophage_group = DF_info[(DF_info["prophage_id"] == dico_prophage_info[prophage]["prophage_strain"]) & (DF_info["Infected_ancestor"] == dico_prophage_info[prophage]["ancestor"])]
    if len(df_prophage_group) == 1 : 
        pass
    else :
        depo_set = set(df_prophage_group[df_prophage_group["Phage"] == prophage]["domain_seq"].values)
        for prophage_tmp in df_prophage_group["Phage"].unique().tolist() :
            if prophage_tmp != prophage :
                tmp_depo_set = set(df_prophage_group[df_prophage_group["Phage"] == prophage_tmp]["domain_seq"].values)
                if depo_set == tmp_depo_set :
                    to_exclude.add(prophage_tmp)
                else :
                    if tmp_depo_set not in combinations :
                        to_keep.add(prophage_tmp)
                        combinations.append(tmp_depo_set)
                    else :
                        to_exclude.add(prophage_tmp)
    return df_prophage_group , to_exclude , to_keep

good_prophages = set()
excluded_prophages = set()

for prophage, info_prophage in tqdm(dico_prophage_info.items()) :
    if prophage not in excluded_prophages and prophage not in good_prophages:
        _, excluded_members , kept_members = get_filtered_prophages(prophage) 
        good_prophages.update(kept_members)
        excluded_prophages.update(excluded_members)

DF_info_lvl_0_filtered = DF_info[DF_info["Phage"].isin(good_prophages)]
DF_info_lvl_0_final = DF_info_lvl_0_filtered[~DF_info_lvl_0_filtered["KL_type_LCA"].str.contains("\\|")]


In [103]:
good_prophages = set()
excluded_prophages = set()

for prophage, info_prophage in tqdm(dico_prophage_info.items()) :
    if prophage not in excluded_prophages and prophage not in good_prophages:
        _, excluded_members , kept_members = get_filtered_prophages(prophage) 
        good_prophages.update(kept_members)
        excluded_prophages.update(excluded_members)
        
        
    

100%|████████████████████████████████████| 15981/15981 [00:23<00:00, 689.77it/s]


In [107]:
DF_info_lvl_0_filtered = DF_info[DF_info["Phage"].isin(good_prophages)]
DF_info_lvl_0_final = DF_info_lvl_0_filtered[~DF_info_lvl_0_filtered["KL_type_LCA"].str.contains("\\|")]


In [108]:
DF_info_lvl_0_final

Unnamed: 0,Phage,Protein_name,KL_type_LCA,Infected_ancestor,index,Dataset,seq,domain_seq,1,2,...,1272,1273,1274,1275,1276,1277,1278,1279,1280,prophage_id
0,GCF_902164905.1__phage1,GCF_902164905.1__phage1__34,KL41,GCF_902164905.1,minibatch__460,minibatch,MPATPQDRLYGLTTSVAVKPPVFISVDYDVARFGEQTITSKTPTDE...,QDRLYGLTTSVAVKPPVFISVDYDVARFGEQTITSKTPTDERTITT...,0.025276,0.053137,...,-0.011464,0.081105,0.012011,0.042917,0.009402,0.093175,-0.080562,0.000897,0.111854,prophage_11309
4,GCF_017310305.1__phage5,GCF_017310305.1__phage5__1353,KL30,n4996,minibatch__1084,minibatch,MTVSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVT...,VSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVTLT...,0.004905,0.040896,...,-0.040657,0.087288,0.022292,0.024434,0.025246,0.083449,-0.123537,0.047648,0.061250,prophage_5
10,GCF_001701985.1__phage2,GCF_001701985.1__phage2__357,KL30,n4988,minibatch__1084,minibatch,MTVSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVT...,VSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVTLT...,0.004905,0.040896,...,-0.040657,0.087288,0.022292,0.024434,0.025246,0.083449,-0.123537,0.047648,0.061250,prophage_6465
12,GCF_001611095.1__phage5,GCF_001611095.1__phage5__1365,KL30,n49894989,minibatch__1084,minibatch,MTVSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVT...,VSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVTLT...,0.004905,0.040896,...,-0.040657,0.087288,0.022292,0.024434,0.025246,0.083449,-0.123537,0.047648,0.061250,prophage_5
15,GCF_902156555.1__phage3,GCF_902156555.1__phage3__511,KL30,GCF_902156555.1,minibatch__1084,minibatch,MTVSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVT...,VSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVTLT...,0.004905,0.040896,...,-0.040657,0.087288,0.022292,0.024434,0.025246,0.083449,-0.123537,0.047648,0.061250,prophage_1828
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21342,GCF_900506765.1__phage17,GCF_900506765.1__phage17__90,KL149,GCF_900506765.1,anubis_return__4216,anubis_return,MMTTLNEHPQWESDIYLIKRSDLVAGGRGGIANMQAQQLANRTAFL...,NRRWFRRFTGNIRAEWSGIHDLSQSSAPVDSYIYRLLLASAVGSPD...,0.053704,0.083858,...,0.032803,0.109572,0.010032,0.024949,0.094129,0.028693,-0.061396,0.006824,0.046220,prophage_15598
21344,GCF_003255785.1__phage1,GCF_003255785.1__phage1__10,KL127,GCF_003255785.1,anubis_return__4239,anubis_return,MNGLNHNALTCSAVPIPPWERSLQTVEAQPYFSVSQASLVLEGIVF...,MNGLNHNALTCSAVPIPPWERSLQTVEAQPYFSVSQASLVLEGIVF...,0.010626,-0.025389,...,0.045372,0.009262,-0.008319,-0.050856,0.034115,0.101663,-0.108278,-0.135629,0.102486,prophage_3577
21346,GCF_002186895.1__phage9,GCF_002186895.1__phage9__5,KL57,GCF_002186895.1,anubis_return__4260,anubis_return,MRYRFIALALCLLSGSKVAISAGFDCSLANLSPTEKTICSNEYLSG...,ITDSPWLVKKIFSSDSFEGGINLEGMNVSSILTYQEIKNDLYIYIS...,0.073450,0.046651,...,0.035302,0.012151,0.003563,-0.022575,0.014130,0.063376,-0.050646,-0.085156,-0.010849,prophage_6002
21347,GCF_004312845.1__phage3,GCF_004312845.1__phage3__38,KL9,GCF_004312845.1,anubis_return__4275,anubis_return,MAILITGKSMTRLPESSSWEEEIELITRSERVAGGLDGPANRPLKS...,DAVIRRDLASDKGTSGVGKLGDKPLVAISYYKSKGQSDQDAVQAAF...,0.032196,0.048856,...,-0.016331,0.084711,0.056063,0.001793,0.073958,0.090169,-0.060105,0.023726,0.086452,prophage_12656


In [104]:
len(good_prophages) , len(excluded_prophages) , len(dico_prophage_info)

(9261, 6723, 15981)

In [96]:
9261 + 6723

15984

In [None]:
{'GCF_011365805.1__phage0',
 'GCF_020520185.1__phage35',
 'GCF_903993195.1__phage23'}

In [98]:
good_prophages_debug = set()
excluded_prophages_debug = set()

decipher_prophage = "GCF_011365805.1__phage0"

for prophage, info_prophage in tqdm(dico_prophage_info.items()) :
    if prophage not in excluded_prophages_debug and prophage not in good_prophages_debug:
        _, excluded_members , kept_members = get_filtered_prophages(prophage)
        good_prophages_debug.update(kept_members)
        excluded_prophages_debug.update(excluded_members)
        if decipher_prophage in excluded_members :
            print("exclusion", prophage, "\n")
            print("excluded members", excluded_members)
            print("kept members", kept_members)
            print(_)
            print("\n")
        if decipher_prophage in kept_members :
            print("inclusion", prophage, "\n")
            print("excluded members", excluded_members)
            print("kept members", kept_members)
            print(_)
            print("\n")
        if decipher_prophage in good_prophages_debug and decipher_prophage in excluded_prophages_debug :
            break
        #good_prophages_debug.update(kept_members)
        #excluded_prophages_debug.update(excluded_members)
        

  7%|██▍                                  | 1060/15981 [00:01<00:25, 589.66it/s]

inclusion GCF_011365805.1__phage0 

excluded members {'GCF_011366085.1__phage4', 'GCF_011366125.1__phage1', 'GCF_011365945.1__phage4'}
kept members {'GCF_011365805.1__phage0'}
                        Phage                  Protein_name KL_type_LCA  \
999   GCF_011365805.1__phage0  GCF_011365805.1__phage0__921        KL55   
1000  GCF_011366085.1__phage4  GCF_011366085.1__phage4__181        KL55   
1001  GCF_011366125.1__phage1  GCF_011366125.1__phage1__921        KL55   
1002  GCF_011365945.1__phage4  GCF_011365945.1__phage4__182        KL55   

     Infected_ancestor     index Dataset  \
999          n57555755  ppt__224     ppt   
1000         n57555755  ppt__224     ppt   
1001         n57555755  ppt__224     ppt   
1002         n57555755  ppt__224     ppt   

                                                    seq  \
999   MFTGLCAFPLTPLHQQNIDEKAFIRILARLTDAGVDSLGILGSTGS...   
1000  MFTGLCAFPLTPLHQQNIDEKAFIRILARLTDAGVDSLGILGSTGS...   
1001  MFTGLCAFPLTPLHQQNIDEKAFIRILARLTDAGVDSLGILGST

 70%|█████████████████████████▎          | 11230/15981 [00:17<00:07, 655.73it/s]

exclusion GCF_011365825.1__phage0 

excluded members {'GCF_011366125.1__phage1', 'GCF_011365805.1__phage0'}
kept members {'GCF_011365825.1__phage0'}
                         Phage                  Protein_name KL_type_LCA  \
14447  GCF_011365805.1__phage0  GCF_011365805.1__phage0__933        KL55   
14453  GCF_011365825.1__phage0  GCF_011365825.1__phage0__933        KL55   
14454  GCF_011366125.1__phage1  GCF_011366125.1__phage1__933        KL55   

      Infected_ancestor        index Dataset  \
14447         n57575757  anubis__462  anubis   
14453         n57575757  anubis__462  anubis   
14454         n57575757  anubis__462  anubis   

                                                     seq  \
14447  MRANLIKTNFTAGEISPRLMGRVDIDRYANGAKTLENSVVVVQGGV...   
14453  MRANLIKTNFTAGEISPRLMGRVDIDRYANGAKTLENSVVVVQGGV...   
14454  MRANLIKTNFTAGEISPRLMGRVDIDRYANGAKTLENSVVVVQGGV...   

                                              domain_seq         1         2  \
14447  ATQAASPGAWTREDSVWTDEFGYPG




In [99]:
DF_info[DF_info["Infected_ancestor"] == "n57575757"]

Unnamed: 0,Phage,Protein_name,KL_type_LCA,Infected_ancestor,index,Dataset,seq,domain_seq,1,2,...,1272,1273,1274,1275,1276,1277,1278,1279,1280,prophage_id
14447,GCF_011365805.1__phage0,GCF_011365805.1__phage0__933,KL55,n57575757,anubis__462,anubis,MRANLIKTNFTAGEISPRLMGRVDIDRYANGAKTLENSVVVVQGGV...,ATQAASPGAWTREDSVWTDEFGYPGAVTLYQQRLVLAGSPQYPQTI...,0.029423,0.015319,...,-0.03547,0.02136,-0.008174,-0.049825,-0.014841,0.093094,-0.147822,-0.187212,0.145852,prophage_1812
14453,GCF_011365825.1__phage0,GCF_011365825.1__phage0__933,KL55,n57575757,anubis__462,anubis,MRANLIKTNFTAGEISPRLMGRVDIDRYANGAKTLENSVVVVQGGV...,ATQAASPGAWTREDSVWTDEFGYPGAVTLYQQRLVLAGSPQYPQTI...,0.029423,0.015319,...,-0.03547,0.02136,-0.008174,-0.049825,-0.014841,0.093094,-0.147822,-0.187212,0.145852,prophage_1812
14454,GCF_011366125.1__phage1,GCF_011366125.1__phage1__933,KL55,n57575757,anubis__462,anubis,MRANLIKTNFTAGEISPRLMGRVDIDRYANGAKTLENSVVVVQGGV...,ATQAASPGAWTREDSVWTDEFGYPGAVTLYQQRLVLAGSPQYPQTI...,0.029423,0.015319,...,-0.03547,0.02136,-0.008174,-0.049825,-0.014841,0.093094,-0.147822,-0.187212,0.145852,prophage_1812


In [100]:
DF_info[DF_info["Infected_ancestor"] == "n57555755"]

Unnamed: 0,Phage,Protein_name,KL_type_LCA,Infected_ancestor,index,Dataset,seq,domain_seq,1,2,...,1272,1273,1274,1275,1276,1277,1278,1279,1280,prophage_id
268,GCF_011365755.1__phage2,GCF_011365755.1__phage2__1092,KL55,n57555755,minibatch__230,minibatch,MAFNPELGSSSPEVLLDNAKRLDELTNGPAATVPDRAGEPLDSWRK...,ELGSSSPEVLLDNAKRLDELTNGPAATVPDRAGEPLDSWRKMQEDN...,-0.003495,-0.032741,...,-0.0759,-0.010408,0.008827,-0.001425,-0.104605,0.118043,-0.125878,0.012201,0.148335,prophage_202
270,GCF_011365825.1__phage4,GCF_011365825.1__phage4__108,KL55,n57555755,minibatch__230,minibatch,MAFNPELGSSSPEVLLDNAKRLDELTNGPAATVPDRAGEPLDSWRK...,ELGSSSPEVLLDNAKRLDELTNGPAATVPDRAGEPLDSWRKMQEDN...,-0.003495,-0.032741,...,-0.0759,-0.010408,0.008827,-0.001425,-0.104605,0.118043,-0.125878,0.012201,0.148335,prophage_202
271,GCF_011365995.1__phage2,GCF_011365995.1__phage2__1089,KL55,n57555755,minibatch__230,minibatch,MAFNPELGSSSPEVLLDNAKRLDELTNGPAATVPDRAGEPLDSWRK...,ELGSSSPEVLLDNAKRLDELTNGPAATVPDRAGEPLDSWRKMQEDN...,-0.003495,-0.032741,...,-0.0759,-0.010408,0.008827,-0.001425,-0.104605,0.118043,-0.125878,0.012201,0.148335,prophage_202
273,GCF_011366055.1__phage2,GCF_011366055.1__phage2__1092,KL55,n57555755,minibatch__230,minibatch,MAFNPELGSSSPEVLLDNAKRLDELTNGPAATVPDRAGEPLDSWRK...,ELGSSSPEVLLDNAKRLDELTNGPAATVPDRAGEPLDSWRKMQEDN...,-0.003495,-0.032741,...,-0.0759,-0.010408,0.008827,-0.001425,-0.104605,0.118043,-0.125878,0.012201,0.148335,prophage_202
274,GCF_011366085.1__phage0,GCF_011366085.1__phage0__145,KL55,n57555755,minibatch__230,minibatch,MAFNPELGSSSPEVLLDNAKRLDELTNGPAATVPDRAGEPLDSWRK...,ELGSSSPEVLLDNAKRLDELTNGPAATVPDRAGEPLDSWRKMQEDN...,-0.003495,-0.032741,...,-0.0759,-0.010408,0.008827,-0.001425,-0.104605,0.118043,-0.125878,0.012201,0.148335,prophage_202
275,GCF_011365985.1__phage0,GCF_011365985.1__phage0__1122,KL55,n57555755,minibatch__230,minibatch,MAFNPELGSSSPEVLLDNAKRLDELTNGPAATVPDRAGEPLDSWRK...,ELGSSSPEVLLDNAKRLDELTNGPAATVPDRAGEPLDSWRKMQEDN...,-0.003495,-0.032741,...,-0.0759,-0.010408,0.008827,-0.001425,-0.104605,0.118043,-0.125878,0.012201,0.148335,prophage_202
276,GCF_011365965.1__phage2,GCF_011365965.1__phage2__1092,KL55,n57555755,minibatch__230,minibatch,MAFNPELGSSSPEVLLDNAKRLDELTNGPAATVPDRAGEPLDSWRK...,ELGSSSPEVLLDNAKRLDELTNGPAATVPDRAGEPLDSWRKMQEDN...,-0.003495,-0.032741,...,-0.0759,-0.010408,0.008827,-0.001425,-0.104605,0.118043,-0.125878,0.012201,0.148335,prophage_202
277,GCF_011365835.1__phage2,GCF_011365835.1__phage2__1092,KL55,n57555755,minibatch__230,minibatch,MAFNPELGSSSPEVLLDNAKRLDELTNGPAATVPDRAGEPLDSWRK...,ELGSSSPEVLLDNAKRLDELTNGPAATVPDRAGEPLDSWRKMQEDN...,-0.003495,-0.032741,...,-0.0759,-0.010408,0.008827,-0.001425,-0.104605,0.118043,-0.125878,0.012201,0.148335,prophage_202
278,GCF_011365805.1__phage4,GCF_011365805.1__phage4__108,KL55,n57555755,minibatch__230,minibatch,MAFNPELGSSSPEVLLDNAKRLDELTNGPAATVPDRAGEPLDSWRK...,ELGSSSPEVLLDNAKRLDELTNGPAATVPDRAGEPLDSWRKMQEDN...,-0.003495,-0.032741,...,-0.0759,-0.010408,0.008827,-0.001425,-0.104605,0.118043,-0.125878,0.012201,0.148335,prophage_202
279,GCF_011366045.1__phage0,GCF_011366045.1__phage0__105,KL55,n57555755,minibatch__230,minibatch,MAFNPELGSSSPEVLLDNAKRLDELTNGPAATVPDRAGEPLDSWRK...,ELGSSSPEVLLDNAKRLDELTNGPAATVPDRAGEPLDSWRKMQEDN...,-0.003495,-0.032741,...,-0.0759,-0.010408,0.008827,-0.001425,-0.104605,0.118043,-0.125878,0.012201,0.148335,prophage_202


In [97]:
decipher_prophages = good_prophages.intersection(excluded_prophages)
decipher_prophages

{'GCF_011365805.1__phage0',
 'GCF_020520185.1__phage35',
 'GCF_903993195.1__phage23'}

***
## test :

In [80]:
test_prophage = "GCF_000281475.1__phage16"

df_test , eclude , keep = get_filtered_prophages(test_prophage) 
df_test

Unnamed: 0,Phage,Protein_name,KL_type_LCA,Infected_ancestor,index,Dataset,seq,domain_seq,1,2,...,1272,1273,1274,1275,1276,1277,1278,1279,1280,prophage_id
4221,GCF_902156275.1__phage6,GCF_902156275.1__phage6__1514,KL107,n2885,anubis__13,anubis,MSEYDTGNPVPSASMPDAWDNMQSIDKFVNSSEETITTRTGEQLDT...,ALSNEVEIYRNGNRDNPRDRVLYREFSRIGRNGALTERIVKDIPTG...,-0.017392,0.117156,...,0.001901,0.111698,-0.000487,0.01816,0.046598,0.024505,-0.104585,0.059359,0.046143,prophage_14
4355,GCF_009867115.1__phage10,GCF_009867115.1__phage10__2901,KL107,n2885,anubis__13,anubis,MSEYDTGNPVPSASMPDAWDNMQSIDKFVNSSEETITTRTGEQLDT...,ALSNEVEIYRNGNRDNPRDRVLYREFSRIGRNGALTERIVKDIPTG...,-0.017392,0.117156,...,0.001901,0.111698,-0.000487,0.01816,0.046598,0.024505,-0.104585,0.059359,0.046143,prophage_14
4580,GCF_000281535.2__phage5,GCF_000281535.2__phage5__2212,KL107,n2885,anubis__13,anubis,MSEYDTGNPVPSASMPDAWDNMQSIDKFVNSSEETITTRTGEQLDT...,ALSNEVEIYRNGNRDNPRDRVLYREFSRIGRNGALTERIVKDIPTG...,-0.017392,0.117156,...,0.001901,0.111698,-0.000487,0.01816,0.046598,0.024505,-0.104585,0.059359,0.046143,prophage_14
7114,GCF_000281475.1__phage16,GCF_000281475.1__phage16__49,KL107,n2885,ppt__128,ppt,MSSAETTNPNAPVSLIEKWDKTFAESSKVDHRKVTFQNRYGITLVG...,MSSAETTNPNAPVSLIEKWDKTFAESSKVDHRKVTFQNRYGITLVG...,0.041171,-0.021027,...,-0.037412,-0.032235,-0.063208,-0.041684,0.041312,0.050056,-0.116315,0.004106,0.06939,prophage_14
7116,GCF_000281495.1__phage10,GCF_000281495.1__phage10__49,KL107,n2885,ppt__128,ppt,MSSAETTNPNAPVSLIEKWDKTFAESSKVDHRKVTFQNRYGITLVG...,MSSAETTNPNAPVSLIEKWDKTFAESSKVDHRKVTFQNRYGITLVG...,0.041171,-0.021027,...,-0.037412,-0.032235,-0.063208,-0.041684,0.041312,0.050056,-0.116315,0.004106,0.06939,prophage_14
7132,GCF_001666005.1__phage13,GCF_001666005.1__phage13__48,KL107,n2885,ppt__128,ppt,MSSAETTNPNAPVSLIEKWDKTFAESSKVDHRKVTFQNRYGITLVG...,MSSAETTNPNAPVSLIEKWDKTFAESSKVDHRKVTFQNRYGITLVG...,0.041171,-0.021027,...,-0.037412,-0.032235,-0.063208,-0.041684,0.041312,0.050056,-0.116315,0.004106,0.06939,prophage_14
15699,GCF_000281515.1__phage26,GCF_000281515.1__phage26__17,KL107,n2885,anubis__807,anubis,MSEYDTGNPVPSASMPDAWDNMQSIDKFVNSSEETITTRTGEQLDT...,ALSNEVEIYRNGNRDNPRDRVLYREFSRIGRNGALTERIVKDIPTG...,0.018501,0.075551,...,0.01014,0.068193,0.070125,0.025948,0.178316,0.060628,-0.08173,0.018712,-0.024282,prophage_14


In [81]:
df_test[df_test["Phage"] == test_prophage]

Unnamed: 0,Phage,Protein_name,KL_type_LCA,Infected_ancestor,index,Dataset,seq,domain_seq,1,2,...,1272,1273,1274,1275,1276,1277,1278,1279,1280,prophage_id
7114,GCF_000281475.1__phage16,GCF_000281475.1__phage16__49,KL107,n2885,ppt__128,ppt,MSSAETTNPNAPVSLIEKWDKTFAESSKVDHRKVTFQNRYGITLVG...,MSSAETTNPNAPVSLIEKWDKTFAESSKVDHRKVTFQNRYGITLVG...,0.041171,-0.021027,...,-0.037412,-0.032235,-0.063208,-0.041684,0.041312,0.050056,-0.116315,0.004106,0.06939,prophage_14


In [62]:
df_test[df_test["Phage"] == "GCF_002180525.1__phage5"]

Unnamed: 0,Phage,Protein_name,KL_type_LCA,Infected_ancestor,index,Dataset,seq,domain_seq,1,2,...,1272,1273,1274,1275,1276,1277,1278,1279,1280,prophage_id
186,GCF_002180525.1__phage5,GCF_002180525.1__phage5__123,KL23,34.2/100,minibatch__1368,minibatch,MVENDTSSVEYQLSTSTGPFSIPFYFIENGHIVAELYTQNGDDFNK...,DTSSVEYQLSTSTGPFSIPFYFIENGHIVAELYTQNGDDFNKTTLN...,0.003456,0.013041,...,-0.030535,0.051649,0.023312,0.019689,4.3e-05,0.093576,-0.108828,0.020945,0.080411,prophage_401
13315,GCF_002180525.1__phage5,GCF_002180525.1__phage5__108,KL23,34.2/100,anubis__233,anubis,MAKIRPIKRSFNAGILSPVMYGQVDFDKWASAVKYMKNFIPLPQGP...,IVTELPPSVRNTVGKTYKYAFGDWSDVLRYPQFAAFFRGRLVFAGR...,0.044393,0.003582,...,-0.029456,0.040838,0.020895,-0.023312,0.013885,0.113177,-0.058832,-0.185573,0.130059,prophage_401


In [63]:
df_test[df_test["Phage"] == "GCF_019702965.1__phage10"]

Unnamed: 0,Phage,Protein_name,KL_type_LCA,Infected_ancestor,index,Dataset,seq,domain_seq,1,2,...,1272,1273,1274,1275,1276,1277,1278,1279,1280,prophage_id
13305,GCF_019702965.1__phage10,GCF_019702965.1__phage10__3183,KL23,34.2/100,ppt__1187,ppt,MVENDTSSVEYQLSTSTGPFSIPFYFIENGHIVAELYTQNGDDFNK...,WDSINLLDFVYATDVVDGFVDYGLGLNRAIAAMSSLGSTSVEHIPR...,0.023919,0.06722,...,-0.015518,0.089377,0.00309,0.02867,0.003768,0.03916,-0.096461,0.061654,0.080934,prophage_401
13330,GCF_019702965.1__phage10,GCF_019702965.1__phage10__3198,KL23,34.2/100,anubis__233,anubis,MAKIRPIKRSFNAGILSPVMYGQVDFDKWASAVKYMKNFIPLPQGP...,IVTELPPSVRNTVGKTYKYAFGDWSDVLRYPQFAAFFRGRLVFAGR...,0.044393,0.003582,...,-0.029456,0.040838,0.020895,-0.023312,0.013885,0.113177,-0.058832,-0.185573,0.130059,prophage_401


In [64]:
df_test[df_test["Phage"] == "GCF_019703085.1__phage8"]

Unnamed: 0,Phage,Protein_name,KL_type_LCA,Infected_ancestor,index,Dataset,seq,domain_seq,1,2,...,1272,1273,1274,1275,1276,1277,1278,1279,1280,prophage_id
244,GCF_019703085.1__phage8,GCF_019703085.1__phage8__3172,KL23,34.2/100,minibatch__1902,minibatch,MRAHPVGWAAMSHGPVGGGEFIARKGTAVDDGGYICVPTGQTEYYW...,MRAHPVGWAAMSHGPVGGGEFIARKGTAVDDGGYICVPTGQTEYYW...,0.011094,0.055557,...,-0.021085,0.107932,0.006993,0.014995,0.040744,0.068817,-0.083601,0.052713,0.063305,prophage_401
13302,GCF_019703085.1__phage8,GCF_019703085.1__phage8__3175,KL23,34.2/100,ppt__1187,ppt,MVENDTSSVEYQLSTSTGPFSIPFYFIENGHIVAELYTQNGDDFNK...,WDSINLLDFVYATDVVDGFVDYGLGLNRAIAAMSSLGSTSVEHIPR...,0.023919,0.06722,...,-0.015518,0.089377,0.00309,0.02867,0.003768,0.03916,-0.096461,0.061654,0.080934,prophage_401
13318,GCF_019703085.1__phage8,GCF_019703085.1__phage8__3190,KL23,34.2/100,anubis__233,anubis,MAKIRPIKRSFNAGILSPVMYGQVDFDKWASAVKYMKNFIPLPQGP...,IVTELPPSVRNTVGKTYKYAFGDWSDVLRYPQFAAFFRGRLVFAGR...,0.044393,0.003582,...,-0.029456,0.040838,0.020895,-0.023312,0.013885,0.113177,-0.058832,-0.185573,0.130059,prophage_401


In [65]:
df_test[df_test["Phage"] == "GCF_019703165.1__phage10"]

Unnamed: 0,Phage,Protein_name,KL_type_LCA,Infected_ancestor,index,Dataset,seq,domain_seq,1,2,...,1272,1273,1274,1275,1276,1277,1278,1279,1280,prophage_id
13322,GCF_019703165.1__phage10,GCF_019703165.1__phage10__3270,KL23,34.2/100,anubis__233,anubis,MAKIRPIKRSFNAGILSPVMYGQVDFDKWASAVKYMKNFIPLPQGP...,IVTELPPSVRNTVGKTYKYAFGDWSDVLRYPQFAAFFRGRLVFAGR...,0.044393,0.003582,...,-0.029456,0.040838,0.020895,-0.023312,0.013885,0.113177,-0.058832,-0.185573,0.130059,prophage_401
16305,GCF_019703165.1__phage10,GCF_019703165.1__phage10__3255,KL23,34.2/100,ppt__5058,ppt,MVENDTSSVEYQLSTSTGPFSIPFYFIENGHIVAELYTQNGDDFNK...,DATLRSNLGSSEEGMGDTLLAVKQPYTGAVARTQHDKNWDSINLLD...,0.038982,0.066496,...,-0.016902,0.084711,0.004791,0.025544,-0.005567,0.040983,-0.106026,0.057074,0.077234,prophage_401


In [82]:
eclude , keep

({'GCF_000281495.1__phage10', 'GCF_001666005.1__phage13'},
 {'GCF_000281475.1__phage16',
  'GCF_000281515.1__phage26',
  'GCF_902156275.1__phage6'})