# Prepare the data before the ML processes

***

## 1. Generate the clean dataframes 

In [None]:
import os
import random
import subprocess
from tqdm import tqdm
from random import sample
import time
from Bio import SeqIO
import pandas as pd

path_phageboost_pred="/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_prediction"

path_decipher = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_70_20102022"
path_database_PFAM = "/home/conchae/databases/Pfam-A/pfam"
path_depolymerse_w = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/20102022_session"
path_label = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_labeling/phageboost/info"

LLHP_accession_dico = {'lipase': ['PF00068', 'PF00151', 'PF00387', 'PF00388', 'PF00614', 'PF00657', 'PF00882', 'PF01114', 'PF01674', 'PF01734', 'PF01735', 'PF01764', 'PF02230', 'PF02253', 'PF02740', 'PF02988', 'PF03280', 'PF03583', 'PF03893', 'PF04083', 'PF04916', 'PF05278', 'PF05826', 'PF06350', 'PF06762', 'PF06951', 'PF08398', 'PF09056', 'PF09279', 'PF12262', 'PF12357', 'PF12536', 'PF13396', 'PF13472', 'PF14606', 'PF15083', 'PF16255', 'PF16565', 'PF16670', 'PF18067', 'PF18695'], 'hydrolase': ['PF00150', 'PF00232', 'PF00251', 'PF00295', 'PF00331', 'PF00332', 'PF00443', 'PF00457', 'PF00561', 'PF00657', 'PF00670', 'PF00702', 'PF00703', 'PF00704', 'PF00722', 'PF00723', 'PF00728', 'PF00759', 'PF00763', 'PF00795', 'PF00840', 'PF00925', 'PF00933', 'PF01055', 'PF01074', 'PF01088', 'PF01156', 'PF01183', 'PF01195', 'PF01227', 'PF01229', 'PF01270', 'PF01301', 'PF01341', 'PF01373', 'PF01374', 'PF01502', 'PF01503', 'PF01532', 'PF01557', 'PF01670', 'PF01738', 'PF01915', 'PF01955', 'PF01979', 'PF01981', 'PF02011', 'PF02015', 'PF02055', 'PF02056', 'PF02057', 'PF02148', 'PF02156', 'PF02275', 'PF02289', 'PF02324', 'PF02550', 'PF02633', 'PF02638', 'PF02649', 'PF02836', 'PF02837', 'PF02838', 'PF02882', 'PF02964', 'PF03065', 'PF03200', 'PF03403', 'PF03512', 'PF03537', 'PF03632', 'PF03633', 'PF03636', 'PF03639', 'PF03644', 'PF03648', 'PF03659', 'PF03662', 'PF03663', 'PF03664', 'PF03718', 'PF03747', 'PF03819', 'PF03959', 'PF04083', 'PF04295', 'PF04307', 'PF04616', 'PF04685', 'PF04775', 'PF04909', 'PF04996', 'PF05013', 'PF05028', 'PF05116', 'PF05165', 'PF05221', 'PF05382', 'PF05838', 'PF05908', 'PF05990', 'PF06028', 'PF06259', 'PF06342', 'PF06399', 'PF06441', 'PF06821', 'PF07176', 'PF07335', 'PF07461', 'PF07470', 'PF07477', 'PF07486', 'PF07488', 'PF07521', 'PF07698', 'PF07745', 'PF07748', 'PF07826', 'PF07858', 'PF07859', 'PF07969', 'PF07971', 'PF08244', 'PF08282', 'PF08306', 'PF08307', 'PF08472', 'PF08840', 'PF09127', 'PF09370', 'PF09663', 'PF09752', 'PF09994', 'PF10081', 'PF10118', 'PF10230', 'PF10287', 'PF10340', 'PF10566', 'PF10605', 'PF11308', 'PF11790', 'PF11975', 'PF12215', 'PF12471', 'PF12535', 'PF12695', 'PF12697', 'PF12710', 'PF12715', 'PF12888', 'PF12891', 'PF12917', 'PF13199', 'PF13200', 'PF13286', 'PF13336', 'PF13344', 'PF13419', 'PF13423', 'PF13472', 'PF13647', 'PF13869', 'PF13872', 'PF14196', 'PF14323', 'PF14498', 'PF14508', 'PF14509', 'PF14587', 'PF14606', 'PF14736', 'PF14741', 'PF14871', 'PF14872', 'PF14883', 'PF14885', 'PF15420', 'PF15979', 'PF16123', 'PF16141', 'PF16255', 'PF16317', 'PF16674', 'PF16822', 'PF16862', 'PF16874', 'PF16875', 'PF16923', 'PF17167', 'PF17189', 'PF17387', 'PF17433', 'PF17451', 'PF17652', 'PF17677', 'PF17678', 'PF17829', 'PF17890', 'PF18031', 'PF18034', 'PF18088', 'PF18089', 'PF18230', 'PF18271', 'PF18290', 'PF18438', 'PF18564', 'PF18565', 'PF19420', 'PF19718', 'PF20091'], 'lyase': ['PF00206', 'PF00221', 'PF00463', 'PF00544', 'PF00875', 'PF01212', 'PF01265', 'PF01977', 'PF02278', 'PF02560', 'PF02884', 'PF02901', 'PF03130', 'PF03211', 'PF03243', 'PF03328', 'PF03441', 'PF03802', 'PF04115', 'PF04223', 'PF04244', 'PF04345', 'PF04431', 'PF05034', 'PF05426', 'PF05985', 'PF06045', 'PF06751', 'PF06838', 'PF06917', 'PF07156', 'PF07450', 'PF07476', 'PF08124', 'PF08218', 'PF08328', 'PF08735', 'PF08787', 'PF09092', 'PF09093', 'PF09171', 'PF09284', 'PF09492', 'PF10397', 'PF12324', 'PF12708', 'PF14099', 'PF14583', 'PF14683', 'PF14686', 'PF14698', 'PF15617', 'PF16114', 'PF16867', 'PF18370'], 'peptidase': ['PF00246', 'PF00326', 'PF00450', 'PF00557', 'PF00675', 'PF00716', 'PF00717', 'PF00768', 'PF00863', 'PF00883', 'PF00905', 'PF00930', 'PF00949', 'PF01019', 'PF01136', 'PF01244', 'PF01252', 'PF01343', 'PF01364', 'PF01400', 'PF01427', 'PF01431', 'PF01432', 'PF01433', 'PF01434', 'PF01435', 'PF01447', 'PF01470', 'PF01478', 'PF01523', 'PF01546', 'PF01551', 'PF01640', 'PF01650', 'PF01707', 'PF01732', 'PF01828', 'PF01829', 'PF01830', 'PF01831', 'PF02016', 'PF02074', 'PF02113', 'PF02122', 'PF02127', 'PF02128', 'PF02129', 'PF02160', 'PF02163', 'PF02244', 'PF02517', 'PF02557', 'PF02586', 'PF02789', 'PF02868', 'PF02897', 'PF02977', 'PF03051', 'PF03290', 'PF03410', 'PF03411', 'PF03412', 'PF03413', 'PF03416', 'PF03510', 'PF03566', 'PF03568', 'PF03569', 'PF03571', 'PF03572', 'PF03574', 'PF03575', 'PF03576', 'PF03577', 'PF03734', 'PF03785', 'PF03995', 'PF04096', 'PF04151', 'PF04228', 'PF04258', 'PF04298', 'PF04389', 'PF04450', 'PF04573', 'PF04951', 'PF05193', 'PF05195', 'PF05223', 'PF05299', 'PF05342', 'PF05343', 'PF05379', 'PF05380', 'PF05381', 'PF05388', 'PF05407', 'PF05409', 'PF05413', 'PF05415', 'PF05416', 'PF05533', 'PF05543', 'PF05547', 'PF05548', 'PF05550', 'PF05569', 'PF05576', 'PF05577', 'PF05578', 'PF05579', 'PF05580', 'PF05582', 'PF05585', 'PF05649', 'PF05903', 'PF05922', 'PF05929', 'PF06162', 'PF06167', 'PF06262', 'PF06550', 'PF06605', 'PF06645', 'PF06703', 'PF06750', 'PF06819', 'PF06847', 'PF07364', 'PF07580', 'PF07687', 'PF07722', 'PF07910', 'PF07930', 'PF07998', 'PF08127', 'PF08192', 'PF08291', 'PF08367', 'PF08439', 'PF08453', 'PF08496', 'PF08530', 'PF08548', 'PF08715', 'PF08795', 'PF09168', 'PF09394', 'PF09471', 'PF09768', 'PF10023', 'PF10103', 'PF10275', 'PF10459', 'PF10460', 'PF10461', 'PF10462', 'PF10463', 'PF10464', 'PF10465', 'PF10467', 'PF10468', 'PF10502', 'PF11667', 'PF11713', 'PF11720', 'PF11814', 'PF11918', 'PF11962', 'PF12044', 'PF12146', 'PF12380', 'PF12381', 'PF12382', 'PF12384', 'PF12386', 'PF12387', 'PF12388', 'PF12389', 'PF12404', 'PF12528', 'PF12559', 'PF12580', 'PF12583', 'PF12628', 'PF13203', 'PF13365', 'PF13398', 'PF13402', 'PF13485', 'PF13529', 'PF13539', 'PF13574', 'PF13582', 'PF13583', 'PF13611', 'PF13620', 'PF13645', 'PF13670', 'PF13688', 'PF13933', 'PF14247', 'PF14521', 'PF14756', 'PF15270', 'PF15499', 'PF15638', 'PF15639', 'PF15640', 'PF15641', 'PF15887', 'PF15890', 'PF16187', 'PF16188', 'PF16217', 'PF16218', 'PF16325', 'PF16470', 'PF16800', 'PF16850', 'PF17033', 'PF17129', 'PF17130', 'PF17222', 'PF17291', 'PF17676', 'PF17899', 'PF17900', 'PF17969', 'PF18027', 'PF18294', 'PF18295', 'PF18348', 'PF18421', 'PF18630', 'PF18811', 'PF18818', 'PF18894', 'PF18958', 'PF18994', 'PF19289', 'PF19290', 'PF19310', 'PF19520', 'PF20034']}
tab_names = ["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]
df_names = ["Protein","PFAM","Pident","Length","Qstart","Qend","Bitscore"]
label_names = ["Prophage_name","K_serotype_monophyletic_group", "Ancestor","Number_ancestors","Size_clade","Min_ancestors","Number_Ktypes","K_types","Number_Kswaps"]

depolymerase_df = pd.read_csv(f"{path_depolymerse_w}/depolymerase_proteins.PFAM_depo.clean.txt", sep="\t" ,names = df_names)
LLHP_df = pd.read_csv(f"{path_depolymerse_w}/LLHP_proteins.PFAM_depo.clean.txt", sep="\t",names = df_names)
label_df = pd.read_csv(f"{path_label}/prophage_data.clusters_80.phageboost_70.tsv", sep="\t", names = label_names)


def prot_get_sequence(protein_name_series) :
    sequences = []
    path_decipher = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_70_20102022"
    for protein_name in protein_name_series :
        strain = protein_name.split("__")[0]
        prophage_id = protein_name.split("__")[1]
        fasta_file = protein_name +".fasta"
        sequence =  [str(record.seq) for record in SeqIO.parse(f"{path_decipher}/{strain}/tmp/{prophage_id}/{fasta_file}", 'fasta')][0]
        sequences.append(sequence)
    return sequences

def get_label(protein_name_series):
    labels = []
    path_decipher = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_70_20102022"
    for protein_name in protein_name_series :
        strain = protein_name.split("__")[0]
        prophage_id = protein_name.split("__")[1]
        label = label_df[label_df["Prophage_name"] == f"{strain}__{prophage_id}.fasta"]["K_serotype_monophyletic_group"].values[0]
        labels.append(label)
    return labels
     
def get_ancestors(protein_name_series):
    ancestors = []
    path_decipher = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_70_20102022"
    for protein_name in protein_name_series :
        strain = protein_name.split("__")[0]
        prophage_id = protein_name.split("__")[1]
        ancestor = label_df[label_df["Prophage_name"] == f"{strain}__{prophage_id}.fasta"]["Ancestor"].values[0]
        ancestors.append(ancestor)
    return ancestors

depolymerase_df["Sequence"] = prot_get_sequence(depolymerase_df["Protein"])
depolymerase_df["Label"] = get_label(depolymerase_df["Protein"])
depolymerase_df["Ancestor"] = get_ancestors(depolymerase_df["Protein"])

depolymerase_final = depolymerase_df.drop_duplicates(['Ancestor', 'Sequence'])
depolymerase_final.to_csv(f"{path_depolymerse_w}/depolymerase_proteins.PFAM_depo.final.csv", sep="\t", index=False)


LLHP_df["Sequence"] = prot_get_sequence(LLHP_df["Protein"])
LLHP_df["Label"] = get_label(LLHP_df["Protein"])
LLHP_df["Ancestor"] = get_ancestors(LLHP_df["Protein"])


> The chosen ones : 

In [None]:
import os
import random
import subprocess
from tqdm import tqdm
from random import sample
import time
import pandas as pd

path_phageboost_pred="/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_prediction"

path_decipher = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_70_20102022"
path_database_PFAM = "/home/conchae/databases/Pfam-A/pfam"
path_depolymerse_w = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/20102022_session"
path_label = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_labeling/phageboost/info"


depolymerase_final.to_csv(f"{path_depolymerse_w}/depolymerase_proteins.PFAM_depo.final.csv", sep="\t", index=False)


***
## 2. Basic Information

#### **A. About the most virulent KL types**

In [None]:
import os
import random
import subprocess
from tqdm import tqdm
from random import sample
import time
import pandas as pd
from collections import Counter


path_phageboost_pred="/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_prediction"

path_decipher = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_70_20102022"
path_database_PFAM = "/home/conchae/databases/Pfam-A/pfam"
path_depolymerse_w = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/20102022_session"
path_label = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_labeling/phageboost/info"

label_names = ["Prophage_name","K_serotype_monophyletic_group", "Ancestor","Number_ancestors","Size_clade","Min_ancestors","Number_Ktypes","K_types","Number_Kswaps"]

depolymerase_F_df = pd.read_csv(f"{path_depolymerse_w}/depolymerase_proteins.PFAM_depo.final.tsv", sep="\t" , header = 0)
LLHP_F_df = pd.read_csv(f"{path_depolymerse_w}/LLHP_proteins.PFAM_depo.clean.txt", sep="\t",names = df_names)

dic_counter = dict(Counter(depolymerase_F_df["Label"]))
info_depolymerase = {}

for K_type, count in dic_counter.items() :
    if count > 10 :
        info_depolymerase.update({K_type : count})
        
info_depolymerase = {'KL47': 139, 'KL2': 135, 'KL107': 135, 'KL22': 35, 'KL102': 49, 'KL51': 30, 'KL126': 12, 'KL17': 98, 'KL52': 13, 'KL24': 33, 'KL27': 67,
                     'KL39': 13, 'KL3': 55, 'KL64': 169, 'KL25': 34, 'KL60': 23, 'KL30': 48, 'KL46': 28, 'KL137': 11, 'KL106': 37, 'KL21': 25, 'KL45': 35, 'KL13': 18,
                     'KL169': 13, 'KL14': 39, 'KL105': 23, 'KL8': 16, 'KL10': 26, 'KL81': 14, 'KL1': 39, 'KL151': 41, 'KL16': 11, 'KL136': 11, 'KL74': 14, 'KL7': 30,
                     'KL62': 28, 'KL43': 18, 'KL145': 11, 'KL57': 23, 'KL53': 12, 'KL23': 18, 'KL61': 16, 'KL127': 14, 'KL18': 13}
