***
# Prediction with SpikeHunter :
### I- Prepare the model
### II- Run the predictions on matrices
### III- Run the predictions on experimentally validated depolymerases¶
***

###  Make the predictor: 

In [59]:
import os
from Bio import SeqIO
import pandas as pd 
from collections import Counter
import subprocess
from tqdm import tqdm

path_SH = "/media/concha-eloko/Linux/PPT_clean/giae017_supplemental_figures_and_tables"
path_benchmark = "/media/concha-eloko/Linux/PPT_clean/benchmark"

tsp_id_df = pd.read_csv(f"{path_SH}/TSP_ids_and_clusters.txt", sep = "\t", header = 0, index_col = False)
serotype_df = pd.read_csv(f"{path_SH}/table_serotype_specificity.tsv", sep = "\t", header = 0)


In [54]:
tsp_id_klebsiella_df = tsp_id_df[tsp_id_df["Species"] == "Klebsiella"]
serotype_klebsiella_df = serotype_df[serotype_df["Species"] == "Klebsiella"]
clusters_id_interest = serotype_klebsiella_df["TSP cluster at 60%"].unique()

> Dico with the relevant sequences : 

In [30]:
with open(f"{path_SH}/database_SH_depo.fasta", "w") as outfile :
    for record in SeqIO.parse(f"{path_SH}/tailspike.fa", "fasta") :
        if record.description.split("|")[1] in clusters_id_interest :
            cluster_id = record.description.split('|')[1]
            serotype = serotype_klebsiella_df[serotype_klebsiella_df["TSP cluster at 60%"] == cluster_id]["Serotype"].values[0].replace(";", "_").replace("/", "_")
            fasta_rec = f">{cluster_id}__{serotype}\n{record.seq}\n"
            outfile.write(fasta_rec)

***
### Make blast DB :

In [32]:
# Make the blastp DB of all the dpo sequences :

path_SH = "/media/concha-eloko/Linux/PPT_clean/giae017_supplemental_figures_and_tables"
fasta_file = f"{path_SH}/database_SH_depo.fasta"

blast_command = f"makeblastdb -in {fasta_file} -dbtype prot -out {path_SH}/SH_database"
make_blast_process = subprocess.Popen(blast_command, shell =True, stdout = subprocess.PIPE, stderr=subprocess.STDOUT)
mkblast_out, mkblast_err = make_blast_process.communicate()
print(mkblast_out , mkblast_err)


b'\n\nBuilding a new DB, current time: 10/28/2024 22:47:14\nNew DB name:   /media/concha-eloko/Linux/PPT_clean/giae017_supplemental_figures_and_tables/SH_database\nNew DB title:  /media/concha-eloko/Linux/PPT_clean/giae017_supplemental_figures_and_tables/database_SH_depo.fasta\nSequence type: Protein\nKeep MBits: T\nMaximum file size: 1000000000B\nAdding sequences from FASTA; added 7451 sequences in 0.18688 seconds.\n' None


***
### Run predictions on matrices:

In [57]:
path_db = f"{path_SH}/SH_database"
path_tmp =  f"{path_SH}/tmp"
labels_blast=["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]


def tmp_fasta_file(record , path_tmp) :
    name_file = "_".join(record.description.split(" "))
    path_fasta = f"{path_tmp}/{name_file}.fasta"
    length_seq = len(record.seq)
    with open(path_fasta, "w") as outfile :
        outfile.write(f">{record.description}\n{str(record.seq)}")
    return path_fasta , length_seq

def blast_seq(path_fasta, path_DB, path_tmp) :
    file_name = path_fasta.split("/")[-1]
    command = f"blastp -query {path_fasta} -db {path_DB} -out {path_tmp}/{file_name}.blast_out -outfmt 6 -evalue 1e-10"
    blastp_sub = subprocess.Popen(command ,shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
    out , err = blastp_sub.communicate()
    return f"{path_tmp}/{file_name}.blast_out"

def get_best_candidate(path_blast_out, length_seq, pident = 0.60) : 
    winner = 0
    labels_blast=["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]
    blast_df = pd.read_csv(path_blast_out, sep = "\t", names = labels_blast)
    if len(blast_df) > 0 :
        row = blast_df.iloc[0] 
        if (row["pident"] > pident) : # and (length_seq/int(row["length"])> 0.50)
            winner = row["sseqid"]
        else :
            winner = "No hits"
    else :
        winner = "No hits"
    return winner

def get_winner(record , path_tmp) :
    path_func , len_func = tmp_fasta_file(record, path_tmp)
    path_blast_out_func = blast_seq(path_func , path_db, path_tmp)
    winner = get_best_candidate(path_blast_out_func, len_func)
    return winner

In [79]:
# ***************************************************************************
# Ferriol inferences : 
path_seq = "/media/concha-eloko/Linux/77_strains_phage_project/rbp_work"
dico_seq = {record.description : record.seq for record in SeqIO.parse(f"{path_seq}/77_phages_Dpo_domains.2406.multi.fasta", "fasta") if len(record.seq) >0}
set_records = [record for record in SeqIO.parse(f"{path_seq}/77_phages_Dpo_domains.2406.multi.fasta", "fasta") if len(record.seq) > 0]

ferriol_winners = []
for record in tqdm(set_records) :
    winner = get_winner(record, path_tmp)
    if winner != "No hits" :
        prediction = winner.split("__")[-1]
    else :
        prediction = "Null"
    a = (record.description.split(",")[0] , winner, prediction.split("_"))
    ferriol_winners.append(a)
    

# ***************************************************************************
# Beamud inferences : 
bea_winners = []

path_bea = "/media/concha-eloko/Linux/PPT_clean/in_vitro/Bea"
path_domains_bea = f"{path_bea}/DepoScope_predictions.bea.domains.0709.fasta"

bea_dico_seq = {record.description : record.seq for record in SeqIO.parse(f"{path_domains_bea}", "fasta") if len(record.seq) >0}
bea_set_records = [record for record in SeqIO.parse(f"{path_domains_bea}", "fasta") if len(record.seq) > 0]

for record in tqdm(bea_set_records) :
    winner = get_winner(record, path_tmp)
    if winner != "No hits" :
        prediction = winner.split("__")[-1]
    else :
        prediction = "Null"
    a = (record.description.split(",")[0] , winner, prediction.split("_"))
    bea_winners.append(a)
    
    
# ***************************************************************************
# Towndsend inferences : 
towndsend_winners = []

path_towndsend = "/media/concha-eloko/Linux/PPT_clean/in_vitro/Townsed"
path_domains_towndsend = f"{path_towndsend}/DepoScope_predictions.Townsed.domains.0909.fasta"

towndsend_dico_seq = {record.description : record.seq for record in SeqIO.parse(f"{path_domains_towndsend}", "fasta") if len(record.seq) >0}
towndsend_set_records = [record for record in SeqIO.parse(f"{path_domains_towndsend}", "fasta") if len(record.seq) > 0]

for record in tqdm(towndsend_set_records) :
    winner = get_winner(record, path_tmp)
    if winner != "No hits" :
        prediction = winner.split("__")[-1]
    else :
        prediction = "Null"
    a = (record.description.split(",")[0] , winner, prediction.split("_"))
    towndsend_winners.append(a)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 145/145 [00:34<00:00,  4.21it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71/71 [00:13<00:00,  5.15it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 44/44 [00:07<00:00,  5.68it/s]


In [80]:
SH_results = ferriol_winners + towndsend_winners + bea_winners

> Write file : 

In [82]:
with open(f"{path_benchmark}/SpikeHunter_predictions.matrices.tsv" , "w") as outfile :
    for prot in SH_results :
        prot_name = prot[0].split("_A")[0]
        if prot[1] == "No hits" :
            outfile.write(f"{prot_name}\tNo_hits\n")
        else :
            try :
                outfile.write(f"{prot_name}\t")
                outfile.write(" , ".join(prot[2]))
                outfile.write("\n")
            except Exception as e :
                print(prot, e)

***
### Work on experimentally validated depolymerases: 

In [73]:
# ***************************************************************************
# exp_validated inferences : 
exp_validated_winners = []

path_seq = "/media/concha-eloko/Linux/PPT_clean/in_vitro"

dico_seq = {record.description : record.seq for record in SeqIO.parse(f"{path_seq}/Others_all.dpos_domains.multi.fasta", "fasta") if len(record.seq) >0}
exp_validated_set_records = [record for record in SeqIO.parse(f"{path_seq}/Others_all.dpos_domains.multi.fasta", "fasta") if len(record.seq) > 0]

for record in tqdm(exp_validated_set_records) :
    winner = get_winner(record, path_tmp)
    if winner != "No hits" :
        prediction = winner.split("__")[-1]
    else :
        prediction = "Null"
    a = (record.description , winner, prediction.split("_"))
    exp_validated_winners.append(a)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 56/56 [00:10<00:00,  5.20it/s]


> Write file:

In [75]:
with open(f"{path_benchmark}/SpikeHunter_predictions.exp_val_depolymerase.tsv" , "w") as outfile :
    for prot in exp_validated_winners :
        prot_name = prot[0].split("_A")[0]
        if prot[1] == "No hits" :
            outfile.write(f"{prot_name}\tNo_hits\n")
        else :
            try :
                outfile.write(f"{prot_name}\t")
                outfile.write(" , ".join(prot[2]))
                outfile.write("\n")
            except Exception as e :
                print(prot, e)