# Prediction with TropiSEQ :
### I- Prepare the model
### II- Run the predictions on matrices
### III- Run the predictions on experimentally validated depolymerases¶
***

In [1]:
# Ground modules
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from Bio import SeqIO
from itertools import product
import random
from collections import Counter
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import logging
import subprocess
from multiprocessing.pool import ThreadPool
import joblib
import json

# SCikitlearn modules :
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report , roc_auc_score

# Scipy modules : 
from scipy.stats import fisher_exact

> Set up predictor 

In [2]:
path_seqbased = "/media/concha-eloko/Linux/PPT_clean/Seqbased_model"

# Run makeblast command :
fasta_file = f"{path_seqbased}/cdhit_clusters_2912/0.7.out"

#blast_command = f"makeblastdb -in {fasta_file} -dbtype prot -out {path_seqbased}/TropiSeq/TropiSeq_0.7.db"
#make_blast_process = subprocess.Popen(blast_command, shell =True, stdout = subprocess.PIPE, stderr=subprocess.STDOUT)
#mkblast_out, mkblast_err = make_blast_process.communicate()
#print(mkblast_out , mkblast_err)

# Relevant files :
dico_cluster = json.load(open(f"{path_seqbased}/dico_cluster.cdhit__0.7.json"))
dico_cluster_r = {ref_dpo : key_dpo for key_dpo,list_dpo in dico_cluster.items() for ref_dpo in list_dpo}

In [3]:
path_seqbased = "/media/concha-eloko/Linux/PPT_clean/Seqbased_model"
path_work = "/media/concha-eloko/Linux/PPT_clean"
path_benchmark = "/media/concha-eloko/Linux/PPT_clean/benchmark"


> Load predictor

In [4]:
#path_db = f"{path_seqbased}/TropiSeq/TropiSeq_0.7.db"
path_db = f"{path_seqbased}/TropiSeq/TropiSeq_0.85.db"

# 50% probability 
#final_annotation = pd.read_csv(f"{path_work}/labeling_depo_clusters.PPT.0804.tsv", sep = "\t", header = 0)

# 5% probability
final_annotation = pd.read_csv(f"{path_work}/labeling_depo_clusters.pred.p_05.tsv", sep = "\t", header = 0)
final_annotation.columns = ['index', 'seq', 'domain_seq', 'depo_cluster', 'Tropiseq_KLtypes','Tropiseq_scores']

# 5% ultrafiltration 
#final_annotation = pd.read_csv(f"/media/concha-eloko/Linux/PPT_clean/Seqbased_model/labeling_depo_clusters.pred.RF_UF_7.p_05.tsv", sep = "\t", header = 0)
#final_annotation.columns = ['index', 'seq', 'domain_seq', 'depo_cluster', 'Tropiseq_KLtypes','Tropiseq_scores']


# Make the dico: 
final_annotation_tropiseq = final_annotation[final_annotation["Tropiseq_KLtypes"] != "None"]
final_annotation_tropiseq = final_annotation_tropiseq.drop_duplicates(subset = ["depo_cluster", "Tropiseq_KLtypes"])
final_annotation_tropiseq = final_annotation_tropiseq[["depo_cluster", "Tropiseq_KLtypes", "Tropiseq_scores"]]

dico_tropiseq_data = {row["depo_cluster"] : {"KL_types" : row["Tropiseq_KLtypes"], "Scores" : row["Tropiseq_scores"]} for _, row in final_annotation_tropiseq.iterrows()}


In [14]:
all_kltypes = [kltype for _, row in final_annotation_tropiseq.iterrows() for kltype in row["Tropiseq_KLtypes"].split(",")]
len(set(all_kltypes))

106

In [7]:
# 50 % probability
#dico_pred = json.load(open("/media/concha-eloko/Linux/PPT_clean/Seqbased_model/prediction_based.labeling.0604.json"))
#dico_cluster = json.load(open(f"{path_seqbased}/dico_cluster.cdhit__0.85.json"))

# 5% probability
dico_pred = json.load(open("/media/concha-eloko/Linux/PPT_clean/Seqbased_model/prediction_based.labeling.p_05.json"))
dico_cluster = json.load(open(f"{path_seqbased}/dico_cluster.cdhit__0.85.json"))

# 5 % UF : 
#dico_pred = json.load(open("/media/concha-eloko/Linux/PPT_clean/Seqbased_model/prediction_based.labeling.RF_UF_7.p_05.json"))
#dico_cluster = json.load(open(f"{path_seqbased}/dico_cluster.cdhit__0.7.json"))




dico_cluster_r = {ref_dpo : key_dpo for key_dpo,list_dpo in dico_cluster.items() for ref_dpo in list_dpo}
dico_pred_correct_name = {f"Dpo_cdhit_{cluster.split('_')[1]}":hits  for cluster, hits in dico_pred.items()}


In [None]:
len(dico_pred_correct_name)

***
### Run predictions on matrices: 

In [8]:
path_tmp =  "/media/concha-eloko/Linux/PPT_clean/Seqbased_model/tmp"
labels_blast=["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]

def tmp_fasta_file(record , path_tmp) :
    name_file = "_".join(record.description.split(" "))
    path_fasta = f"{path_tmp}/{name_file}.fasta"
    length_seq = len(record.seq)
    with open(path_fasta, "w") as outfile :
        outfile.write(f">{record.description}\n{str(record.seq)}")
    return path_fasta , length_seq

def blast_seq(path_fasta, path_DB, path_tmp) :
    file_name = path_fasta.split("/")[-1]
    command = f"blastp -query {path_fasta} -db {path_DB} -out {path_tmp}/{file_name}.blast_out -outfmt 6 -evalue 1e-10"
    blastp_sub = subprocess.Popen(command ,shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
    out , err = blastp_sub.communicate()
    return f"{path_tmp}/{file_name}.blast_out"

def get_best_candidate(path_blast_out, length_seq, bitscore = 75) : 
    winner = 0
    labels_blast=["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]
    blast_df = pd.read_csv(path_blast_out, sep = "\t", names = labels_blast)
    if len(blast_df) > 0 :
        row = blast_df.iloc[0] 
        if row["bitscore"] > bitscore and length_seq/int(row["length"])> 0.8:
            winner = dico_cluster_r[row["sseqid"]]
        else :
            winner = "No hits"
    else :
        winner = "No hits"
    return winner

def get_winner(record , path_tmp) :
    path_func , len_func = tmp_fasta_file(record, path_tmp)
    path_blast_out_func = blast_seq(path_func , path_db, path_tmp)
    winner = get_best_candidate(path_blast_out_func, len_func)
    return winner

In [9]:
# Ferriol inferences : 
path_seq = "/media/concha-eloko/Linux/77_strains_phage_project/rbp_work"
dico_seq = {record.description : record.seq for record in SeqIO.parse(f"{path_seq}/77_phages_Dpo_domains.2406.multi.fasta", "fasta") if len(record.seq) >0}
set_records = [record for record in SeqIO.parse(f"{path_seq}/77_phages_Dpo_domains.2406.multi.fasta", "fasta") if len(record.seq) > 0]


ferriol_winners = []
for record in tqdm(set_records) :
    winner = get_winner(record, path_tmp)
    if winner != "No hits" :
        hit = int(winner.split("_")[-1])
        results = dico_pred_correct_name.get(winner, {})
        a = (record.description.split(",")[0] , winner, results)
    else :
        results = "Null"
    a = (record.description.split(",")[0] , winner, results)
    ferriol_winners.append(a)
    

# ***************************************************************************
# Beamud inferences : 
bea_winners = []
path_bea = "/media/concha-eloko/Linux/PPT_clean/in_vitro/Bea"
path_domains_bea = f"{path_bea}/DepoScope_predictions.bea.domains.0709.fasta"

bea_dico_seq = {record.description : record.seq for record in SeqIO.parse(f"{path_domains_bea}", "fasta") if len(record.seq) >0}
bea_set_records = [record for record in SeqIO.parse(f"{path_domains_bea}", "fasta") if len(record.seq) > 0]

for record in tqdm(bea_set_records) :
    winner = get_winner(record, path_tmp)
    if winner != "No hits" :
        hit = int(winner.split("_")[-1])
        results = dico_pred_correct_name.get(winner, {})
        a = (record.description.split(",")[0] , winner, results)
    else :
        results = "Null"
    a = (record.description.split(",")[0] , winner, results)
    bea_winners.append(a)
    
    
# ***************************************************************************
# Towndsend inferences : 
towndsend_winners = []
path_towndsend = "/media/concha-eloko/Linux/PPT_clean/in_vitro/Townsed"
path_domains_towndsend = f"{path_towndsend}/DepoScope_predictions.Townsed.domains.0909.fasta"

towndsend_dico_seq = {record.description : record.seq for record in SeqIO.parse(f"{path_domains_towndsend}", "fasta") if len(record.seq) >0}
towndsend_set_records = [record for record in SeqIO.parse(f"{path_domains_towndsend}", "fasta") if len(record.seq) > 0]

for record in tqdm(towndsend_set_records) :
    winner = get_winner(record, path_tmp)
    if winner != "No hits" :
        hit = int(winner.split("_")[-1])
        results = dico_pred_correct_name.get(winner, {})
        a = (record.description.split(",")[0] , winner, results)
    else :
        results = "Null"
    a = (record.description.split(",")[0] , winner, results)
    towndsend_winners.append(a)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 145/145 [00:06<00:00, 23.97it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71/71 [00:02<00:00, 23.80it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 44/44 [00:01<00:00, 25.88it/s]


In [10]:
TropiSeq_results = ferriol_winners + towndsend_winners + bea_winners


In [None]:
with open(f"{path_benchmark}/TropiSEQ_UF_70.p_05.matrices.tsv" , "w") as outfile :
    for prot in TropiSeq_results :
        prot_name = prot[0].split("_A")[0]
        if prot[1] == "No hits" :
            outfile.write(f"{prot_name}\tNo_hits\n")
        elif prot[2] == {} :
            outfile.write(f"{prot_name}\tNo_associations\n")
        else :
            try :
                hits = [f"{kltype}:{round(score,3)}" for kltype, score in prot[2].items()]
                sorted_hits = " ; ".join(sorted(hits, key=lambda x: float(x.split(":")[1]), reverse=True)[0:10])
                outfile.write(f"{prot_name}\t")
                outfile.write(sorted_hits)
                outfile.write("\n")
            except Exception as e :
                print(prot, e)

***
### Work on experimentally validated depolymerases :

In [11]:
# ***************************************************************************
# exp_validated inferences : 
exp_validated_winners = []
path_seq = "/media/concha-eloko/Linux/PPT_clean/in_vitro"

dico_seq = {record.description : record.seq for record in SeqIO.parse(f"{path_seq}/exp_validated.multi.fasta", "fasta") if len(record.seq) >0}
exp_validated_set_records = [record for record in SeqIO.parse(f"{path_seq}/exp_validated.multi.fasta", "fasta") if len(record.seq) > 0]


for record in tqdm(exp_validated_set_records) :
    winner = get_winner(record, path_tmp)
    if winner != "No hits" :
        hit = int(winner.split("_")[-1])
        results = dico_pred_correct_name.get(winner, {})
        a = (record.description.split(",")[0] , winner, results)
    else :
        results = "Null"
    a = (record.description.split(",")[0] , winner, results)
    exp_validated_winners.append(a)
    


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 63/63 [00:02<00:00, 23.85it/s]


In [12]:
with open(f"{path_benchmark}/TropiSEQ.p_05.exp_val_depolymerase.tsv" , "w") as outfile :
    for prot in exp_validated_winners :
        prot_name = prot[0].split("_A")[0]
        if prot[1] == "No hits" :
            outfile.write(f"{prot_name}\tNo_hits\n")
        elif prot[2] == {} :
            outfile.write(f"{prot_name}\tNo_associations\n")
        else :
            try :
                hits = [f"{kltype}:{round(score,3)}" for kltype, score in prot[2].items()]
                sorted_hits = " ; ".join(sorted(hits, key=lambda x: float(x.split(":")[1]), reverse=True)[0:10])
                outfile.write(f"{prot_name}\t")
                outfile.write(sorted_hits)
                outfile.write("\n")
            except Exception as e :
                print(prot, e)