In [14]:
# Ground modules
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from Bio import SeqIO
from itertools import product
import random
from collections import Counter
import json
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import logging
import subprocess
from multiprocessing.pool import ThreadPool
import joblib

# SCikitlearn modules :
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report , roc_auc_score

# Scipy modules : 
from scipy.stats import fisher_exact

In [15]:
# Make the blastp DB of all the dpo sequences :

path_seqbased = "/media/concha-eloko/Linux/PPT_clean/Seqbased_model"
path_db = f"{path_seqbased}/TropiSeq/TropiSeq_0.85.db"
path_work = "/media/concha-eloko/Linux/PPT_clean"


# Run makeblast command :
fasta_file = f"{path_seqbased}/cdhit_clusters_2912/0.85.out"

blast_command = f"makeblastdb -in {fasta_file} -dbtype prot -out {path_seqbased}/TropiSeq/TropiSeq_0.85.db"
#make_blast_process = subprocess.Popen(blast_command, shell =True, stdout = subprocess.PIPE, stderr=subprocess.STDOUT)
#mkblast_out, mkblast_err = make_blast_process.communicate()
#print(mkblast_out , mkblast_err)

# Relevant files :
dico_cluster = json.load(open(f"{path_seqbased}/dico_cluster.cdhit__0.85.json"))
dico_cluster_r = {ref_dpo : key_dpo for key_dpo,list_dpo in dico_cluster.items() for ref_dpo in list_dpo}

In [16]:
final_annotation = pd.read_csv(f"{path_work}/labeling_depo_clusters.PPT.0804.tsv", sep = "\t", header = 0)
final_annotation_tropiseq = final_annotation[final_annotation["Tropiseq_KLtypes"] != "None"]
final_annotation_tropiseq = final_annotation_tropiseq.drop_duplicates(subset = ["depo_cluster", "Tropiseq_KLtypes"])
final_annotation_tropiseq = final_annotation_tropiseq[["depo_cluster", "Tropiseq_KLtypes", "Tropiseq_scores"]]

dico_tropiseq_data = {row["depo_cluster"] : {"KL_types" : row["Tropiseq_KLtypes"], "Scores" : row["Tropiseq_scores"]} for _, row in final_annotation_tropiseq.iterrows()}
dico_tropiseq_data

{'Dpo_cdhit_12': {'KL_types': 'KL30',
  'Scores': 'KL30 : 0.00017771665698402528'},
 'Dpo_cdhit_233': {'KL_types': 'KL19',
  'Scores': 'KL19 : 0.00013812934086225345'},
 'Dpo_cdhit_280': {'KL_types': 'KL25',
  'Scores': 'KL25 : 0.0008645001322490626'},
 'Dpo_cdhit_182': {'KL_types': 'KL15',
  'Scores': 'KL15 : 2.4704011248342683e-22'},
 'Dpo_cdhit_126': {'KL_types': 'KL64',
  'Scores': 'KL64 : 0.0004611112383402333'},
 'Dpo_cdhit_132': {'KL_types': 'KL106',
  'Scores': 'KL106 : 8.518572858997499e-26'},
 'Dpo_cdhit_74': {'KL_types': 'KL28,KL23',
  'Scores': 'KL28 : 0.0012451648171312574,KL23 : 0.00012303924662079074'},
 'Dpo_cdhit_711': {'KL_types': 'KL1', 'Scores': 'KL1 : 0.0020476851628455045'},
 'Dpo_cdhit_48': {'KL_types': 'KL19',
  'Scores': 'KL19 : 2.937170125714393e-11'},
 'Dpo_cdhit_51': {'KL_types': 'KL47',
  'Scores': 'KL47 : 3.2559635989706166e-08'},
 'Dpo_cdhit_72': {'KL_types': 'KL28',
  'Scores': 'KL28 : 5.227116747174737e-10'},
 'Dpo_cdhit_30': {'KL_types': 'KL60',
  'Sco

***
## Make the predictions :

In [17]:
path_tmp =  "/media/concha-eloko/Linux/PPT_clean/Seqbased_model/tmp"
labels_blast=["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]

def tmp_fasta_file(record , path_tmp) :
    name_file = "_".join(record.description.split(" "))
    path_fasta = f"{path_tmp}/{name_file}.fasta"
    length_seq = len(record.seq)
    with open(path_fasta, "w") as outfile :
        outfile.write(f">{record.description}\n{str(record.seq)}")
    return path_fasta , length_seq

def blast_seq(path_fasta, path_DB, path_tmp) :
    file_name = path_fasta.split("/")[-1]
    command = f"blastp -query {path_fasta} -db {path_DB} -out {path_tmp}/{file_name}.blast_out -outfmt 6 -evalue 1e-10"
    blastp_sub = subprocess.Popen(command ,shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
    out , err = blastp_sub.communicate()
    return f"{path_tmp}/{file_name}.blast_out"

def get_best_candidate(path_blast_out, length_seq, bitscore = 75) : 
    winner = 0
    labels_blast=["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]
    blast_df = pd.read_csv(path_blast_out, sep = "\t", names = labels_blast)
    if len(blast_df) > 0 :
        row = blast_df.iloc[0] 
        if row["bitscore"] > bitscore and length_seq/int(row["length"])> 0.8:
            winner = dico_cluster_r[row["sseqid"]]
        else :
            winner = "No hits"
    else :
        winner = "No hits"
    return winner

def get_winner(record , path_tmp) :
    path_func , len_func = tmp_fasta_file(record, path_tmp)
    path_blast_out_func = blast_seq(path_func , path_db, path_tmp)
    winner = get_best_candidate(path_blast_out_func, len_func)
    return winner

> Ferriol : 

In [27]:
path_seq = "/media/concha-eloko/Linux/77_strains_phage_project/rbp_work"

dico_seq = {record.description : record.seq for record in SeqIO.parse(f"{path_seq}/77_phages_Dpo_domains.2512.multi.fasta", "fasta") if len(record.seq) >0}
set_records = [record for record in SeqIO.parse(f"{path_seq}/77_phages_Dpo_domains.2512.multi.fasta", "fasta") if len(record.seq) > 0]


ferriol_winners = []
for record in tqdm(set_records) :
    winner = get_winner(record, path_tmp)
    if winner != "No hits" :
        hit = int(winner.split("_")[-1])
        results = dico_tropiseq_data.get(winner, {}).get("Scores", "No_associations")
        a = (record.description , winner, results)
    else :
        results = "Null"
    a = (record.description , winner, results)
    ferriol_winners.append(a)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 126/126 [00:06<00:00, 18.04it/s]


In [28]:
ferriol_winners

[('K17alfa62__cds_66_A_281-682',
  'Dpo_cdhit_88',
  'KL62 : 0.00010714006315332097'),
 ('K14PH164C1__cds_24_A_4_221_871.pdb', 'Dpo_cdhit_92', 'No_associations'),
 ('K64PH164C4__cds_24_A_4_178_852.pdb',
  'Dpo_cdhit_56',
  'KL64 : 1.3514145354604479e-12'),
 ('K40PH129C1__cds_56_A_4_239_860.pdb', 'No hits', 'Null'),
 ('K51PH129C1__cds_9_A_1_92_787.pdb',
  'Dpo_cdhit_47',
  'KL51 : 4.50741105912156e-19'),
 ('K15PH90__cds_55_A.pdb', 'Dpo_cdhit_182', 'KL15 : 2.4704011248342683e-22'),
 ('K21lambda1__cds_28_A.pdb',
  'Dpo_cdhit_336',
  'KL21 : 9.058690784864949e-17,KL124 : 1.2360184682751167e-05'),
 ('K26PH128C1__cds_50_A_1_97_595.pdb', 'Dpo_cdhit_87', 'No_associations'),
 ('K35PH164C3__cds_48_A_4_282_728.pdb', 'Dpo_cdhit_352', 'No_associations'),
 ('K37PH164C1__cds_47_A_1_1_307.pdb', 'Dpo_cdhit_599', 'No_associations'),
 ('K38PH09C2__cds_24_A_4_178_672.pdb', 'Dpo_cdhit_76', 'No_associations'),
 ('K27PH129C1__cds_48_A_7_200_648.pdb',
  'Dpo_cdhit_268',
  'KL27 : 2.22568076718677e-12'),
 ('K5

> Beamud : 

In [29]:
bea_winners = []

path_bea = "/media/concha-eloko/Linux/PPT_clean/in_vitro/Bea"
path_domains_bea = f"{path_bea}/DepoScope_predictions.bea.domains.0709.fasta"

bea_dico_seq = {record.description : record.seq for record in SeqIO.parse(f"{path_domains_bea}", "fasta") if len(record.seq) >0}
bea_set_records = [record for record in SeqIO.parse(f"{path_domains_bea}", "fasta") if len(record.seq) > 0]

for record in tqdm(bea_set_records) :
    winner = get_winner(record, path_tmp)
    if winner != "No hits" :
        hit = int(winner.split("_")[-1])
        results = dico_tropiseq_data.get(winner, {}).get("Scores", "No_associations")
        a = (record.description , winner, results)
    else :
        results = "Null"
    a = (record.description , winner, results)
    bea_winners.append(a)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71/71 [00:03<00:00, 19.01it/s]


> Towndsend : 

In [30]:
towndsend_winners = []

path_towndsend = "/media/concha-eloko/Linux/PPT_clean/in_vitro/Townsed"
path_domains_towndsend = f"{path_towndsend}/DepoScope_predictions.Townsed.domains.0909.fasta"

towndsend_dico_seq = {record.description : record.seq for record in SeqIO.parse(f"{path_domains_towndsend}", "fasta") if len(record.seq) >0}
towndsend_set_records = [record for record in SeqIO.parse(f"{path_domains_towndsend}", "fasta") if len(record.seq) > 0]

for record in tqdm(towndsend_set_records) :
    winner = get_winner(record, path_tmp)
    if winner != "No hits" :
        hit = int(winner.split("_")[-1])
        results = dico_tropiseq_data.get(winner, {}).get("Scores", "No_associations")
        a = (record.description , winner, results)
    else :
        results = "Null"
    a = (record.description , winner, results)
    towndsend_winners.append(a)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 44/44 [00:02<00:00, 20.15it/s]


> Compile the results : 

In [32]:
TropiSeq_results = ferriol_winners + towndsend_winners + bea_winners
TropiSeq_results

[('K17alfa62__cds_66_A_281-682',
  'Dpo_cdhit_88',
  'KL62 : 0.00010714006315332097'),
 ('K14PH164C1__cds_24_A_4_221_871.pdb', 'Dpo_cdhit_92', 'No_associations'),
 ('K64PH164C4__cds_24_A_4_178_852.pdb',
  'Dpo_cdhit_56',
  'KL64 : 1.3514145354604479e-12'),
 ('K40PH129C1__cds_56_A_4_239_860.pdb', 'No hits', 'Null'),
 ('K51PH129C1__cds_9_A_1_92_787.pdb',
  'Dpo_cdhit_47',
  'KL51 : 4.50741105912156e-19'),
 ('K15PH90__cds_55_A.pdb', 'Dpo_cdhit_182', 'KL15 : 2.4704011248342683e-22'),
 ('K21lambda1__cds_28_A.pdb',
  'Dpo_cdhit_336',
  'KL21 : 9.058690784864949e-17,KL124 : 1.2360184682751167e-05'),
 ('K26PH128C1__cds_50_A_1_97_595.pdb', 'Dpo_cdhit_87', 'No_associations'),
 ('K35PH164C3__cds_48_A_4_282_728.pdb', 'Dpo_cdhit_352', 'No_associations'),
 ('K37PH164C1__cds_47_A_1_1_307.pdb', 'Dpo_cdhit_599', 'No_associations'),
 ('K38PH09C2__cds_24_A_4_178_672.pdb', 'Dpo_cdhit_76', 'No_associations'),
 ('K27PH129C1__cds_48_A_7_200_648.pdb',
  'Dpo_cdhit_268',
  'KL27 : 2.22568076718677e-12'),
 ('K5

In [45]:
def ordered_list_of_pvalues(tuple_list):
    # Extract the third element of each tuple and split it by comma
    pvalue_list = [element[2].split(",") for element in tuple_list]
    # Split each string to extract KLx and p-value, and create a list of tuples (KLx, p-value)
    pvalue_tuples = []
    for sublist in pvalue_list:
        for item in sublist:
            klx, pvalue = item.split(" : ")
            pvalue = '{:.3e}'.format(float(pvalue))
            pvalue_tuples.append((klx.strip(), pvalue))
    # Sort the list of tuples based on the p-values
    sorted_pvalues = sorted(pvalue_tuples, key=lambda x: x[1])
    # Format the sorted list of tuples into the desired format
    formatted_results = [f"{klx} : {pvalue}" for klx, pvalue in sorted_pvalues]
    return formatted_results

with open("/media/concha-eloko/Linux/PPT_clean/Seqbased_model.results.bit75.0804.tsv" , "w") as outfile :
    for prot in TropiSeq_results :
        prot_name = prot[0].split("_A")[0]
        if prot[1] == "No hits" :
            outfile.write(f"{prot_name}\tNo hits\n")
        elif prot[2] == "No_associations" :
            outfile.write(f"{prot_name}\tNo predictions\n")
        else :
            outfile.write(f"{prot_name}\t")
            try : 
                hits = ordered_list_of_pvalues([prot])
                outfile.write(";".join(hits))
                outfile.write("\n")
            except Exception as e :
                print(prot, e)

***
### Predictions : 

In [48]:
predictions_Seqbased_df = pd.read_csv("/media/concha-eloko/Linux/PPT_clean/Seqbased_model.results.bit75.0804.tsv", sep = "\t", names = ["protein", "predicitons"])
predictions_Seqbased_df

Unnamed: 0,protein,predicitons
0,K17alfa62__cds_66,KL62 : 1.071e-04
1,K14PH164C1__cds_24,No predictions
2,K64PH164C4__cds_24,KL64 : 1.351e-12
3,K40PH129C1__cds_56,No hits
4,K51PH129C1__cds_9,KL51 : 4.507e-19
...,...,...
236,A2a_b_00022,No predictions
237,A2a_b_00036,KL102 : 1.991e-02
238,A1i_00037,KL102 : 1.991e-02
239,A1i_00041,No predictions
