In [1]:
# Ground modules
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from Bio import SeqIO
from itertools import product
import random
from collections import Counter
import json
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import logging
import subprocess
from multiprocessing.pool import ThreadPool
import joblib

# SCikitlearn modules :
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report , roc_auc_score

# Scipy modules : 
from scipy.stats import fisher_exact

In [2]:
# Make the blastp DB of all the dpo sequences :

path_seqbased = "/media/concha-eloko/Linux/PPT_clean/Seqbased_model"
path_db = f"{path_seqbased}/TropiSeq/TropiSeq_0.85.db"
path_work = "/media/concha-eloko/Linux/PPT_clean"


# Run makeblast command :
fasta_file = f"{path_seqbased}/cdhit_clusters_2912/0.85.out"

blast_command = f"makeblastdb -in {fasta_file} -dbtype prot -out {path_seqbased}/TropiSeq/TropiSeq_0.85.db"
#make_blast_process = subprocess.Popen(blast_command, shell =True, stdout = subprocess.PIPE, stderr=subprocess.STDOUT)
#mkblast_out, mkblast_err = make_blast_process.communicate()
#print(mkblast_out , mkblast_err)

# Relevant files :
dico_cluster = json.load(open(f"{path_seqbased}/dico_cluster.cdhit__0.85.json"))
dico_cluster_r = {ref_dpo : key_dpo for key_dpo,list_dpo in dico_cluster.items() for ref_dpo in list_dpo}

In [9]:
final_annotation = pd.read_csv(f"{path_work}/labeling_depo_clusters.PPT.0804.tsv", sep = "\t", header = 0)
final_annotation_tropiseq = final_annotation[final_annotation["Tropiseq_KLtypes"] != "None"]
final_annotation_tropiseq = final_annotation_tropiseq.drop_duplicates(subset = ["depo_cluster", "Tropiseq_KLtypes"])
final_annotation_tropiseq = final_annotation_tropiseq[["depo_cluster", "Tropiseq_KLtypes", "Tropiseq_scores"]]

dico_tropiseq_data = {row["depo_cluster"] : {"KL_types" : row["Tropiseq_KLtypes"], "Scores" : row["Tropiseq_scores"]} for _, row in final_annotation_tropiseq.iterrows()}
dico_tropiseq_data

{'Dpo_cdhit_12': {'KL_types': 'KL30',
  'Scores': 'KL30 : 0.00017771665698402528'},
 'Dpo_cdhit_233': {'KL_types': 'KL19',
  'Scores': 'KL19 : 0.00013812934086225345'},
 'Dpo_cdhit_280': {'KL_types': 'KL25',
  'Scores': 'KL25 : 0.0008645001322490626'},
 'Dpo_cdhit_182': {'KL_types': 'KL15',
  'Scores': 'KL15 : 2.4704011248342683e-22'},
 'Dpo_cdhit_126': {'KL_types': 'KL64',
  'Scores': 'KL64 : 0.0004611112383402333'},
 'Dpo_cdhit_132': {'KL_types': 'KL106',
  'Scores': 'KL106 : 8.518572858997499e-26'},
 'Dpo_cdhit_74': {'KL_types': 'KL28,KL23',
  'Scores': 'KL28 : 0.0012451648171312574,KL23 : 0.00012303924662079074'},
 'Dpo_cdhit_711': {'KL_types': 'KL1', 'Scores': 'KL1 : 0.0020476851628455045'},
 'Dpo_cdhit_48': {'KL_types': 'KL19',
  'Scores': 'KL19 : 2.937170125714393e-11'},
 'Dpo_cdhit_51': {'KL_types': 'KL47',
  'Scores': 'KL47 : 3.2559635989706166e-08'},
 'Dpo_cdhit_72': {'KL_types': 'KL28',
  'Scores': 'KL28 : 5.227116747174737e-10'},
 'Dpo_cdhit_30': {'KL_types': 'KL60',
  'Sco

In [4]:
dico_pred = json.load(open("/media/concha-eloko/Linux/PPT_clean/Seqbased_model/prediction_based.labeling.0604.json"))
dico_pred_correct_name = {f"Dpo_cdhit_{cluster.split('_')[1]}":hits  for cluster, hits in dico_pred.items()}


***
## Make the predictions :

In [6]:
path_tmp =  "/media/concha-eloko/Linux/PPT_clean/Seqbased_model/tmp"
labels_blast=["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]

def tmp_fasta_file(record , path_tmp) :
    name_file = "_".join(record.description.split(" "))
    path_fasta = f"{path_tmp}/{name_file}.fasta"
    length_seq = len(record.seq)
    with open(path_fasta, "w") as outfile :
        outfile.write(f">{record.description}\n{str(record.seq)}")
    return path_fasta , length_seq

def blast_seq(path_fasta, path_DB, path_tmp) :
    file_name = path_fasta.split("/")[-1]
    command = f"blastp -query {path_fasta} -db {path_DB} -out {path_tmp}/{file_name}.blast_out -outfmt 6 -evalue 1e-10"
    blastp_sub = subprocess.Popen(command ,shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
    out , err = blastp_sub.communicate()
    return f"{path_tmp}/{file_name}.blast_out"

def get_best_candidate(path_blast_out, length_seq, bitscore = 75) : 
    winner = 0
    labels_blast=["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]
    blast_df = pd.read_csv(path_blast_out, sep = "\t", names = labels_blast)
    if len(blast_df) > 0 :
        row = blast_df.iloc[0] 
        if row["bitscore"] > bitscore and length_seq/int(row["length"])> 0.8:
            winner = dico_cluster_r[row["sseqid"]]
        else :
            winner = "No hits"
    else :
        winner = "No hits"
    return winner

def get_winner(record , path_tmp) :
    path_func , len_func = tmp_fasta_file(record, path_tmp)
    path_blast_out_func = blast_seq(path_func , path_db, path_tmp)
    winner = get_best_candidate(path_blast_out_func, len_func)
    return winner

> Others : 

In [7]:
path_seq = "/media/concha-eloko/Linux/PPT_clean/in_vitro"

dico_seq = {record.description : record.seq for record in SeqIO.parse(f"{path_seq}/Others_all.dpos_domains.multi.fasta", "fasta") if len(record.seq) >0}
set_records = [record for record in SeqIO.parse(f"{path_seq}/Others_all.dpos_domains.multi.fasta", "fasta") if len(record.seq) > 0]


other_winners = []
for record in tqdm(set_records) :
    winner = get_winner(record, path_tmp)
    if winner != "No hits" :
        hit = int(winner.split("_")[-1])
        results = dico_pred_correct_name.get(winner, {})
        a = (record.description.split(",")[0] , winner, results)
    else :
        results = "Null"
    a = (record.description.split(",")[0] , winner, results)
    other_winners.append(a)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 56/56 [00:02<00:00, 22.09it/s]


> Compile the results : 

In [9]:
TropiSeq_results = other_winners

In [10]:
with open("/media/concha-eloko/Linux/PPT_clean/Seqbased_model.results.bit75.2406.Others.tsv" , "w") as outfile :
    for prot in TropiSeq_results :
        prot_name = prot[0].split("_A")[0]
        if prot[1] == "No hits" :
            outfile.write(f"{prot_name}\tNo_hits\n")
        elif prot[2] == {} :
            outfile.write(f"{prot_name}\tNo_associations\n")
        else :
            try :
                hits = [f"{kltype}:{round(score,3)}" for kltype, score in prot[2].items()]
                outfile.write(f"{prot_name}\t")
                outfile.write(" ; ".join(hits))
                outfile.write("\n")
            except Exception as e :
                print(prot, e)

***
### Predictions : 

In [11]:
predictions_Seqbased_df = pd.read_csv("/media/concha-eloko/Linux/PPT_clean/Seqbased_model.results.bit75.2406.Others.tsv", sep = "\t", names = ["protein", "predicitons"])
predictions_Seqbased_df

Unnamed: 0,protein,predicitons
0,MZ322895.1_prot_QWY13631.1_35,No_associations
1,ON146449.1_prot_UPW35138.1_1,KL2:0.639
2,ON146449.1_prot_UPW35150.1_13,KL102:0.737
3,MZ571831.1_prot_UEW68236.1_80,No_hits
4,MZ612130.1_prot_QYC51043.1_10,No_associations
5,MT197175.1_prot_QIW86415.1_45,KL102:0.737
6,MT197175.1_prot_QIW86419.1_49,KL110:0.725 ; KL37:0.891 ; KL106:0.503 ; KL22:...
7,MT197175.1_prot_QIW86428.1_58,KL29:0.545
8,MZ571832.1_prot_UEP19662.1_17,No_hits
9,MZ571832.1_prot_UEP19667.1_22,KL102:0.737
