In [21]:
# Ground modules
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from Bio import SeqIO
from itertools import product
import random
from collections import Counter
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import logging
import subprocess
from multiprocessing.pool import ThreadPool
import joblib

# SCikitlearn modules :
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report , roc_auc_score

# Scipy modules : 
from scipy.stats import fisher_exact

***

In [9]:
# Make the blastp DB of all the dpo sequences :

path_seqbased = "/media/concha-eloko/Linux/PPT_clean/Seqbased_model"

fasta_file = f"{path_seqbased}/cdhit_clusters_1710/0.85.out"

blast_command = f"makeblastdb -in {fasta_file} -dbtype prot -out {path_seqbased}/TropiSeq/TropiSeq_0.85.db"
make_blast_process = subprocess.Popen(blast_command, shell =True, stdout = subprocess.PIPE, stderr=subprocess.STDOUT)
mkblast_out, mkblast_err = make_blast_process.communicate()
print(mkblast_out , mkblast_err)

b'\n\nBuilding a new DB, current time: 10/28/2023 05:08:33\nNew DB name:   /media/concha-eloko/Linux/PPT_clean/Seqbased_model/TropiSeq/TropiSeq_0.85.db\nNew DB title:  /media/concha-eloko/Linux/PPT_clean/Seqbased_model/cdhit_clusters_1710/0.85.out\nSequence type: Protein\nKeep MBits: T\nMaximum file size: 1000000000B\nAdding sequences from FASTA; added 822 sequences in 0.0205569 seconds.\n' None


***
# Make function that :
### A : blastp from a Dpo seq
### B : read the results and spot the hits
### C : Build a vector from the presence abscence
### D : Make prediction
***

> 77 phages

In [6]:
import json

path_seqbased = "/media/concha-eloko/Linux/PPT_clean/Seqbased_model"
path_db = f"{path_seqbased}/TropiSeq/TropiSeq_0.85.db"

dico_cluster = json.load(open(f"{path_seqbased}/dico_cluster.cdhit__0.85.json"))
dico_cluster_r = {ref_dpo : key_dpo for key_dpo,list_dpo in dico_cluster.items() for ref_dpo in list_dpo}


In [7]:
path_seq = "/media/concha-eloko/Linux/77_strains_phage_project/rbp_work"
path_tmp =  "/media/concha-eloko/Linux/PPT_clean/Seqbased_model/tmp"
labels_blast=["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]

dico_seq = {record.description : record.seq for record in SeqIO.parse(f"{path_seq}/77_domain_Dpo_cluster_all_seqs.fasta", "fasta") if len(record.seq) >0}
set_records = [record for record in SeqIO.parse(f"{path_seq}/77_domain_Dpo_cluster_all_seqs.fasta", "fasta") if len(record.seq) > 0]


def tmp_fasta_file(record , path_tmp) :
    name_file = "_".join(record.description.split(" "))
    path_fasta = f"{path_tmp}/{name_file}.fasta"
    length_seq = len(record.seq)
    with open(path_fasta, "w") as outfile :
        outfile.write(f">{record.description}\n{str(record.seq)}")
    return path_fasta , length_seq

def blast_seq(path_fasta, path_DB, path_tmp) :
    file_name = path_fasta.split("/")[-1]
    command = f"blastp -query {path_fasta} -db {path_DB} -out {path_tmp}/{file_name}.blast_out -outfmt 6 -evalue 1e-10"
    blastp_sub = subprocess.Popen(command ,shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
    out , err = blastp_sub.communicate()
    return f"{path_tmp}/{file_name}.blast_out"

def get_best_candidate(path_blast_out, length_seq) : 
    winner = 0
    labels_blast=["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]
    blast_df = pd.read_csv(path_blast_out, sep = "\t", names = labels_blast)
    if len(blast_df) > 0 :
        row = blast_df.iloc[0] 
        if row["bitscore"] > 150 and length_seq/int(row["length"])> 0.8:
            winner = dico_cluster_r[row["sseqid"]]
        else :
            winner = "No hits"
    else :
        winner = "No hits"
    return winner

def get_winner(record , path_tmp) :
    path_func , len_func = tmp_fasta_file(record, path_tmp)
    path_blast_out_func = blast_seq(path_func , path_db, path_tmp)
    winner = get_best_candidate(path_blast_out_func, len_func)
    return winner

In [53]:
winners = []
for record in tqdm(set_records) :
    winner = get_winner(record, path_tmp)
    if winner != "No hits" :
        hit = int(winner.split("_")[-1])
        vector = [0]*len(dico_cluster)
        vector[hit] = 1
        vector = np.array(vector)
        a = (record.description , winner, vector)
        winners.append(a)
    else :
        vector = "Null"
    a = (record.description , winner, vector)
    winners.append(a)

100%|███████████████████████████████████████████| 74/74 [00:04<00:00, 15.08it/s]


In [46]:
len(winners)

56

In [11]:
len(dico_cluster)

822

***
# Make predictions

In [1]:
import pickle
import os
path_seqbased = "/media/concha-eloko/Linux/PPT_clean/Seqbased_model"

models_TropiSeq = {}

for rf_model in os.listdir(f"{path_seqbased}/RF_models") :
    if rf_model.count("0711.slow") > 0 and rf_model.count("fasta") == 0 :
        kltype = rf_model.split("_RF_")[1].split(".")[0]
        with open(f"{path_seqbased}/RF_models/{rf_model}", 'rb') as file:
            models_TropiSeq[kltype] = pickle.load(file)


In [2]:
models_TropiSeq

{'KL56': RandomForestClassifier(bootstrap=False, max_depth=21, max_features='auto',
                        min_samples_split=9, n_estimators=378, random_state=42),
 'KL106': RandomForestClassifier(bootstrap=False, max_depth=69, max_features='auto',
                        min_samples_leaf=2, n_estimators=200, random_state=42),
 'KL117': RandomForestClassifier(bootstrap=False, max_depth=91, max_features='auto',
                        min_samples_split=6, n_estimators=714, random_state=42),
 'KL46': RandomForestClassifier(bootstrap=False, max_depth=81, max_features='auto',
                        min_samples_split=6, n_estimators=672, random_state=42),
 'KL39': RandomForestClassifier(bootstrap=False, max_depth=87, max_features='auto',
                        min_samples_split=4, n_estimators=249, random_state=42),
 'KL9': RandomForestClassifier(bootstrap=False, max_depth=25, max_features='auto',
                        min_samples_split=8, n_estimators=368, random_state=42),
 'KL42': R

In [54]:
TropiSeq_results = {}

for _,winner in enumerate(winners) :
    if isinstance(winner[2], np.ndarray):
        tmp_positif = {}
        for kltype in models_TropiSeq :
            pred = models_TropiSeq[kltype].predict(np.array(winner[2]).reshape(1, -1))
            if pred[0] == 1 :
                tmp_positif[kltype] = pred[0]
        TropiSeq_results[winner[0]] = tmp_positif
    else : 
        TropiSeq_results[winner[0]] = "No hits"
    

In [27]:
len(winners)

74

In [56]:
import pprint
pp = pprint.PrettyPrinter(width = 150, sort_dicts = True, compact = True)
out = pp.pprint(TropiSeq_results)


{'K10PH82C1__cds_50_A_5_301_819.pdb': 'No hits',
 'K10PH82C1__cds_51_A_2_38_368.pdb': {},
 'K11PH164C1__cds_45_A_5_356_700.pdb': 'No hits',
 'K11PH164C1__cds_46_A_1_1_416.pdb': {},
 'K13PH07C1S__cds_10_A_7_32_375.pdb': 'No hits',
 'K13PH07C1S__cds_11_A_2_93_430.pdb': 'No hits',
 'K14PH164C1__cds_24_A_4_221_871.pdb': {},
 'K15PH90__cds_55_A.pdb': {'KL136': 1, 'KL15': 1},
 'K16PH164C3__cds_48_A_3_292_776.pdb': {'KL16': 1},
 'K17alfa61__cds_23_A_4_179_630.pdb': 'No hits',
 'K17alfa62__cds_64_A_3_129_548.pdb': {'KL17': 1},
 'K1PH164C1__cds_8_A_2_69_559.pdb': 'No hits',
 'K21lambda1__cds_28_A.pdb': {'KL124': 1, 'KL125': 1, 'KL21': 1, 'KL30': 1, 'KL31': 1, 'KL39': 1},
 'K22PH164C1__cds_10_A_1_1_368.pdb': {'KL111': 1},
 'K23PH08C2__cds_233_A_2_76_514.pdb': {'KL23': 1},
 'K24PH164C1__cds_8_A_2_85_402.pdb': {'KL112': 1, 'KL19': 1, 'KL21': 1, 'KL24': 1, 'KL28': 1, 'KL39': 1},
 'K25PH129C1__cds_60_A_5_311_671.pdb': {'KL25': 1},
 'K26PH128C1__cds_49_A_3_291_808.pdb': {'KL74': 1},
 'K26PH128C1__cds

In [None]:
results_clean = {}

for prot in TropiSeq_results :
    phage = prot.split("__")[0]
    if phage not in results_clean :
        results_clean[phage] = []

# 