In [2]:
# Ground modules
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from Bio import SeqIO
from itertools import product
import random
from collections import Counter
import warnings
import logging
import subprocess
from multiprocessing.pool import ThreadPool

# SCikitlearn modules :
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report , roc_auc_score

# Scipy modules : 
from scipy.stats import fisher_exact

***

In [9]:
# Make the blastp DB of all the dpo sequences :

path_seqbased = "/media/concha-eloko/Linux/PPT_clean/Seqbased_model"

fasta_file = f"{path_seqbased}/cdhit_clusters_1710/0.85.out"

blast_command = f"makeblastdb -in {fasta_file} -dbtype prot -out {path_seqbased}/TropiSeq/TropiSeq_0.85.db"
make_blast_process = subprocess.Popen(blast_command, shell =True, stdout = subprocess.PIPE, stderr=subprocess.STDOUT)
mkblast_out, mkblast_err = make_blast_process.communicate()
print(mkblast_out , mkblast_err)

b'\n\nBuilding a new DB, current time: 10/28/2023 05:08:33\nNew DB name:   /media/concha-eloko/Linux/PPT_clean/Seqbased_model/TropiSeq/TropiSeq_0.85.db\nNew DB title:  /media/concha-eloko/Linux/PPT_clean/Seqbased_model/cdhit_clusters_1710/0.85.out\nSequence type: Protein\nKeep MBits: T\nMaximum file size: 1000000000B\nAdding sequences from FASTA; added 822 sequences in 0.0205569 seconds.\n' None


***
# Make function that :
### A : blastp from a Dpo seq
### B : read the results and spot the hits
### C : Build a vector from the presence abscence
### D : Make prediction
***

> 77 phages

In [41]:
import json

path_seqbased = "/media/concha-eloko/Linux/PPT_clean/Seqbased_model"
path_db = f"{path_seqbased}/TropiSeq/TropiSeq_0.85.db"

dico_cluster = json.load(open(f"{path_seqbased}/dico_cluster.cdhit__0.85.json"))
dico_cluster_r = {ref_dpo : key_dpo for key_dpo,list_dpo in dico_cluster.items() for ref_dpo in list_dpo}


In [64]:
path_seq = "/media/concha-eloko/Linux/77_strains_phage_project/rbp_work"
path_tmp =  "/media/concha-eloko/Linux/PPT_clean/Seqbased_model/tmp"
labels_blast=["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]

dico_seq = {record.description : record.seq for record in SeqIO.parse(f"{path_seq}/77_domain_Dpo_cluster_all_seqs.fasta", "fasta") if len(record.seq) >0}
set_records = [record for record in SeqIO.parse(f"{path_seq}/77_domain_Dpo_cluster_all_seqs.fasta", "fasta") if len(record.seq) > 0]


def tmp_fasta_file(record , path_tmp) :
    name_file = "_".join(record.description.split(" "))
    path_fasta = f"{path_tmp}/{name_file}.fasta"
    length_seq = len(record.seq)
    with open(path_fasta, "w") as outfile :
        outfile.write(f">{record.description}\n{str(record.seq)}")
    return path_fasta , length_seq

def blast_seq(path_fasta, path_DB, path_tmp) :
    file_name = path_fasta.split("/")[-1]
    command = f"blastp -query {path_fasta} -db {path_DB} -out {path_tmp}/{file_name}.blast_out -outfmt 6 -evalue 1e-10"
    blastp_sub = subprocess.Popen(command ,shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
    out , err = blastp_sub.communicate()
    return f"{path_tmp}/{file_name}.blast_out"

def get_best_candidate(path_blast_out, length_seq) : 
    winner = 0
    labels_blast=["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]
    blast_df = pd.read_csv(path_blast_out, sep = "\t", names = labels_blast)
    if len(blast_df) > 0 :
        row = blast_df.iloc[0] 
        if row["bitscore"] > 150 and length_seq/int(row["length"])> 0.8:
            winner = dico_cluster_r[row["sseqid"]]
        else :
            winner = "No hits"
    else :
        winner = "No hits"
    return winner

def get_winner(record , path_tmp) :
    path_func , len_func = tmp_fasta_file(record, path_tmp)
    path_blast_out_func = blast_seq(path_func , path_db, path_tmp)
    winner = get_best_candidate(path_blast_out_func, len_func)
    return winner

In [65]:
winners = []
for record in set_records :
    winner = get_winner(record, path_tmp)
    a = (record.description , winner)
    winners.append(a)

In [66]:
winners

[('K14PH164C1__cds_24_A_4_221_871.pdb', 'Dpo_cdhit_17'),
 ('K64PH164C4__cds_24_A_4_178_852.pdb', 'Dpo_cdhit_11'),
 ('K40PH129C1__cds_56_A_4_239_860.pdb', 'No hits'),
 ('K51PH129C1__cds_9_A_1_92_787.pdb', 'Dpo_cdhit_13'),
 ('K15PH90__cds_55_A.pdb', 'Dpo_cdhit_74'),
 ('K21lambda1__cds_28_A.pdb', 'Dpo_cdhit_217'),
 ('K26PH128C1__cds_50_A_1_97_595.pdb', 'Dpo_cdhit_48'),
 ('K35PH164C3__cds_48_A_4_282_728.pdb', 'Dpo_cdhit_234'),
 ('K37PH164C1__cds_47_A_1_1_307.pdb', 'Dpo_cdhit_440'),
 ('K38PH09C2__cds_24_A_4_178_672.pdb', 'Dpo_cdhit_142'),
 ('K27PH129C1__cds_48_A_7_200_648.pdb', 'Dpo_cdhit_153'),
 ('K5lambda5__cds_198_A_3_173_674.pdb', 'Dpo_cdhit_156'),
 ('K41P2__cds_11_A_7_188_506.pdb', 'Dpo_cdhit_160'),
 ('K43PH164C1__cds_40_A_3_296_718.pdb', 'Dpo_cdhit_193'),
 ('K11PH164C1__cds_46_A_1_1_416.pdb', 'Dpo_cdhit_301'),
 ('K17alfa62__cds_64_A_3_129_548.pdb', 'Dpo_cdhit_242'),
 ('K2PH164C1__cds_23_A_6_269_664.pdb', 'No hits'),
 ('K2alfa62__cds_23_A_6_269_671.pdb', 'No hits'),
 ('K56PH164C1__cds_

In [68]:
for win in winners : 
    if win[1] == 'No hits' :
        print(win[0])

K40PH129C1__cds_56_A_4_239_860.pdb
K2PH164C1__cds_23_A_6_269_664.pdb
K2alfa62__cds_23_A_6_269_671.pdb
K56PH164C1__cds_48_A_5_293_702.pdb
K82P1__cds_45_A_5_292_704.pdb
K8PH128__cds_46_A_5_294_719.pdb
K39PH122C2__cds_8_A_4_33_387.pdb
K13PH07C1S__cds_10_A_7_32_375.pdb
K82P1__cds_46_A_2_94_449.pdb
K11PH164C1__cds_45_A_5_356_700.pdb
K17alfa61__cds_23_A_4_179_630.pdb
K1PH164C1__cds_8_A_2_69_559.pdb
K5lambda5__cds_199_A_2_109_658.pdb
K10PH82C1__cds_50_A_5_301_819.pdb
K6PH25C3__cds_23_A_3_206_691.pdb
K13PH07C1S__cds_11_A_2_93_430.pdb
K54lambda2__cds_23_A_7_214_582.pdb
K80PH1317b__cds_53_A.pdb


In [62]:
winner = get_winner(set_records[0], path_tmp)
winner

'Dpo_cdhit_17'

In [54]:
pd.read_csv(path_blast_out_test, sep = "\t", names = labels_blast).iloc[0]

qseqid      K14PH164C1__cds_24_A_4_221_871.pdb
sseqid                          minibatch__842
pident                                  26.379
length                                     671
mismatch                                   438
gapopen                                     18
qstart                                       2
qend                                       646
sstart                                       2
send                                       642
evalue                                     0.0
bitscore                                   183
Name: 0, dtype: object

In [57]:
path_test , length_test = tmp_fasta_file(set_records[0], path_tmp)
path_blast_out_test = blast_seq(path_test , path_db, path_tmp)
candidates_test = get_best_candidate(path_blast_out_test, length_test)

In [58]:
candidates_test

'Dpo_cdhit_17'

In [None]:
path_seqbased = "/media/concha-eloko/Linux/PPT_clean/Seqbased_model"
import joblib

# Load the model from the file
loaded_model = joblib.load('path_to_your_saved_model.joblib')

# Now you can use `loaded_model` to make predictions
predictions = loaded_model.predict(X_test)

# 