In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
import random
from collections import Counter
import warnings
import logging
import subprocess
from multiprocessing.pool import ThreadPool
import joblib

# SCikitlearn modules :
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report , roc_auc_score, matthews_corrcoef
from sklearn.metrics import roc_auc_score

# Scipy modules : 
from scipy.stats import fisher_exact
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer


path_work = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Seqbased_model"
path_models = f"{path_work}/RF_0702_models"

DF_info = pd.read_csv(f"{path_work}/TropiGATv2.final_df.tsv", sep = "\t" ,  header = 0)
DF_info_lvl_0 = DF_info[~DF_info["KL_type_LCA"].str.contains("\\|")]
DF_info_lvl_0 = DF_info_lvl_0.drop_duplicates(subset = ["Infected_ancestor","index","prophage_id"] , keep = "first").reset_index(drop=True)

# useful dictionary :
KLtype_count = Counter(DF_info_lvl_0["KL_type_LCA"])
KLtype_pred = [kltype for kltype in KLtype_count if KLtype_count[kltype] >= 10]

dico_prophage_kltype_associated = {}
for negative_index,phage in tqdm(enumerate(DF_info_lvl_0["Phage"].unique().tolist())) :
    kltypes = set()
    dpos = DF_info_lvl_0[DF_info_lvl_0["Phage"] == phage]["index"]
    for dpo in dpos : 
        tmp_kltypes = DF_info_lvl_0[DF_info_lvl_0["index"] == dpo]["KL_type_LCA"].values
        kltypes.update(tmp_kltypes)
    dico_prophage_kltype_associated[phage] = kltypes

depo_domains_seq = {index: domain_seq for index, domain_seq in zip(DF_info_lvl_0["index"], DF_info_lvl_0['domain_seq'])}
with open(f"{path_work}/Dpo_domains.1710.multi.fasta" , "w") as outfile : 
    for index,seq in depo_domains_seq.items() : 
        outfile.write(f">{index}\n{seq}\n")


In [None]:
dico_threshold_done = {}

names = ["KLtype", "Count","mcc","F1","recall","accuracy","AUC"]

for threshold in [0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 0.975] : 
    df_c = pd.read_csv(f"{path_work}/RF_report.{threshold}.0702.tsv", sep = "\t", names = names)
    done_kltypes = df_c["KLtype"].tolist()
    dico_threshold_done[str(threshold)] = done_kltypes




In [None]:
# ******************************************************
# CD hit step :

path_multi_fasta = f"{path_work}/Dpo_domains.1710.multi.fasta"
path_tmp_cdhit = f"{path_work}/cdhit_clusters_1710"

def make_cdhit_cluster(threshold) :
    cdhit_command = f"cd-hit -i {path_multi_fasta} -o {path_tmp_cdhit}/{threshold}.out -c {threshold} -G 0 -aL 0.8"
    cdhit_process = subprocess.Popen(cdhit_command, shell =True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 
    scan_out, scan_err = cdhit_process.communicate()
    print(scan_out, scan_err)
    

def make_cluster_dico(cdhit_out) :
    import json
    dico_cluster = {}
    threshold = cdhit_out.split("/")[-1].split(".out")[0]
    cluster_file = f"{cdhit_out}.clstr"
    cluster_out = open(cluster_file).read().split(">Cluster")
    for index,cluster in enumerate(cluster_out[1:]) :
        tmp_dpo = []
        id_cluster = f"Dpo_cdhit_{index}"
        for _,line in enumerate(cluster.split("\n")[1:-1]) :
            dpo = line.split(">")[1].split(".")[0]
            tmp_dpo.append(dpo)
        dico_cluster[id_cluster] = tmp_dpo
    with open(f"{path_work}/dico_cluster.cdhit__{threshold}.json", "w") as outfile:
        json.dump(dico_cluster, outfile)
    return dico_cluster , threshold

def make_DF_binaries(df_info , dico_cluster, threshold) :
    all_dpo_binaries = []
    for phage in df_info.Phage.unique() :
        dpo_binary = []
        df_phage = df_info[df_info["Phage"] == phage]["index"].values
        for cluster,dpos in dico_cluster.items() :
            shared_item = bool(set(dpos) & set(df_phage))
            if shared_item == True :
                dpo_binary.append(1)
            else :
                dpo_binary.append(0)
        all_dpo_binaries.append(dpo_binary)
    df_dpo_prophages = pd.DataFrame(all_dpo_binaries, index=df_info.Phage.unique(), columns=dico_cluster.keys())
    df_dpo_prophages.to_csv(f"{path_work}/DF_binaries_{threshold}.csv", sep = ",", index = True, header = True)

    return df_dpo_prophages


def make_DF_kltype(df_info, df ,KL_type , dico_cluster,ratio = 5, collapse = False) : 
    # positive data :
    positive_phages = df_info[df_info["KL_type_LCA"] == KL_type]["Phage"].unique()
    df_positives = df[df.index.isin(positive_phages)]
    #df_positives = df_positives.drop_duplicates(subset = ["Phage"] , keep = "first")
    df_positives = df_positives[~df_positives.index.duplicated(keep='first')]
    binaries_pos = df_positives.values
    labels_pos = [1] * len(binaries_pos)
    phages_pos = df_positives.index
    # negative data :
    n_samples = len(phages_pos)
    negative_phages = []
    for negative_index,phage in enumerate(df_info["Phage"].unique().tolist()) :
        if KL_type not in dico_prophage_kltype_associated[phage] :
            negative_phages.append(phage)
    negative_phages_selected = random.sample(negative_phages, int(n_samples*ratio))
    df_negatives = df[df.index.isin(negative_phages_selected)]
    binaries_neg = df_negatives.values
    labels_neg = [0] * len(binaries_neg)

    all_binaries = np.concatenate((binaries_pos, binaries_neg)) 
    all_labels = labels_pos + labels_neg
    all_indices = list(phages_pos) + list(negative_phages_selected)

    df_kl = pd.DataFrame(all_binaries, index=all_indices, columns=dico_cluster.keys())
    if collapse == True :
        dpo_presence = [dpo for dpo in df_kl.columns if sum(df_kl[dpo]) >0]
        df_kl = df_kl[dpo_presence]
    return (df_kl , all_labels)


def fit_rf_model_random_search(df_kl , all_labels,KL_type, threshold):
    n_iters = 100
    X_train, X_test, y_train, y_test = train_test_split(df_kl, all_labels, test_size=0.3, random_state=243)
    X_test, X_eval, y_test, y_eval = train_test_split(X_test, y_test, test_size=0.33, random_state=243)
    param_grid = {
        'bootstrap': Categorical([True, False]),
        'max_depth': Integer(10, 100),
        'max_features': Categorical(['auto', 'sqrt']),
        'min_samples_leaf': Integer(1, 4),
        'min_samples_split': Integer(2, 10),
        'n_estimators': Integer(200, 800)
    }
    rf = RandomForestClassifier(random_state=42)
    bayes_search = BayesSearchCV(rf, param_grid, n_iter=n_iters, cv=5, n_jobs=-1)
    bayes_search.fit(X_train, y_train)
    # Predicting with the best 
    predictions = bayes_search.predict(X_test)
    report = classification_report(y_test, predictions, output_dict=True)
    joblib.dump(bayes_search.best_estimator_, f'{path_models}/{threshold}_RF_{KL_type}.0702.joblib')
    mcc = matthews_corrcoef(y_test, predictions)
    auc = roc_auc_score(y_test, predictions)
    return mcc , report , auc

#list(map(make_cdhit_cluster , [0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 0.975]))
cdhit_files = [f"{path_tmp_cdhit}/{file}" for file in os.listdir(path_tmp_cdhit) if file[-3:]=="out"]

def make_prediction_file (path_file) :
    dico_cluster , threshold = make_cluster_dico(f"{path_file}")
    df_binaries = make_DF_binaries(DF_info_lvl_0 , dico_cluster, threshold)
    for KL_type in KLtype_count :
        if KL_type not in dico_threshold_done[threshold] :
            with open(f"{path_work}/RF_report.{threshold}.0702.tsv", "a+") as outfile :
                if KLtype_count[KL_type] < 5 :
                    outfile.write(f"{KL_type}\t{KLtype_count[KL_type]}\t***No sufficient Data\n")
                else :
                    df_kl , all_labels = make_DF_kltype(DF_info_lvl_0 ,df_binaries, KL_type , dico_cluster, collapse = False)
                    mcc , report, auc = fit_rf_model_random_search(df_kl , all_labels, KL_type,threshold)
                    outfile.write(f"{KL_type}\t{KLtype_count[KL_type]}\t{mcc}\t{report['1']['f1-score']}\t{report['1']['recall']}\t{report['accuracy']}\t{auc}\n")

if __name__ == '__main__':
    with ThreadPool(20) as p:
        p.map(make_prediction_file, cdhit_files)


In [None]:
rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Seqbased_model/RF_1912_models_info \
/media/concha-eloko/Linux/PPT_clean/  

In [None]:
/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Seqbased_model/RF_1912_models
/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Seqbased_model/RF_2912_models

In [None]:
rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Seqbased_model/RF_2912_models \
/media/concha-eloko/Linux/PPT_clean/