In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
import random
from collections import Counter
import warnings
import logging
import subprocess
from multiprocessing.pool import ThreadPool
import joblib

# SCikitlearn modules :
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report , roc_auc_score, matthews_corrcoef
from sklearn.metrics import roc_auc_score

# Scipy modules : 
from scipy.stats import fisher_exact
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from statistics import mean

path_work = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Seqbased_model"
path_models = f"{path_work}/RF_1302_models"
path_testing = f"{path_work}/RF_1302_data"

DF_info = pd.read_csv(f"{path_work}/TropiGATv2.final_df_v2.tsv", sep = "\t" ,  header = 0)
df_prophages = DF_info.drop_duplicates(subset = ["Phage"], keep = "first")
dico_prophage_info = {row["Phage"] : {"prophage_strain" : row["prophage_id"] , "ancestor" : row["Infected_ancestor"]} for _,row in df_prophages.iterrows()}

def get_filtered_prophages(prophage) :
    combinations = []
    to_exclude = set()
    to_keep = set()
    to_keep.add(prophage)
    df_prophage_group = DF_info[(DF_info["prophage_id"] == dico_prophage_info[prophage]["prophage_strain"]) & (DF_info["Infected_ancestor"] == dico_prophage_info[prophage]["ancestor"])]
    if len(df_prophage_group) == 1 : 
        pass
    else :
        depo_set = set(df_prophage_group[df_prophage_group["Phage"] == prophage]["domain_seq"].values)
        for prophage_tmp in df_prophage_group["Phage"].unique().tolist() :
            if prophage_tmp != prophage :
                tmp_depo_set = set(df_prophage_group[df_prophage_group["Phage"] == prophage_tmp]["domain_seq"].values)
                if depo_set == tmp_depo_set :
                    to_exclude.add(prophage_tmp)
                else :
                    if tmp_depo_set not in combinations :
                        to_keep.add(prophage_tmp)
                        combinations.append(tmp_depo_set)
                    else :
                        to_exclude.add(prophage_tmp)
    return df_prophage_group , to_exclude , to_keep

good_prophages = set()
excluded_prophages = set()

for prophage, info_prophage in tqdm(dico_prophage_info.items()) :
    if prophage not in excluded_prophages and prophage not in good_prophages:
        _, excluded_members , kept_members = get_filtered_prophages(prophage) 
        good_prophages.update(kept_members)
        excluded_prophages.update(excluded_members)

DF_info_lvl_0_filtered = DF_info[DF_info["Phage"].isin(good_prophages)]
DF_info_lvl_0_final = DF_info_lvl_0_filtered[~DF_info_lvl_0_filtered["KL_type_LCA"].str.contains("\\|")]

DF_info_lvl_0 = DF_info_lvl_0_final.copy()

# useful dictionary :
KLtype_count = Counter(DF_info_lvl_0["KL_type_LCA"])
KLtype_pred = [kltype for kltype in KLtype_count if KLtype_count[kltype] >= 10]

dico_prophage_kltype_associated = {}
for negative_index,phage in tqdm(enumerate(DF_info_lvl_0["Phage"].unique().tolist())) :
    kltypes = set()
    dpos = DF_info_lvl_0[DF_info_lvl_0["Phage"] == phage]["index"]
    for dpo in dpos : 
        tmp_kltypes = DF_info_lvl_0[DF_info_lvl_0["index"] == dpo]["KL_type_LCA"].values
        kltypes.update(tmp_kltypes)
    dico_prophage_kltype_associated[phage] = kltypes

depo_domains_seq = {index: domain_seq for index, domain_seq in zip(DF_info_lvl_0["index"], DF_info_lvl_0['domain_seq'])}
with open(f"{path_work}/Dpo_domains.2912.multi.fasta" , "w") as outfile : 
    for index,seq in depo_domains_seq.items() : 
        outfile.write(f">{index}\n{seq}\n")
       

# ******************************************************
# CD hit step :
path_multi_fasta = f"{path_work}/Dpo_domains.2912.multi.fasta"
path_tmp_cdhit = f"{path_work}/cdhit_clusters_2912"

def make_cdhit_cluster(threshold) :
    cdhit_command = f"cd-hit -i {path_multi_fasta} -o {path_tmp_cdhit}/{threshold}.out -c {threshold} -G 0 -aL 0.8"
    cdhit_process = subprocess.Popen(cdhit_command, shell =True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 
    scan_out, scan_err = cdhit_process.communicate()
    print(scan_out, scan_err)

def make_cluster_dico(cdhit_out) :
    import json
    dico_cluster = {}
    threshold = cdhit_out.split("/")[-1].split(".out")[0]
    cluster_file = f"{cdhit_out}.clstr"
    cluster_out = open(cluster_file).read().split(">Cluster")
    for index,cluster in enumerate(cluster_out[1:]) :
        tmp_dpo = []
        id_cluster = f"Dpo_cdhit_{index}"
        for _,line in enumerate(cluster.split("\n")[1:-1]) :
            dpo = line.split(">")[1].split(".")[0]
            tmp_dpo.append(dpo)
        dico_cluster[id_cluster] = tmp_dpo
    with open(f"{path_work}/dico_cluster.cdhit__{threshold}.json", "w") as outfile:
        json.dump(dico_cluster, outfile)
    return dico_cluster , threshold

def make_DF_binaries(df_info , dico_cluster, threshold) :
    all_dpo_binaries = []
    for phage in df_info.Phage.unique() :
        dpo_binary = []
        df_phage = df_info[df_info["Phage"] == phage]["index"].values
        for cluster,dpos in dico_cluster.items() :
            shared_item = bool(set(dpos) & set(df_phage))
            if shared_item == True :
                dpo_binary.append(1)
            else :
                dpo_binary.append(0)
        all_dpo_binaries.append(dpo_binary)
    df_dpo_prophages = pd.DataFrame(all_dpo_binaries, index=df_info.Phage.unique(), columns=dico_cluster.keys())
    df_dpo_prophages.to_csv(f"{path_work}/DF_binaries_{threshold}.csv", sep = ",", index = True, header = True)
    return df_dpo_prophages

def make_DF_kltype(df_info, df ,KL_type , dico_cluster,ratio = 5, collapse = False) : 
    # positive data :
    positive_phages = df_info[df_info["KL_type_LCA"] == KL_type]["Phage"].unique()
    df_positives = df[df.index.isin(positive_phages)]
    #df_positives = df_positives.drop_duplicates(subset = ["Phage"] , keep = "first")
    df_positives = df_positives[~df_positives.index.duplicated(keep='first')]
    binaries_pos = df_positives.values
    labels_pos = [1] * len(binaries_pos)
    phages_pos = df_positives.index
    # negative data :
    n_samples = len(phages_pos)
    negative_phages = []
    for negative_index,phage in enumerate(df_info["Phage"].unique().tolist()) :
        if KL_type not in dico_prophage_kltype_associated[phage] :
            negative_phages.append(phage)
    negative_phages_selected = random.sample(negative_phages, int(n_samples*ratio))
    df_negatives = df[df.index.isin(negative_phages_selected)]
    binaries_neg = df_negatives.values
    labels_neg = [0] * len(binaries_neg)
    all_binaries = np.concatenate((binaries_pos, binaries_neg)) 
    all_labels = labels_pos + labels_neg
    all_indices = list(phages_pos) + list(negative_phages_selected)
    df_kl = pd.DataFrame(all_binaries, index=all_indices, columns=dico_cluster.keys())
    if collapse == True :
        dpo_presence = [dpo for dpo in df_kl.columns if sum(df_kl[dpo]) >0]
        df_kl = df_kl[dpo_presence]
    return (df_kl , all_labels)


def fit_rf_model_random_search(df_kl, all_labels, KL_type, threshold, path_models, n_splits=5, n_iters=100):
    mcc_scores = []
    auc_scores = []
    report_list = []
    best_models = []
    test_sets = []
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=243)
    first_split = True
    best_params = None
    for train_index, test_index in skf.split(df_kl, all_labels):
        X_train, X_test = df_kl.iloc[train_index], df_kl.iloc[test_index]
        all_labels_series = pd.Series(all_labels)
        y_train = all_labels_series.iloc[train_index]
        y_test = all_labels_series.iloc[test_index]        
        # Define hyperparameter search space
        param_grid = {
            'bootstrap': Categorical([True, False]),
            'max_depth': Integer(10, 100),
            'max_features': Categorical(['auto', 'sqrt']),
            'min_samples_leaf': Integer(1, 4),
            'min_samples_split': Integer(2, 10),
            'n_estimators': Integer(200, 800)
        }
        if first_split:
            # Initialize Random Forest classifier
            rf = RandomForestClassifier(random_state=42)
            # Initialize BayesSearchCV for hyperparameter tuning
            print(rf , param_grid, n_iters)
            bayes_search = BayesSearchCV(rf, param_grid, n_iter=n_iters, cv=4, n_jobs=-1)
            bayes_search.fit(X_train, y_train)
            # Get the best hyperparameters
            best_params = bayes_search.best_params_
            best_model = bayes_search.best_estimator_
            best_models.append(best_model)
            first_split = False
        else:
            # Reuse the best hyperparameters
            best_model = RandomForestClassifier(random_state=42, **best_params)
            best_model.fit(X_train, y_train)
            best_models.append(best_model)
        print("that round worked")
        print(train_index, test_index, best_models)
        # Make predictions on testing data
        predictions = best_model.predict(X_test)
        # Calculate metrics
        mcc = matthews_corrcoef(y_test, predictions)
        mcc_scores.append(mcc)
        auc = roc_auc_score(y_test, predictions) # Calculate AUC score
        auc_scores.append(auc)  # Store AUC score
        # Get classification report
        report = classification_report(y_test, predictions, output_dict=True)
        report_list.append(report)
        # Save the testing set
        test_sets.append((X_test, y_test))
    # Calculate means
    mean_mcc = mean(mcc_scores)
    mean_auc = mean(auc_scores)  # Calculate mean AUC score
    # Calculate average report across all folds
    avg_report = {
        'precision': {key: mean([r['precision'][key] for r in report_list]) for key in report_list[0]['precision']},
        'recall': {key: mean([r['recall'][key] for r in report_list]) for key in report_list[0]['recall']},
        'f1-score': {key: mean([r['f1-score'][key] for r in report_list]) for key in report_list[0]['f1-score']},
        'support': {key: mean([r['support'][key] for r in report_list]) for key in report_list[0]['support']},
        'accuracy': {key: mean([r['accuracy'][key] for r in report_list]) for key in report_list[0]['accuracy']}
    }
    # Save the best model
    joblib.dump(best_model, f'{path_models}/{threshold}_RF_{KL_type}.joblib')
    # Save the corresponding testing set
    best_model_index = mcc_scores.index(max(mcc_scores))
    joblib.dump(test_sets[best_model_index], f'{path_models}/{threshold}_test_data_{KL_type}.joblib')
    return mean_mcc, avg_report, mean_auc
    
#list(map(make_cdhit_cluster , [0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 0.975]))
cdhit_files = [f"{path_tmp_cdhit}/{file}" for file in os.listdir(path_tmp_cdhit) if file[-3:]=="out"]

def make_prediction_file (path_file) :
    dico_cluster , threshold = make_cluster_dico(f"{path_file}")
    df_binaries = make_DF_binaries(DF_info_lvl_0 , dico_cluster, threshold)
    for KL_type in KLtype_count :
        if KLtype_count[KL_type] < 5 :
            if os.path.isfile(f'{path_models}/{threshold}_RF_{KL_type}.full_data.joblib') == False :
                df_kl , all_labels = make_DF_kltype(DF_info_lvl_0 ,df_binaries, KL_type , dico_cluster, collapse = False)
                fit_rf_model_random_search(df_kl , all_labels, KL_type,threshold,path_models)




if __name__ == '__main__':
    with ThreadPool(10) as p:
        p.map(make_prediction_file, cdhit_files)


In [None]:
def make_prediction_file (path_file) :
    dico_cluster , threshold = make_cluster_dico(f"{path_file}")
    df_binaries = make_DF_binaries(DF_info_lvl_0 , dico_cluster, threshold)
    for KL_type in KLtype_count :
        if KLtype_count[KL_type] < 5 :
            if os.path.isfile(f'{path_models}/{threshold}_RF_{KL_type}.full_data.joblib') == False :
                df_kl , all_labels = make_DF_kltype(DF_info_lvl_0 ,df_binaries, KL_type , dico_cluster, collapse = False)
                fit_rf_model_random_search(df_kl , all_labels, KL_type,threshold,path_models)





In [None]:
def fit_rf_model_random_search(df_kl, all_labels, KL_type, threshold, path_models, n_splits=5, n_iters=100):
    data_kltype = {}
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=243)
    first_split = True
    best_params = None
    n = 0
    for train_index, test_index in skf.split(df_kl, all_labels):
        tmp_dico = {}
        X_train, X_test = df_kl.iloc[train_index], df_kl.iloc[test_index]
        all_labels_series = pd.Series(all_labels)
        y_train = all_labels_series.iloc[train_index]
        y_test = all_labels_series.iloc[test_index]
        # Define hyperparameter search space
        param_grid = {
            'bootstrap': Categorical([True, False]),
            'max_depth': Integer(10, 100),
            'max_features': Categorical(['auto', 'sqrt']),
            'min_samples_leaf': Integer(1, 4),
            'min_samples_split': Integer(2, 10),
            'n_estimators': Integer(200, 800)
        }
        if first_split:
            rf = RandomForestClassifier(random_state=42)
            # Initialize BayesSearchCV for hyperparameter tuning
            print(rf , param_grid, n_iters)
            bayes_search = BayesSearchCV(rf, param_grid, n_iter=n_iters, cv=4, n_jobs=-1)
            bayes_search.fit(X_train, y_train)
            # Get the best hyperparameters
            best_params = bayes_search.best_params_
            best_model = bayes_search.best_estimator_
            first_split = False
        else:
            # Reuse the best hyperparameters
            best_model = RandomForestClassifier(random_state=42, **best_params)
            best_model.fit(X_train, y_train)
        # Make predictions on testing data
        predictions = best_model.predict(X_test)
        # Calculate metrics
        a = {n : {"best_parameters" : bayes_search.best_params_,
             "model" : best_model,
             "test_data" : (y_test, predictions),
             "test_&_model_predictions" : (X_test, y_test),
             "iteration" : n}}
        tmp_dico.update(a)
        n += 1
        print(KL_type,tmp_dico)
    data_kltype[KL_type] = tmp_dico
    # Save the best model
    joblib.dump(data_kltype, f'{path_models}/{threshold}_RF_{KL_type}.full_data.joblib')


In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
import random
from collections import Counter
import warnings
import logging
import subprocess
from multiprocessing.pool import ThreadPool
import joblib

# SCikitlearn modules :
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report , roc_auc_score, matthews_corrcoef
from sklearn.metrics import roc_auc_score

# Scipy modules : 
from scipy.stats import fisher_exact
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from statistics import mean

path_work = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Seqbased_model"
path_models = f"{path_work}/RF_1302_models"
path_testing = f"{path_work}/RF_1302_data"


joblib.open(f'{path_models}/{threshold}_RF_{KL_type}.full_data.joblib')

In [None]:
rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Seqbased_model/RF_1302_models \
/media/concha-eloko/Linux/PPT_clean/ficheros_28032023




In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
import random
from collections import Counter
import warnings
import logging
import subprocess
from multiprocessing.pool import ThreadPool
import joblib

# SCikitlearn modules :
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report , roc_auc_score, matthews_corrcoef
from sklearn.metrics import roc_auc_score

# Scipy modules : 
from scipy.stats import fisher_exact
#from skopt import BayesSearchCV
#from skopt.space import Real, Categorical, Integer
from statistics import mean

path_jobs = "/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/RF_1302_models"

In [12]:
eg_job = joblib.load(f"{path_jobs}/0.8_RF_KL1.TEST.full_data.joblib")

In [24]:
eg_job[2]["test_data"]

(18      1
 21      1
 24      1
 26      1
 28      1
        ..
 1121    0
 1123    0
 1129    0
 1133    0
 1134    0
 Length: 228, dtype: int64,
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]))