In [None]:
rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Seqbased_model/RF_1302_models \
/media/concha-eloko/Linux/PPT_clean/ficheros_28032023


***
### Load Data :

In [25]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
import random
from collections import Counter, defaultdict
import warnings
import logging
import subprocess
from multiprocessing.pool import ThreadPool
import joblib

# SCikitlearn modules :
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report , roc_auc_score, matthews_corrcoef
from sklearn.metrics import roc_auc_score

# Scipy modules : 
from scipy.stats import fisher_exact
#from skopt import BayesSearchCV
#from skopt.space import Real, Categorical, Integer
from statistics import mean

path_work = "/media/concha-eloko/Linux/PPT_clean"
path_jobs = "/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/RF_1302_models"

> Set the count prophage dico : 

In [26]:
DF_info = pd.read_csv(f"{path_work}/TropiGATv2.final_df_v2.tsv", sep = "\t" ,  header = 0)

DF_info = DF_info.drop_duplicates(subset = ["Protein_name"])

df_prophages = DF_info.drop_duplicates(subset = ["Phage"], keep = "first")
dico_prophage_info = {row["Phage"] : {"prophage_strain" : row["prophage_id"] , "ancestor" : row["Infected_ancestor"]} for _,row in df_prophages.iterrows()}

def get_filtered_prophages(prophage) :
    combinations = []
    to_exclude = set()
    to_keep = set()
    to_keep.add(prophage)
    df_prophage_group = DF_info[(DF_info["prophage_id"] == dico_prophage_info[prophage]["prophage_strain"]) & (DF_info["Infected_ancestor"] == dico_prophage_info[prophage]["ancestor"])]
    if len(df_prophage_group) == 1 :
        pass
    else :
        depo_set = set(df_prophage_group[df_prophage_group["Phage"] == prophage]["domain_seq"].values)
        for prophage_tmp in df_prophage_group["Phage"].unique().tolist() :
            if prophage_tmp != prophage :
                tmp_depo_set = set(df_prophage_group[df_prophage_group["Phage"] == prophage_tmp]["domain_seq"].values)
                if depo_set == tmp_depo_set :
                    to_exclude.add(prophage_tmp)
                else :
                    if tmp_depo_set not in combinations :
                        to_keep.add(prophage_tmp)
                        combinations.append(tmp_depo_set)
                    else :
                        to_exclude.add(prophage_tmp)
    return df_prophage_group , to_exclude , to_keep

good_prophages = set()
excluded_prophages = set()

for prophage, info_prophage in tqdm(dico_prophage_info.items()) :
    if prophage not in excluded_prophages and prophage not in good_prophages:
        _, excluded_members , kept_members = get_filtered_prophages(prophage)
        good_prophages.update(kept_members)
        excluded_prophages.update(excluded_members)

DF_info_lvl_0_filtered = DF_info[DF_info["Phage"].isin(good_prophages)]
DF_info_lvl_0_final = DF_info_lvl_0_filtered[~DF_info_lvl_0_filtered["KL_type_LCA"].str.contains("\\|")]
DF_info_lvl_0 = DF_info_lvl_0_final.copy()

df_prophages = DF_info_lvl_0.drop_duplicates(subset = ["Phage"])
dico_prophage_count = dict(Counter(df_prophages["KL_type_LCA"]))


100%|████████████████████████████████████| 15981/15981 [00:25<00:00, 621.66it/s]


In [23]:
cluster_values = set([file.split("_")[0] for file in os.listdir(path_jobs)])
KLtypes = set([file.split("_RF_")[1].split(".full_data")[0] for file in os.listdir(path_jobs)])

KLtypes_paths = {kltype : [f"{path_jobs}/{file}" for file in os.listdir(path_jobs) if file.split("_RF_")[1].split(".full_data")[0]==kltype]
                for kltype in KLtypes}

KLtypes_paths

{'KL41': ['/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/RF_1302_models/0.75_RF_KL41.full_data.joblib',
  '/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/RF_1302_models/0.85_RF_KL41.full_data.joblib',
  '/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/RF_1302_models/0.65_RF_KL41.full_data.joblib',
  '/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/RF_1302_models/0.9_RF_KL41.full_data.joblib',
  '/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/RF_1302_models/0.8_RF_KL41.full_data.joblib',
  '/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/RF_1302_models/0.975_RF_KL41.full_data.joblib',
  '/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/RF_1302_models/0.7_RF_KL41.full_data.joblib',
  '/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/RF_1302_models/0.95_RF_KL41.full_data.joblib'],
 'KL60': ['/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/RF_1302_models/0.85_RF_KL60.full_data.joblib',
  '/media/concha-eloko/Linux/PPT_clean/ficheros_2

***
### Get the best cl value :

> Functions :

In [20]:
def get_mean_MCC(path_data_object) :
    data_object = joblib.load(f"{path_data_object}")
    mcc_values = []
    for i in data_object :
        y_test = data_object[i]["test_data"][0].values
        predictions = data_object[i]["test_data"][1]
        mcc = matthews_corrcoef(y_test, predictions)
        mcc_values.append(mcc)
    del data_object
    mean_mcc = mean(mcc_values)
    return mean_mcc

    

> Get the best model : 

In [39]:
with open(f"/media/concha-eloko/Linux/PPT_clean/Seqbased_model/Seqbased_models.prophages_metrics.tsv", "w") as outfile :
    for KLtype in KLtypes_paths : 
        for path in KLtypes_paths[KLtype] :
            cl_value = path.split("/")[-1].split("_RF")[0]
            mean_mcc = get_mean_MCC(path)
            count_kltype = dico_prophage_count[KLtype]
            outfile.write(f"{KLtype}\t{count_kltype}\t{cl_value}\t{mean_mcc}\n")


In [80]:
threshold = 15

names_metric_col = ["KL_type", "Count", "c_value","mean_mcc"]
metric_df = pd.read_csv(f"/media/concha-eloko/Linux/PPT_clean/Seqbased_model/Seqbased_models.prophages_metrics.tsv", sep = "\t", names = names_metric_col)

metric_eval_df = metric_df[metric_df["Count"] > threshold]

weighted_mcc_dico = {}
for cluster in metric_eval_df["c_value"].unique().tolist() :
    mcc_sum = 0
    cl_df = metric_eval_df[metric_eval_df["c_value"] == cluster]
    for _,row in cl_df.iterrows() : 
        mcc_sum = mcc_sum + row["mean_mcc"] * row["Count"]
    weighted_mcc = mcc_sum / (sum(cl_df["Count"]))
    weighted_mcc_dico[cluster] = weighted_mcc

weighted_mcc_dico

{0.85: 0.7707948894513985,
 0.8: 0.7584184893758646,
 0.75: 0.7462727218238402,
 0.7: 0.7398668014047561,
 0.65: 0.7601878670678446,
 0.9: 0.7701360467416041,
 0.975: 0.7463814845203501,
 0.95: 0.7491082544898822}

***
### Get the full metrics : 

> Relevant objects path :

In [79]:
cl_value = "0.85"

KLtypes = set([file.split("_RF_")[1].split(".full_data")[0] for file in os.listdir(path_jobs)])

KLtypes_paths = {kltype : [f"{path_jobs}/{file}" for file in os.listdir(path_jobs) if file.split("_")[0] == cl_value if file.split("_RF_")[1].split(".full_data")[0]==kltype]
                for kltype in KLtypes if dico_prophage_count[kltype] > threshold}



In [64]:
KLtypes_paths

{'KL41': ['/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/RF_1302_models/0.85_RF_KL41.full_data.joblib'],
 'KL60': ['/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/RF_1302_models/0.85_RF_KL60.full_data.joblib'],
 'KL28': ['/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/RF_1302_models/0.85_RF_KL28.full_data.joblib'],
 'KL6': ['/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/RF_1302_models/0.85_RF_KL6.full_data.joblib'],
 'KL25': ['/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/RF_1302_models/0.85_RF_KL25.full_data.joblib'],
 'KL2': ['/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/RF_1302_models/0.85_RF_KL2.full_data.joblib'],
 'KL19': ['/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/RF_1302_models/0.85_RF_KL19.full_data.joblib'],
 'KL1': ['/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/RF_1302_models/0.85_RF_KL1.full_data.joblib'],
 'KL107': ['/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/RF_1302_models/0.85_RF_KL107.full_da

In [60]:
def get_mean_MCC(path_data_object) :
    data_object = joblib.load(f"{path_data_object}")
    mcc_values = []
    for i in data_object :
        y_test = data_object[i]["test_data"][0].values
        predictions = data_object[i]["test_data"][1]
        mcc = matthews_corrcoef(y_test, predictions)
        mcc_values.append(mcc)
    del data_object
    mean_mcc = mean(mcc_values)
    return mean_mcc

def get_mean_metrics(path_data_object) :
    data_object = joblib.load(f"{path_data_object}")
    mcc_values = []
    auc_values = []
    recall_values = []
    f1_values = []
    acc_values = []
    prec_values = []
    report_list = []
    for i in data_object :
        y_test = data_object[i]["test_data"][0].values
        predictions = data_object[i]["test_data"][1]
        mcc = matthews_corrcoef(y_test, predictions)
        auc = roc_auc_score(y_test, predictions)
        auc_values.append(auc)
        mcc_values.append(mcc)
        # Report : 
        report = classification_report(y_test, predictions, output_dict=True)
        recall_values.append(report["1"]["recall"])
        f1_values.append(report["1"]["f1-score"])
        prec_values.append(report["1"]["precision"])
        acc_values.append(report["accuracy"])
        report_list.append(report)
    del data_object
    mean_mcc = mean(mcc_values)
    mean_auc = mean(auc_values)
    mean_recall = mean(recall_values)
    mean_f1 = mean(f1_values)
    mean_acc = mean(acc_values)
    mean_prec = mean(prec_values)
    return mean_mcc , mean_recall , mean_f1 , mean_acc , mean_prec , mean_auc



In [61]:
mean_mcc , mean_recall , mean_f1 , mean_acc , mean_prec , mean_auc = get_mean_metrics(KLtypes_paths["KL47"][0])

In [62]:
with open(f"/media/concha-eloko/Linux/PPT_clean/Seqbased_model/Seqbased_models.prophages_FULL_metrics.tsv", "w") as outfile :
    outfile.write(f"KL_type\tn_prophages\tF1\trecall\tprecision\taccuracy\tAUC\tMCC\n")
    for kltype in KLtypes_paths :
        count_kltype = dico_prophage_count[kltype]
        mean_mcc , mean_recall , mean_f1 , mean_acc , mean_prec , mean_auc = get_mean_metrics(KLtypes_paths[kltype][0])
        outfile.write(f"{kltype}\t{count_kltype}\t{mean_f1}\t{mean_recall}\t{mean_prec}\t{mean_acc}\t{mean_auc}\t{mean_mcc}\n")
        print(f"{kltype}\t{count_kltype}\t{mean_f1}\t{mean_recall}\t{mean_prec}\t{mean_acc}\t{mean_auc}\t{mean_mcc}")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KL41	10	0.0	0.0	0.0	0.8333333333333334	0.5	0.0
KL60	68	0.7656521739130434	0.6472527472527473	0.9492063492063492	0.9362842517314062	0.8206851971557855	0.7511373779570631
KL28	112	0.8571776630017277	0.7692028985507247	0.9787114845938375	0.9589851163719908	0.8829065340211251	0.8446934684060952


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KL6	12	0.0	0.0	0.0	0.8342857142857143	0.5	0.0
KL25	269	0.8380233762041729	0.8104122990915443	0.8712717446785647	0.9479491567794166	0.8929384915532071	0.8090144660938176
KL2	351	0.834817120753062	0.7534406438631791	0.9374125780431639	0.9504248628395557	0.8716211717899465	0.8131165601721763
KL19	73	0.844021164021164	0.7516666666666667	0.9666666666666667	0.9544880785413745	0.8732359307359308	0.8278668631815479
KL1	190	0.5279575510104982	0.37894736842105264	0.9261403508771929	0.8903508771929824	0.6857894736842105	0.5444479817538865
KL107	1121	0.8266579480598523	0.8591785714285715	0.7972308587597026	0.9399890351240057	0.9076641520244462	0.79153889872206
KL24	288	0.8177182385659331	0.7750151240169388	0.8672900649621551	0.9423264646599256	0.8753968353648708	0.7859092373131645
KL123	29	0.6266666666666667	0.5	0.9333333333333333	0.9055555555555556	0.7433333333333333	0.6298843133197235
KL106	487	0.7956232917703557	0.8200294550810014	0.772739221599344	0.9297887680169311	0.8858838481948975	0.753810

IndexError: list index out of range

***
### Dump the best model for each KL types :

In [71]:
path_models = "/media/concha-eloko/Linux/PPT_clean/Seqbased_model/1702_models"
cl_value = "0.85"

KLtypes = set([file.split("_RF_")[1].split(".full_data")[0] for file in os.listdir(path_jobs)])

KLtypes_paths = {kltype : [f"{path_jobs}/{file}" for file in os.listdir(path_jobs) if file.split("_")[0] == cl_value if file.split("_RF_")[1].split(".full_data")[0]==kltype]
                for kltype in KLtypes}


In [78]:
def save_best_model(path_data_object) :
    data_object = joblib.load(f"{path_data_object}")
    mcc_values = []
    kltype = path_data_object.split("/")[-1].split("RF_")[1].split(".full")[0]
    for i in range(0,5) :
        y_test = data_object[i]["test_data"][0].values
        predictions = data_object[i]["test_data"][1]
        mcc = matthews_corrcoef(y_test, predictions)
        mcc_values.append(mcc)
    #print(kltype)
    #print(mcc_values)
    #print(np.argmax(mcc_values))
    #print(data_object[np.argmax(mcc_values)])
    #print("\n")
    best_model = data_object[np.argmax(mcc_values)]["model"]
    joblib.dump(best_model, f'{path_models}/{cl_value}_RF_{kltype}.best_RF_model.joblib')
    del data_object
    print(f"Model saved for {kltype}")

for kltype in KLtypes_paths :
    save_best_model(KLtypes_paths[kltype][0])
    


Model saved for KL41
Model saved for KL60
Model saved for KL28
Model saved for KL6
Model saved for KL25
Model saved for KL2
Model saved for KL19
Model saved for KL1
Model saved for KL107
Model saved for KL24
Model saved for KL123
Model saved for KL106
Model saved for KL15
Model saved for KL30
Model saved for KL18
Model saved for KL81
Model saved for KL23
Model saved for KL16
Model saved for KL22
Model saved for KL38
Model saved for KL145
Model saved for KL47
Model saved for KL74
Model saved for KL125
Model saved for KL64


IndexError: list index out of range