In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
import random
from collections import Counter, defaultdict
import warnings
import logging
import subprocess
from multiprocessing.pool import ThreadPool
import joblib

# SCikitlearn modules :
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report , roc_auc_score, matthews_corrcoef
from sklearn.metrics import roc_auc_score

# Scipy modules : 
from scipy.stats import fisher_exact
from statistics import mean


PATH_WORK = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Seqbased_model"
path_metrics = f"{PATH_WORK}/metric_files"

### Make the metrics files : 

In [2]:
def load_data():
    df_info = pd.read_csv(f"{PATH_WORK}/TropiGATv2.final_df_v2.tsv", sep="\t", header=0)
    df_prophages = df_info.drop_duplicates(subset=["Phage"], keep="first")
    dico_prophage_info = {row["Phage"]: {"prophage_strain": row["prophage_id"], "ancestor": row["Infected_ancestor"]} for _, row in df_prophages.iterrows()}
    return df_info, dico_prophage_info

def get_filtered_prophages(prophage, df_info, dico_prophage_info):
    to_exclude = set()
    to_keep = {prophage}
    df_prophage_group = df_info[(df_info["prophage_id"] == dico_prophage_info[prophage]["prophage_strain"]) & 
                                (df_info["Infected_ancestor"] == dico_prophage_info[prophage]["ancestor"])]
    
    if len(df_prophage_group) > 1:
        depo_set = set(df_prophage_group[df_prophage_group["Phage"] == prophage]["domain_seq"].values)
        combinations = [depo_set]
        
        for prophage_tmp in df_prophage_group["Phage"].unique():
            if prophage_tmp != prophage:
                tmp_depo_set = set(df_prophage_group[df_prophage_group["Phage"] == prophage_tmp]["domain_seq"].values)
                if tmp_depo_set in combinations:
                    to_exclude.add(prophage_tmp)
                else:
                    to_keep.add(prophage_tmp)
                    combinations.append(tmp_depo_set)
    
    return df_prophage_group, to_exclude, to_keep

def filter_prophages(df_info, dico_prophage_info):
    good_prophages = set()
    excluded_prophages = set()
    for prophage in tqdm(dico_prophage_info.keys()):
        if prophage not in excluded_prophages and prophage not in good_prophages:
            _, excluded_members, kept_members = get_filtered_prophages(prophage, df_info, dico_prophage_info)
            good_prophages.update(kept_members)
            excluded_prophages.update(excluded_members)
    df_info_filtered = df_info[df_info["Phage"].isin(good_prophages)]
    return df_info_filtered[~df_info_filtered["KL_type_LCA"].str.contains("\\|")]

def ultrafilter_prophages(df_info):
    duplicate_prophage = []
    for kltype in df_info["KL_type_LCA"].unique():
        df_kl = df_info[df_info["KL_type_LCA"] == kltype][["Phage", "domain_seq"]]
        set_sets_depo = []
        for _, group in df_kl.groupby("Phage"):
            set_depo = frozenset(group["domain_seq"].values)
            if set_depo in set_sets_depo:
                duplicate_prophage.extend(group["Phage"])
            else:
                set_sets_depo.append(set_depo)
    
    return df_info[~df_info["Phage"].isin(duplicate_prophage)]


def get_mean_MCC(path_data_object):
    try:
        assert os.path.isfile(path_data_object)
        data_object = joblib.load(path_data_object)
        mcc_values = []
        for i in data_object:
            y_test = data_object[i]["test_data"][0].values
            predictions = data_object[i]["test_data"][1]
            mcc = matthews_corrcoef(y_test, predictions)
            mcc_values.append(mcc)
        del data_object 
        mean_mcc = mean(mcc_values)
        return mean_mcc
    except AssertionError:
        raise FileNotFoundError(f"File not found: {path_data_object}")
        


> RF UF

In [None]:
path_jobs = f"{PATH_WORK}/RF_21122024_models_ultraF"

cluster_values = set([file.split("_")[0] for file in os.listdir(path_jobs)])
KLtypes = set([file.split("_RF_")[1].split(".full_data")[0] for file in os.listdir(path_jobs)])

KLtypes_paths = {kltype : [f"{path_jobs}/{file}" for file in os.listdir(path_jobs) if file.split("_RF_")[1].split(".full_data")[0]==kltype]
                for kltype in KLtypes}

DF_info, dico_prophage_info = load_data()
# UF or not ?
DF_info_lvl_0 = ultrafilter_prophages(filter_prophages(DF_info, dico_prophage_info))
#DF_info_lvl_0 = filter_prophages(DF_info, dico_prophage_info)

dico_prophage_count = Counter(DF_info_lvl_0["KL_type_LCA"])

with open(f"{path_metrics}/SEQ_based.RF__UF.prophages_metrics.tsv", "w") as outfile :
    for KLtype in tqdm(KLtypes_paths) : 
        for path in KLtypes_paths[KLtype] :
            cl_value = path.split("/")[-1].split("_RF")[0]
            mean_mcc = get_mean_MCC(path)
            count_kltype = dico_prophage_count[KLtype]
            outfile.write(f"{KLtype}\t{count_kltype}\t{cl_value}\t{mean_mcc}\n")


> RF :

In [None]:
path_jobs = f"{PATH_WORK}/RF_1302_models"

cluster_values = set([file.split("_")[0] for file in os.listdir(path_jobs)])
KLtypes = set([file.split("_RF_")[1].split(".full_data")[0] for file in os.listdir(path_jobs)])

KLtypes_paths = {kltype : [f"{path_jobs}/{file}" for file in os.listdir(path_jobs) if file.split("_RF_")[1].split(".full_data")[0]==kltype]
                for kltype in KLtypes}

DF_info, dico_prophage_info = load_data()
# UF or not ?
#DF_info_lvl_0 = ultrafilter_prophages(filter_prophages(DF_info, dico_prophage_info))
DF_info_lvl_0 = filter_prophages(DF_info, dico_prophage_info)

dico_prophage_count = Counter(DF_info_lvl_0["KL_type_LCA"])

with open(f"{path_metrics}/SEQ_based.RF.prophages_metrics.tsv", "w") as outfile :
    for KLtype in tqdm(KLtypes_paths) : 
        for path in KLtypes_paths[KLtype] :
            cl_value = path.split("/")[-1].split("_RF")[0]
            mean_mcc = get_mean_MCC(path)
            count_kltype = dico_prophage_count[KLtype]
            outfile.write(f"{KLtype}\t{count_kltype}\t{cl_value}\t{mean_mcc}\n")


> Lreg reg:

In [None]:
path_jobs = f"{PATH_WORK}/LR_21122024_models"

cluster_values = set([file.split("_")[0] for file in os.listdir(path_jobs)])
KLtypes = set([file.split("_LogReg_")[1].split(".full_data")[0] for file in os.listdir(path_jobs)])

KLtypes_paths = {kltype : [f"{path_jobs}/{file}" for file in os.listdir(path_jobs) if file.split("_LogReg_")[1].split(".full_data")[0]==kltype]
                for kltype in KLtypes}

DF_info, dico_prophage_info = load_data()
# UF or not ?
#DF_info_lvl_0 = ultrafilter_prophages(filter_prophages(DF_info, dico_prophage_info))
DF_info_lvl_0 = filter_prophages(DF_info, dico_prophage_info)

dico_prophage_count = Counter(DF_info_lvl_0["KL_type_LCA"])

with open(f"{path_metrics}/SEQ_based.LogReg.prophages_metrics.tsv", "w") as outfile :
    for KLtype in tqdm(KLtypes_paths) : 
        for path in KLtypes_paths[KLtype] :
            cl_value = path.split("/")[-1].split("_RF")[0]
            mean_mcc = get_mean_MCC(path)
            count_kltype = dico_prophage_count[KLtype]
            outfile.write(f"{KLtype}\t{count_kltype}\t{cl_value}\t{mean_mcc}\n")

> LogReg UF 

In [None]:
path_jobs = f"{PATH_WORK}/LR_21122024_models_ultraF"

cluster_values = set([file.split("_")[0] for file in os.listdir(path_jobs)])
KLtypes = set([file.split("_LogReg_")[1].split(".full_data")[0] for file in os.listdir(path_jobs)])

KLtypes_paths = {kltype : [f"{path_jobs}/{file}" for file in os.listdir(path_jobs) if file.split("_LogReg_")[1].split(".full_data")[0]==kltype]
                for kltype in KLtypes}

DF_info, dico_prophage_info = load_data()
# UF or not ?
DF_info_lvl_0 = ultrafilter_prophages(filter_prophages(DF_info, dico_prophage_info))
#DF_info_lvl_0 = filter_prophages(DF_info, dico_prophage_info)

dico_prophage_count = Counter(DF_info_lvl_0["KL_type_LCA"])

with open(f"{path_metrics}/SEQ_based.LogReg__UF.prophages_metrics.tsv", "w") as outfile :
    for KLtype in tqdm(KLtypes_paths) : 
        for path in KLtypes_paths[KLtype] :
            cl_value = path.split("/")[-1].split("_RF")[0]
            mean_mcc = get_mean_MCC(path)
            count_kltype = dico_prophage_count[KLtype]
            outfile.write(f"{KLtype}\t{count_kltype}\t{cl_value}\t{mean_mcc}\n")

***
### Compute the weighted MCC:

In [None]:
rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Seqbased_model/metric_files \
/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/review_work/SeqBased_model


In [18]:
path_metrics = "/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/review_work/SeqBased_model/metric_files"

threshold = 5
names_metric_col = ["KL_type", "Count", "file_name","mean_mcc"]
final_weighted_dico = {}

for file in os.listdir(path_metrics):
    metric_df = pd.read_csv(f"{path_metrics}/{file}", sep = "\t", names = names_metric_col)
    metric_eval_df = metric_df[metric_df["Count"] > threshold]
    metric_eval_df["c_value"] = metric_eval_df["file_name"].astype(str).apply(lambda x: x.split("_")[0])
    weighted_mcc_dico = {}
    for cluster in metric_eval_df["c_value"].unique().tolist() :
        mcc_sum = 0
        cl_df = metric_eval_df[metric_eval_df["c_value"] == cluster]
        for _,row in cl_df.iterrows() : 
            mcc_sum = mcc_sum + row["mean_mcc"] * row["Count"]
        weighted_mcc = mcc_sum / (sum(cl_df["Count"]))
        weighted_mcc_dico[cluster] = weighted_mcc
    final_weighted_dico[file] = weighted_mcc_dico



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metric_eval_df["c_value"] = metric_eval_df["file_name"].astype(str).apply(lambda x: x.split("_")[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metric_eval_df["c_value"] = metric_eval_df["file_name"].astype(str).apply(lambda x: x.split("_")[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  met

In [19]:
final_weighted_dico

{'SEQ_based.LogReg__UF.prophages_metrics.tsv': {'0.85': 0.3151602636210511,
  '0.9': 0.32933361394227706,
  '0.8': 0.34091186979267774,
  '0.7': 0.36446002765094077,
  '0.95': 0.27893023121060845,
  '0.75': 0.3463067872443405,
  '0.65': 0.36193177410601585,
  '0.975': 0.25498957852463594},
 'SEQ_based.LogReg.prophages_metrics.tsv': {'0.85': 0.3722464784487348,
  '0.9': 0.33694270698415413,
  '0.8': 0.3659430012749742,
  '0.7': 0.36964008602380943,
  '0.95': 0.3319429903096709,
  '0.75': 0.374760037337778,
  '0.65': 0.39023597272237615,
  '0.975': 0.27913217655865474},
 'SEQ_based.RF.prophages_metrics.tsv': {'0.95': 0.7505887697037068,
  '0.9': 0.7665851018307361,
  '0.85': 0.7677152879518742,
  '0.75': 0.7576805724196833,
  '0.975': 0.7326524783190056,
  '0.8': 0.7651531836408012,
  '0.7': 0.7564513341443188,
  '0.65': 0.7666988914668954},
 'SEQ_based.RF__UF.prophages_metrics.tsv': {'0.95': 0.3093359009354269,
  '0.9': 0.3353644615409259,
  '0.85': 0.35113786424314536,
  '0.75': 0.3435

In [None]:
SEQ_based.LogReg__UF.prophages_metrics.tsv : '0.7': 0.36446002765094077
SEQ_based.LogReg.prophages_metrics.tsv : '0.65': 0.39023597272237615 

SEQ_based.RF.prophages_metrics.tsv : '0.85': 0.7677152879518742
SEQ_based.RF__UF.prophages_metrics.tsv : '0.65': 0.3669266262836219

In [None]:
import os 

path_work = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Seqbased_model"

dico_models = {"LR_21122024_models" : {"c_value" : "0.65", "destination" : "LogReg"}, 
               "LR_21122024_models_ultraF" : {"c_value" : "0.7", "destination" : "LogReg_UF"} , 
               #"RF_21122024_models_ultraF" : {"c_value" : "0.65", "destination" : "TropiSEQ_UF"},
               "RF_21122024_models_ultraF" : {"c_value" : "0.85", "destination" : "TropiSEQ_UF_85"}}

for file in os.listdir(path_work) : 
    if file in dico_models : 
        if file == "RF_21122024_models_ultraF":
            for model in os.listdir(f"{path_work}/{file}") :
                if model.startswith(dico_models[file]["c_value"]) :
                    os.system(f"cp {path_work}/{file}/{model} {path_work}/reviewed_models/{dico_models[file]['destination']}")

In [None]:
rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Seqbased_model/reviewed_models \
/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/review_work/SeqBased_model


***
### Write the Metrics file: 

In [14]:
import os 
import pandas as pd

path_metrics = "/media/concha-eloko/Linux/PPT_clean/ficheros_28032023/review_work/SeqBased_model/metric_files"

df_rf_uf = pd.read_csv(f"{path_metrics}/SEQ_based.RF__UF.prophages_metrics.tsv", sep = "\t", names = ["KL_types", "number_prophages", "c_value", "MCC"])
df_rf_uf = df_rf_uf[df_rf_uf["c_value"] == 0.65]

In [16]:
df_rf_uf

Unnamed: 0,KL_types,number_prophages,c_value,MCC
7,KL53,48,0.65,0.525543
10,KL24,179,0.65,0.000000
17,KL42,7,0.65,0.000000
25,KL34,16,0.65,0.000000
35,KL107,223,0.65,0.353817
...,...,...,...,...
824,KL41,13,0.65,0.000000
838,KL11,7,0.65,0.000000
843,KL22,50,0.65,0.517659
851,KL23,170,0.65,0.535506
