In [None]:
rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/train_nn/TropiGATv2.final_df.tsv \
/media/concha-eloko/Linux/PPT_clean/  


***
# Load Data :

In [78]:
# Ground modules
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
import random
from collections import Counter
import warnings
import logging
import subprocess
from multiprocessing.pool import ThreadPool

# SCikitlearn modules :
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report , roc_auc_score

# Scipy modules : 
from scipy.stats import fisher_exact

In [19]:
# *****************************************************************************
# Load the Dataframes :
#path_work = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Seqbased_model"
path_work = "/media/concha-eloko/Linux/PPT_clean"

DF_info = pd.read_csv(f"{path_work}/TropiGATv2.final_df.tsv", sep = "\t" ,  header = 0)
DF_info_lvl_0 = DF_info[~DF_info["KL_type_LCA"].str.contains("\\|")]
DF_info_lvl_0 = DF_info_lvl_0.drop_duplicates(subset = ["Infected_ancestor","index","prophage_id"] , keep = "first").reset_index(drop=True)

# useful dictionary :
dico_prophage_kltype_associated = {}
for negative_index,phage in tqdm(enumerate(DF_info_lvl_0["Phage"].unique().tolist())) :
    kltypes = set()
    dpos = DF_info_lvl_0[DF_info_lvl_0["Phage"] == phage]["index"]
    for dpo in dpos : 
        tmp_kltypes = DF_info_lvl_0[DF_info_lvl_0["index"] == dpo]["KL_type_LCA"].values
        kltypes.update(tmp_kltypes)
    dico_prophage_kltype_associated[phage] = kltypes


> Make fasta file :

In [8]:
depo_domains_seq = {index: domain_seq for index, domain_seq in zip(DF_info_lvl_0["index"], DF_info_lvl_0['domain_seq'])}

with open(f"{path_work}/Dpo_domains.1710.multi.fasta" , "w") as outfile : 
    for index,seq in depo_domains_seq.items() : 
        outfile.write(f">{index}\n{seq}\n")

In [20]:
path_multi_fasta = f"{path_work}/Dpo_domains.1710.multi.fasta"
path_tmp_cdhit = f"{path_work}/cdhit_clusters_1710"

def make_cdhit_cluster(threshold) :
    cdhit_command = f"cd-hit -i {path_multi_fasta} -o {path_tmp_cdhit}/{threshold}.out -c {threshold} -G 0 -aL 0.8"
    cdhit_process = subprocess.Popen(cdhit_command, shell =True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 
    scan_out, scan_err = cdhit_process.communicate()
    print(scan_out, scan_err)
    
list(map(make_cdhit_cluster , [0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 0.975]))

b'/bin/sh: 1: cd-hit: not found\n' None


[None]

> Make DF for each threshold

In [None]:
def make_cluster_dico(cdhit_out) :
    import json
    dico_cluster = {}
    threshold = cdhit_out.split("/")[-1].split(".out")[0]
    cluster_out = open(cdhit_out).read().split(">Cluster")
    for index,cluster in enumerate(cluster_out[1:]) :
        tmp_dpo = []
        id_cluster = f"Dpo_cdhit_{index}"
        for _,line in enumerate(cluster.split("\n")[1:-1]) :
            dpo = line.split(">")[1].split(".")[0]
            tmp_dpo.append(dpo)
        dico_cluster[id_cluster] = tmp_dpo
    with open(f"{path_work}/dico_cluster.cdhit__{threshold}.json", "w") as outfile:
        json.dump(dico_cluster, outfile)
    return dico_cluster , threshold
            

In [33]:
def make_DF_binaries(df_info , dico_cluster, threshold) :
    all_dpo_binaries = []
    for phage in df_info.Phage.unique() :
        dpo_binary = []
        df_phage = df_info[df_info["Phage"] == phage]["index"].values
        for cluster,dpos in dico_cluster.items() :
            shared_item = bool(set(dpos) & set(df_phage))
            if shared_item == True :
                dpo_binary.append(1)
            else :
                dpo_binary.append(0)
        all_dpo_binaries.append(dpo_binary)
    df_dpo_prophages = pd.DataFrame(all_dpo_binaries, index=df_info.Phage.unique(), columns=dico_cluster.keys())
    df_dpo_prophages.to_csv(f"{path_work}/DF_binaries_{threshold}.csv", sep = ",", index = True, header = True)

    return df_dpo_prophages

#import json
#cdhit_json = json.load(open(f"/media/concha-eloko/Linux/PPT_clean/Rafa_task/sequence_similarity/dico_cluster.cdhit__0.75.json"))
#df = make_DF_binaries(DF_info_lvl_0 ,cdhit_json , 0.65)

> Make DF for modelization :

In [72]:
def make_DF_kltype(df_info ,KL_type , dico_cluster,ratio = 5, collapse = True) : 
    # positive data :
    positive_phages = df_info[df_info["KL_type_LCA"] == KL_type]["Phage"].unique()
    df_positives = df[df.index.isin(positive_phages)]
    binaries_pos = df_positives.values
    labels_pos = [1] * len(binaries_pos)
    phages_pos = df_positives.index
    # negative data :
    n_samples = len(phages_pos)
    negative_phages = []
    for negative_index,phage in enumerate(df_info["Phage"].unique().tolist()) :
        if KL_type not in dico_prophage_kltype_associated[phage] :
            negative_phages.append(phage)
    negative_phages_selected = random.sample(negative_phages, int(n_samples*ratio))
    df_negatives = df[df.index.isin(negative_phages_selected)]
    binaries_neg = df_negatives.values
    labels_neg = [0] * len(binaries_neg)

    all_binaries = np.concatenate((binaries_pos, binaries_neg)) 
    all_labels = labels_pos + labels_neg
    all_indices = list(phages_pos) + list(negative_phages_selected)

    df_kl = pd.DataFrame(all_binaries, index=all_indices, columns=dico_cluster.keys())
    if collapse == True :
        dpo_presence = [dpo for dpo in df_kl.columns if sum(df_kl[dpo]) >0]
        df_kl = df_kl[dpo_presence]
        
    return (df_kl , all_labels)



In [76]:
test_df , test_labels = make_DF_kltype(DF_info_lvl_0 , "KL64", 2, cdhit_json)

> add the negative sequences :

In [None]:
def make_df_type(path_df) :
    df = pd.read_csv(f"{path_df}" , sep = "\t" , header = 0 , index_col = 0)
    method = index_dico[path_df.split("/")[-1].split("_")[1].split(".")[1]]
    threshold = path_df.split("/")[-1].split("_")[-1].split(".csv")[0]
    dico_KL_prophage_id = json.load(open(f"{path_task}/dico_prophage_ID.KLtypes.json"))
    labels = []
    for index in df.index:
        label = dico_KL_prophage_id.get(index, np.nan)
        if len(label) == 1 :
            labels.append(label[0])
        else :
            labels.append("Multiple")
    df["label"] = labels
    df = df[df['label'] != 'Multiple']
    return df , method , threshold

    
def fit_rf_model_random_search(df):
    le = LabelEncoder()
    df["label"] = le.fit_transform(df["label"])
    n_iter=2
    X_train, X_test, y_train, y_test = train_test_split(df, df["label"], test_size=0.2, random_state=42)
    param_grid = {
        'bootstrap': [True, False],
        'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
        'max_features': ['auto', 'sqrt'],
        'min_samples_leaf': [1, 2, 4],
        'min_samples_split': [2, 5, 10],
        'n_estimators': [200, 400, 600, 800]
    }
    rf = RandomForestClassifier()
    rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, 
                                   n_iter=n_iter, cv=4, verbose=2, random_state=42, n_jobs=-1)
    rf_random.fit(X_train, y_train)
    # Print the best parameters from the Randomized Search
    predictions = rf_random.predict(X_test)
    proba = rf_random.predict_proba(X_test)
    report = classification_report(y_test, predictions, output_dict=True)
    #auc = roc_auc_score(y_test, proba, multi_class='ovr')
    accuracy = report["accuracy"]
    weighted_precision = report["weighted avg"]["precision"]
    weighted_recall = report["weighted avg"]["recall"]
    weighted_F1 = report["weighted avg"]["f1-score"]
    return weighted_precision ,  weighted_recall , weighted_F1
    
        

def fit_RF_file(path_df) :
    df , method , threshold = make_df_type(path_df)
    weighted_precision ,  weighted_recall , weighted_F1  = fit_rf_model_random_search(df)
    with open(f"{path_task}/{method}_RF_report.tsv", "a+") as outfile :
        outfile.write(f"{method}\t{threshold}\t{weighted_precision}\t{weighted_recall}\t{weighted_F1}\n")
    #return weighted_precision ,  weighted_recall , weighted_F1  , method
    

if __name__ == '__main__':
    with ThreadPool(10) as p:
        p.map(fit_RF_file, path_dfs)
        

In [None]:
def seq_similarity_df(cdhit_out) :
    dico_cluster , threshold = make_cluster_dico(cdhit_out)
    prophage_dico = make_Dpo_prophage_dico(dico_cluster)
    df_dpo_prophages = build_dataframe(prophage_dico)
    df_dpo_prophages.sort_index(inplace = True, ascending=True)
    df_dpo_prophages.to_csv(f"{path_task}/sequence_similarity/prophage_Dpo.cdhit_{threshold}.csv", sep = "\t")
    return df_dpo_prophages
    
path_cdhit = [f"{path_tmp_cdhit}/{file}" for file in os.listdir(path_tmp_cdhit) if file[-5:] == "clstr"]   
    
if __name__ == '__main__':
    with ThreadPool(10) as p:
        p.map(seq_similarity_df, path_cdhit)

***
# Webserver version :

In [None]:
# Ground modules
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
import random
from collections import Counter
import warnings
import logging
import subprocess
import joblib
from multiprocessing.pool import ThreadPool

# SCikitlearn modules :
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report , roc_auc_score, matthews_corrcoef

# Scipy modules : 
from scipy.stats import fisher_exact
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer


path_work = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Seqbased_model"
path_models = f"{path_work}/RF_models"

DF_info = pd.read_csv(f"{path_work}/TropiGATv2.final_df.tsv", sep = "\t" ,  header = 0)
DF_info_lvl_0 = DF_info[~DF_info["KL_type_LCA"].str.contains("\\|")]
DF_info_lvl_0 = DF_info_lvl_0.drop_duplicates(subset = ["Infected_ancestor","index","prophage_id"] , keep = "first").reset_index(drop=True)

# useful dictionary :
KLtype_count = Counter(DF_info_lvl_0["KL_type_LCA"])
KLtype_pred = [kltype for kltype in KLtype_count if KLtype_count[kltype] >= 10]

dico_prophage_kltype_associated = {}
for negative_index,phage in tqdm(enumerate(DF_info_lvl_0["Phage"].unique().tolist())) :
    kltypes = set()
    dpos = DF_info_lvl_0[DF_info_lvl_0["Phage"] == phage]["index"]
    for dpo in dpos : 
        tmp_kltypes = DF_info_lvl_0[DF_info_lvl_0["index"] == dpo]["KL_type_LCA"].values
        kltypes.update(tmp_kltypes)
    dico_prophage_kltype_associated[phage] = kltypes

depo_domains_seq = {index: domain_seq for index, domain_seq in zip(DF_info_lvl_0["index"], DF_info_lvl_0['domain_seq'])}
with open(f"{path_work}/Dpo_domains.1710.multi.fasta" , "w") as outfile : 
    for index,seq in depo_domains_seq.items() : 
        outfile.write(f">{index}\n{seq}\n")
        
# ******************************************************
# CD hit step :

path_multi_fasta = f"{path_work}/Dpo_domains.1710.multi.fasta"
path_tmp_cdhit = f"{path_work}/cdhit_clusters_1710"

def make_cdhit_cluster(threshold) :
    cdhit_command = f"cd-hit -i {path_multi_fasta} -o {path_tmp_cdhit}/{threshold}.out -c {threshold} -G 0 -aL 0.8"
    cdhit_process = subprocess.Popen(cdhit_command, shell =True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 
    scan_out, scan_err = cdhit_process.communicate()
    print(scan_out, scan_err)
    

def make_cluster_dico(cdhit_out) :
    import json
    dico_cluster = {}
    threshold = cdhit_out.split("/")[-1].split(".out")[0]
    cluster_out = open(cdhit_out).read().split(">Cluster")
    for index,cluster in enumerate(cluster_out[1:]) :
        tmp_dpo = []
        id_cluster = f"Dpo_cdhit_{index}"
        for _,line in enumerate(cluster.split("\n")[1:-1]) :
            dpo = line.split(">")[1].split(".")[0]
            tmp_dpo.append(dpo)
        dico_cluster[id_cluster] = tmp_dpo
    with open(f"{path_work}/dico_cluster.cdhit__{threshold}.json", "w") as outfile:
        json.dump(dico_cluster, outfile)
    return dico_cluster , threshold

def make_DF_binaries(df_info , dico_cluster, threshold) :
    all_dpo_binaries = []
    for phage in df_info.Phage.unique() :
        dpo_binary = []
        df_phage = df_info[df_info["Phage"] == phage]["index"].values
        for cluster,dpos in dico_cluster.items() :
            shared_item = bool(set(dpos) & set(df_phage))
            if shared_item == True :
                dpo_binary.append(1)
            else :
                dpo_binary.append(0)
        all_dpo_binaries.append(dpo_binary)
    df_dpo_prophages = pd.DataFrame(all_dpo_binaries, index=df_info.Phage.unique(), columns=dico_cluster.keys())
    df_dpo_prophages.to_csv(f"{path_work}/DF_binaries_{threshold}.csv", sep = ",", index = True, header = True)
    return df_dpo_prophages


def make_DF_kltype(df_info, df ,KL_type , dico_cluster,ratio = 5, collapse = True) : 
    # positive data :
    positive_phages = df_info[df_info["KL_type_LCA"] == KL_type]["Phage"].unique()
    df_positives = df[df.index.isin(positive_phages)]
    df_positives = df_positives[~df_positives.index.duplicated(keep='first')]
    binaries_pos = df_positives.values
    labels_pos = [1] * len(binaries_pos)
    phages_pos = df_positives.index
    # negative data :
    n_samples = len(phages_pos)
    negative_phages = []
    for negative_index,phage in enumerate(df_info["Phage"].unique().tolist()) :
        if KL_type not in dico_prophage_kltype_associated[phage] :
            negative_phages.append(phage)
    negative_phages_selected = random.sample(negative_phages, int(n_samples*ratio))
    df_negatives = df[df.index.isin(negative_phages_selected)]
    binaries_neg = df_negatives.values
    labels_neg = [0] * len(binaries_neg)
    all_binaries = np.concatenate((binaries_pos, binaries_neg)) 
    all_labels = labels_pos + labels_neg
    all_indices = list(phages_pos) + list(negative_phages_selected)
    df_kl = pd.DataFrame(all_binaries, index=all_indices, columns=dico_cluster.keys())
    if collapse == True :
        dpo_presence = [dpo for dpo in df_kl.columns if sum(df_kl[dpo]) >0]
        df_kl = df_kl[dpo_presence]
    return (df_kl , all_labels)


def fit_rf_model_random_search(df_kl , all_labels,KL_type, threshold):
    n_iter = 150
    X_train, X_test, y_train, y_test = train_test_split(df_kl, all_labels, test_size=0.2, random_state=243)
    param_grid = {
        'bootstrap': Categorical([True, False]),
        'max_depth': Integer(10, 100),
        'max_features': Categorical(['auto', 'sqrt']),
        'min_samples_leaf': Integer(1, 4),
        'min_samples_split': Integer(2, 10),
        'n_estimators': Integer(200, 800)
    }
    rf = RandomForestClassifier(random_state=42)
    bayes_search = BayesSearchCV(rf, param_grid, n_iter=n_iter, cv=4, n_jobs=-1)
    bayes_search.fit(X_train, y_train)
    predictions = bayes_search.predict(X_test)
    report = classification_report(y_test, predictions, output_dict=True)
    mcc = matthews_corrcoef(y_test, predictions)
    #joblib.dump(bayes_search.best_estimator_, f'{path_models}/{threshold}_RF_{KL_type}.joblib')
    with open(f'{path_models}/{threshold}_RF_{KL_type}.pkl', 'wb') as model_file:
        pickle.dump(bayes_search.best_estimator_, model_file)
    
    return mcc , report




In [None]:

list(map(make_cdhit_cluster , [0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 0.975]))
cdhit_files = [f"{path_tmp_cdhit}/{file}" for file in os.listdir(path_tmp_cdhit) if file[-3:]=="out"]
        
def make_prediction_file (path_file, path_binaries) :
    dico_cluster , threshold = make_cluster_dico(f"{path_file}")
    df_binaries = make_DF_binaries(DF_info_lvl_0 , dico_cluster, threshold)
    for KL_type in KLtype_count :
        with open(f"{path_work}/RF_report.{threshold}.tsv", "a+") as outfile :
            if KLtype_count[KL_type] < 10 :
                outfile.write(f"{KL_type}\t{KLtype_count[KL_type]}\t***No sufficient Data\n")
            else :
                df_kl , all_labels = make_DF_kltype(DF_info_lvl_0 ,df_binaries, KL_type , dico_cluster)
                mcc , report = fit_rf_model_random_search(df_kl , all_labels, KL_type,threshold)
                outfile.write(f"{KL_type}\t{KLtype_count[KL_type]}\t{mcc}\t{report['1']['f1-score']}\t{report['accuracy']}\n")
                    
if __name__ == '__main__':
    with ThreadPool(10) as p:
        p.map(make_prediction_file, cdhit_files)         
        

In [None]:
import joblib

# Load the model from the file
loaded_model = joblib.load('path_to_your_saved_model.joblib')

# Now you can use `loaded_model` to make predictions
predictions = loaded_model.predict(X_test)

In [None]:
# ****************************************************************************************
#!/bin/bash
#BATCH --job-name=RF_fitting__
#SBATCH --qos=short
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=16 
#SBATCH --mem=50gb 
#SBATCH --time=1-00:00:00 
#SBATCH --output=RF_fitting__%j.log 

source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate python_311

/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Seqbased_model/script_files/RF_fitting.py

In [None]:
rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Seqbased_model \
/media/concha-eloko/Linux/PPT_clean/ 

### Slight change in the code to reapeat only the 0.85 version (which is the best), but save into pickle bc joblib does not work well :

In [None]:
def make_prediction_fast (KL_type) :
    dico_cluster = json.load(open(f"{path_work}/dico_cluster.cdhit__0.85.json"))
    threshold = 0.85
    df_binaries = pd.read_csv(f"{path_work}/DF_binaries_0.85.csv", sep = ",", header = 0, index_col = 0)
    with open(f"{path_work}/RF_report.{threshold}.deep.no_collapse.pkl.tsv", "a+") as outfile :
        if KLtype_count[KL_type] < 10 :
            outfile.write(f"{KL_type}\t{KLtype_count[KL_type]}\t***No sufficient Data\n")
        else :
            df_kl , all_labels = make_DF_kltype(DF_info_lvl_0 ,df_binaries, KL_type , dico_cluster)
            mcc , report = fit_rf_model_random_search(df_kl , all_labels, KL_type,threshold)
            outfile.write(f"{KL_type}\t{KLtype_count[KL_type]}\t{mcc}\t{report['1']['f1-score']}\t{report['accuracy']}\n")


if __name__ == '__main__':
    with ThreadPool(25) as p:
        p.map(make_prediction_fast, KLtype_pred) 

***
# Debugg : 

In [2]:
file_path = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Seqbased_model/cdhit_clusters_1710/0.8.out"

def make_cluster_dico(cdhit_out) :
    import json
    dico_cluster = {}
    threshold = cdhit_out.split("/")[-1].split(".out")[0]
    cluster_file = f"{cdhit_out}.clstr"
    cluster_out = open(cluster_file).read().split(">Cluster")
    for index,cluster in enumerate(cluster_out[1:]) :
        tmp_dpo = []
        id_cluster = f"Dpo_cdhit_{index}"
        for _,line in enumerate(cluster.split("\n")[1:-1]) :
            dpo = line.split(">")[1].split(".")[0]
            tmp_dpo.append(dpo)
        dico_cluster[id_cluster] = tmp_dpo
    #with open(f"{path_work}/dico_cluster.cdhit__{threshold}.json", "w") as outfile:
        #json.dump(dico_cluster, outfile)
    return dico_cluster , threshold

dico_cluster , threshold = make_cluster_dico(file_path)



FileNotFoundError: [Errno 2] No such file or directory: '/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Seqbased_model/cdhit_clusters_1710/0.8.out.clstr'

In [None]:
def make_DF_binaries(df_info , dico_cluster, threshold) :
    all_dpo_binaries = []
    for phage in df_info.Phage.unique() :
        dpo_binary = []
        df_phage = df_info[df_info["Phage"] == phage]["index"].values
        for cluster,dpos in dico_cluster.items() :
            shared_item = bool(set(dpos) & set(df_phage))
            if shared_item == True :
                dpo_binary.append(1)
            else :
                dpo_binary.append(0)
        all_dpo_binaries.append(dpo_binary)
    df_dpo_prophages = pd.DataFrame(all_dpo_binaries, index=df_info.Phage.unique(), columns=dico_cluster.keys())
    #df_dpo_prophages.to_csv(f"{path_work}/DF_binaries_{threshold}.csv", sep = ",", index = True, header = True)
    return df_dpo_prophages

df_binaries = make_DF_binaries(DF_info_lvl_0 , dico_cluster, threshold)


In [None]:
def make_DF_kltype(df_info, df ,KL_type , dico_cluster,ratio = 5, collapse = True) : 
    # positive data :
    positive_phages = df_info[df_info["KL_type_LCA"] == KL_type]["Phage"].unique()
    df_positives = df[df.index.isin(positive_phages)]
    binaries_pos = df_positives.values
    labels_pos = [1] * len(binaries_pos)
    phages_pos = df_positives.index
    # negative data :
    n_samples = len(phages_pos)
    negative_phages = []
    for negative_index,phage in enumerate(df_info["Phage"].unique().tolist()) :
        if KL_type not in dico_prophage_kltype_associated[phage] :
            negative_phages.append(phage)
    negative_phages_selected = random.sample(negative_phages, int(n_samples*ratio))
    df_negatives = df[df.index.isin(negative_phages_selected)]
    binaries_neg = df_negatives.values
    labels_neg = [0] * len(binaries_neg)
    all_binaries = np.concatenate((binaries_pos, binaries_neg)) 
    all_labels = labels_pos + labels_neg
    all_indices = list(phages_pos) + list(negative_phages_selected)
    df_kl = pd.DataFrame(all_binaries, index=all_indices, columns=dico_cluster.keys())
    if collapse == True :
        dpo_presence = [dpo for dpo in df_kl.columns if sum(df_kl[dpo]) >0]
        df_kl = df_kl[dpo_presence]
    return (df_kl , all_labels)

df_kl , all_labels = make_DF_kltype(DF_info_lvl_0 ,df_binaries, "KL64" , dico_cluster)