In [2]:
# Ground modules
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from Bio import SeqIO
from itertools import product
import random
from collections import Counter
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import logging
import subprocess
from multiprocessing.pool import ThreadPool
import joblib

# SCikitlearn modules :
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report , roc_auc_score

# Scipy modules : 
from scipy.stats import fisher_exact

In [8]:
path_work = "/media/concha-eloko/Linux/PPT_clean"

DF_info = pd.read_csv(f"{path_work}/TropiGATv2.final_df_v2.tsv", sep = "\t" ,  header = 0)
df_prophages = DF_info.drop_duplicates(subset = ["Phage"], keep = "first")
dico_prophage_info = {row["Phage"] : {"prophage_strain" : row["prophage_id"] , "ancestor" : row["Infected_ancestor"]} for _,row in df_prophages.iterrows()}


In [9]:
def get_filtered_prophages(prophage) :
    combinations = []
    to_exclude = set()
    to_keep = set()
    to_keep.add(prophage)
    df_prophage_group = DF_info[(DF_info["prophage_id"] == dico_prophage_info[prophage]["prophage_strain"]) & (DF_info["Infected_ancestor"] == dico_prophage_info[prophage]["ancestor"])]
    if len(df_prophage_group) == 1 : 
        pass
    else :
        depo_set = set(df_prophage_group[df_prophage_group["Phage"] == prophage]["domain_seq"].values)
        for prophage_tmp in df_prophage_group["Phage"].unique().tolist() :
            if prophage_tmp != prophage :
                tmp_depo_set = set(df_prophage_group[df_prophage_group["Phage"] == prophage_tmp]["domain_seq"].values)
                if depo_set == tmp_depo_set :
                    to_exclude.add(prophage_tmp)
                else :
                    if tmp_depo_set not in combinations :
                        to_keep.add(prophage_tmp)
                        combinations.append(tmp_depo_set)
                    else :
                        to_exclude.add(prophage_tmp)
    return df_prophage_group , to_exclude , to_keep

good_prophages = set()
excluded_prophages = set()

for prophage, info_prophage in tqdm(dico_prophage_info.items()) :
    if prophage not in excluded_prophages and prophage not in good_prophages:
        _, excluded_members , kept_members = get_filtered_prophages(prophage) 
        good_prophages.update(kept_members)
        excluded_prophages.update(excluded_members)

DF_info_lvl_0_filtered = DF_info[DF_info["Phage"].isin(good_prophages)]
DF_info_lvl_0_final = DF_info_lvl_0_filtered[~DF_info_lvl_0_filtered["KL_type_LCA"].str.contains("\\|")]

DF_info_lvl_0 = DF_info_lvl_0_final.copy()

# useful dictionary :
KLtype_count = Counter(DF_info_lvl_0["KL_type_LCA"])
KLtype_pred = [kltype for kltype in KLtype_count if KLtype_count[kltype] >= 5]

dico_prophage_kltype_associated = {}
for negative_index,phage in tqdm(enumerate(DF_info_lvl_0["Phage"].unique().tolist())) :
    kltypes = set()
    dpos = DF_info_lvl_0[DF_info_lvl_0["Phage"] == phage]["index"]
    for dpo in dpos : 
        tmp_kltypes = DF_info_lvl_0[DF_info_lvl_0["index"] == dpo]["KL_type_LCA"].values
        kltypes.update(tmp_kltypes)
    dico_prophage_kltype_associated[phage] = kltypes

depo_domains_seq = {index: domain_seq for index, domain_seq in zip(DF_info_lvl_0["index"], DF_info_lvl_0['domain_seq'])}

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 15981/15981 [00:28<00:00, 563.35it/s]
8892it [00:27, 327.50it/s]


In [3]:
import json

path_seqbased = "/media/concha-eloko/Linux/PPT_clean/Seqbased_model"
path_db = f"{path_seqbased}/TropiSeq/TropiSeq_0.85.db"

dico_cluster = json.load(open(f"{path_seqbased}/dico_cluster.cdhit__0.85.json"))
dico_cluster_r = {ref_dpo : key_dpo for key_dpo,list_dpo in dico_cluster.items() for ref_dpo in list_dpo}


In [4]:
dico_cluster_r

{'minibatch__2009': 'Dpo_cdhit_0',
 'ppt__2348': 'Dpo_cdhit_1',
 'minibatch__106': 'Dpo_cdhit_2',
 'minibatch__505': 'Dpo_cdhit_3',
 'ppt__4004': 'Dpo_cdhit_3',
 'ppt__4400': 'Dpo_cdhit_3',
 'anubis_return__372': 'Dpo_cdhit_4',
 'anubis_return__1536': 'Dpo_cdhit_4',
 'anubis_return__3097': 'Dpo_cdhit_4',
 'ppt__612': 'Dpo_cdhit_5',
 'ppt__6611': 'Dpo_cdhit_6',
 'ppt__5387': 'Dpo_cdhit_7',
 'ppt__2174': 'Dpo_cdhit_8',
 'minibatch__1251': 'Dpo_cdhit_9',
 'minibatch__439': 'Dpo_cdhit_9',
 'ppt__3411': 'Dpo_cdhit_9',
 'minibatch__1246': 'Dpo_cdhit_10',
 'ppt__2067': 'Dpo_cdhit_10',
 'ppt__6301': 'Dpo_cdhit_10',
 'ppt__4977': 'Dpo_cdhit_11',
 'minibatch__1084': 'Dpo_cdhit_12',
 'ppt__4840': 'Dpo_cdhit_13',
 'ppt__5234': 'Dpo_cdhit_13',
 'ppt__6878': 'Dpo_cdhit_14',
 'ppt__6587': 'Dpo_cdhit_15',
 'ppt__3793': 'Dpo_cdhit_15',
 'minibatch__102': 'Dpo_cdhit_15',
 'ppt__4600': 'Dpo_cdhit_15',
 'minibatch__2006': 'Dpo_cdhit_15',
 'ppt__4637': 'Dpo_cdhit_16',
 'minibatch__2160': 'Dpo_cdhit_17',
 '

In [17]:
import pickle
import os
from joblib import load

path_seqbased = "/media/concha-eloko/Linux/PPT_clean"

models_TropiSeq = {}

for rf_model in os.listdir(f"{path_seqbased}/Seqbased_model/1702_models") :
    kltype = rf_model.split("_RF_")[1].split(".")[0]
    with open(f"{path_seqbased}/Seqbased_model/1702_models/{rf_model}", 'rb') as file:
        models_TropiSeq[kltype] = load(file)

TropiSeq_results = {}

***
# Approach 1 : Predictions with probability > 0.5

In [18]:
num_arrays = 989
list_of_arrays = [np.zeros(num_arrays) for _ in range(num_arrays)]
for i, arr in enumerate(list_of_arrays):
    arr[i] = 1

In [20]:
for index,array in tqdm(enumerate(list_of_arrays)) :
    cluster_id = "cluster_" + str(index)
    tmp_positif = {}
    for kltype in models_TropiSeq :
        pred = models_TropiSeq[kltype].predict_proba(np.array(array).reshape(1, -1))
        if pred[0][1] >= 0.5 :
            tmp_positif[kltype] = pred[0][1]
    TropiSeq_results[cluster_id] = tmp_positif

#import json 
#with open("/media/concha-eloko/Linux/PPT_clean/Seqbased_model/prediction_based.labeling.0604.json", "w") as outfile :
#    json.dump(TropiSeq_results, outfile)

989it [1:07:36,  4.10s/it]


In [26]:
from collections import defaultdict

TropiSeq_results_frtm = defaultdict(list)
for cluster,dico_kltype in TropiSeq_results.items() :
    for kltype in dico_kltype :
        TropiSeq_results_frtm[cluster].append(kltype)

> The combinations :

In [28]:
from itertools import combinations
pairs_list_pred = []
for clu,targets in TropiSeq_results_frtm.items():
    pairs = combinations(targets, 2)
    pairs_list_pred.extend(pairs)

In [30]:
with open("/media/concha-eloko/Linux/PPT_clean/Network_file.TropiSeq.pred.0604.tsv", "w") as outfile :
    for tuple,count in dict(Counter(pairs_list_pred)).items() :
        if count > 1 :
            outfile.write(f"{tuple[0]}\t{tuple[1]}\t{count}\n")

***
# Approach 2 : Feature importance

In [5]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

def plot_feature_importances(importances, feature_names):
    """
    Plot feature importances using a bar plot.
    
    Parameters:
        importances (array-like): Feature importances.
        feature_names (list): Names of the features.
    """
    # Filter features with importances greater than 0
    nonzero_indices = importances > 0.1
    importances = importances[nonzero_indices]
    feature_names = [feature_names[i] for i, is_nonzero in enumerate(nonzero_indices) if is_nonzero]

    # Sort the features by their importances
    indices = np.argsort(importances)[::-1]

    # Plot the feature importances
    plt.figure(figsize=(10, 6))
    plt.title("Feature Importances")
    plt.bar(range(len(importances)), importances[indices], color="skyblue", align="center")
    plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=45, ha="right")
    plt.xlabel("Feature")
    plt.ylabel("Importance")
    plt.tight_layout()
    plt.show()

def top_features(importances, feature_names, threshold=0.1):
    """
    Get the features with importance scores above a given threshold, along with their importance scores.
    
    Parameters:
        importances (array-like): Feature importances.
        feature_names (list): Names of the features.
        threshold (float): Importance threshold.
    
    Returns:
        feature_info (list): List of tuples containing feature name and importance score.
    """
    # Create a list of tuples containing feature name and importance score
    feature_info = [(feature_names[i], importances[i]) for i in range(len(importances)) if importances[i] > threshold]
    
    # Sort the feature_info list by importance score (in descending order)
    feature_info.sort(key=lambda x: x[1], reverse=True)
    
    return feature_info

In [6]:
feature_importance_dico = {}
cluster_ids = ["cluster_" + str(index) for index in range(0,989)]

for KL_type in models_TropiSeq :
    model = models_TropiSeq[KL_type]
    feature_importances = model.feature_importances_
    top_f = top_features(feature_importances ,cluster_ids,threshold=0.05)
    feature_importance_dico[KL_type] = top_f

In [7]:

KL_clusters = {}
all_KL_clusters = {}

for cluster in cluster_ids : 
    cluster_list = []
    for KL_type in feature_importance_dico :
        for tuple in feature_importance_dico[KL_type] :
            if tuple[0] == cluster :
                cluster_list.append(KL_type)
    if len(cluster_list) > 1 :
        KL_clusters[cluster] = cluster_list
    all_KL_clusters[cluster] = cluster_list
    

In [22]:
KL_clusters

{'cluster_44': ['KL137', 'KL123'],
 'cluster_50': ['KL47', 'KL64'],
 'cluster_64': ['KL21', 'KL39', 'KL166', 'KL24', 'KL48', 'KL112'],
 'cluster_88': ['KL8', 'KL7', 'KL3'],
 'cluster_89': ['KL8', 'KL9', 'KL31', 'KL48'],
 'cluster_106': ['KL152', 'KL145'],
 'cluster_134': ['KL6', 'KL109'],
 'cluster_151': ['KL47',
  'KL107',
  'KL105',
  'KL41',
  'KL64',
  'KL102',
  'KL125',
  'KL15',
  'KL152',
  'KL58',
  'KL36',
  'KL164',
  'KL24',
  'KL147',
  'KL6',
  'KL13',
  'KL106',
  'KL103',
  'KL108'],
 'cluster_156': ['KL145', 'KL55'],
 'cluster_159': ['KL15', 'KL136'],
 'cluster_161': ['KL107', 'KL6', 'KL112'],
 'cluster_166': ['KL23', 'KL28'],
 'cluster_174': ['KL26', 'KL74'],
 'cluster_186': ['KL107', 'KL164'],
 'cluster_193': ['KL67', 'KL17', 'KL56'],
 'cluster_220': ['KL20',
  'KL51',
  'KL124',
  'KL137',
  'KL16',
  'KL55',
  'KL61',
  'KL122',
  'KL48'],
 'cluster_226': ['KL27', 'KL20'],
 'cluster_227': ['KL136', 'KL122'],
 'cluster_228': ['KL5', 'KL117'],
 'cluster_237': ['KL9',

### Check the combinations : 

In [15]:
from itertools import combinations
pairs_list = []
for clu,targets in KL_clusters.items():
    pairs = combinations(targets, 2)
    pairs_list.extend(pairs)

In [16]:
dict(Counter(pairs_list))

{('KL137', 'KL123'): 1,
 ('KL47', 'KL64'): 3,
 ('KL21', 'KL39'): 1,
 ('KL21', 'KL166'): 1,
 ('KL21', 'KL24'): 1,
 ('KL21', 'KL48'): 1,
 ('KL21', 'KL112'): 1,
 ('KL39', 'KL166'): 1,
 ('KL39', 'KL24'): 1,
 ('KL39', 'KL48'): 1,
 ('KL39', 'KL112'): 1,
 ('KL166', 'KL24'): 1,
 ('KL166', 'KL48'): 1,
 ('KL166', 'KL112'): 1,
 ('KL24', 'KL48'): 1,
 ('KL24', 'KL112'): 2,
 ('KL48', 'KL112'): 1,
 ('KL8', 'KL7'): 1,
 ('KL8', 'KL3'): 1,
 ('KL7', 'KL3'): 1,
 ('KL8', 'KL9'): 1,
 ('KL8', 'KL31'): 1,
 ('KL8', 'KL48'): 1,
 ('KL9', 'KL31'): 1,
 ('KL9', 'KL48'): 2,
 ('KL31', 'KL48'): 1,
 ('KL152', 'KL145'): 1,
 ('KL6', 'KL109'): 1,
 ('KL47', 'KL107'): 2,
 ('KL47', 'KL105'): 1,
 ('KL47', 'KL41'): 1,
 ('KL47', 'KL102'): 1,
 ('KL47', 'KL125'): 1,
 ('KL47', 'KL15'): 2,
 ('KL47', 'KL152'): 1,
 ('KL47', 'KL58'): 1,
 ('KL47', 'KL36'): 1,
 ('KL47', 'KL164'): 1,
 ('KL47', 'KL24'): 1,
 ('KL47', 'KL147'): 1,
 ('KL47', 'KL6'): 1,
 ('KL47', 'KL13'): 1,
 ('KL47', 'KL106'): 1,
 ('KL47', 'KL103'): 1,
 ('KL47', 'KL108'): 1,

In [18]:
with open("/media/concha-eloko/Linux/PPT_clean/Network_file.TropiSeq.1002.tsv", "w") as outfile :
    for tuple,count in dict(Counter(pairs_list)).items() :
        if count > 1 :
            outfile.write(f"{tuple[0]}\t{tuple[1]}\t{count}\n")

> Make the predictions : 

In [30]:
1062.024/3 , 1693.558/3

(354.008, 564.5193333333333)

In [33]:
'''for index,array in tqdm(enumerate(list_of_arrays)) :
    cluster_id = "cluster_" + str(index)
    tmp_positif = {}
    for kltype in models_TropiSeq :
        pred = models_TropiSeq[kltype].predict_proba(np.array(array).reshape(1, -1))
        if pred[0][1] >= 0.5 :
            tmp_positif[kltype] = pred[0][1]
    TropiSeq_results[cluster_id] = tmp_positif'''

#import json 
#with open("/media/concha-eloko/Linux/PPT_clean/Seqbased_model/cluster_KLtypes.json", "w") as outfile :
#    json.dump(TropiSeq_results, outfile)

883it [28:00,  1.90s/it]


> Open predictions :

In [76]:
import json 

path_pred = "/media/concha-eloko/Linux/PPT_clean/Seqbased_model/cluster_KLtypes.json"

TropiSeq_results = json.load(open(path_pred))

In [77]:
TropiSeq_results

{'cluster_0': {},
 'cluster_1': {},
 'cluster_2': {},
 'cluster_3': {'KL43': 0.5768679824908414},
 'cluster_4': {'KL64': 0.7243079795779298, 'KL54': 0.7225832357688411},
 'cluster_5': {'KL38': 0.5951490162713051},
 'cluster_6': {},
 'cluster_7': {},
 'cluster_8': {},
 'cluster_9': {},
 'cluster_10': {},
 'cluster_11': {'KL30': 0.5230767259793374},
 'cluster_12': {},
 'cluster_13': {},
 'cluster_14': {'KL14': 0.7441289066212997},
 'cluster_15': {},
 'cluster_16': {},
 'cluster_17': {},
 'cluster_18': {},
 'cluster_19': {'KL16': 0.9437004643985163},
 'cluster_20': {},
 'cluster_21': {},
 'cluster_22': {},
 'cluster_23': {},
 'cluster_24': {},
 'cluster_25': {},
 'cluster_26': {},
 'cluster_27': {},
 'cluster_28': {'KL123': 0.6101621438255168},
 'cluster_29': {},
 'cluster_30': {},
 'cluster_31': {},
 'cluster_32': {},
 'cluster_33': {},
 'cluster_34': {},
 'cluster_35': {},
 'cluster_36': {},
 'cluster_37': {'KL64': 0.8004726564524154, 'KL10': 0.5359320862005766},
 'cluster_38': {'KL151'

In [37]:
from collections import Counter
lengths = [len(TropiSeq_results[cluster]) for cluster in TropiSeq_results]


Counter({1: 382,
         0: 361,
         2: 76,
         3: 26,
         4: 11,
         5: 8,
         7: 5,
         6: 4,
         11: 3,
         8: 3,
         9: 2,
         12: 1,
         10: 1})

In [43]:
from itertools import combinations
pairs_list = []
associations_tropiseq = [set(kl for kl in TropiSeq_results[cluster]) for cluster in TropiSeq_results if len(TropiSeq_results[cluster])>0]


for s in associations_tropiseq:
    # Convert set to list for compatibility with combinations
    elements = list(s)
    pairs = combinations(elements, 2)
    pairs_list.extend(pairs)

# Convert pairs_list to a list of tuples
#pairs_list = list(pairs_list)




In [45]:
Counter(pairs_list)

Counter({('KL47', 'KL64'): 5,
         ('KL30', 'KL125'): 4,
         ('KL51', 'KL81'): 3,
         ('KL123', 'KL43'): 3,
         ('KL21', 'KL64'): 3,
         ('KL24', 'KL28'): 3,
         ('KL105', 'KL15'): 3,
         ('KL36', 'KL106'): 3,
         ('KL36', 'KL15'): 3,
         ('KL36', 'KL107'): 3,
         ('KL24', 'KL15'): 3,
         ('KL106', 'KL15'): 3,
         ('KL106', 'KL107'): 3,
         ('KL15', 'KL107'): 3,
         ('KL15', 'KL64'): 3,
         ('KL107', 'KL64'): 3,
         ('KL107', 'KL106'): 3,
         ('KL74', 'KL26'): 3,
         ('KL116', 'KL30'): 3,
         ('KL116', 'KL125'): 3,
         ('KL8', 'KL22'): 3,
         ('KL107', 'KL15'): 3,
         ('KL2', 'KL122'): 3,
         ('KL2', 'KL64'): 3,
         ('KL13', 'KL2'): 3,
         ('KL5', 'KL30'): 3,
         ('KL51', 'KL2'): 3,
         ('KL21', 'KL47'): 2,
         ('KL24', 'KL112'): 2,
         ('KL112', 'KL39'): 2,
         ('KL8', 'KL1'): 2,
         ('KL31', 'KL14'): 2,
         ('KL48', 'KL9'): 2,


***
# Approach 3 : Statistics

In [None]:
/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Seqbased_model

rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Seqbased_model/DF_binaries_0.85.csv \
/media/concha-eloko/Linux/PPT_clean




In [12]:
import json
from statsmodels.stats.multitest import multipletests
import pandas as pd

path_seqbased = "/media/concha-eloko/Linux/PPT_clean/Seqbased_model"
path_db = f"{path_seqbased}/TropiSeq/TropiSeq_0.85.db"

dico_cluster = json.load(open(f"{path_seqbased}/dico_cluster.cdhit__0.85.json"))
dico_cluster_r = {ref_dpo : key_dpo for key_dpo,list_dpo in dico_cluster.items() for ref_dpo in list_dpo}


In [13]:
path_work = "/media/concha-eloko/Linux/PPT_clean"

DF_info = pd.read_csv(f"{path_work}/TropiGATv2.final_df_v2.tsv", sep = "\t" ,  header = 0)
df_prophages = DF_info.drop_duplicates(subset = ["Phage"], keep = "first")
dico_prophage_info = {row["Phage"] : {"prophage_strain" : row["prophage_id"] , "ancestor" : row["Infected_ancestor"]} for _,row in df_prophages.iterrows()}

df_binaries = pd.read_csv(f"{path_work}/DF_binaries_0.85.csv", sep = ",", header = 0, index_col = 0)

In [39]:
df_binaries

Unnamed: 0,Dpo_cdhit_0,Dpo_cdhit_1,Dpo_cdhit_2,Dpo_cdhit_3,Dpo_cdhit_4,Dpo_cdhit_5,Dpo_cdhit_6,Dpo_cdhit_7,Dpo_cdhit_8,Dpo_cdhit_9,...,Dpo_cdhit_979,Dpo_cdhit_980,Dpo_cdhit_981,Dpo_cdhit_982,Dpo_cdhit_983,Dpo_cdhit_984,Dpo_cdhit_985,Dpo_cdhit_986,Dpo_cdhit_987,Dpo_cdhit_988
GCF_902164905.1__phage1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GCF_017310305.1__phage5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GCF_001701985.1__phage2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GCF_001611095.1__phage5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GCF_902156555.1__phage3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GCF_000349245.1__phage1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GCF_900506765.1__phage17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GCF_002186895.1__phage9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GCF_004312845.1__phage3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
def get_filtered_prophages(prophage) :
    combinations = []
    to_exclude = set()
    to_keep = set()
    to_keep.add(prophage)
    df_prophage_group = DF_info[(DF_info["prophage_id"] == dico_prophage_info[prophage]["prophage_strain"]) & (DF_info["Infected_ancestor"] == dico_prophage_info[prophage]["ancestor"])]
    if len(df_prophage_group) == 1 : 
        pass
    else :
        depo_set = set(df_prophage_group[df_prophage_group["Phage"] == prophage]["domain_seq"].values)
        for prophage_tmp in df_prophage_group["Phage"].unique().tolist() :
            if prophage_tmp != prophage :
                tmp_depo_set = set(df_prophage_group[df_prophage_group["Phage"] == prophage_tmp]["domain_seq"].values)
                if depo_set == tmp_depo_set :
                    to_exclude.add(prophage_tmp)
                else :
                    if tmp_depo_set not in combinations :
                        to_keep.add(prophage_tmp)
                        combinations.append(tmp_depo_set)
                    else :
                        to_exclude.add(prophage_tmp)
    return df_prophage_group , to_exclude , to_keep

good_prophages = set()
excluded_prophages = set()

for prophage, info_prophage in tqdm(dico_prophage_info.items()) :
    if prophage not in excluded_prophages and prophage not in good_prophages:
        _, excluded_members , kept_members = get_filtered_prophages(prophage) 
        good_prophages.update(kept_members)
        excluded_prophages.update(excluded_members)

DF_info_lvl_0_filtered = DF_info[DF_info["Phage"].isin(good_prophages)]
DF_info_lvl_0_final = DF_info_lvl_0_filtered[~DF_info_lvl_0_filtered["KL_type_LCA"].str.contains("\\|")]

DF_info_lvl_0 = DF_info_lvl_0_final.copy()
DF_info_lvl_0 = DF_info_lvl_0.drop_duplicates(subset = ["Protein_name"])

# useful dictionary :
KLtype_count = Counter(DF_info_lvl_0["KL_type_LCA"])
KLtype_pred = [kltype for kltype in KLtype_count if KLtype_count[kltype] >= 5]

dico_prophage_kltype_associated = {}
#for negative_index,phage in tqdm(enumerate(DF_info_lvl_0["Phage"].unique().tolist())) :
#    kltypes = set()
#    dpos = DF_info_lvl_0[DF_info_lvl_0["Phage"] == phage]["index"]
#    for dpo in dpos : 
#        tmp_kltypes = DF_info_lvl_0[DF_info_lvl_0["index"] == dpo]["KL_type_LCA"].values
#        kltypes.update(tmp_kltypes)
#    dico_prophage_kltype_associated[phage] = kltypes

depo_domains_seq = {index: domain_seq for index, domain_seq in zip(DF_info_lvl_0["index"], DF_info_lvl_0['domain_seq'])}
#with open(f"{path_work}/Dpo_domains.2912.multi.fasta" , "w") as outfile : 
#    for index,seq in depo_domains_seq.items() : 
#        outfile.write(f">{index}\n{seq}\n")
       

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 15981/15981 [00:32<00:00, 487.15it/s]
8892it [00:30, 293.54it/s]


In [67]:
# ******************************************************
# CD hit step :
path_multi_fasta = f"{path_work}/Dpo_domains.2912.multi.fasta"
path_tmp_cdhit = f"{path_work}/cdhit_clusters_2912"


def make_DF_kltype(df_info, df ,KL_type , dico_cluster,ratio = 5, collapse = False) : 
    # positive data :
    positive_phages = df_info[df_info["KL_type_LCA"] == KL_type]["Phage"].unique()
    df_positives = df[df.index.isin(positive_phages)]
    #df_positives = df_positives.drop_duplicates(subset = ["Phage"] , keep = "first")
    df_positives = df_positives[~df_positives.index.duplicated(keep='first')]
    binaries_pos = df_positives.values
    labels_pos = [1] * len(binaries_pos)
    phages_pos = df_positives.index
    # negative data :
    n_samples = len(phages_pos)
    negative_phages = []
    for negative_index,phage in enumerate(df_info["Phage"].unique().tolist()) :
        if KL_type not in dico_prophage_kltype_associated[phage] :
            negative_phages.append(phage)
    negative_phages_selected = random.sample(negative_phages, int(n_samples*ratio))
    df_negatives = df[df.index.isin(negative_phages_selected)]
    binaries_neg = df_negatives.values
    labels_neg = [0] * len(binaries_neg)
    all_binaries = np.concatenate((binaries_pos, binaries_neg)) 
    all_labels = labels_pos + labels_neg
    all_indices = list(phages_pos) + list(negative_phages_selected)
    df_kl = pd.DataFrame(all_binaries, index=all_indices, columns=dico_cluster.keys())

    return df_kl , all_labels


def make_prediction_file (KL_type) :
    df_kl , all_labels = make_DF_kltype(DF_info_lvl_0 ,df_binaries, KL_type , dico_cluster, collapse = False)
    return df_kl , all_labels



In [75]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from statsmodels.stats.contingency_tables import Table2x2
from statsmodels.sandbox.stats.multicomp import multipletests

def compute_chi_square(df, labels):
    chi_square_results = pd.DataFrame(index=df.columns, columns=['Chi-square', 'P-value'])
    for col in df.columns:
        contingency_table = pd.crosstab(df[col], labels)
        chi2, p_value, _, _ = chi2_contingency(contingency_table)
        chi_square_results.loc[col] = [chi2, p_value]
    return chi_square_results

def compute_odds_ratio(df, labels):
    odds_ratio_results = pd.DataFrame(index=df.columns, columns=['Odds ratio', 'P-value'])
    for col in df.columns:
        table = Table2x2(pd.crosstab(df[col], labels))
        odds_ratio_results.loc[col, 'Odds ratio'], odds_ratio_results.loc[col, 'P-value'] = table.oddsratio_pvalue()
    return odds_ratio_results

def benjamini_hochberg_correction(results_df):
    p_values = results_df['P-value']
    reject, corrected_p_values, _, _ = multipletests(p_values, method='fdr_bh')
    results_df['Corrected p-value'] = corrected_p_values
    results_df['Reject null hypothesis'] = reject
    return results_df


> Chi-2

In [87]:
chisquare_dico = defaultdict(list)

for KL_type in tqdm(KLtype_pred) :
    df_KL_type, labels_KL_type = make_prediction_file(KL_type)
    chi_KL_type = compute_chi_square(df_KL_type, labels_KL_type)
    corrected_KL_type = benjamini_hochberg_correction(chi_KL_type)
    corrected_KL_type_succ = corrected_KL_type[corrected_KL_type["Reject null hypothesis"] == True]
    for prot,prot_info in corrected_KL_type_succ.iterrows() :
        a = (prot, prot_info["Corrected p-value"])
        chisquare_dico[KL_type].append(a)

In [89]:
#with open("/media/concha-eloko/Linux/PPT_clean/Seqbased_model/chi2.labeling.0604.json", "w") as outfile :
#    json.dump(chisquare_dico, outfile)

In [15]:
chisquare_dico = json.load(open("/media/concha-eloko/Linux/PPT_clean/Seqbased_model/chi2.labeling.0604.json"))

In [16]:
chisquare_dico

{'KL30': [['Dpo_cdhit_12', 0.00017771665698402528],
  ['Dpo_cdhit_23', 0.00195368887847524],
  ['Dpo_cdhit_471', 2.587269267847131e-05],
  ['Dpo_cdhit_594', 0.00017771665698402528],
  ['Dpo_cdhit_739', 0.005763059770386412],
  ['Dpo_cdhit_883', 2.587269267847131e-05]],
 'KL19': [['Dpo_cdhit_48', 2.937170125714393e-11],
  ['Dpo_cdhit_172', 0.02218720163739315],
  ['Dpo_cdhit_233', 0.00013812934086225345],
  ['Dpo_cdhit_309', 2.084748948865438e-13],
  ['Dpo_cdhit_432', 0.00013812934086225345],
  ['Dpo_cdhit_532', 1.3894206723743427e-22]],
 'KL25': [['Dpo_cdhit_50', 0.01074301576844586],
  ['Dpo_cdhit_175', 4.548261277974428e-09],
  ['Dpo_cdhit_280', 0.0008645001322490626],
  ['Dpo_cdhit_282', 5.869500767749009e-06],
  ['Dpo_cdhit_334', 6.1199856700488986e-15],
  ['Dpo_cdhit_406', 5.2604741672530814e-79],
  ['Dpo_cdhit_472', 2.520404843282725e-26],
  ['Dpo_cdhit_480', 5.104870744462972e-54],
  ['Dpo_cdhit_487', 5.560223173546723e-07],
  ['Dpo_cdhit_515', 3.2681546076685253e-10],
  ['Dpo_c

In [18]:
DF_info_lvl_0 = DF_info_lvl_0.drop_duplicates(subset = ["Protein_name"])
DF_info_lvl_0

Unnamed: 0,Phage,Protein_name,KL_type_LCA,Infected_ancestor,index,Dataset,seq,domain_seq,1,2,...,1272,1273,1274,1275,1276,1277,1278,1279,1280,prophage_id
0,GCF_902164905.1__phage1,GCF_902164905.1__phage1__34,KL41,GCF_902164905.1,minibatch__460,minibatch,MPATPQDRLYGLTTSVAVKPPVFISVDYDVARFGEQTITSKTPTDE...,QDRLYGLTTSVAVKPPVFISVDYDVARFGEQTITSKTPTDERTITT...,0.025276,0.053137,...,-0.011464,0.081105,0.012011,0.042917,0.009402,0.093175,-0.080562,0.000897,0.111854,prophage_11309
4,GCF_017310305.1__phage5,GCF_017310305.1__phage5__1353,KL30,n4996,minibatch__1084,minibatch,MTVSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVT...,VSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVTLT...,0.004905,0.040896,...,-0.040657,0.087288,0.022292,0.024434,0.025246,0.083449,-0.123537,0.047648,0.061250,prophage_5
10,GCF_001701985.1__phage2,GCF_001701985.1__phage2__357,KL30,n4988,minibatch__1084,minibatch,MTVSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVT...,VSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVTLT...,0.004905,0.040896,...,-0.040657,0.087288,0.022292,0.024434,0.025246,0.083449,-0.123537,0.047648,0.061250,prophage_6465
12,GCF_001611095.1__phage5,GCF_001611095.1__phage5__1365,KL30,n49894989,minibatch__1084,minibatch,MTVSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVT...,VSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVTLT...,0.004905,0.040896,...,-0.040657,0.087288,0.022292,0.024434,0.025246,0.083449,-0.123537,0.047648,0.061250,prophage_5
15,GCF_902156555.1__phage3,GCF_902156555.1__phage3__511,KL30,GCF_902156555.1,minibatch__1084,minibatch,MTVSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVT...,VSTQVSRNEYTGNGATTQYDFTFRILDKSHLLVQTMDTSENIVTLT...,0.004905,0.040896,...,-0.040657,0.087288,0.022292,0.024434,0.025246,0.083449,-0.123537,0.047648,0.061250,prophage_1828
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21342,GCF_900506765.1__phage17,GCF_900506765.1__phage17__90,KL149,GCF_900506765.1,anubis_return__4216,anubis_return,MMTTLNEHPQWESDIYLIKRSDLVAGGRGGIANMQAQQLANRTAFL...,NRRWFRRFTGNIRAEWSGIHDLSQSSAPVDSYIYRLLLASAVGSPD...,0.053704,0.083858,...,0.032803,0.109572,0.010032,0.024949,0.094129,0.028693,-0.061396,0.006824,0.046220,prophage_15598
21344,GCF_003255785.1__phage1,GCF_003255785.1__phage1__10,KL127,GCF_003255785.1,anubis_return__4239,anubis_return,MNGLNHNALTCSAVPIPPWERSLQTVEAQPYFSVSQASLVLEGIVF...,MNGLNHNALTCSAVPIPPWERSLQTVEAQPYFSVSQASLVLEGIVF...,0.010626,-0.025389,...,0.045372,0.009262,-0.008319,-0.050856,0.034115,0.101663,-0.108278,-0.135629,0.102486,prophage_3577
21346,GCF_002186895.1__phage9,GCF_002186895.1__phage9__5,KL57,GCF_002186895.1,anubis_return__4260,anubis_return,MRYRFIALALCLLSGSKVAISAGFDCSLANLSPTEKTICSNEYLSG...,ITDSPWLVKKIFSSDSFEGGINLEGMNVSSILTYQEIKNDLYIYIS...,0.073450,0.046651,...,0.035302,0.012151,0.003563,-0.022575,0.014130,0.063376,-0.050646,-0.085156,-0.010849,prophage_6002
21347,GCF_004312845.1__phage3,GCF_004312845.1__phage3__38,KL9,GCF_004312845.1,anubis_return__4275,anubis_return,MAILITGKSMTRLPESSSWEEEIELITRSERVAGGLDGPANRPLKS...,DAVIRRDLASDKGTSGVGKLGDKPLVAISYYKSKGQSDQDAVQAAF...,0.032196,0.048856,...,-0.016331,0.084711,0.056063,0.001793,0.073958,0.090169,-0.060105,0.023726,0.086452,prophage_12656


> Odds ratio

In [92]:
df_KL_type, labels_KL_type = make_prediction_file("KL64")
#odds_KL_type = compute_odds_ratio(df_KL_type, labels_KL_type)


In [94]:
df_KL_type["labels"] = labels_KL_type

In [97]:
df_KL_type[df_KL_type["Dpo_cdhit_0"] == 1]

Unnamed: 0,Dpo_cdhit_0,Dpo_cdhit_1,Dpo_cdhit_2,Dpo_cdhit_3,Dpo_cdhit_4,Dpo_cdhit_5,Dpo_cdhit_6,Dpo_cdhit_7,Dpo_cdhit_8,Dpo_cdhit_9,...,Dpo_cdhit_980,Dpo_cdhit_981,Dpo_cdhit_982,Dpo_cdhit_983,Dpo_cdhit_984,Dpo_cdhit_985,Dpo_cdhit_986,Dpo_cdhit_987,Dpo_cdhit_988,labels
GCF_916172625.1__phage4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [101]:
for depo_cluster in df_KL_type.columns:
    dico_count = dict(Counter(df_KL_type[depo_cluster]))
    n_absent = dico_count.get(0, 0)  
    n_present = dico_count.get(1, 0)
    if n_present > 0 :
        if 1 in df_KL_type[df_KL_type[depo_cluster] == 1]["labels"].values :
            label_absent = df_KL_type[df_KL_type[depo_cluster] == 0].values
    