In [1]:
# Ground modules
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from Bio import SeqIO
from itertools import product
import random
from collections import Counter
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import logging
import subprocess
from multiprocessing.pool import ThreadPool
import joblib

# SCikitlearn modules :
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report , roc_auc_score

# Scipy modules : 
from scipy.stats import fisher_exact

In [2]:
import json

path_seqbased = "/media/concha-eloko/Linux/PPT_clean/Seqbased_model"
path_db = f"{path_seqbased}/TropiSeq/TropiSeq_0.75.db"

dico_cluster = json.load(open(f"{path_seqbased}/dico_cluster.cdhit__0.75.json"))
dico_cluster_r = {ref_dpo : key_dpo for key_dpo,list_dpo in dico_cluster.items() for ref_dpo in list_dpo}


In [5]:
len(dico_cluster)

883

In [3]:
num_arrays = 883
list_of_arrays = [np.zeros(num_arrays) for _ in range(num_arrays)]
for i, arr in enumerate(list_of_arrays):
    arr[i] = 1

***
# Make predictions

In [5]:
import pickle
import os
from joblib import load

path_seqbased = "/media/concha-eloko/Linux/PPT_clean"

models_TropiSeq = {}

for rf_model in os.listdir(f"{path_seqbased}/selected_RF_3112") :
        kltype = rf_model.split("_RF_")[1].split(".")[0]
        with open(f"{path_seqbased}/selected_RF_3112/{rf_model}", 'rb') as file:
            models_TropiSeq[kltype] = load(file)

TropiSeq_results = {}

In [6]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

def plot_feature_importances(importances, feature_names):
    """
    Plot feature importances using a bar plot.
    
    Parameters:
        importances (array-like): Feature importances.
        feature_names (list): Names of the features.
    """
    # Filter features with importances greater than 0
    nonzero_indices = importances > 0.1
    importances = importances[nonzero_indices]
    feature_names = [feature_names[i] for i, is_nonzero in enumerate(nonzero_indices) if is_nonzero]

    # Sort the features by their importances
    indices = np.argsort(importances)[::-1]

    # Plot the feature importances
    plt.figure(figsize=(10, 6))
    plt.title("Feature Importances")
    plt.bar(range(len(importances)), importances[indices], color="skyblue", align="center")
    plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=45, ha="right")
    plt.xlabel("Feature")
    plt.ylabel("Importance")
    plt.tight_layout()
    plt.show()

def top_features(importances, feature_names, threshold=0.1):
    """
    Get the features with importance scores above a given threshold, along with their importance scores.
    
    Parameters:
        importances (array-like): Feature importances.
        feature_names (list): Names of the features.
        threshold (float): Importance threshold.
    
    Returns:
        feature_info (list): List of tuples containing feature name and importance score.
    """
    # Create a list of tuples containing feature name and importance score
    feature_info = [(feature_names[i], importances[i]) for i in range(len(importances)) if importances[i] > threshold]
    
    # Sort the feature_info list by importance score (in descending order)
    feature_info.sort(key=lambda x: x[1], reverse=True)
    
    return feature_info

In [7]:
plot_feature_importances(feature_importances ,cluster_ids)

NameError: name 'feature_importances' is not defined

In [73]:
top_features(feature_importances ,cluster_ids)

[('cluster_782', 0.3859021196930758),
 ('cluster_323', 0.1205076599204536),
 ('cluster_793', 0.12003602612586292),
 ('cluster_373', 0.09600447502887081),
 ('cluster_258', 0.04900119599818295),
 ('cluster_611', 0.04710584293518818),
 ('cluster_151', 0.017350842339643624),
 ('cluster_324', 0.01662482146974565),
 ('cluster_706', 0.015400660597113229)]

In [12]:
feature_importance_dico = {}
cluster_ids = ["cluster_" + str(index) for index in range(0,884)]

for KL_type in models_TropiSeq :
    model = models_TropiSeq[KL_type]
    feature_importances = model.feature_importances_
    top_f = top_features(feature_importances ,cluster_ids,threshold=0.05)
    feature_importance_dico[KL_type] = top_f

In [13]:
feature_importance_dico

KL_clusters = {}

for cluster in cluster_ids : 
    cluster_list = []
    for KL_type in feature_importance_dico :
        for tuple in feature_importance_dico[KL_type] :
            if tuple[0] == cluster :
                cluster_list.append(KL_type)
    if len(cluster_list) > 1 :
        KL_clusters[cluster] = cluster_list
    

In [14]:
len(KL_clusters)

67

In [15]:
from itertools import combinations
pairs_list = []
for clu,targets in KL_clusters.items():
    pairs = combinations(targets, 2)
    pairs_list.extend(pairs)

In [16]:
dict(Counter(pairs_list))

{('KL137', 'KL123'): 1,
 ('KL47', 'KL64'): 3,
 ('KL21', 'KL39'): 1,
 ('KL21', 'KL166'): 1,
 ('KL21', 'KL24'): 1,
 ('KL21', 'KL48'): 1,
 ('KL21', 'KL112'): 1,
 ('KL39', 'KL166'): 1,
 ('KL39', 'KL24'): 1,
 ('KL39', 'KL48'): 1,
 ('KL39', 'KL112'): 1,
 ('KL166', 'KL24'): 1,
 ('KL166', 'KL48'): 1,
 ('KL166', 'KL112'): 1,
 ('KL24', 'KL48'): 1,
 ('KL24', 'KL112'): 2,
 ('KL48', 'KL112'): 1,
 ('KL8', 'KL7'): 1,
 ('KL8', 'KL3'): 1,
 ('KL7', 'KL3'): 1,
 ('KL8', 'KL9'): 1,
 ('KL8', 'KL31'): 1,
 ('KL8', 'KL48'): 1,
 ('KL9', 'KL31'): 1,
 ('KL9', 'KL48'): 2,
 ('KL31', 'KL48'): 1,
 ('KL152', 'KL145'): 1,
 ('KL6', 'KL109'): 1,
 ('KL47', 'KL107'): 2,
 ('KL47', 'KL105'): 1,
 ('KL47', 'KL41'): 1,
 ('KL47', 'KL102'): 1,
 ('KL47', 'KL125'): 1,
 ('KL47', 'KL15'): 2,
 ('KL47', 'KL152'): 1,
 ('KL47', 'KL58'): 1,
 ('KL47', 'KL36'): 1,
 ('KL47', 'KL164'): 1,
 ('KL47', 'KL24'): 1,
 ('KL47', 'KL147'): 1,
 ('KL47', 'KL6'): 1,
 ('KL47', 'KL13'): 1,
 ('KL47', 'KL106'): 1,
 ('KL47', 'KL103'): 1,
 ('KL47', 'KL108'): 1,

In [18]:
with open("/media/concha-eloko/Linux/PPT_clean/Network_file.TropiSeq.1002.tsv", "w") as outfile :
    for tuple,count in dict(Counter(pairs_list)).items() :
        if count > 1 :
            outfile.write(f"{tuple[0]}\t{tuple[1]}\t{count}\n")

In [30]:
1062.024/3 , 1693.558/3

(354.008, 564.5193333333333)

> Make the predictions : 

In [33]:
'''for index,array in tqdm(enumerate(list_of_arrays)) :
    cluster_id = "cluster_" + str(index)
    tmp_positif = {}
    for kltype in models_TropiSeq :
        pred = models_TropiSeq[kltype].predict_proba(np.array(array).reshape(1, -1))
        if pred[0][1] >= 0.5 :
            tmp_positif[kltype] = pred[0][1]
    TropiSeq_results[cluster_id] = tmp_positif'''

#import json 
#with open("/media/concha-eloko/Linux/PPT_clean/Seqbased_model/cluster_KLtypes.json", "w") as outfile :
#    json.dump(TropiSeq_results, outfile)

883it [28:00,  1.90s/it]


> Open predictions :

In [76]:
import json 

path_pred = "/media/concha-eloko/Linux/PPT_clean/Seqbased_model/cluster_KLtypes.json"

TropiSeq_results = json.load(open(path_pred))

In [77]:
TropiSeq_results

{'cluster_0': {},
 'cluster_1': {},
 'cluster_2': {},
 'cluster_3': {'KL43': 0.5768679824908414},
 'cluster_4': {'KL64': 0.7243079795779298, 'KL54': 0.7225832357688411},
 'cluster_5': {'KL38': 0.5951490162713051},
 'cluster_6': {},
 'cluster_7': {},
 'cluster_8': {},
 'cluster_9': {},
 'cluster_10': {},
 'cluster_11': {'KL30': 0.5230767259793374},
 'cluster_12': {},
 'cluster_13': {},
 'cluster_14': {'KL14': 0.7441289066212997},
 'cluster_15': {},
 'cluster_16': {},
 'cluster_17': {},
 'cluster_18': {},
 'cluster_19': {'KL16': 0.9437004643985163},
 'cluster_20': {},
 'cluster_21': {},
 'cluster_22': {},
 'cluster_23': {},
 'cluster_24': {},
 'cluster_25': {},
 'cluster_26': {},
 'cluster_27': {},
 'cluster_28': {'KL123': 0.6101621438255168},
 'cluster_29': {},
 'cluster_30': {},
 'cluster_31': {},
 'cluster_32': {},
 'cluster_33': {},
 'cluster_34': {},
 'cluster_35': {},
 'cluster_36': {},
 'cluster_37': {'KL64': 0.8004726564524154, 'KL10': 0.5359320862005766},
 'cluster_38': {'KL151'

In [37]:
from collections import Counter
lengths = [len(TropiSeq_results[cluster]) for cluster in TropiSeq_results]


Counter({1: 382,
         0: 361,
         2: 76,
         3: 26,
         4: 11,
         5: 8,
         7: 5,
         6: 4,
         11: 3,
         8: 3,
         9: 2,
         12: 1,
         10: 1})

In [43]:
from itertools import combinations
pairs_list = []
associations_tropiseq = [set(kl for kl in TropiSeq_results[cluster]) for cluster in TropiSeq_results if len(TropiSeq_results[cluster])>0]


for s in associations_tropiseq:
    # Convert set to list for compatibility with combinations
    elements = list(s)
    pairs = combinations(elements, 2)
    pairs_list.extend(pairs)

# Convert pairs_list to a list of tuples
#pairs_list = list(pairs_list)




In [45]:
Counter(pairs_list)

Counter({('KL47', 'KL64'): 5,
         ('KL30', 'KL125'): 4,
         ('KL51', 'KL81'): 3,
         ('KL123', 'KL43'): 3,
         ('KL21', 'KL64'): 3,
         ('KL24', 'KL28'): 3,
         ('KL105', 'KL15'): 3,
         ('KL36', 'KL106'): 3,
         ('KL36', 'KL15'): 3,
         ('KL36', 'KL107'): 3,
         ('KL24', 'KL15'): 3,
         ('KL106', 'KL15'): 3,
         ('KL106', 'KL107'): 3,
         ('KL15', 'KL107'): 3,
         ('KL15', 'KL64'): 3,
         ('KL107', 'KL64'): 3,
         ('KL107', 'KL106'): 3,
         ('KL74', 'KL26'): 3,
         ('KL116', 'KL30'): 3,
         ('KL116', 'KL125'): 3,
         ('KL8', 'KL22'): 3,
         ('KL107', 'KL15'): 3,
         ('KL2', 'KL122'): 3,
         ('KL2', 'KL64'): 3,
         ('KL13', 'KL2'): 3,
         ('KL5', 'KL30'): 3,
         ('KL51', 'KL2'): 3,
         ('KL21', 'KL47'): 2,
         ('KL24', 'KL112'): 2,
         ('KL112', 'KL39'): 2,
         ('KL8', 'KL1'): 2,
         ('KL31', 'KL14'): 2,
         ('KL48', 'KL9'): 2,


In [3]:
1761.348/3 , 1418.833/3

(587.116, 472.94433333333336)