In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report , roc_auc_score, matthews_corrcoef , confusion_matrix
from scipy.stats import fisher_exact
import pandas as pd
import numpy as np
from collections import Counter
import warnings
warnings.filterwarnings("ignore")
import os 
import pandas as pd
import numpy as np
import json
from tqdm import tqdm 
from multiprocessing.pool import ThreadPool 

path_work = f"/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023"
path_task = f"{path_work}/Rafa_task"
path_embeddings = f"{path_task}/embeddings"
path_3D = f"{path_task}/3Dsequence_similarity"
path_aa_seq = f"{path_task}/sequence_similarity"

path_methods = [path_embeddings , path_3D , path_aa_seq]

path_dfs = []
for method in path_methods :
    for file in os.listdir(method) :
        if file[-3:] == "csv" :
            f_path = f"{method}/{file}"
            path_dfs.append(f_path)
            

index_dico = {"cdhit" : "seq_similarity" , "foldseek" : "3D_similarity" , "embeddings" : "embeddings_similarity"}
dico_KL_type = json.load(open(f"{path_task}/dico_prophage_ID.KLtypes.json"))
KL_types = set(values[0] for values in dico_KL_type.values())


def make_df_type(path_df , label) :
    df = pd.read_csv(f"{path_df}" , sep = "\t" , header = 0 , index_col = 0)
    method = index_dico[path_df.split("/")[-1].split("_")[1].split(".")[1]]
    threshold = path_df.split("/")[-1].split("_")[-1].split(".csv")[0]
    dico_KL_prophage_id = json.load(open(f"{path_task}/dico_prophage_ID.KLtypes.json"))
    df["Labels"] = df.index.to_series().map(lambda x : 1 if label in dico_KL_type.get(x, []) else 0)
    support = 0 
    X = df.drop(["Labels"] , axis = 1)
    for prophage , kltypes in dico_KL_type.items() :
        if prophage in (X.index.tolist()) :
            if X.loc[[prophage]].sum().sum() > 0 and label in kltypes:
                support+=1
    return df , method , threshold , support


def fit_rf_model_random_search(df):
    X = df.drop(["Labels"] , axis = 1)
    n_Dpo_clusters = len(X.columns)
    n_iter=1
    X_train, X_test, y_train, y_test = train_test_split(X, df["Labels"], test_size=0.2, random_state=42)
    param_grid = {
            'bootstrap': [True, False],
            'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
            'max_features': ['auto', 'sqrt'],
            'min_samples_leaf': [1, 2, 4],
            'min_samples_split': [2, 5, 10],
            'n_estimators': [200, 400, 600, 800]
        }
    rf = RandomForestClassifier()
    rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, 
                                   n_iter=n_iter, cv=4, verbose=2, random_state=42, n_jobs=-1)
    rf_random.fit(X_train, y_train)
    # Print the best parameters from the Randomized Search
    predictions = rf_random.predict(X_test)
    report = classification_report(y_test, predictions, output_dict=True)
    # Getting the TPR and TNR : 
    try : 
        tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
        tpr = tp / (tp + fn)
        tnr = tn / (tn + fp)
        mcc = matthews_corrcoef(y_test, predictions)
        return tpr , tnr , mcc  , report['1']['f1-score'] , n_Dpo_clusters 
    except ValueError :
        return None , None , None  , None , n_Dpo_clusters 

    
def write_line(method , label , support , threshold , n_clusters , tpr , tnr , f1_score , mcc) :
    with open(f"{path_task}/{method}_RF_FullReport.corrected.tsv", "a+") as outfile :
        prefix = f"{label}\t{support}\t{threshold}\t{n_clusters}"
        if f1_score != None :
            metrics = {"TPR" : tpr , "TNR" : tnr , "F1_score" : f1_score , "MCC" : mcc}
            for metric in metrics :
                outfile.write(f"{prefix}\t{metric}\t{metrics[metric]}\n")
        else :
            outfile.write(f"{prefix}\tNo prediction in that instance\n")
        

def final_function(path_df) :
    for label in KL_types :
        df , method , threshold , support = make_df_type(path_df , label)
        tpr , tnr , mcc , f1_score , n_Dpo_clusters = fit_rf_model_random_search(df)
        write_line(method , label , support , threshold , n_Dpo_clusters , tpr , tnr , f1_score , mcc)
        
        
if __name__ == '__main__':
    with ThreadPool(20) as p:
        p.map(final_function, path_dfs)
        
        
df , method , threshold = make_df_type(path_dfs[5] , "KL64")
fit_rf_model_random_search(df)
write_line()

In [None]:
def fit_rf_model_random_search(label):
    for path_df in paths_df : 
        try : 
            file = path_df.split("/")[-1]
            df = pd.read_csv(f"{path_df}", sep = "\t", index_col = 0)
            df["Labels"] = df.index.to_series().map(lambda x : 1 if label in dico_KL_type.get(x, []) else 0)
            
            
            X = df.drop(["Labels"] , axis = 1)
            n_iter=10
            X_train, X_test, y_train, y_test = train_test_split(X, df["Labels"], test_size=0.2, random_state=42)
            param_grid = {
                    'bootstrap': [True, False],
                    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
                    'max_features': ['auto', 'sqrt'],
                    'min_samples_leaf': [1, 2, 4],
                    'min_samples_split': [2, 5, 10],
                    'n_estimators': [200, 400, 600, 800]
                }
            rf = RandomForestClassifier()
            rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, 
                                           n_iter=n_iter, cv=4, verbose=2, random_state=42, n_jobs=-1)
            rf_random.fit(X_train, y_train)
            # Print the best parameters from the Randomized Search
            predictions = rf_random.predict(X_test)
            report = classification_report(y_test, predictions, output_dict=True)
            with open(f"{path_project}/{dico_names[file]}.KL_report.tsv", "a+") as outfile : 
                outfile.write(f"{label}\t{report['1']['precision']}\t{report['1']['recall']}\t{report['1']['f1-score']}\t{report['1']['support']}\n")
        except Exception as e :
            with open(f"{path_project}/{dico_names[file]}.KL_report.tsv", "a+") as outfile : 
                outfile.write(f"{label}\tFailed\n")

In [None]:

support = 0 
df.drop(["Labels"], axis = 1)
for prophage , kltypes in dico_KL_type.items() :
    if prophage in (df.index.tolist()) :
        if df.loc[[prophage]].sum().sum() > 0 and "KL64" in kltypes:
            support+=1 
            

In [None]:
# compute class weights
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
# convert class weights to a tensor
weights = torch.tensor(class_weights, dtype=torch.float)
# move to GPU if available
weights = weights.to(device)
# use in your loss function
criterion = torch.nn.CrossEntropyLoss(weight=weights)
