Resources :<br>
https://machinelearningmastery.com/scikit-optimize-for-hyperparameter-tuning-in-machine-learning/<br>
https://towardsdatascience.com/bayesian-optimization-with-python-85c66df711ec<br>
https://towardsdatascience.com/hyperparameter-optimization-with-scikit-learn-scikit-opt-and-keras-f13367f3e796<br>
***
# Run a Random Forest for each KL type, after optimizing the clustering preference
***
### Process in 4 steps :
#### I. Clustering the Dpo domains based on the preference
#### II. Build the DF with the corresponding Dpo clusters
#### III. Run the RF modelization
#### IV. Optimazation of the Parameters (RF parameters and preference)
***

> The data :


In [None]:
import os 
import pandas as pd
from tqdm import tqdm 

# remote :
path_project = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023"
path_similarity = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_similarity/phageboost/fastANI_20102022_out"
path_db = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session"

# The embeddings :
esm2_depo = pd.read_csv(f"{path_project}/Dpo.0805.embeddings.ultimate.csv" , header = None, sep = "," , index_col = 0)

# Informative DF :
family_df = pd.read_csv(f"{path_similarity}/clusters_99_80.extra_clean.2004.v2.tsv", sep = "\t", header = 0)
df_info = pd.read_csv(f"{path_db}/DF_Dpo.final.1005.tsv" , sep = "\t", header =0)
prophage_ktype_df = pd.read_csv(f"{path_project}/DF_optimization/prophage_K_types.ultimate.csv", sep = "\t")

In [None]:
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report
from scipy.stats import fisher_exact
import pandas as pd
import numpy as np
from collections import Counter


def build_dataframe(dico):
    prophages = set(prophage for prophages in dico.values() for prophage in prophages)
    matrix = [[1 if prophage in dico[dpo] else 0 for dpo in dico.keys()] for prophage in prophages]
    df = pd.DataFrame(matrix, index=prophages, columns=dico.keys())
    return df

def cluster_AFF_prop(pref) :
    """
    In 3 steps :
    I. The clustering
    II. Generate the Dico
    III. Make the DF
    
    It outputs the DF_preference.cluster in the rep tmp.
    
    As we plan to run the optimizer in parrallel, some cluster may ahve already been built.
    So before clustering, we'll check that the DF has not already been built.
    """
    # I
    if os.path.isfile(f"{path_project}/DF_optimization/tmp/dico_cluster.ultimate.pref_{pref}.json") == False :
        af = AffinityPropagation(damping=0.95, preference=pref, random_state=123, max_iter=1000,verbose=True).fit(esm2_depo)
        cluster_centers_indices = af.cluster_centers_indices_
        labels = af.labels_
        n_clusters_ = len(cluster_centers_indices)
        n_clusters_
        # II
        dico_df = {"Dpo" : esm2_depo.index , "Label" : labels}
        df_results = pd.DataFrame(dico_df)
        dico_cluster = {}
        for index , row in df_results.iterrows() :
            if row["Label"] not in dico_cluster :
                tmp_list = []
                tmp_list.append(row["Dpo"])
                dico_cluster[row["Label"]] = tmp_list
            else :
                dico_cluster[row["Label"]].append(row["Dpo"])
        with open(f"{path_project}/DF_optimization/tmp/dico_cluster.ultimate.pref_{pref}.json", "w") as outfile:
            json.dump(dico_cluster, outfile)
        # III.
        dico_prophage_id = {}
        for Dpo_label , Dpo_list in tqdm(dico_cluster.items()) :
            prophages_carrying = set()
            for _,dpo in enumerate(Dpo_list) :
                prophages_df = df_info[df_info["index"] == dpo]
                for phage in prophages_df["Phage"].to_list() :
                    family = family_df[family_df["prophage"] == f"{phage}.fasta"]["prophage_id"].values[0]
                    prophages_carrying.add(family)
            dico_prophage_id[f"Dpo__{Dpo_label}"] = prophages_carrying
        df = build_dataframe(dico_prophage_id)
        df.sort_index(inplace = True, ascending=True)
        df.to_csv(f"{path_project}/DF_optimization/tmp/prophage_Dpo.pref_{pref}.ultimate.csv", sep = "\t", header = True , index = True)
    else :
        pass
    
def fit_rf_model_random_search(label):
    prophage_dpo_df = pd.read_csv(f"{path_project}/DF_optimization/tmp/prophage_Dpo.pref_{pref}.ultimate.csv", sep = "\t", header = 0 , index_col = 0)
    label_series = prophage_ktype_df[label]
    dic_count = dict(Counter(label_series))
    if 1 in dic_count:
        n_infection = dic_count[1]
        df = prophage_dpo_df
        n_iter=3
        X_train, X_test, y_train, y_test = train_test_split(df, label_series, test_size=0.2, random_state=42)
        param_grid = {
            'bootstrap': [True, False],
            'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
            'max_features': ['auto', 'sqrt'],
            'min_samples_leaf': [1, 2, 4],
            'min_samples_split': [2, 5, 10],
            'n_estimators': [200, 400, 600, 800]
        }
        rf = RandomForestClassifier()
        rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, 
                                       n_iter=n_iter, cv=3, verbose=2, random_state=42, n_jobs=-1)
        rf_random.fit(X_train, y_train)
        # Print the best parameters from the Randomized Search
        predictions = rf_random.predict(X_test)
        report = classification_report(y_test, predictions, output_dict=True)
        # Extract relevant metrics
        try : 
            precision, recall, f1_score = report['1']['precision'], report['1']['recall'], report['1']['f1-score']
            # Calculate p-values for features
            significant_features = []
            for feature in df.columns:
                contingency_table = pd.crosstab(df[feature], label_series)
                _, p_value = fisher_exact(contingency_table)
                if p_value < 0.05:
                    significant_features.append(feature)
            with open(f"{path_DF}/RF_results.1605.tsv", "a+") as outfile :
                outfile.write(f"{label}\t{n_infection}\t{f1_score}\t{precision}\t{recall}\t{len(significant_features)}\t{','.join(significant_features)}\n")
        except Exception as e :
            print(e, label, n_infection)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from bayes_opt import BayesianOptimization, UtilityFunction
import warnings
warnings.filterwarnings("ignore")
# Prepare the data.
cancer = load_breast_cancer()
X = cancer["data"]
y = cancer["target"]X_train, X_test, y_train, y_test = train_test_split(X, y,
                                            stratify = y,
                                        random_state = 42)scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)# Define the black box function to optimize.
def black_box_function(C):
    # C: SVC hyper parameter to optimize for.
    model = SVC(C = C)
    model.fit(X_train_scaled, y_train)
    y_score = model.decision_function(X_test_scaled)
    f = roc_auc_score(y_test, y_score)
    return f# Set range of C to optimize for.
# bayes_opt requires this to be a dictionary.
pbounds = {"C": [0.1, 10]}# Create a BayesianOptimization optimizer,
# and optimize the given black_box_function.
optimizer = BayesianOptimization(f = black_box_function,
                                 pbounds = pbounds, verbose = 2,
                                 random_state = 4)optimizer.maximize(init_points = 5, n_iter = 10)print("Best result: {}; f(x) = {}.".format(optimizer.max["params"], optimizer.max["target"]))

In [None]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

search_space = {"bootstrap": Categorical([True, False]), # values for boostrap can be either True or False
        "max_depth": Integer(6, 20), # values of max_depth are integers from 6 to 20
        "max_features": Categorical(['auto', 'sqrt','log2']), 
        "min_samples_leaf": Integer(2, 10),
        "min_samples_split": Integer(2, 10),
        "n_estimators": Integer(100, 500)
    }
