# Hyperparameter tunning 

In [None]:
from collections import Counter
import os

from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.pipeline import Pipeline

import pandas as pd
import numpy as np
import random
from collections import Counter
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import seaborn as sea
import logging
logging.basicConfig(filename="debug_genic.log", level=logging.INFO)

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

plant_dir = os.environ.get("PLANT_DIR")
if not plant_dir:
    raise EnvironmentError("PLANT_DIR environment variable is not set")

OUTPUT_DIR_NAME = os.path.join(plant_dir, \
                                "experimental_setup",\
                                "models", \
                                "models_genic_m_esculenta_PCA")

DATASET_NAME = os.path.join(plant_dir, \
                            "experimental_setup", \
                            "datasets", \
                            "dataset_m_esculenta", \
                            "final", \
                            "dataset_m_esculenta_1500_50.csv")


os.makedirs(OUTPUT_DIR_NAME, exist_ok=True)

def import_data_everything(dataset=DATASET_NAME):
    data = pd.read_csv(dataset)
    logging.info(f"Read CSV: {data.shape}")

    data = data.sample(frac=1, random_state=SEED).reset_index(drop=True)  # Shuffle
    logging.info(f"After shuffle: {data.shape}")

    data = data.drop(columns=["species"])
    data = data.dropna()
    logging.info(f"After dropna: {data.shape}")

    print("Original size:", len(data))
    logging.info(f"Original size: {len(data)}")
    
    return data
    
def import_split_scale_data(dataset):
    le = LabelEncoder()

    dataset = dataset.sample(frac=1, random_state=SEED).reset_index(drop=True)

    feature_cols = [c for c in dataset.columns if c != "label"]
    dataset = dataset.drop_duplicates(subset=feature_cols, keep="first")

    dataset["label"] = le.fit_transform(dataset["label"])

    X = dataset.drop(columns=["label"]).astype(np.float32)
    y = dataset["label"].astype(np.int32).values

    features_names = X.columns.tolist()

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    # downsample stratified
    sss = StratifiedShuffleSplit(n_splits=1, train_size=0.6, random_state=SEED)
    idx_keep, _ = next(sss.split(X_train, y_train))

    X_train, y_train = X_train.iloc[idx_keep].reset_index(drop=True), y_train[idx_keep]
    logging.info(f"After 50% stratified downsample -> X_train: {X_train.shape}, y_train: {Counter(y_train)}")

    return X_train, X_test, y_train, y_test, le, features_names


def undersample_data(X_train, y_train):
    rs = RandomUnderSampler(random_state=42)
    X_train, y_train = rs.fit_resample(X_train, y_train)

    print('Resampled dataset shape %s' % Counter(y_train))
    logging.info('Resampled dataset shape %s' % Counter(y_train))

    return X_train, y_train

def oversample_data(X_train, y_train):
    sm = SMOTE(random_state=42)
    X_train, y_train = sm.fit_resample(X_train, y_train)

    print('Resampled dataset shape %s' % Counter(y_train))
    logging.info('Resampled dataset shape %s' % Counter(y_train))

    return X_train, y_train

In [2]:
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import LeaveOneGroupOut, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import label_binarize
import joblib

def tune_and_evaluate(name, model, params, sampling, X_train, y_train, X_test, y_test, le, y_test_bin, feature_names):
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("pca", PCA(n_components=0.95, random_state=SEED)),
        ("clf", model)
    ])

    grid = RandomizedSearchCV(pipe, params, n_iter=10, cv=3, n_jobs=4, random_state=SEED)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_

    print(f"Best params: {grid.best_params_}")

    y_pred = best_model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    y_pred_bin = label_binarize(y_pred, classes=list(range(len(le.classes_))))

    auc_roc = roc_auc_score(y_test_bin, y_pred_bin, average="macro", multi_class="ovr")
    auc_pr = average_precision_score(y_test_bin, y_pred_bin, average="macro")

    result = {
        "Model": name,
        "Precision": report["weighted avg"]["precision"],
        "Recall": report["weighted avg"]["recall"],
        "F1-score": report["weighted avg"]["f1-score"],
        "AUC-ROC": auc_roc,
        "AUC-PRC": auc_pr
    }

    joblib.dump({
        "pipeline": best_model,
        "label_encoder": le,
        "results": result,
        "feature_names": feature_names
    }, f"{OUTPUT_DIR_NAME}/model_{sampling}_{name.replace(' ', '_')}_50.pkl")

    return result, best_model

    
def import_results(name, sampling):
    with open(f"{OUTPUT_DIR_NAME}/model_{sampling}_{name}_50.pkl", "rb") as f:
        RF = joblib.load(f)
    return RF

In [3]:
data = import_data_everything()
X_train, X_test, y_train, y_test, le, features_names = import_split_scale_data(data)
y_test_bin = label_binarize(y_test, classes=list(range(len(le.classes_))))

Original size: 15252300


In [None]:

tasks = [
    ("Random_Forest", RandomForestClassifier(random_state=42, n_jobs=4), {
        'clf__n_estimators': [100, 200, 300],
        'clf__max_depth': [None, 10, 20, 30],
        'clf__min_samples_split': [2, 5, 10],
        'clf__max_features': ['sqrt', 'log2']
    }),
    ("XGBoost", XGBClassifier(eval_metric='mlogloss', random_state=42, n_jobs=4), {
        'clf__n_estimators': [100, 200],
        'clf__max_depth': [3, 5, 7],
        'clf__learning_rate': [0.01, 0.1, 0.2],
        'clf__subsample': [0.7, 0.8, 1.0],
        'clf__colsample_bytree': [0.7, 0.8, 1.0]
    })
]

' with ThreadPoolExecutor(max_workers=8) as executor:\n    futures = [\n        executor.submit(\n            tune_and_evaluate, \n            name, model, params, "oversampling", \n            X_train_oversampled, y_train_oversampled, X_test, y_test, le, y_test_bin, \n            scaler, features_names\n        ) for name, model, params in tasks\n    ]\n    results_models = [future.result() for future in futures] '

## Undersampling

In [8]:
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import joblib
from sklearn.preprocessing import label_binarize
    
X_train_undersampled, y_train_undersampled = undersample_data(X_train, y_train)


Resampled dataset shape Counter({0: 2794871, 1: 2794871})


In [9]:
with ThreadPoolExecutor(max_workers=8) as executor:
    futures = [
        executor.submit(
            tune_and_evaluate, 
            name, model, params, "undersampling", 
            X_train_undersampled, y_train_undersampled, X_test, y_test, le, y_test_bin, features_names
        ) for name, model, params in tasks
    ]
    results_models = [future.result() for future in futures]

results = []





Best params: {'clf__subsample': 0.7, 'clf__n_estimators': 200, 'clf__max_depth': 7, 'clf__learning_rate': 0.2, 'clf__colsample_bytree': 0.7}


Best params: {'clf__n_estimators': 100, 'clf__min_samples_split': 2, 'clf__max_features': 'sqrt', 'clf__max_depth': None}


In [10]:
RF = import_results("Random_Forest", "undersampling")
XGB = import_results("XGBoost", "undersampling")

results.append(RF["results"])
results.append(XGB["results"])

In [11]:
df_results = pd.DataFrame(results)

print(df_results)

           Model  Precision    Recall  F1-score   AUC-ROC   AUC-PRC
0  Random_Forest   0.889428  0.888355  0.887675  0.881855  0.855639
1        XGBoost   0.860110  0.859485  0.858633  0.852534  0.825805
