In [6]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import shap
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import KFold
import optuna
import optuna.visualization as vis
import joblib
from xgboost import XGBClassifier
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, auc, precision_score, recall_score, precision_recall_curve, classification_report
np.random.seed(123)

In [7]:
def save_classifier(study, y_test, y_test_proba, y_test_final, thr_opt, file_path):
    with open(file_path, 'w') as file:
        file.write("=" * 40 + "\n")
        file.write("Study Summary\n")
        file.write("-" * 40 + "\n")
        file.write(f"Best Parameters: {study.best_params}\n")
        file.write(f"Best AUC-ROC Value: {study.best_value:.4f}\n\n")
        file.write("Best Model Results\n")
        file.write("-" * 40 + "\n")
        file.write("Test Metrics:\n")
        file.write(f"Optimal Threshold: {thr_opt:.4f}\n")
        file.write(f"ROC-AUC Score: {roc_auc_score(y_test, y_test_proba):.4f}\n")
        file.write("Confusion Matrix:\n")
        file.write(f"{confusion_matrix(y_test, y_test_final)}\n\n")
        file.write("Classification Report:\n")
        file.write(classification_report(y_test, y_test_final))
        file.write("\n" + "=" * 40 + "\n")

In [8]:
sg_num  = 35

## Loading data for the provided state

In [12]:
name_lab = 'data/lab_'+str(sg_num)+'.csv'
name_epi = 'data/epi_'+str(sg_num)+'.csv'
df_lab = pd.read_csv(name_lab)
df_epi = pd.read_csv(name_epi)
df_lab = df_lab.drop(columns = ['ID_MN_RESI'])
df_epi = df_epi.drop(columns = ['ID_MN_RESI'])
denv_lab = sum(df_lab['CHIK'] == 0) 
chik_lab = sum(df_lab['CHIK'] == 1) 
denv_epi = sum(df_epi['CHIK'] == 0) 
chik_epi = sum(df_epi['CHIK'] == 1) 
df_lab = pd.get_dummies(df_lab, columns = ['fx_etaria'])
df_epi = pd.get_dummies(df_epi, columns = ['fx_etaria'])
df_epi = df_epi.reindex(columns = df_lab.columns, fill_value=0)
df_lab = df_lab.drop_duplicates()
X_lab = df_lab.drop(columns = ['CHIK'])
y_lab = df_lab['CHIK']
X_epi = df_epi.drop(columns = ['CHIK'])
y_epi = df_epi['CHIK']
X_train, X_test, y_train, y_test = train_test_split(X_lab, y_lab, stratify=y_lab, test_size = 0.3, random_state = 0)
X_test, X_val, y_test, y_val = train_test_split(X_lab, y_lab, stratify=y_lab, test_size = 0.5, random_state = 0)

## Optimizing a CatBoost classifier for this

In [14]:
def objective_ct(trial):
    ct_params = {
        "iterations": trial.suggest_int("iterations", low = 100, high = 2000),
        "depth": trial.suggest_int("depth", 3, 10),
        "learning_rate": trial.suggest_float('learning_rate', 1e-3, 1, log = True),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", low = 1, high = 10),
        'verbose': False
    }
    kf = KFold(n_splits = 5, shuffle = True, random_state = 123)
    values_test = []
    ct = CatBoostClassifier(**ct_params)
    for trf_index, tef_index in kf.split(X_train): # train and test folds of train data (only data we use now)
        X_trf, X_tef = X_train.iloc[trf_index], X_train.iloc[tef_index]
        y_trf, y_tef = y_train.iloc[trf_index], y_train.iloc[tef_index]
        ct.fit(X_trf, y_trf)
        values_test.append(roc_auc_score(y_tef, ct.predict_proba(X_tef)[:,1]))
    return np.mean(np.array(values_test))

study = optuna.create_study(direction = 'maximize')
study.optimize(objective_ct, n_trials = 10)
fig = vis.plot_optimization_history(study)
fig.show()
grid3 = CatBoostClassifier(**study.best_params, verbose = False)
grid3.fit(X_train, y_train)
joblib.dump(grid3, str(sg_num)+'_ct.pkl')
y_val_pred = grid3.predict(X_val)
y_val_proba = grid3.predict_proba(X_val)[:,1]
fpr3, tpr3, thresholds = roc_curve(y_val, y_val_proba, pos_label = 1)
plt.plot(fpr3, tpr3, '-o', linewidth=2, label = 'CatBoost')
plt.plot([0,1], [0,1], 'k--' )
plt.legend()
file_path = str(sg_num)+'_plot.png'
plt.savefig(file_path, dpi=300, bbox_inches='tight') 
plt.show()
n_arg = np.argmin(abs(tpr3-1))
thr_opt = thresholds[n_arg]
y_test_proba = grid3.predict_proba(X_test)[:,1]
y_test_final = y_test_proba.copy()
y_test_final[y_test_proba >= thr_opt] = 1
y_test_final[y_test_proba < thr_opt] = 0
file_path = str(sg_num)+'_ct.txt'
save_classifier(study, y_test, y_test_proba, y_test_final, thr_opt, file_path)

[I 2024-12-16 21:33:05,906] A new study created in memory with name: no-name-d4000ae3-930c-4c6b-87d4-d24ac6365a55
[I 2024-12-16 21:37:46,167] Trial 0 finished with value: 0.9461146878960184 and parameters: {'iterations': 1162, 'depth': 10, 'learning_rate': 0.001522084329172324, 'l2_leaf_reg': 3}. Best is trial 0 with value: 0.9461146878960184.
[I 2024-12-16 21:40:48,353] Trial 1 finished with value: 0.9507414545568625 and parameters: {'iterations': 1426, 'depth': 7, 'learning_rate': 0.003042268442375378, 'l2_leaf_reg': 6}. Best is trial 1 with value: 0.9507414545568625.
[I 2024-12-16 21:43:14,865] Trial 2 finished with value: 0.9576257884810445 and parameters: {'iterations': 1258, 'depth': 6, 'learning_rate': 0.01705333183111323, 'l2_leaf_reg': 10}. Best is trial 2 with value: 0.9576257884810445.
[W 2024-12-16 21:45:34,575] Trial 3 failed with parameters: {'iterations': 1699, 'depth': 10, 'learning_rate': 0.02201427949620894, 'l2_leaf_reg': 9} because of the following error: KeyboardIn

KeyboardInterrupt: 