In [None]:
import yaml
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from src.models.classification import Classification
from src.models.model_selection import grid_search, feature_selector
from src.models.utils import get_run

In [None]:
# read config
with open('config.yml', 'r') as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)
del file

In [None]:
# load data
df = pd.read_csv(config['data_loader']['path'])
display(df.head())

# split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df.iloc[:,1:-1], df['Class']
    , test_size=config['model_selection']['test_set_size']
    , random_state=123
    , shuffle=True
    )

In [None]:
grid_search(
    X=X_train
    , y=y_train
    , hyper_params=config['model_selection']['algorithms']
    , cv=config['model_selection']['cross_validator']
    , scoring_metric=config['model_selection']['scoring_metric']
)

In [None]:
# select best algorithm and its hyper-params
best_algorithm, best_hyper_params = get_run(
    experiment_names=['model_evaluation']
    , order_by_metric=True
    , metric_name=config['model_selection']['scoring_metric']
    , num=2
)

# greedy feature selection
"""best_features = feature_selector(
    X=X_train
    , y=y_train
    , algorithm=best_algorithm
    , algorithm_params=best_hyper_params
    , tol=config['model_selection']['tolerance']
    , cv=config['model_selection']['cross_validator']
    , scoring_metric=config['model_selection']['scoring_metric']
)"""

print(
    f'algorithm: {best_algorithm}'
    , f'\nparameters: {best_hyper_params}'
    #, f'\nfeatures: {best_features}'
)

In [None]:
# fit best algorithm on most important features of training data 
clf = Classification(algorithm=best_algorithm, **best_hyper_params)
clf.fit(X=X_train.iloc[:,[3,6,8]], y=y_train)
# predict target value for the test set
y_pred = clf.predict(X_test.iloc[:,[3,6,8]])
y_score = clf.score(X_test.iloc[:,[3,6,8]])[:, -1]

In [None]:
from src.visuals.boundary import plot_boundary

plot_boundary(
    X=X_test.iloc[:,[3,6,8]], y=y_test, clf=clf, azim=50
)

In [None]:
#pd.DataFrame({'score': y_score, 'label': y_pred}).groupby(by=['label']).describe()
#print(clf.model.decision_path(X_test[best_features[:2]].iloc[:10,:]))

In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_curve, auc, RocCurveDisplay,
    precision_recall_curve, PrecisionRecallDisplay, average_precision_score
)

# Print evaluation metrics
print(
    f"""Accuracy : {round(accuracy_score(y_test, y_pred), 5)}
Precision: {round(precision_score(y_test, y_pred), 5)}
Recall   : {round(recall_score(y_test, y_pred), 5)}
F1-Score : {round(f1_score(y_test, y_pred), 5)}"""
)

# plot ROC and PR curves
fig, ax = plt.subplots(1, 2, figsize=(14, 6))
# plot ROC curve
fpr, tpr, _ = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)
ax[0].plot(fpr, tpr, color="blue", lw=2, label=f"ROC Curve (AUC = {roc_auc:.3f})")
ax[0].plot([0, 1], [0, 1], color="gray", linestyle="--", lw=2, label="Random Guessing")
ax[0].set_xlabel("False Positive Rate")
ax[0].set_ylabel("True Positive Rate")
ax[0].set_title("ROC Curve")
ax[0].legend(loc="lower right")
ax[0].grid(True)

# plot PR curve
precision, recall, _ = precision_recall_curve(y_test, y_score)
ap_score = average_precision_score(y_test, y_score)
ax[1].plot(recall, precision, color="green", lw=2, label=f"PR Curve (AP = {ap_score:.3f})")
ax[1].set_xlabel("Recall")
ax[1].set_ylabel("Precision")
ax[1].set_title("Precision-Recall Curve")
ax[1].legend(loc="lower left")
ax[1].grid(True)

plt.tight_layout()
plt.show()