In [None]:
import yaml
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from src.models.classification import Classification
from src.models.model_selection import grid_search, feature_selector
from src.models.utils import get_run

In [None]:
# read config
with open('config.yml', 'r') as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)
del file

In [None]:
# load data
df = pd.read_csv(config['data_loader']['path'])

df.head()

In [None]:
# split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df.iloc[:,1:-1], df['Class']
    , test_size=config['model_selection']['test_set_size']
    , random_state=123
    , shuffle=True
    )

In [None]:
hyper_params = {
    'logistic_regression': {
        'tol': [0.0001, 0.00005]
        , 'max_iter': [100]
        }
    #, 'support_vector_machine': {
    #    'tol': [0.01, 0.001]
    #}
    , 'decision_tree': {
        'max_depth': [20]
        , 'min_samples_leaf': [2, 5]
        , 'max_leaf_nodes': [10]
        }
    , 'random_forest': {
        'n_estimators': [10, 20]
        , 'max_depth': [20]
        , 'min_samples_leaf': [2, 5]
        , 'max_leaf_nodes': [10]
        }
    , 'xgboost': {
        'n_estimators': [10, 20]
        , 'max_depth': [20]
        }
    }

In [None]:
grid_search(
    X=X_train
    , y=y_train
    , hyper_params=hyper_params
    , cv=config['model_selection']['cross_validator']
    , scoring_metric=config['model_selection']['scoring_metric']
)

In [None]:
# select best algorithm and its hyper-params
best_algorithm, best_hyper_params = get_run(
    experiment_names=['model_evaluation']
    , order_by_metric=True
    , metric_name=config['model_selection']['scoring_metric']
    , num=2
)

# greedy feature selection
"""best_features = feature_selector(
    X=X_train
    , y=y_train
    , algorithm=best_algorithm
    , algorithm_params=best_hyper_params
    , tol=config['model_selection']['tolerance']
    , cv=config['model_selection']['cross_validator']
    , scoring_metric=config['model_selection']['scoring_metric']
)"""

print(
    f'algorithm: {best_algorithm}'
    , f'\nparameters: {best_hyper_params}'
    #, f'\nfeatures: {best_features}'
)

In [None]:
# fit best algorithm on most important features of training data 
clf = Classification(algorithm=best_algorithm, **best_hyper_params)
clf.fit(X=X_train.iloc[:,[5,7]], y=y_train)
# predict target value for the test set
y_pred = clf.predict(X_test.iloc[:,[5,7]])
y_score = clf.score(X_test.iloc[:,[5,7]])[:, -1]

In [None]:
from src.visuals.boundary import plot_boundary

plot_boundary(
    X=X_test.iloc[:,[5,7]], y=y_test, clf=clf, plot_points=False
)

In [None]:
#pd.DataFrame({'score': y_score, 'label': y_pred}).groupby(by=['label']).describe()
#print(clf.model.decision_path(X_test[best_features[:2]].iloc[:10,:]))

In [None]:
from sklearn.metrics import (
    accuracy_score
    , precision_score, recall_score, f1_score
    , roc_curve, auc, RocCurveDisplay
)

print(
    f"""Accuracy : {round(accuracy_score(y_test, y_pred), 5)}
Precision: {round(precision_score(y_test, y_pred), 5)}
Recall   : {round(recall_score(y_test, y_pred), 5)}
F1-Score : {round(f1_score(y_test, y_pred), 5)}"""
)

fpr, tpr, thresholds = roc_curve(
    y_true=y_test
    , y_score=y_score
    , drop_intermediate=False
    )
roc_auc = auc(x=fpr, y=tpr)
display = RocCurveDisplay(
    fpr=fpr, tpr=tpr, roc_auc=roc_auc,
    estimator_name=best_algorithm
    )
display.plot()
plt.show()