In [None]:
import yaml
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from src.models.model_selection import GridSearch, FeatureSelection
from src.models.classification import Classification

In [None]:
# read config
with open('config.yml', 'r') as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)
del file

In [None]:
# load data
df = pd.read_csv(config['data_loader']['path'])
display(df.head())

# split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df.iloc[:,1:-1], df['Class']
    , test_size=config['model_selection']['test_set_size']
    , random_state=123
    , shuffle=True
    )

In [None]:
grid_search = GridSearch(model_selection=config['model_selection'])
grid_search.fit(X=X_train, y=y_train)

In [None]:
# greedy feature selection
clf=Classification(
    algorithm=grid_search.best_algorithm
    , **grid_search.best_hyperparams
    )
feature_selection = FeatureSelection(X=X_train, y=y_train)
feature_selection.wrapper(clf=clf, model_selection=config['model_selection']
)

# should feature selection be done before or after model selection?
# should it be trained on different dataset?

In [None]:
# fit best algorithm on most important features of training data 
clf = clf=Classification(
    algorithm=grid_search.best_algorithm
    , **grid_search.best_hyperparams
    )
clf.fit(X=X_train.iloc[100000:, :], y=y_train.iloc[100000:,])
# predict target value for the test set
y_pred = clf.predict(X_test)
y_score = clf.score(X_test)[:, -1]

In [None]:
from src.models.model_selection import ClassificationThreshold

tuned_clf = ClassificationThreshold(model_selection=config['model_selection'])
tuned_clf.fit(clf=clf, X=X_train.iloc[:100000, :], y=y_train.iloc[:100000,])

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from src.visuals.pr_roc_curve import plot_roc_pr_curves

plot_roc_pr_curves(clf.model, tuned_clf.model, X_train=X_train, y_train=y_train)

# Print evaluation metrics
print(
    f"""Accuracy : {round(accuracy_score(y_test, y_pred), 5)}
Precision: {round(precision_score(y_test, y_pred), 5)}
Recall   : {round(recall_score(y_test, y_pred), 5)}
F1-Score : {round(f1_score(y_test, y_pred), 5)}"""
)

In [None]:
from src.visuals.boundary import plot_boundary

plot_boundary(
    X=X_test.iloc[:,[3,6,8]], y=y_test, clf=clf, azim=50, plot_points=True
)

In [None]:
#pd.DataFrame({'score': y_score, 'label': y_pred}).groupby(by=['label']).describe()
#print(clf.model.decision_path(X_test[best_features[:2]].iloc[:10,:]))

In [None]:
"""
https://medium.com/towards-data-science/tune-in-decision-threshold-optimization-with-scikit-learns-tunedthresholdclassifiercv-7de558a2cf58
https://scikit-learn.org/stable/auto_examples/model_selection/plot_cost_sensitive_learning.html#tunedthresholdclassifiercv-no-cv
https://scikit-learn.org/stable/modules/classification_threshold.html

overfitting

https://scikit-learn.org/stable/modules/learning_curve.html
"""