In [None]:
import yaml
import pandas as pd
from sklearn.model_selection import train_test_split

from src.data.prepare_data import prepare_data
from src.data.utils import resample_data
from src.models.utils import train_splits
from src.models.model_selection import GridSearch, ClassificationThreshold
from src.models.classification import Classification
from src.visuals.pr_roc_curve import plot_roc_pr_curves
from src.models.evaluation import Evaluation

In [None]:
# read config
with open('config.yml', 'r') as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)
del file

In [None]:
# load and prepare data
df = pd.read_csv(config['data_loader']['path'])
df = prepare_data(df=df)
display(df.head())

# resample for imbalanced sets
df_sampled = resample_data(df=df, pos_share=0.01)

# check class distributions
print(
    df['target'].value_counts(normalize=True)
    , df_sampled['target'].value_counts(normalize=True)
    )

In [None]:
# split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df_sampled.iloc[:,:-1], df_sampled['target']
    , test_size=config['train_test_split']['test_size']
    , random_state=123
    , shuffle=True
    , stratify=df_sampled['target']
    )

# check class distributions
print(
    y_train.value_counts(normalize=True)
    , y_test.value_counts(normalize=True)
    )

In [None]:
# split train sets into multiple sets and check class distributions
train = train_splits(X_train, y_train, config['train_test_split'])
[train[i].iloc[:,-1].value_counts(normalize=True) for i in train.keys()]

In [None]:
# search best algorithm and hyperparams
grid_search = GridSearch(config=config['optimization'])
grid_search.fit(X=train[1].iloc[:,:-1], y=train[1].iloc[:,-1])

for j in grid_search.results.keys():
    print(j, '-', grid_search.results[j]['best_score'])

In [None]:
# fit best algorithm
clf = Classification(
    algorithm=grid_search.best_algorithm
    , **grid_search.best_hyperparams
    )

clf.fit(X=train[1].iloc[:, :-1], y=train[1].iloc[:,-1])

# tune classification threshold
tuned_clf = ClassificationThreshold(config=config['optimization'])
threshold = tuned_clf.fit(clf=clf, X=train[2].iloc[:,:-1], y=train[2].iloc[:,-1])
print(threshold)

plot_roc_pr_curves(clf.model, tuned_clf.model, X_train=train[2].iloc[:,:-1], y_train=train[2].iloc[:,-1])

In [None]:
# fit best algorithm on whole training set
clf = Classification(
    algorithm=grid_search.best_algorithm
    , **grid_search.best_hyperparams
    )
clf.fit(X=X_train, y=y_train)

# test set evaluation
eval = Evaluation(clf=clf, threshold=threshold)
eval.fit(
    X_train=X_train, y_train=y_train
    , X_test=X_test, y_test=y_test
    )