In [None]:
import yaml
import pandas as pd
from sklearn.model_selection import train_test_split

from src.data.prepare_data import prepare_data
from src.data.utils import resample_data
from src.models.utils import train_splits, imbalanced_sampling
from src.models.model_selection import GridSearch
from src.models.classification import Classification
from src.models.feature_selection import FeatureSelection
from src.models.evaluation import Evaluation

In [None]:
# read config
with open('config.yml', 'r') as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)
del file

In [None]:
# load and prepare data
df = pd.read_csv(config['data_loader']['path'])
df = prepare_data(df=df)
display(df.head())

# resample for imbalanced sets
df_sampled = resample_data(df=df, pos_share=0.01)

# check class distributions
print(
    df['label'].value_counts(normalize=True)
    , df_sampled['label'].value_counts(normalize=True)
    )

In [None]:
# split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df_sampled.iloc[:,:-1], df_sampled['label']
    , test_size=config['train_test_split']['test_size']
    , random_state=123
    , shuffle=True
    , stratify=df_sampled['label']
    )

# check class distributions
print(
    y_train.value_counts(normalize=True)
    , y_test.value_counts(normalize=True)
    )

In [None]:
# apply oversampling to the train set
X_train_rs, y_train_rs = imbalanced_sampling(
    method='over'
    , X_train=X_train
    , y_train=y_train
)

# check class distributions
print(
    y_train.value_counts(normalize=True)
    , y_train_rs.value_counts(normalize=True)
    )

In [None]:
# split train sets into multiple sets and check class distributions
train = train_splits(X_train_rs, y_train_rs, config['train_test_split'])
[train[i].iloc[:,-1].value_counts(normalize=True) for i in train.keys()]

In [None]:
# search best algorithm and hyperparams
grid_search = GridSearch(config=config['optimization'])
grid_search.fit(X=train[1].iloc[:,:-1], y=train[1].iloc[:,-1])

for j in grid_search.results.keys():
    print(j, '-', grid_search.results[j]['best_score'])

In [None]:
# greedy feature selection
clf = Classification(
    algorithm=grid_search.best_algorithm
    , **grid_search.best_hyperparams
    )

feature_selection = FeatureSelection(X=train[2].iloc[:,:-1], y=train[2].iloc[:,-1])
feats = feature_selection.wrapper(clf=clf, config=config['optimization'])

feats

In [None]:
# fit best algorithm on most important features of training data 
clf = Classification(
    algorithm=grid_search.best_algorithm
    , **grid_search.best_hyperparams
    )
clf.fit(X=X_train_rs[feats], y=y_train_rs)

# test set evaluation
eval = Evaluation(clf=clf, threshold=0.5)
eval.fit(
    X_train=X_train_rs[feats], y_train=y_train_rs
    , X_test=X_test[feats], y_test=y_test
    )