In [2]:
import pandas as pd
import numpy as np
import copy

import src.utils as utils

# Load Config File

In [3]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/data.csv',
 'data_set_path': 'data/output/data.pkl',
 'input_set_path': 'data/output/input.pkl',
 'output_set_path': 'data/output/output.pkl',
 'input_columns_path': 'data/output/input_columns.pkl',
 'train_set_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'valid_set_path': ['data/output/X_valid.pkl', 'data/output/y_valid.pkl'],
 'test_set_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'output_column': 'Class',
 'seed': 42,
 'test_size': 0.2,
 'standardizer_path': 'data/output/standardizer.pkl',
 'preprocessor_path': 'data/output/preprocessor.pkl',
 'train_clean_path': ['data/output/X_train_clean.pkl',
  'data/output/y_train_clean.pkl'],
 'valid_clean_path': ['data/output/X_valid_clean.pkl',
  'data/output/y_valid_clean.pkl'],
 'test_clean_path': ['data/output/X_test_clean.pkl',
  'data/output/y_test_clean.pkl'],
 'list_of_model_path': 'log/list_of_model.pkl',
 'list_of_param_path': 'log/list_of_param.pkl',
 'list_of_tuned

# Create Model

Model of Choice
- KNN
- Logistic Regression
- Random Forest
- XGBoost

Define params

In [4]:
def create_model_param():
    """Create the model objects"""
    knn_params = {
        'n_neighbors': [50, 100, 200],
    }
    
    rf_params = {
        "n_estimators" : [i for i in range(50, 151, 30)],
        "min_samples_split" : [2, 4, 6, 8],
        "criterion" : ["gini", "entropy", "log_loss"]
    }

    lgr_params = {
        # 'penalty': ['l1', 'l2'],
        'C': [0.01, 0.1],
        'max_iter': [100, 300, 500]
    }

    xgb_params = {
        'n_estimators': [5, 10, 25, 50]
    }

    # Create model params
    list_of_param = {
        'KNeighborsClassifier': knn_params,
        'RandomForestClassifier': rf_params,
        'LogisticRegression': lgr_params,
        'XGBClassifier': xgb_params
    }

    return list_of_param


Define Models

In [5]:
# !pip install xgboost

In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [7]:
def create_model_object():
    """Create the model objects"""
    print("Creating model objects")

    # Create model objects
    knn = KNeighborsClassifier()
    rf = RandomForestClassifier()
    lgr = LogisticRegression(solver='sag') # 
    xgb = XGBClassifier()

    # Create list of model
    list_of_model = [
        {'model_name': knn.__class__.__name__, 'model_object': knn},
        {'model_name': rf.__class__.__name__, 'model_object': rf},
        {'model_name': lgr.__class__.__name__, 'model_object': lgr},
        {'model_name': xgb.__class__.__name__, 'model_object': xgb}
    ]

    return list_of_model


Do the cross validation

In [8]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score

In [9]:
list_of_param = create_model_param()
list_of_model = create_model_object()

Creating model objects


In [10]:
list_of_param

{'KNeighborsClassifier': {'n_neighbors': [50, 100, 200]},
 'RandomForestClassifier': {'n_estimators': [50, 80, 110, 140],
  'min_samples_split': [2, 4, 6, 8],
  'criterion': ['gini', 'entropy', 'log_loss']},
 'LogisticRegression': {'C': [0.01, 0.1], 'max_iter': [100, 300, 500]},
 'XGBClassifier': {'n_estimators': [5, 10, 25, 50]}}

In [11]:
list_of_model

[{'model_name': 'KNeighborsClassifier',
  'model_object': KNeighborsClassifier()},
 {'model_name': 'RandomForestClassifier',
  'model_object': RandomForestClassifier()},
 {'model_name': 'LogisticRegression',
  'model_object': LogisticRegression(solver='sag')},
 {'model_name': 'XGBClassifier',
  'model_object': XGBClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bynode=None,
                colsample_bytree=None, early_stopping_rounds=None,
                enable_categorical=False, eval_metric=None, feature_types=None,
                gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
                interaction_constraints=None, learning_rate=None, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=None, max_leaves=None,
                min_child_weight=None, missing=nan, monotone_constraints=None,
                n_estimators=100, n_jobs=No

In [12]:
def train_model(return_file=True):
    """Function to get the best model"""
    # Load dataset
    X_train = utils.pickle_load(CONFIG_DATA['train_clean_path'][0])
    y_train = utils.pickle_load(CONFIG_DATA['train_clean_path'][1])
    X_valid = utils.pickle_load(CONFIG_DATA['valid_clean_path'][0])
    y_valid = utils.pickle_load(CONFIG_DATA['valid_clean_path'][1])
    
    # Create list of params & models
    list_of_param = create_model_param()
    list_of_model = create_model_object()

    # List of trained model
    list_of_tuned_model = {}

    # Train model
    for base_model in list_of_model:
        # Current condition
        model_name = base_model['model_name']
        model_obj = copy.deepcopy(base_model['model_object'])
        model_param = list_of_param[model_name]

        # Debug message
        print('Training model :', model_name)

        # Create model object
        model = RandomizedSearchCV(estimator = model_obj,
                                   param_distributions = model_param,
                                   n_iter=5,
                                   cv = 5,
                                   random_state = 123,
                                   n_jobs=1,
                                   verbose=10,
                                   scoring = 'roc_auc')
        
        # Train model
        model.fit(X_train, y_train)

        # Predict
        y_pred_proba_train = model.predict_proba(X_train)[:, 1]
        y_pred_proba_valid = model.predict_proba(X_valid)[:, 1]
        
        # Get score
        train_score = roc_auc_score(y_train, y_pred_proba_train)
        valid_score = roc_auc_score(y_valid, y_pred_proba_valid)

        # Append
        list_of_tuned_model[model_name] = {
            'model': model,
            'train_auc': train_score,
            'valid_auc': valid_score,
            'best_params': model.best_params_
        }

        print("Done training")
        print("")

    # Dump data
    utils.pickle_dump(list_of_param, CONFIG_DATA['list_of_param_path'])
    utils.pickle_dump(list_of_model, CONFIG_DATA['list_of_model_path'])
    utils.pickle_dump(list_of_tuned_model, CONFIG_DATA['list_of_tuned_model_path'])

    if return_file:
        return list_of_param, list_of_model, list_of_tuned_model    


In [13]:
list_of_param, list_of_model, list_of_tuned_model = train_model()

Creating model objects
Training model : KNeighborsClassifier
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5; 1/3] START n_neighbors=50..............................................
[CV 1/5; 1/3] END ...............n_neighbors=50;, score=0.968 total time=   0.0s
[CV 2/5; 1/3] START n_neighbors=50..............................................
[CV 2/5; 1/3] END ...............n_neighbors=50;, score=0.989 total time=   0.0s
[CV 3/5; 1/3] START n_neighbors=50..............................................
[CV 3/5; 1/3] END ...............n_neighbors=50;, score=0.967 total time=   0.0s
[CV 4/5; 1/3] START n_neighbors=50..............................................
[CV 4/5; 1/3] END ...............n_neighbors=50;, score=0.995 total time=   0.0s
[CV 5/5; 1/3] START n_neighbors=50..............................................
[CV 5/5; 1/3] END ...............n_neighbors=50;, score=0.974 total time=   0.0s
[CV 1/5; 2/3] START n_neighbors=100..................................



[CV 5/5; 2/3] END ..............n_neighbors=100;, score=0.974 total time=   0.0s
[CV 1/5; 3/3] START n_neighbors=200.............................................
[CV 1/5; 3/3] END ..............n_neighbors=200;, score=0.967 total time=   0.0s
[CV 2/5; 3/3] START n_neighbors=200.............................................
[CV 2/5; 3/3] END ..............n_neighbors=200;, score=0.991 total time=   0.0s
[CV 3/5; 3/3] START n_neighbors=200.............................................
[CV 3/5; 3/3] END ..............n_neighbors=200;, score=0.971 total time=   0.0s
[CV 4/5; 3/3] START n_neighbors=200.............................................
[CV 4/5; 3/3] END ..............n_neighbors=200;, score=0.997 total time=   0.0s
[CV 5/5; 3/3] START n_neighbors=200.............................................
[CV 5/5; 3/3] END ..............n_neighbors=200;, score=0.967 total time=   0.0s


Done training

Training model : RandomForestClassifier
Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5; 1/5] START criterion=entropy, min_samples_split=2, n_estimators=110....
[CV 1/5; 1/5] END criterion=entropy, min_samples_split=2, n_estimators=110;, score=0.979 total time=   0.1s
[CV 2/5; 1/5] START criterion=entropy, min_samples_split=2, n_estimators=110....
[CV 2/5; 1/5] END criterion=entropy, min_samples_split=2, n_estimators=110;, score=0.979 total time=   0.2s
[CV 3/5; 1/5] START criterion=entropy, min_samples_split=2, n_estimators=110....
[CV 3/5; 1/5] END criterion=entropy, min_samples_split=2, n_estimators=110;, score=0.982 total time=   0.1s
[CV 4/5; 1/5] START criterion=entropy, min_samples_split=2, n_estimators=110....
[CV 4/5; 1/5] END criterion=entropy, min_samples_split=2, n_estimators=110;, score=0.999 total time=   0.1s
[CV 5/5; 1/5] START criterion=entropy, min_samples_split=2, n_estimators=110....
[CV 5/5; 1/5] END criterion=entropy, min_samples



[CV 1/5; 2/5] END ..........C=0.1, max_iter=100;, score=0.965 total time=   0.0s
[CV 2/5; 2/5] START C=0.1, max_iter=100.........................................
[CV 2/5; 2/5] END ..........C=0.1, max_iter=100;, score=0.990 total time=   0.0s
[CV 3/5; 2/5] START C=0.1, max_iter=100.........................................
[CV 3/5; 2/5] END ..........C=0.1, max_iter=100;, score=0.982 total time=   0.0s
[CV 4/5; 2/5] START C=0.1, max_iter=100.........................................
[CV 4/5; 2/5] END ..........C=0.1, max_iter=100;, score=0.995 total time=   0.0s
[CV 5/5; 2/5] START C=0.1, max_iter=100.........................................
[CV 5/5; 2/5] END ..........C=0.1, max_iter=100;, score=0.972 total time=   0.0s
[CV 1/5; 3/5] START C=0.1, max_iter=300.........................................
[CV 1/5; 3/5] END ..........C=0.1, max_iter=300;, score=0.971 total time=   0.0s
[CV 2/5; 3/5] START C=0.1, max_iter=300.........................................
[CV 2/5; 3/5] END ..........



[CV 4/5; 3/5] END ..........C=0.1, max_iter=300;, score=0.997 total time=   0.0s
[CV 5/5; 3/5] START C=0.1, max_iter=300.........................................
[CV 5/5; 3/5] END ..........C=0.1, max_iter=300;, score=0.972 total time=   0.0s
[CV 1/5; 4/5] START C=0.01, max_iter=100........................................
[CV 1/5; 4/5] END .........C=0.01, max_iter=100;, score=0.968 total time=   0.0s
[CV 2/5; 4/5] START C=0.01, max_iter=100........................................
[CV 2/5; 4/5] END .........C=0.01, max_iter=100;, score=0.988 total time=   0.0s
[CV 3/5; 4/5] START C=0.01, max_iter=100........................................
[CV 3/5; 4/5] END .........C=0.01, max_iter=100;, score=0.984 total time=   0.0s
[CV 4/5; 4/5] START C=0.01, max_iter=100........................................
[CV 4/5; 4/5] END .........C=0.01, max_iter=100;, score=0.998 total time=   0.0s
[CV 5/5; 4/5] START C=0.01, max_iter=100........................................
[CV 5/5; 4/5] END .........C



[CV 1/5; 2/4] END ..............n_estimators=10;, score=0.967 total time=   0.0s
[CV 2/5; 2/4] START n_estimators=10.............................................
[CV 2/5; 2/4] END ..............n_estimators=10;, score=0.981 total time=   0.0s
[CV 3/5; 2/4] START n_estimators=10.............................................
[CV 3/5; 2/4] END ..............n_estimators=10;, score=0.973 total time=   0.0s
[CV 4/5; 2/4] START n_estimators=10.............................................
[CV 4/5; 2/4] END ..............n_estimators=10;, score=0.994 total time=   0.0s
[CV 5/5; 2/4] START n_estimators=10.............................................
[CV 5/5; 2/4] END ..............n_estimators=10;, score=0.976 total time=   0.0s
[CV 1/5; 3/4] START n_estimators=25.............................................
[CV 1/5; 3/4] END ..............n_estimators=25;, score=0.974 total time=   0.0s
[CV 2/5; 3/4] START n_estimators=25.............................................
[CV 2/5; 3/4] END ..........

In [14]:
list_of_tuned_model

{'KNeighborsClassifier': {'model': RandomizedSearchCV(cv=5, estimator=KNeighborsClassifier(), n_iter=5, n_jobs=1,
                     param_distributions={'n_neighbors': [50, 100, 200]},
                     random_state=123, scoring='roc_auc', verbose=10),
  'train_auc': 0.9822038567493112,
  'valid_auc': 0.9722329620371388,
  'best_params': {'n_neighbors': 200}},
 'RandomForestClassifier': {'model': RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=5, n_jobs=1,
                     param_distributions={'criterion': ['gini', 'entropy',
                                                        'log_loss'],
                                          'min_samples_split': [2, 4, 6, 8],
                                          'n_estimators': [50, 80, 110, 140]},
                     random_state=123, scoring='roc_auc', verbose=10),
  'train_auc': 0.999880624426079,
  'valid_auc': 0.981115056037798,
  'best_params': {'n_estimators': 140,
   'min_samples_split': 8,
   'crit

Get the best model

In [15]:
def get_best_model(return_file=True):
    """Function to get the best model"""
    # Load tuned model
    list_of_tuned_model = utils.pickle_load(CONFIG_DATA['list_of_tuned_model_path'])

    # Get the best model
    best_model_name = None
    best_model = None
    best_performance = -99999
    best_model_param = None

    for model_name, model in list_of_tuned_model.items():
        if model['valid_auc'] > best_performance:
            best_model_name = model_name
            best_model = model['model']
            best_performance = model['valid_auc']
            best_model_param = model['best_params']

    # Dump the best model
    utils.pickle_dump(best_model, CONFIG_DATA['best_model_path'])

    # Print
    print('=============================================')
    print('Best model        :', best_model_name)
    print('Metric score      :', best_performance)
    print('Best model params :', best_model_param)
    print('=============================================')

    if return_file:
        return best_model

In [16]:
best_model = get_best_model()

Best model        : RandomForestClassifier
Metric score      : 0.981115056037798
Best model params : {'n_estimators': 140, 'min_samples_split': 8, 'criterion': 'entropy'}


## Prediction on test data

In [17]:
X_test = utils.pickle_load(CONFIG_DATA['test_clean_path'][0])
y_test = utils.pickle_load(CONFIG_DATA['test_clean_path'][1])

In [18]:
# Predict
y_test_proba = best_model.predict_proba(X_test)[:, 1]
y_pred = best_model.predict(X_test)

# Get score
score = roc_auc_score(y_test, y_test_proba)

In [19]:
score

0.9785603746281856

In [20]:
y_pred

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)