# Modelling v1

modelling use the latest exploration

# Goal

Get the model from the prepared data. Without oversampling. 

# Plan

Checklist what will be done on the notebook :

    [*] Get Data
    [ ] Modelling



In [1]:
import yaml
import pandas as pd
import matplotlib.pyplot as plt
import os
import joblib
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic
import time

from scipy.stats import uniform,randint
import warnings
warnings.filterwarnings("ignore")

In [2]:
with open("../../config.yaml", "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

PATH_RAW = config['path']['raw']
PATH_INTERIM = config['path']['interim']
PATH_PROCESSED = config['path']['processed']

if os.name == 'posix' :
    PATH_MODEL = config['path']['artifact']+os.getcwd().split('/')[-1]+'/'
else:
    PATH_MODEL = config['path']['artifact']+os.getcwd().split('\\')[-1]+'/'

PATH_UDF = config['path']['udf']

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

## Get Data

In [4]:
raw_data = pd.read_csv(PATH_PROCESSED+'prep_iter2.csv')
col_cat_load = joblib.load(PATH_MODEL+'cat_names.pkl')
col_num_load = joblib.load(PATH_MODEL+'num_names.pkl')

In [5]:
X_tr = raw_data[raw_data.is_train == 1][col_num_load+col_cat_load]
Y_tr = raw_data[raw_data.is_train == 1]['target'].values

X_val = raw_data[raw_data.is_train == 0][col_num_load+col_cat_load]
Y_val = raw_data[raw_data.is_train == 0]['target'].values

In [6]:
X_tr.shape

(32665, 26)

In [7]:
X_val.shape

(5765, 26)

In [8]:
Y_val.shape

(5765,)

## Modelling

In [9]:
proportion_cls = pd.Series(Y_tr).value_counts(normalize = True)
scale_pos_w = np.ceil(proportion_cls[0]/proportion_cls[1])
cw = {0:1, 1:scale_pos_w}
cw
# scale_pos_weight for xgboost

{0: 1, 1: 9.0}

In [10]:
proportion_cls

0    0.884525
1    0.115475
dtype: float64

In [11]:
all_model = {'logreg' : LogisticRegression(class_weight=cw)
            ,'svm' : SVC(class_weight=cw, probability=False)
            ,'rf': RandomForestClassifier(class_weight=cw, random_state=234)
            ,'nb':GaussianNB(priors=[0.88,0.12])}

In [13]:
param_options = {'logreg' : dict(C=uniform(loc=0, scale=2)
                                 ,penalty=['l2', 'l1']
                                 ,solver=['liblinear', 'sag'])
                 ,'svm' : dict(C=uniform(loc=0, scale=2)
                              ,kernel=['linear', 'poly', 'rbf']
                              ,degree= randint(2, 5)
                              ,coef0=uniform(loc=0, scale=2))  
                 ,'rf' : dict(n_estimators=randint(300, 500)
                             ,max_depth=randint(20, 50)
                             ,min_samples_split=randint(5, 20)
                             ,min_samples_leaf=randint(5, 20)
                             ,min_weight_fraction_leaf=uniform(loc=0, scale=0.4)
                             ,min_impurity_decrease=uniform(loc=0, scale=0.3))
                 ,'nb' : dict(var_smoothing = uniform(loc=1e-8, scale=1e-2))
                }

 # min_weight_fraction_leaf must in [0, 0.5]

In [14]:
## scoring
## https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
best_clf = {}
best_scr = {}
sla_model = {}
for mdl in all_model.keys():
    L1 = time.time()
    clf = all_model[mdl]
    parm = param_options[mdl]
    if mdl == 'svm':
        RS = RandomizedSearchCV(clf, parm, n_iter = 25, cv = 5, scoring = 'roc_auc',n_jobs = 12)
    else:
        RS = RandomizedSearchCV(clf, parm, n_iter = 100, cv = 5, scoring = 'roc_auc',n_jobs = 10)
        
    RS.fit(X_tr,Y_tr)
    
    best_clf[mdl] = RS.best_estimator_
    best_scr[mdl] = RS.best_score_
    diff_time = round((time.time()-L1)/60, 2)
    
    joblib.dump(best_clf[mdl],PATH_MODEL+mdl+'.pkl')
    
    print(mdl, 'is done in ',diff_time, ' minutes')
    sla_model[mdl] = diff_time

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.7/

logreg is done in  4.44  minutes


Traceback (most recent call last):
  File "/usr/local/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.7/

svm is done in  20.87  minutes
rf is done in  3.18  minutes
nb is done in  0.07  minutes


In [15]:
sla_model

{'logreg': 4.44, 'svm': 20.87, 'rf': 3.18, 'nb': 0.07}

In [18]:
best_scr

{'logreg': 0.85417726397065,
 'svm': 0.8550122811738815,
 'rf': 0.8185848382454501,
 'nb': 0.7911350961212115}

In [19]:
best_clf

{'logreg': LogisticRegression(C=0.175999547315985, class_weight={0: 1, 1: 8.0},
                    penalty='l1', solver='liblinear'),
 'svm': SVC(C=0.9025510540801358, class_weight={0: 1, 1: 8.0}, coef0=0.9650458090160368,
     degree=4, kernel='linear'),
 'rf': RandomForestClassifier(class_weight={0: 1, 1: 8.0}, max_depth=32,
                        min_impurity_decrease=0.01866685087467975,
                        min_samples_leaf=8, min_samples_split=9,
                        min_weight_fraction_leaf=0.14676768084094832,
                        n_estimators=359, random_state=234),
 'nb': GaussianNB(priors=[0.885, 0.115], var_smoothing=0.009959748834230392)}

## Evaluation

The SVM is best so far, but there is no predict proba.

In [29]:
for ml in best_clf.keys():
    Y_prob_pred = best_clf[ml].predict(X_val)
    print(ml)
    print(classification_report(Y_val,Y_prob_pred))

logreg
              precision    recall  f1-score   support

           0       0.97      0.80      0.87      5079
           1       0.35      0.79      0.48       686

    accuracy                           0.80      5765
   macro avg       0.66      0.79      0.68      5765
weighted avg       0.89      0.80      0.83      5765

svm
              precision    recall  f1-score   support

           0       0.97      0.79      0.87      5079
           1       0.34      0.79      0.48       686

    accuracy                           0.79      5765
   macro avg       0.65      0.79      0.67      5765
weighted avg       0.89      0.79      0.82      5765

rf
              precision    recall  f1-score   support

           0       0.96      0.72      0.82      5079
           1       0.27      0.77      0.40       686

    accuracy                           0.73      5765
   macro avg       0.62      0.75      0.61      5765
weighted avg       0.88      0.73      0.77      5765

nb
  

In [31]:
1-Y_val.mean()

0.8810060711188205

In [None]:
## use sample_weight