# Modelling v6

modelling use the latest exploration

# Goal

Get the model from the prepared data. With undersampling + sample weight. 

# Plan

Checklist what will be done on the notebook :

    [*] Get Data
    [*] Modelling



In [1]:
import yaml
import pandas as pd
import matplotlib.pyplot as plt
import os
import joblib
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic
import time

from scipy.stats import uniform,randint
import warnings
warnings.filterwarnings("ignore")

In [2]:
if 'iter' in os.getcwd().split('/')[-1]:
    ######## run manually #############
    with open("../../config.yaml", "r") as f:
        config = yaml.load(f, Loader=yaml.FullLoader)
        
    if os.name == 'posix' :
        PATH_MODEL = config['path']['artifact']+os.getcwd().split('/')[-1]+'/'
    else:
        PATH_MODEL = config['path']['artifact']+os.getcwd().split('\\')[-1]+'/'

else:
    ######### run with makefile #############
    with open("config-make.yaml", "r") as f:
        config = yaml.load(f, Loader=yaml.FullLoader)

    if os.name == 'posix' :
        PATH_MODEL = config['path']['artifact']+'iteration_2/'
    else:
        PATH_MODEL = config['path']['artifact']+'iteration_2/'

PATH_RAW = config['path']['raw']
PATH_INTERIM = config['path']['interim']
PATH_PROCESSED = config['path']['processed']
PATH_UDF = config['path']['udf']

In [3]:
from sklearn.linear_model import LogisticRegression, ElasticNet, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessRegressor

## Get Data

In [4]:
raw_data = pd.read_csv(PATH_PROCESSED+'prep_iter2.csv')
col_cat_load = joblib.load(PATH_MODEL+'cat_names.pkl')
col_num_load = joblib.load(PATH_MODEL+'num_names.pkl')

In [5]:
all_tr = raw_data[raw_data.is_train == 1]
down_size = int(len(all_tr[all_tr.target == 1]) * 3.5)
all_tr0 = all_tr[all_tr.target == 0].sample(down_size,random_state = 125)

all_tr0['sweight'] = 1.5
all_tr1 = all_tr[all_tr.target == 1]
all_tr1['sweight'] = 1

all_tr_down = pd.concat([all_tr0,all_tr1])
all_tr_down.target.value_counts(normalize = 1)

samp_w = all_tr_down['sweight']

In [6]:
X_tr = all_tr_down[col_num_load+col_cat_load]
Y_tr = all_tr_down['target'].values

X_val = raw_data[raw_data.is_train == 0][col_num_load+col_cat_load]
Y_val = raw_data[raw_data.is_train == 0]['target'].values

In [7]:
X_tr.shape

(16974, 26)

In [8]:
X_val.shape

(5765, 26)

In [9]:
Y_val.shape

(5765,)

In [10]:
Y_tr.mean()

0.2222222222222222

In [11]:
samp_w

5079     1.5
17291    1.5
9206     1.5
4581     1.5
19666    1.5
        ... 
38422    1.0
38424    1.0
38425    1.0
38426    1.0
38427    1.0
Name: sweight, Length: 16974, dtype: float64

## Modelling

In [12]:
# proportion_cls = pd.Series(Y_tr).value_counts(normalize = True)
# scale_pos_w = np.ceil(proportion_cls[0]/proportion_cls[1])
scale_pos_w = 2
cw = {0:1, 1:scale_pos_w}
cw
# scale_pos_weight for xgboost

{0: 1, 1: 2}

In [13]:
# proportion_cls

In [14]:
base_logreg = LogisticRegression(class_weight=cw)
all_model = {'sgd6' : SGDClassifier(class_weight=cw,max_iter = 2500,early_stopping=True,warm_start=True,eta0=0.05)
            ,'logreg6' : LogisticRegression(class_weight=cw)
            ,'nb6': GaussianNB(priors=[0.778,0.222])
            ,'bagtree6': BaggingClassifier(random_state=234)
            ,'baglog6': BaggingClassifier(base_estimator=base_logreg, random_state=234)
            ,'adab_tree6':AdaBoostClassifier(random_state=234)
            ,'adab_log6':AdaBoostClassifier(base_estimator=base_logreg,random_state=234)
            }

In [15]:
param_options = {'logreg6' : dict(C=uniform(loc=0, scale=2)
                                 ,penalty=['l2', 'l1']
                                 ,solver=['liblinear', 'sag'])
                 ,'nb6' : dict(var_smoothing = uniform(loc=1e-8, scale=1e-2))
                 ,'sgd6' : dict(loss=['hinge', 'squared_hinge', 'perceptron']
                                ,penalty=['l2', 'l1']
                                ,alpha=uniform(loc=0.1, scale=0.95)
                                ,l1_ratio=uniform(loc=0.1, scale=0.85)
                                ,learning_rate=['constant','optimal','invscaling','adaptive'])
                 ,'bagtree6' : dict(n_estimators=randint(4, 35)
                                   ,max_samples=uniform(loc=0.3,scale=0.7)
                                   ,max_features=uniform(loc=0.3,scale=0.7)
                                   ,warm_start=[False,True])
                 ,'baglog6' : dict(n_estimators=randint(4, 35)
                                   ,max_samples=uniform(loc=0.3,scale=0.7)
                                   ,max_features=uniform(loc=0.3,scale=0.7)
                                   ,warm_start=[False,True])
                 ,'adab_tree6' : dict(n_estimators=randint(50, 120)
                                   ,learning_rate=uniform(loc=0.1,scale=0.85)
                                  )
                 ,'adab_log6' : dict(n_estimators=randint(50, 120)
                                   ,learning_rate=uniform(loc=0.1,scale=0.85)
                                  )
                }

 # min_weight_fraction_leaf must in [0, 0.5]

In [15]:
## scoring
## https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
best_clf = {}
best_scr = {}
sla_model = {}
for mdl in all_model.keys():
    L1 = time.time()
    clf = all_model[mdl]
    parm = param_options[mdl]
    RS = RandomizedSearchCV(clf, parm, n_iter = 100, cv = 5, scoring = 'roc_auc',n_jobs = 10)
        
    RS.fit(X_tr,Y_tr, sample_weight=samp_w)
    
    best_clf[mdl] = RS.best_estimator_
    best_scr[mdl] = RS.best_score_
    diff_time = round((time.time()-L1)/60, 2)
    
    joblib.dump(best_clf[mdl],PATH_MODEL+mdl+'.pkl')
    
    print(mdl, 'is done in ',diff_time, ' minutes')
    sla_model[mdl] = diff_time

In [17]:
sla_model

{'sgd6': 0.2,
 'logreg6': 0.74,
 'nb6': 0.05,
 'bagtree6': 0.53,
 'baglog6': 4.26,
 'adab_tree6': 1.55,
 'adab_log6': 4.79}

In [18]:
best_scr

{'sgd6': 0.8323752731702989,
 'logreg6': 0.8332433215818229,
 'nb6': 0.7610658079641831,
 'bagtree6': 0.8270074449433882,
 'baglog6': 0.8354422867843763,
 'adab_tree6': 0.8462773278756875,
 'adab_log6': 0.3448619313830982}

In [19]:
best_clf

{'sgd6': SGDClassifier(alpha=0.7740025389951034, class_weight={0: 1, 1: 2},
               early_stopping=True, eta0=0.05, l1_ratio=0.320449581358638,
               learning_rate='invscaling', max_iter=2500, warm_start=True),
 'logreg6': LogisticRegression(C=0.06403195312003063, class_weight={0: 1, 1: 2},
                    penalty='l1', solver='liblinear'),
 'nb6': GaussianNB(priors=[0.778, 0.222], var_smoothing=1.7399798799287706e-05),
 'bagtree6': BaggingClassifier(max_features=0.721855438722457,
                   max_samples=0.5163370052061359, n_estimators=34,
                   random_state=234, warm_start=True),
 'baglog6': BaggingClassifier(base_estimator=LogisticRegression(class_weight={0: 1, 1: 2}),
                   max_features=0.32573528487181874,
                   max_samples=0.5485298898362136, n_estimators=15,
                   random_state=234, warm_start=True),
 'adab_tree6': AdaBoostClassifier(learning_rate=0.6796433013905538, n_estimators=113,
                

## Evaluation

The SVM is best so far, but there is no predict proba.

In [20]:
for ml in best_clf.keys():
    Y_prob_pred = best_clf[ml].predict(X_val)
    print(ml)
    print(classification_report(Y_val,Y_prob_pred))

sgd6
              precision    recall  f1-score   support

           0       0.88      1.00      0.94      5079
           1       0.00      0.00      0.00       686

    accuracy                           0.88      5765
   macro avg       0.44      0.50      0.47      5765
weighted avg       0.78      0.88      0.83      5765

logreg6
              precision    recall  f1-score   support

           0       0.94      0.92      0.93      5079
           1       0.48      0.55      0.51       686

    accuracy                           0.87      5765
   macro avg       0.71      0.73      0.72      5765
weighted avg       0.88      0.87      0.88      5765

nb6
              precision    recall  f1-score   support

           0       0.94      0.77      0.85      5079
           1       0.28      0.65      0.39       686

    accuracy                           0.76      5765
   macro avg       0.61      0.71      0.62      5765
weighted avg       0.86      0.76      0.80      5765

ba

In [21]:
## Save train data for explainability
joblib.dump(X_tr,PATH_MODEL+'train_data6.pkl')
joblib.dump(Y_tr,PATH_MODEL+'target_data6.pkl')

['artifacts/iteration_2/target_data6.pkl']

In [22]:
1-Y_val.mean()

0.8810060711188205