## Use Logistic regression as a baseline model.

In [1]:
import pandas as pd
# Logistic regression model
from sklearn.linear_model import LogisticRegression as lr
# some preprocessing methods
from sklearn.preprocessing import StandardScaler
# cross_validation results.
from sklearn.model_selection import cross_val_score
# pipeline 
from sklearn.pipeline import make_pipeline

In [2]:
train_data = pd.read_csv('../2_Data_Clean/hr/train_data.csv')

In [3]:
X = train_data.drop('Class', axis =1)
y = train_data["Class"]

In [4]:
logistic = lr(C = 2)

In [5]:
cv_scores_acc = cross_val_score(logistic, X,y, cv =10)
cv_scores_acc

array([0.99899684, 0.99874605, 0.99889652, 0.99859558, 0.99864573,
       0.99909711, 0.99914727, 0.99859551, 0.99889647, 0.9990469 ])

In [6]:
cv_scores_f1 = cross_val_score(logistic, X,y, scoring="f1", cv =10)
cv_scores_f1

array([0.67741935, 0.56140351, 0.65625   , 0.44      , 0.52631579,
       0.7       , 0.70175439, 0.48148148, 0.64516129, 0.6779661 ])

In [7]:
cv_scores_precision = cross_val_score(logistic, X,y, scoring="precision", cv =10)
cv_scores_precision

array([0.77777778, 0.72727273, 0.72413793, 0.73333333, 0.68181818,
       0.80769231, 0.86956522, 0.65      , 0.71428571, 0.8       ])

In [8]:
cv_scores_recall = cross_val_score(logistic, X,y, scoring="recall", cv =10, n_jobs =2)
cv_scores_recall

array([0.6       , 0.45714286, 0.6       , 0.31428571, 0.42857143,
       0.61764706, 0.58823529, 0.38235294, 0.58823529, 0.58823529])

In [9]:
cv_score_roc_auc = cross_val_score(logistic, X,y, scoring = "roc_auc" ,cv =10, n_jobs =2)
cv_score_roc_auc

array([0.93018792, 0.93133066, 0.94273368, 0.83233846, 0.86363897,
       0.94226415, 0.89944404, 0.81369889, 0.89098199, 0.93490868])

In [10]:
logistic_pipe = make_pipeline(StandardScaler(), logistic)

In [11]:
cross_val_score(logistic_pipe, X, y , scoring = "f1", cv = 10)

array([0.6984127 , 0.6557377 , 0.6984127 , 0.67857143, 0.54545455,
       0.66666667, 0.77966102, 0.61818182, 0.73333333, 0.76363636])

In [12]:
cross_val_score(logistic_pipe, X, y , scoring = "precision", cv = 10)

array([0.78571429, 0.76923077, 0.78571429, 0.9047619 , 0.75      ,
       0.9       , 0.92      , 0.80952381, 0.84615385, 1.        ])

In [13]:
cross_val_score(logistic_pipe, X, y , scoring = "recall", cv = 10)

array([0.62857143, 0.57142857, 0.62857143, 0.54285714, 0.42857143,
       0.52941176, 0.67647059, 0.5       , 0.64705882, 0.61764706])

In [14]:
pipe = make_pipeline(StandardScaler(), lr(C=3))

In [15]:
cross_val_score(pipe, X,y, scoring ='f1', cv =10)

array([0.6984127 , 0.6557377 , 0.6984127 , 0.67857143, 0.54545455,
       0.66666667, 0.77966102, 0.61818182, 0.73333333, 0.76363636])

In [16]:
from sklearn.model_selection import cross_validate

In [17]:
all_metrics = cross_validate(pipe, X,y, cv=10, scoring = ['f1','precision','recall','roc_auc'],return_train_score = True)

In [18]:
pipe

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=3, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [19]:
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV 
from scipy.stats import expon
import numpy as np

model = lr()

param_grid = {'penalty':['l1','l2'],'C': expon(scale=1),'max_iter':sp_randint(100,1000)}


random_cv = RandomizedSearchCV(model,param_distributions=param_grid,n_iter= 20,n_jobs=2, cv =10, scoring = 'f1', error_score = 0)


In [20]:
random_cv.fit(X,y)

RandomizedSearchCV(cv=10, error_score=0,
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=20, n_jobs=2,
          param_distributions={'penalty': ['l1', 'l2'], 'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fb881c3a2e8>, 'max_iter': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fb881c3acc0>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='f1', verbose=0)

In [21]:
random_cv.best_params_

{'C': 0.24617418438311098, 'max_iter': 195, 'penalty': 'l1'}

In [22]:
random_cv.best_score_

0.6905235480858485

In [23]:

# train a pipeline

pipe = make_pipeline(StandardScaler(),lr())
pipe_param_grid = {
    'logisticregression__C':expon(scale=1),
    'logisticregression__penalty':['l1','l2'],
}

pipe_random_cv = RandomizedSearchCV(pipe, param_distributions = pipe_param_grid, cv = 10, n_iter = 20 ,scoring = 'f1', error_score = 0, n_jobs =2 )

pipe_random_cv.fit(X,y)

RandomizedSearchCV(cv=10, error_score=0,
          estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
          fit_params=None, iid=True, n_iter=20, n_jobs=2,
          param_distributions={'logisticregression__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fb89d4667b8>, 'logisticregression__penalty': ['l1', 'l2']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='f1', verbose=0)

In [24]:
pipe_random_cv.best_params_

{'logisticregression__C': 0.3429673030508152,
 'logisticregression__penalty': 'l2'}

In [25]:
pipe_random_cv.best_score_

0.6882371309630492

## The cross-validated results for F_1 is around 0.688 with/without StandardScaler