# Machine Learning Model building:

After feature engineering, different Machine Learning models were trained and evaluated.

## Import Python Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Import data

In [2]:
X_train = pd.read_csv('X_train_processed.csv')
y_train = pd.read_csv('y_train_processed.csv')

X_test = pd.read_csv('X_test_processed.csv')
y_test = pd.read_csv('y_test_processed.csv')

In [3]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1185, 610), (1185, 1), (509, 610), (509, 1))

In [4]:
X_train.head()

Unnamed: 0,InscClaimAmtReimbursed_x,InpatientClaimPeriod,DurationInHospital,DeductibleAmtPaid_x,NoOfMonths_PartACov,NoOfMonths_PartBCov,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,...,County_981_y,County_982_y,County_983_y,County_984_y,County_986_y,County_988_y,County_989_y,County_990_y,County_991_y,County_992_y
0,0.03529,0.634834,0.632766,0.0,0.215258,-0.648255,-0.037962,-0.016147,-0.242751,-0.867557,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.547996,0.88734,0.885036,0.0,0.215258,0.237652,-1.1167,-0.475348,-0.9028,-0.997736,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.017145,-0.822593,-0.823304,0.0,0.215258,-1.695236,-0.372273,-0.326406,-0.438602,-0.818562,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.238384,-1.056474,-1.056968,0.0,0.215258,0.237652,-0.434826,-0.442754,-0.737917,0.944394,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.817705,-0.770619,-0.771379,0.0,0.215258,0.237652,-0.205604,0.837067,-0.546073,0.195945,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
y_train.head()

Unnamed: 0,PotentialFraud
0,1
1,0
2,1
3,0
4,0


In [6]:
y_train.shape

(1185, 1)

In [7]:
X_test.head()

Unnamed: 0,InscClaimAmtReimbursed_x,InpatientClaimPeriod,DurationInHospital,DeductibleAmtPaid_x,NoOfMonths_PartACov,NoOfMonths_PartBCov,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,...,County_981_y,County_982_y,County_983_y,County_984_y,County_986_y,County_988_y,County_989_y,County_990_y,County_991_y,County_992_y
0,0.746119,0.015482,0.01399,0.0,0.215258,0.237652,-0.131748,-0.602731,-0.630329,-1.104246,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.07794,0.06908,0.067538,0.0,0.215258,-1.091209,-0.068184,0.941023,-0.571998,-0.559701,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.055599,0.229873,0.228182,0.0,-1.596361,0.237652,-0.162986,-0.602731,-1.229842,-1.281763,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.167804,1.135081,1.132546,0.0,0.215258,0.237652,-1.068428,-0.869361,-0.891738,-0.539711,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.347878,0.323087,0.321308,0.0,0.215258,0.237652,-0.044202,-0.554042,-0.302378,-0.617587,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
y_test.head()

Unnamed: 0,PotentialFraud
0,1
1,1
2,0
3,0
4,1


In [9]:
y_train = y_train.values.ravel()

In [10]:
y_test = y_test.values.ravel()

## Building Different ML Models and their Evaluation

## Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver = 'liblinear', random_state = 42) 
# Changed the solver from 'lbfgs' to 'liblinear' and the error message "Str object has no attribute decode" solved.
lr.fit(X_train, y_train)

# Let us apply this to test set
y_predict = lr.predict(X_test)
y_predict_prob = lr.predict_proba(X_test)

from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
print("classification_report:\n", classification_report(y_test, y_predict, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test, y_predict_prob[:, 1]))

classification_report:
               precision    recall  f1-score   support

           1       0.72      0.47      0.57       123
           0       0.85      0.94      0.89       386

    accuracy                           0.83       509
   macro avg       0.78      0.71      0.73       509
weighted avg       0.82      0.83      0.81       509

The ROC AUC score: 0.8419689119170983


## Decision Tree

In [12]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state = 42) 
dtc.fit(X_train, y_train)
y_predict_dtc = dtc.predict(X_test)
y_predict_prob = dtc.predict_proba(X_test)

print("classification_report:\n", classification_report(y_test, y_predict_dtc, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test, y_predict_prob[:, 1]))

classification_report:
               precision    recall  f1-score   support

           1       0.52      0.48      0.50       123
           0       0.84      0.86      0.85       386

    accuracy                           0.77       509
   macro avg       0.68      0.67      0.67       509
weighted avg       0.76      0.77      0.76       509

The ROC AUC score: 0.6698892118454863


## Random Forest Classifier

In [13]:
# Random forest classifier
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(random_state = 42)
RFC.fit(X_train, y_train)
y_predict_RFC = RFC.predict(X_test)
y_predict_prob = RFC.predict_proba(X_test)

print("classification_report:\n", classification_report(y_test, y_predict_RFC, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test, y_predict_prob[:, 1]))

classification_report:
               precision    recall  f1-score   support

           1       0.76      0.33      0.46       123
           0       0.82      0.97      0.89       386

    accuracy                           0.81       509
   macro avg       0.79      0.65      0.68       509
weighted avg       0.81      0.81      0.78       509

The ROC AUC score: 0.8348287627954


## Gradient Boosting Classifier

In [14]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(random_state = 42)
gbc.fit(X_train, y_train)
y_predict_gbc = gbc.predict(X_test)
y_predict_prob = gbc.predict_proba(X_test)

print("classification_report:\n", classification_report(y_test, y_predict_gbc, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test, y_predict_prob[:, 1]))

classification_report:
               precision    recall  f1-score   support

           1       0.66      0.42      0.51       123
           0       0.83      0.93      0.88       386

    accuracy                           0.81       509
   macro avg       0.75      0.68      0.70       509
weighted avg       0.79      0.81      0.79       509

The ROC AUC score: 0.8398626732381314


## Support Vector Machine

In [15]:
from sklearn.svm import SVC
svc = SVC(probability=True, random_state = 42)
svc.fit(X_train, y_train)
y_predict_svc = svc.predict(X_test)
y_predict_prob = svc.predict_proba(X_test)

print("classification_report:\n", classification_report(y_test, y_predict_svc, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test, y_predict_prob[:, 1]))

classification_report:
               precision    recall  f1-score   support

           1       0.76      0.41      0.54       123
           0       0.84      0.96      0.89       386

    accuracy                           0.83       509
   macro avg       0.80      0.69      0.72       509
weighted avg       0.82      0.83      0.81       509

The ROC AUC score: 0.8156619908167995


# Apply SMORT to handle the imbalance in the data

In [16]:
import imblearn
from collections import Counter
from imblearn.over_sampling import SMOTE

counter = Counter(y_train)
print(counter)
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)
counter = Counter(y_train)
print(counter)

Counter({0: 912, 1: 273})
Counter({1: 912, 0: 912})


**Applying SMORT Transformed data, again train different ML models and evaluate:**  

In [17]:
# Logistic regression
lr2 = LogisticRegression(solver = 'liblinear', random_state = 42) 
lr2.fit(X_train, y_train)
y_predict_lr2 = lr2.predict(X_test)
y_predict_prob = lr2.predict_proba(X_test)
print("Classification_report from Logistic regression:\n", classification_report(y_test, y_predict_lr2, labels = [1, 0]))
print("The ROC AUC score from Logistic regression:", roc_auc_score(y_test, y_predict_prob[:, 1]))
print("\n")

# Decision Tree
dtc2 = DecisionTreeClassifier(random_state = 42) 
dtc2.fit(X_train, y_train)
y_predict_dtc2 = dtc2.predict(X_test)
y_predict_prob = dtc2.predict_proba(X_test)
print("Classification_report from Decision Tree Clasiifier:\n", classification_report(y_test, y_predict_dtc2, labels = [1, 0]))
print("The ROC AUC score from Decision Tree Clasiifier:", roc_auc_score(y_test, y_predict_prob[:, 1]))
print("\n")

# Random forest classifier
RFC2 = RandomForestClassifier(random_state = 42)
RFC2.fit(X_train, y_train)
y_predict_RFC2 = RFC2.predict(X_test)
y_predict_prob = RFC2.predict_proba(X_test)
print("Classification_report from Random Forest Classifier:\n", classification_report(y_test, y_predict_RFC2, labels = [1, 0]))
print("The ROC AUC score from Random Forest Classifier:", roc_auc_score(y_test, y_predict_prob[:, 1]))
print("\n")

# Gradient Boosting
gbc2 = GradientBoostingClassifier(random_state = 42)
gbc2.fit(X_train, y_train)
y_predict_gbc2 = gbc2.predict(X_test)
y_predict_prob = gbc2.predict_proba(X_test)
print("classification_report from Gradient Boosting Classifier:\n", classification_report(y_test, y_predict_gbc2, labels = [1, 0]))
print("The ROC AUC score from Gradient Boosting Classifier:", roc_auc_score(y_test, y_predict_prob[:, 1]))
print("\n")

# Support Vector Machine
svc2 = SVC(probability=True,random_state = 42)
svc2.fit(X_train, y_train)
y_predict_svc2 = svc2.predict(X_test)
y_predict_prob = svc2.predict_proba(X_test)
print("classification_report from Supportt Vector Classifier:\n", classification_report(y_test, y_predict_svc2, labels = [1, 0]))
print("The ROC AUC score from Support Vector Classifier:", roc_auc_score(y_test, y_predict_prob[:, 1]))

Classification_report from Logistic regression:
               precision    recall  f1-score   support

           1       0.61      0.67      0.64       123
           0       0.89      0.86      0.88       386

    accuracy                           0.82       509
   macro avg       0.75      0.77      0.76       509
weighted avg       0.82      0.82      0.82       509

The ROC AUC score from Logistic regression: 0.8503306794725978


Classification_report from Decision Tree Clasiifier:
               precision    recall  f1-score   support

           1       0.48      0.54      0.51       123
           0       0.85      0.81      0.83       386

    accuracy                           0.75       509
   macro avg       0.66      0.68      0.67       509
weighted avg       0.76      0.75      0.75       509

The ROC AUC score from Decision Tree Clasiifier: 0.6777981380850078


Classification_report from Random Forest Classifier:
               precision    recall  f1-score   support


**After applying the SMORT on the data set, all the ML models performance increased, and among all, the Logistic regression performance is maximum.** 

# Apply GridsearchCV and RandomSearchCV on Logistic Regression

In [18]:
from sklearn.model_selection import GridSearchCV

parameters = {'penalty': ['l1', 'l2'],
             'C': [0.001, 0.01, 0.1, 1.0, 10],
             'max_iter':[100, 200] 
             }
lr3 = LogisticRegression(solver = 'liblinear', random_state = 42)
gsc_lr3 = GridSearchCV(lr3, param_grid = parameters, scoring = {'f1_score': 'f1', 'roc_auc_score': 'roc_auc'}, refit = 'roc_auc_score', cv = 3, n_jobs = -1)
gsc_lr3.fit(X_train, y_train)

best_parameters = gsc_lr3.best_params_
best_score = gsc_lr3.best_score_
print("Best parameters from GridSearchCV on Logistic Regression:", best_parameters)
print("Best Score from Grid Search CV on: ", best_score)

gsc_lr3_1 = LogisticRegression(solver = 'liblinear', random_state = 42, C = best_parameters['C'], penalty = best_parameters['penalty'], max_iter = best_parameters['max_iter'])
gsc_lr3_1.fit(X_train, y_train)
y_predict_gsc_lr3_1 = gsc_lr3_1.predict(X_test)
y_predict_prob = gsc_lr3_1.predict_proba(X_test)

print("Classification_report from best Logistic regression model obtained from GridSeachCV:\n", classification_report(y_test, y_predict_gsc_lr3_1, labels = [1, 0]))
print("The ROC AUC score from best Logistic regression model obtained from GridSeachCV:", roc_auc_score(y_test, y_predict_prob[:, 1]))

Best parameters from GridSearchCV on Logistic Regression: {'C': 0.1, 'max_iter': 100, 'penalty': 'l1'}
Best Score from Grid Search CV on:  0.9138172322253002
Classification_report from best Logistic regression model obtained from GridSeachCV:
               precision    recall  f1-score   support

           1       0.59      0.67      0.63       123
           0       0.89      0.85      0.87       386

    accuracy                           0.81       509
   macro avg       0.74      0.76      0.75       509
weighted avg       0.82      0.81      0.81       509

The ROC AUC score from best Logistic regression model obtained from GridSeachCV: 0.8633682968954042


In [19]:
# Apply Randomized search on Logistic Regression
from sklearn.model_selection import RandomizedSearchCV
lr4 = LogisticRegression(solver = 'liblinear', random_state = 42)

parameters = {'penalty': ['l1', 'l2'],
             'C': np.linspace(0.001, 10, 10),
             'max_iter':np.linspace(50, 200, 3)}
rsc_lr4 = RandomizedSearchCV(lr4, param_distributions = parameters, scoring = 'roc_auc', random_state = 42, n_jobs = -1)
rsc_lr4.fit(X_train, y_train)


best_parameters = rsc_lr4.best_params_
best_score = rsc_lr4.best_score_

print("Best parameters from RandomizedSearchCV on Logistic Regression:", best_parameters)
print("Best Score from RandomizedSearchCV on: ", best_score)

rsc_lr4_1 = LogisticRegression(solver = 'liblinear',random_state = 42, C = best_parameters['C'], penalty = best_parameters['penalty'], max_iter = best_parameters['max_iter'])
rsc_lr4_1.fit(X_train, y_train)
y_predict_rsc_lr4_1 = rsc_lr4_1.predict(X_test)
y_predict_prob =rsc_lr4_1.predict_proba(X_test)

print("classification_report:\n", classification_report(y_test, y_predict_rsc_lr4_1, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test, y_predict_prob[:, 1]))

Best parameters from RandomizedSearchCV on Logistic Regression: {'penalty': 'l1', 'max_iter': 50.0, 'C': 2.223}
Best Score from RandomizedSearchCV on:  0.9049234768114249
classification_report:
               precision    recall  f1-score   support

           1       0.60      0.67      0.63       123
           0       0.89      0.86      0.87       386

    accuracy                           0.81       509
   macro avg       0.75      0.76      0.75       509
weighted avg       0.82      0.81      0.82       509

The ROC AUC score: 0.8500989932179114


As the dataframe is small, here we'll take the optimized hyperparameter obtained from GridSearchCV, as the results from both GridSearchCV and RandomSearchCV are almost same.

**With application of GridSearchCV and RandomSearchCV, we obtained the maximum roc_auc score:0.85.**   

# Select Best K features and train model again with diffrent number of features values

In [20]:
# Write a function to carry out the SelectKBest, Logistic regression, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif, chi2
def Best_features_LogisticRegression(X_tr, y_tr, X_te, y_te, K):
    for k in K:
        features_selector = SelectKBest(score_func = f_classif, k = k)
        ordered = features_selector.fit_transform(X_tr, y_tr)
        a = features_selector.get_support()
        features = X_tr.columns
        b = features[a].tolist()

        lr5 = LogisticRegression(solver = 'liblinear', random_state = 42)
        parameters = {'penalty': ['l1', 'l2'],
             'C': [0.001, 0.01, 0.1, 1.0, 10],
             'max_iter':[100, 200] 
             }
        gbc_pipe = GridSearchCV(lr5, param_grid = parameters, scoring = {'f1_score': 'f1', 'roc_auc_score': 'roc_auc'}, refit = 'roc_auc_score', cv = 3, n_jobs = -1)      
        gbc_pipe.fit(X_tr[b], y_tr)
        best_parameters = gbc_pipe.best_params_
        best_score = gbc_pipe.best_score_

        print("K = ", k)
        #print("Best parameters from GridSearchCV on Logistic Regression:", best_parameters)
        print("Best Score from Grid Search CV on: ", best_score)
        print("\n")

        gbc_lr7_1 = LogisticRegression(solver = 'liblinear',random_state = 42, C = best_parameters['C'], penalty = best_parameters['penalty'], max_iter = best_parameters['max_iter'])
        gbc_lr7_1.fit(X_tr[b], y_tr)
        y_predict_gbc_lr7_1 = gbc_lr7_1.predict(X_te[b])
        y_predict_prob = gbc_lr7_1.predict_proba(X_te[b]) 

        print("classification_report:\n", classification_report(y_te, y_predict_gbc_lr7_1, labels = [1, 0]))
        print("The ROC AUC score:", roc_auc_score(y_te, y_predict_prob[:, 1]))
    return print("End")

In [21]:
print(Best_features_LogisticRegression(X_train, y_train, X_test, y_test, [100, 200, 300, 400, 500, 600, 610]))

  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81
  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99
 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279
 280 300 301 302 303 304 305 321 322 323 324 325 326 327 329 330 331 332
 333 334 335 336 337 338 339 340 341 342 343 344 34

K =  100
Best Score from Grid Search CV on:  0.9138208391043398


classification_report:
               precision    recall  f1-score   support

           1       0.59      0.67      0.63       123
           0       0.89      0.85      0.87       386

    accuracy                           0.81       509
   macro avg       0.74      0.76      0.75       509
weighted avg       0.82      0.81      0.81       509

The ROC AUC score: 0.8633472345086145


  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81
  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99
 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279
 280 300 301 302 303 304 305 321 322 323 324 325 326 327 329 330 331 332
 333 334 335 336 337 338 339 340 341 342 343 344 34

K =  200
Best Score from Grid Search CV on:  0.9138136253462604


classification_report:
               precision    recall  f1-score   support

           1       0.59      0.67      0.63       123
           0       0.89      0.85      0.87       386

    accuracy                           0.81       509
   macro avg       0.74      0.76      0.75       509
weighted avg       0.82      0.81      0.81       509

The ROC AUC score: 0.8633472345086145


  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81
  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99
 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279
 280 300 301 302 303 304 305 321 322 323 324 325 326 327 329 330 331 332
 333 334 335 336 337 338 339 340 341 342 343 344 34

K =  300
Best Score from Grid Search CV on:  0.9138172322253001


classification_report:
               precision    recall  f1-score   support

           1       0.59      0.67      0.63       123
           0       0.89      0.85      0.87       386

    accuracy                           0.81       509
   macro avg       0.74      0.76      0.75       509
weighted avg       0.82      0.81      0.81       509

The ROC AUC score: 0.8632208601878765


  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81
  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99
 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279
 280 300 301 302 303 304 305 321 322 323 324 325 326 327 329 330 331 332
 333 334 335 336 337 338 339 340 341 342 343 344 34

K =  400
Best Score from Grid Search CV on:  0.9138208391043398


classification_report:
               precision    recall  f1-score   support

           1       0.59      0.67      0.63       123
           0       0.89      0.85      0.87       386

    accuracy                           0.81       509
   macro avg       0.74      0.76      0.75       509
weighted avg       0.82      0.81      0.81       509

The ROC AUC score: 0.8632419225746661


  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81
  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99
 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279
 280 300 301 302 303 304 305 321 322 323 324 325 326 327 329 330 331 332
 333 334 335 336 337 338 339 340 341 342 343 344 34

K =  500
Best Score from Grid Search CV on:  0.9138208391043398


classification_report:
               precision    recall  f1-score   support

           1       0.59      0.67      0.63       123
           0       0.89      0.85      0.87       386

    accuracy                           0.81       509
   macro avg       0.74      0.76      0.75       509
weighted avg       0.82      0.81      0.81       509

The ROC AUC score: 0.8633682968954042


  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81
  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99
 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279
 280 300 301 302 303 304 305 321 322 323 324 325 326 327 329 330 331 332
 333 334 335 336 337 338 339 340 341 342 343 344 34

K =  600
Best Score from Grid Search CV on:  0.9138208391043398


classification_report:
               precision    recall  f1-score   support

           1       0.59      0.67      0.63       123
           0       0.89      0.85      0.87       386

    accuracy                           0.81       509
   macro avg       0.74      0.76      0.75       509
weighted avg       0.82      0.81      0.81       509

The ROC AUC score: 0.8633261721218248


  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81
  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99
 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279
 280 300 301 302 303 304 305 321 322 323 324 325 326 327 329 330 331 332
 333 334 335 336 337 338 339 340 341 342 343 344 34

K =  610
Best Score from Grid Search CV on:  0.9138172322253002


classification_report:
               precision    recall  f1-score   support

           1       0.59      0.67      0.63       123
           0       0.89      0.85      0.87       386

    accuracy                           0.81       509
   macro avg       0.74      0.76      0.75       509
weighted avg       0.82      0.81      0.81       509

The ROC AUC score: 0.8633682968954042
End
None


Here, one can see that there are many constant columns, hence we need to work on them.<br> 
Let us drop the constant and quasi-constant features.<br>
Here, we can note that <br> The constant and quasi-constant features are defined as follows:<br>
1. Constant features: Same value in all the records in that column.<br>
2. Quasi-constant features: One of the values is dominant with 99.9% (or 99% or 98% or 95% etc) in that column.

#### Drop the constant and quasi constant features

In [22]:
# Identify the constant features by using get_constant_features from fast_ml.utilities
from fast_ml.utilities import display_all
from fast_ml.feature_selection import get_constant_features

constant_features = get_constant_features(X_train)
constant_features.head(30)

Unnamed: 0,Desc,Var,Value,Perc
0,Constant,DeductibleAmtPaid_x,0.0,100.0
1,Constant,County_321,0.0,100.0
2,Constant,County_400_y,0.0,100.0
3,Constant,County_390_y,0.0,100.0
4,Constant,County_380_y,0.0,100.0
5,Constant,County_370_y,0.0,100.0
6,Constant,County_362,0.0,100.0
7,Constant,County_361,0.0,100.0
8,Constant,County_360_y,0.0,100.0
9,Constant,County_350_y,0.0,100.0


In [23]:
constant_features.shape, X_train.shape

((530, 4), (1824, 610))

Out of 610 features, 530 are constant.

In [24]:
# List the constant features
constant_features_list = constant_features.query("Desc == 'Constant'")['Var'].tolist()
constant_features_list[:30]

['DeductibleAmtPaid_x',
 'County_321',
 'County_400_y',
 'County_390_y',
 'County_380_y',
 'County_370_y',
 'County_362',
 'County_361',
 'County_360_y',
 'County_350_y',
 'County_343',
 'County_342',
 'County_341_y',
 'County_340_y',
 'County_331_y',
 'County_330_y',
 'County_320_y',
 'County_600_y',
 'County_312_y',
 'County_311',
 'County_310_y',
 'County_301',
 'County_300_y',
 'County_292_y',
 'County_291_y',
 'County_290_y',
 'County_281_y',
 'County_280_y',
 'County_270_y',
 'County_260_y']

In [25]:
# Drop the constant features
X_train.drop(columns = constant_features_list, inplace = True)
X_train.shape

(1824, 81)

In [26]:
# Locate the quasi constant features and drop them from the dataframe
quasi_constant_features1 = get_constant_features(X_train, threshold = 0.999, dropna = False)
quasi_constant_features1.shape

(0, 4)

In [27]:
quasi_constant_features2 = get_constant_features(X_train, threshold = 0.99, dropna = False)
quasi_constant_features2.shape

(1, 4)

In [28]:
quasi_constant_features3 = get_constant_features(X_train, threshold = 0.98, dropna = False)
quasi_constant_features3.shape

(2, 4)

In [29]:
quasi_constant_features4 = get_constant_features(X_train, threshold = 0.95, dropna = False)
quasi_constant_features4.shape

(2, 4)

In [30]:
quasi_constant_features4.head()

Unnamed: 0,Desc,Var,Value,Perc
0,Quasi Constant,Race_1_y,0.096797,99.177632
1,Quasi Constant,Race_1_x,0.155565,98.355263


In [31]:
quasi_constant_features_list = quasi_constant_features3.query("Desc == 'Quasi Constant'")['Var'].tolist()
X_train.drop(columns = quasi_constant_features_list, inplace = True)

In [32]:
X_train.shape

(1824, 79)

In [33]:
# Now carry out the same for  X_test
constant_features_X_test = get_constant_features(X_test)
print(constant_features_X_test.shape, X_test.shape)
constant_features_list_X_test = constant_features_X_test.query("Desc == 'Constant'")['Var'].tolist()
X_test.drop(columns = constant_features_list_X_test, inplace = True)

quasi_constant_features3_X_test = get_constant_features(X_test, threshold = 0.98, dropna = False)
print(quasi_constant_features3_X_test.shape)

quasi_constant_features_list_X_test = quasi_constant_features3_X_test.query("Desc == 'Quasi Constant'")['Var'].tolist()
X_test.drop(columns = quasi_constant_features_list_X_test, inplace = True)
X_test.shape

(530, 4) (509, 610)
(2, 4)


(509, 79)

#### Train all the models again on the new train and test data 

In [34]:
# Logistic regression
lr2 = LogisticRegression(solver = 'liblinear', random_state = 42) 
lr2.fit(X_train, y_train)
y_predict_lr2 = lr2.predict(X_test)
y_predict_prob =lr2.predict_proba(X_test)

print("Classification_report from Logistic regression:\n", classification_report(y_test, y_predict_lr2, labels = [1, 0]))
print("The ROC AUC score from Logistic regression:", roc_auc_score(y_test, y_predict_prob[:, 1]))
print("\n")

# Decision Tree
dtc2 = DecisionTreeClassifier(random_state = 42) 
dtc2.fit(X_train, y_train)
y_predict_dtc2 = dtc2.predict(X_test)
y_predict_prob =dtc2.predict_proba(X_test)
print("Classification_report from Decision Tree Clasiifier:\n", classification_report(y_test, y_predict_dtc2, labels = [1, 0]))
print("The ROC AUC score from Decision Tree Clasiifier:", roc_auc_score(y_test, y_predict_prob[:, 1]))
print("\n")

# Random forest classifier
RFC2 = RandomForestClassifier(random_state = 42)
RFC2.fit(X_train, y_train)
y_predict_RFC2 = RFC2.predict(X_test)
y_predict_prob =RFC2.predict_proba(X_test)
print("Classification_report from Random Forest Classifier:\n", classification_report(y_test, y_predict_RFC2, labels = [1, 0]))
print("The ROC AUC score from Random Forest Classifier:", roc_auc_score(y_test, y_predict_prob[:, 1]))
print("\n")

# Gradient Boosting
gbc2 = GradientBoostingClassifier(random_state = 42)
gbc2.fit(X_train, y_train)
y_predict_gbc2 = gbc2.predict(X_test)
y_predict_prob = gbc2.predict_proba(X_test)
print("classification_report from Gradient Boosting Classifier:\n", classification_report(y_test, y_predict_gbc2, labels = [1, 0]))
print("The ROC AUC score from Gradient Boosting Classifier:", roc_auc_score(y_test, y_predict_prob[:, 1]))
print("\n")

# Support Vector Machine
svc2 = SVC(probability = True, random_state = 42)
svc2.fit(X_train, y_train)
y_predict_svc2 = svc2.predict(X_test)
y_predict_prob = svc2.predict_proba(X_test)
print("classification_report from Supportt Vector Classifier:\n", classification_report(y_test, y_predict_svc2, labels = [1, 0]))
print("The ROC AUC score from Support Vector Classifier:", roc_auc_score(y_test, y_predict_prob[:, 1]))

Classification_report from Logistic regression:
               precision    recall  f1-score   support

           1       0.60      0.67      0.63       123
           0       0.89      0.86      0.87       386

    accuracy                           0.81       509
   macro avg       0.75      0.76      0.75       509
weighted avg       0.82      0.81      0.82       509

The ROC AUC score from Logistic regression: 0.8502043051518599


Classification_report from Decision Tree Clasiifier:
               precision    recall  f1-score   support

           1       0.47      0.55      0.51       123
           0       0.85      0.80      0.82       386

    accuracy                           0.74       509
   macro avg       0.66      0.68      0.66       509
weighted avg       0.76      0.74      0.75       509

The ROC AUC score from Decision Tree Clasiifier: 0.6753864947975904


Classification_report from Random Forest Classifier:
               precision    recall  f1-score   support


**Among all models, Logistic regression, Random forest and Gradient boosting has almost same performance(roc_auc_score: 0.84)**

In [35]:
# Apply GridSearCV to Logistic Regression

parameters = {'penalty': ['l1', 'l2'],
             'C': [0.001, 0.01, 0.1, 1.0, 10],
             'max_iter':[100, 200] 
             }

lr3 = LogisticRegression(solver = 'liblinear', random_state = 42)
gsc_lr3 = GridSearchCV(lr3, param_grid = parameters, scoring = {'f1_score': 'f1', 'roc_auc_score': 'roc_auc'}, refit = 'roc_auc_score', cv = 3, n_jobs = -1)
gsc_lr3.fit(X_train, y_train)

best_parameters = gsc_lr3.best_params_
best_score = gsc_lr3.best_score_
print("Best parameters from GridSearchCV on Logistic Regression:", best_parameters)
print("Best Score from Grid Search CV on: ", best_score)

gsc_lr3_1 = LogisticRegression(solver = 'liblinear', random_state = 42, C = best_parameters['C'], penalty = best_parameters['penalty'], max_iter = best_parameters['max_iter'])
gsc_lr3_1.fit(X_train, y_train)
y_predict_gsc_lr3_1 = gsc_lr3_1.predict(X_test)
y_predict_prob = gsc_lr3_1.predict_proba(X_test)

print("Classification_report from best Logistic Regression obtained from GridSeachCV:\n", classification_report(y_test, y_predict_gsc_lr3_1, labels = [1, 0]))
print("The ROC AUC score from best Logistic Regression obtained from GridSeachCV:", roc_auc_score(y_test, y_predict_prob[:, 1]))

Best parameters from GridSearchCV on Logistic Regression: {'C': 0.1, 'max_iter': 100, 'penalty': 'l1'}
Best Score from Grid Search CV on:  0.9137847703139429
Classification_report from best Logistic Regression obtained from GridSeachCV:
               precision    recall  f1-score   support

           1       0.59      0.67      0.63       123
           0       0.89      0.85      0.87       386

    accuracy                           0.81       509
   macro avg       0.74      0.76      0.75       509
weighted avg       0.82      0.81      0.81       509

The ROC AUC score from best Logistic Regression obtained from GridSeachCV: 0.8629470491596107


In [36]:
# Apply GridSearCV to Random Forest Regressor

parameters = {'n_estimators': [75, 100, 125, 150, 175, 200],
              'criterion': ['gini','entropy'], 
             'max_depth' : [3, 5, 7] 
             }
RFC3 = RandomForestClassifier(random_state = 42)
gsc_RFC3 = GridSearchCV(RFC3, param_grid = parameters, scoring = {'f1_score': 'f1', 'roc_auc_score': 'roc_auc'}, refit = 'roc_auc_score', cv = 3, n_jobs = -1)
gsc_RFC3.fit(X_train, y_train)

best_parameters = gsc_RFC3.best_params_
best_score = gsc_RFC3.best_score_
print("Best parameters from GridSearchCV on Random Forest Classifier:", best_parameters)
print("Best Score from Grid Search CV on: ", best_score)

gsc_RFC3_1 = RandomForestClassifier(random_state = 42, n_estimators = best_parameters['n_estimators'], criterion = best_parameters['criterion'], max_depth = best_parameters['max_depth'])
gsc_RFC3_1.fit(X_train, y_train)
y_predict_gsc_RFC3_1 = gsc_RFC3_1.predict(X_test)
y_predict_prob = gsc_RFC3_1.predict_proba(X_test)

print("Classification_report from best Random Forest Classifier obtained from GridSeachCV:\n", classification_report(y_test, y_predict_gsc_RFC3_1, labels = [1, 0]))
print("The ROC AUC score from best Random Forest Classifier obtained from GridSeachCV:", roc_auc_score(y_test, y_predict_prob[:, 1]))

Best parameters from GridSearchCV on Random Forest Classifier: {'criterion': 'gini', 'max_depth': 7, 'n_estimators': 200}
Best Score from Grid Search CV on:  0.9430365593259465
Classification_report from best Random Forest Classifier obtained from GridSeachCV:
               precision    recall  f1-score   support

           1       0.58      0.65      0.62       123
           0       0.88      0.85      0.87       386

    accuracy                           0.80       509
   macro avg       0.73      0.75      0.74       509
weighted avg       0.81      0.80      0.81       509

The ROC AUC score from best Random Forest Classifier obtained from GridSeachCV: 0.8527739163401996


In [37]:
# Apply GridSearCV to Gradient Boosting Classifier

parameters = {'learning_rate': [0.01, 0.1, 1, 10], 
              'n_estimators': [100, 150, 200, 250, 300],
              'max_depth' : [3, 5, 7]
             }
gbc3 = GradientBoostingClassifier(random_state = 42)
gsc_gbc3 = GridSearchCV(gbc3, param_grid = parameters, scoring = {'f1_score': 'f1', 'roc_auc_score': 'roc_auc'}, refit = 'roc_auc_score', cv = 3, n_jobs = -1)
gsc_gbc3.fit(X_train, y_train)

best_parameters = gsc_gbc3.best_params_
best_score = gsc_gbc3.best_score_
print("Best parameters from GridSearchCV on Gradient Boosting Classifier:", best_parameters)
print("Best Score from Grid Search CV on: ", best_score)

gsc_gbc3_1 = GradientBoostingClassifier(random_state = 42, learning_rate = best_parameters['learning_rate'], n_estimators = best_parameters['n_estimators'], max_depth = best_parameters['max_depth'])
gsc_gbc3_1.fit(X_train, y_train)
y_predict_gsc_gbc3_1 = gsc_gbc3_1.predict(X_test)
y_predict_prob = gsc_gbc3_1.predict_proba(X_test)

print("Classification_report from best Gradient Boosting classifier obtained from GridSeachCV:\n", classification_report(y_test, y_predict_gsc_gbc3_1, labels = [1, 0]))
print("The ROC AUC score from best Gradient Boosting classifier obtained from GridSeachCV:", roc_auc_score(y_test, y_predict_prob[:, 1]))

Best parameters from GridSearchCV on Gradient Boosting Classifier: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300}
Best Score from Grid Search CV on:  0.9617526546629732
Classification_report from best Gradient Boosting classifier obtained from GridSeachCV:
               precision    recall  f1-score   support

           1       0.63      0.54      0.58       123
           0       0.86      0.90      0.88       386

    accuracy                           0.81       509
   macro avg       0.74      0.72      0.73       509
weighted avg       0.80      0.81      0.81       509

The ROC AUC score from best Gradient Boosting classifier obtained from GridSeachCV: 0.8492986225199041


In [38]:
# Apply GridSearchCV to Support Vector Classifier

parameters = {'C': [0.01, 0.1, 1, 10], 
#              'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
              }
SVC3 = SVC(probability = True, random_state = 42)
gsc_SVC3 = GridSearchCV(SVC3, param_grid = parameters, scoring = {'f1_score': 'f1', 'roc_auc_score': 'roc_auc'}, refit = 'roc_auc_score', cv = 3, n_jobs = -1)
gsc_SVC3.fit(X_train, y_train)

best_parameters = gsc_SVC3.best_params_
best_score = gsc_SVC3.best_score_
print("Best parameters from GridSearchCV on Support Vector Classifier:", best_parameters)
print("Best Score from Grid Search CV on: ", best_score)

gsc_SVC3_1 = SVC(probability = True, random_state = 42, C = best_parameters['C'])
gsc_SVC3_1.fit(X_train, y_train)
y_predict_gsc_SVC3_1 = gsc_SVC3_1.predict(X_test)
y_predict_prob = gsc_SVC3_1.predict_proba(X_test)

print("Classification_report from best Gradient Support Vector Classifier obtained from GridSeachCV:\n", classification_report(y_test, y_predict_gsc_SVC3_1, labels = [1, 0]))
print("The ROC AUC score from best Support Vector Classifier obtained from GridSeachCV:", roc_auc_score(y_test, y_predict_prob[:, 1]))

Best parameters from GridSearchCV on Support Vector Classifier: {'C': 10}
Best Score from Grid Search CV on:  0.970885272391505
Classification_report from best Gradient Support Vector Classifier obtained from GridSeachCV:
               precision    recall  f1-score   support

           1       0.61      0.49      0.54       123
           0       0.85      0.90      0.87       386

    accuracy                           0.80       509
   macro avg       0.73      0.69      0.71       509
weighted avg       0.79      0.80      0.79       509

The ROC AUC score from best Support Vector Classifier obtained from GridSeachCV: 0.820232528750158


**Note: among all, the Logistic Regression performance is highest with roc_auc_score: 0.85.**

# Apply PCA

In [39]:
from sklearn.decomposition import PCA

def pca_application_logistic_regression(data1, data2, n_components_list):
    for element in n_components_list:
        pca = PCA(n_components = element, random_state = 42)
        pca1 = pca.fit(X_train)
        pca1_train = pca1.transform(X_train)
        pca1_test = pca1.transform(X_test)
        #print(pca1.explained_variance_ratio_)

        lr_pca = LogisticRegression(solver = 'liblinear', random_state = 42)
        lr_pca.fit(pca1_train, y_train)
        y_predict_pca1 = lr_pca.predict(pca1_test)
        y_predict_prob = lr_pca.predict_proba(pca1_test)
        
        print("n_components:", element)
        print("classification_report:\n", classification_report(y_test, y_predict_pca1, labels = [1, 0]))
        print("The ROC AUC score:", roc_auc_score(y_test,  y_predict_prob[:, 1]))
    return None

In [40]:
n_components_list = [None, 5, 10, 20, 30, 40, 50, 60, 70]
pca_application_logistic_regression(X_train, X_test, n_components_list)

n_components: None
classification_report:
               precision    recall  f1-score   support

           1       0.60      0.67      0.63       123
           0       0.89      0.86      0.87       386

    accuracy                           0.81       509
   macro avg       0.75      0.76      0.75       509
weighted avg       0.82      0.81      0.82       509

The ROC AUC score: 0.8502043051518597
n_components: 5
classification_report:
               precision    recall  f1-score   support

           1       0.57      0.66      0.61       123
           0       0.89      0.84      0.86       386

    accuracy                           0.80       509
   macro avg       0.73      0.75      0.74       509
weighted avg       0.81      0.80      0.80       509

The ROC AUC score: 0.8375458106912674
n_components: 10
classification_report:
               precision    recall  f1-score   support

           1       0.57      0.65      0.61       123
           0       0.88      0.84    

**Among all PCAs ,the PCA with n_components = 10 performed best with roc_auc_score: 0.85**.

In [41]:
# Apply Best PCA with best Logistic Regression
import datetime
start = datetime.datetime.now()
pca = PCA(n_components = 10, random_state = 42)
pca1 = pca.fit(X_train)
pca1_train = pca1.transform(X_train)
pca1_test = pca1.transform(X_test)
lr_final = LogisticRegression(solver = 'liblinear', random_state = 42, C = 0.1, max_iter = 100, penalty = 'l1') 


lr_final.fit(pca1_train, y_train)
y_predict_lr_final = lr_final.predict(pca1_test)
y_predict_prob = lr_final.predict_proba(pca1_test)

print("Classification_report from Logistic regression:\n", classification_report(y_test, y_predict_lr_final, labels = [1, 0]))
print("The ROC AUC score from Logistic regression:", roc_auc_score(y_test, y_predict_prob[:, 1]))
print("\n")
end = datetime.datetime.now()
print(end-start)

Classification_report from Logistic regression:
               precision    recall  f1-score   support

           1       0.59      0.66      0.62       123
           0       0.89      0.85      0.87       386

    accuracy                           0.81       509
   macro avg       0.74      0.76      0.74       509
weighted avg       0.81      0.81      0.81       509

The ROC AUC score from Logistic regression: 0.8485403765954758


0:00:00.034284


In [42]:
# Logistic regression with best parameters from GridSearchCV without appying PCA

start = datetime.datetime.now()
lr2 = LogisticRegression(solver = 'liblinear', random_state = 42, max_iter = 100, penalty = 'l1') 
lr2.fit(X_train, y_train)
y_predict_lr2 = lr2.predict(X_test)
y_predict_prob =lr2.predict_proba(X_test)

print("Classification_report from Logistic regression:\n", classification_report(y_test, y_predict_lr2, labels = [1, 0]))
print("The ROC AUC score from Logistic regression:", roc_auc_score(y_test, y_predict_prob[:, 1]))
print("\n")
end = datetime.datetime.now()
print(end-start)

Classification_report from Logistic regression:
               precision    recall  f1-score   support

           1       0.61      0.67      0.64       123
           0       0.89      0.87      0.88       386

    accuracy                           0.82       509
   macro avg       0.75      0.77      0.76       509
weighted avg       0.82      0.82      0.82       509

The ROC AUC score from Logistic regression: 0.851531235519609


0:00:00.070819


# Final Optimized model

Among all the models, Logistic Regression model lr_final model with following features was considered as the best taking into acoount the run time when applied to large dataset in future.

In [43]:
lr_final = LogisticRegression(solver = 'liblinear', random_state = 42, C = 0.1, max_iter = 100, penalty = 'l1')

# Save the Finally Optimized Model 

In [44]:
best_model = lr_final
best_model.version = '1.0'
best_model.pandas_version = pd.__version__
best_model.numpy_version = np.__version__
best_model.X_columns = [col for col in X_train.columns]

In [45]:
import pickle
pickle.dump(lr_final, open('Best_model.pkl', 'wb'))

# Summary:

The performance of the Logistic Regression without and with PCA was same with roc_auc_score of 0.85. However, the model with PCA was saved for deployment taking into considering the run time while it'll be applied to large dataset . 