# Machine Learning Model building:

After feature engineering, Machine Learning models were trained and evaluated, however, models building was carried out for two different types of processed data:<br>case-1: Total number of counts was taken for each categorical data under each provider<br>Case-2: Dummies was obtained for each categrical variables under each provider

## Import the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Case-1: ML model building (For categorical features, where total number entries under each provider was counted)

In [2]:
X_train = pd.read_csv('X_train3.csv', index_col = 0)
y_train = pd.read_csv('y_train3.csv', index_col = 0)

X_test = pd.read_csv('X_test3.csv', index_col = 0)
y_test = pd.read_csv('y_test3.csv', index_col = 0)

In [3]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1185, 95), (1185, 1), (509, 95), (509, 1))

In [4]:
X_train.head()

Unnamed: 0,InscClaimAmtReimbursed_x,DeductibleAmtPaid_x,InpatientClaimPeriod,DurationInHospital,NoOfMonths_PartACov_x,NoOfMonths_PartBCov_x,IPAnnualReimbursementAmt_x,IPAnnualDeductibleAmt_x,OPAnnualReimbursementAmt_x,OPAnnualDeductibleAmt_x,...,ChronicCond_Heartfailure_y,ChronicCond_KidneyDisease_y,ChronicCond_Cancer_y,ChronicCond_ObstrPulmonary_y,ChronicCond_Depression_y,ChronicCond_Diabetes_y,ChronicCond_IschemicHeart_y,ChronicCond_Osteoporasis_y,ChronicCond_rheumatoidarthritis_y,ChronicCond_stroke_y
0,0.03529,0.0,0.634834,0.632766,0.215258,-0.648255,-0.037962,-0.016147,-0.242751,-0.867557,...,-0.454031,-0.454031,-0.454031,-0.454031,-0.454031,-0.454031,-0.454031,-0.454031,-0.454031,-0.454031
1,-0.547996,0.0,0.88734,0.885036,0.215258,0.237652,-1.1167,-0.475348,-0.9028,-0.997736,...,-0.032232,-0.032232,-0.032232,-0.032232,-0.032232,-0.032232,-0.032232,-0.032232,-0.032232,-0.032232
2,-0.017145,0.0,-0.822593,-0.823304,0.215258,-1.695236,-0.372273,-0.326406,-0.438602,-0.818562,...,-0.100631,-0.100631,-0.100631,-0.100631,-0.100631,-0.100631,-0.100631,-0.100631,-0.100631,-0.100631
3,0.238384,0.0,-1.056474,-1.056968,0.215258,0.237652,-0.434826,-0.442754,-0.737917,0.944394,...,-0.237431,-0.237431,-0.237431,-0.237431,-0.237431,-0.237431,-0.237431,-0.237431,-0.237431,-0.237431
4,-0.817705,0.0,-0.770619,-0.771379,0.215258,0.237652,-0.205604,0.837067,-0.546073,0.195945,...,-0.94423,-0.94423,-0.94423,-0.94423,-0.94423,-0.94423,-0.94423,-0.94423,-0.94423,-0.94423


In [5]:
y_train.head()

Unnamed: 0,PotentialFraud
0,1
1,0
2,1
3,0
4,0


In [6]:
y_train.shape

(1185, 1)

In [7]:
X_test.head()

Unnamed: 0,InscClaimAmtReimbursed_x,DeductibleAmtPaid_x,InpatientClaimPeriod,DurationInHospital,NoOfMonths_PartACov_x,NoOfMonths_PartBCov_x,IPAnnualReimbursementAmt_x,IPAnnualDeductibleAmt_x,OPAnnualReimbursementAmt_x,OPAnnualDeductibleAmt_x,...,ChronicCond_Heartfailure_y,ChronicCond_KidneyDisease_y,ChronicCond_Cancer_y,ChronicCond_ObstrPulmonary_y,ChronicCond_Depression_y,ChronicCond_Diabetes_y,ChronicCond_IschemicHeart_y,ChronicCond_Osteoporasis_y,ChronicCond_rheumatoidarthritis_y,ChronicCond_stroke_y
0,0.746119,0.0,0.015482,0.01399,0.215258,0.237652,-0.131748,-0.602731,-0.630329,-1.104246,...,-0.499631,-0.499631,-0.499631,-0.499631,-0.499631,-0.499631,-0.499631,-0.499631,-0.499631,-0.499631
1,0.07794,0.0,0.06908,0.067538,0.215258,-1.091209,-0.068184,0.941023,-0.571998,-0.559701,...,0.081768,0.081768,0.081768,0.081768,0.081768,0.081768,0.081768,0.081768,0.081768,0.081768
2,0.055599,0.0,0.229873,0.228182,-1.596361,0.237652,-0.162986,-0.602731,-1.229842,-1.281763,...,0.628967,0.628967,0.628967,0.628967,0.628967,0.628967,0.628967,0.628967,0.628967,0.628967
3,-0.167804,0.0,1.135081,1.132546,0.215258,0.237652,-1.068428,-0.869361,-0.891738,-0.539711,...,-0.009432,-0.009432,-0.009432,-0.009432,-0.009432,-0.009432,-0.009432,-0.009432,-0.009432,-0.009432
4,0.347878,0.0,0.323087,0.321308,0.215258,0.237652,-0.044202,-0.554042,-0.302378,-0.617587,...,0.024768,0.024768,0.024768,0.024768,0.024768,0.024768,0.024768,0.024768,0.024768,0.024768


In [8]:
y_test.head()

Unnamed: 0,PotentialFraud
0,1
1,1
2,0
3,0
4,1


In [9]:
y_train = y_train.values.ravel()

## Build Different ML models and Evaluation

## Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver = 'liblinear', random_state = 42) 
# Changed the solver from 'lbfgs' to 'liblinear' and the error message "Str object has no attribute decode" solved.
lr.fit(X_train, y_train)

# Let us apply this to test set
y_predict = lr.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
print("classification_report:\n", classification_report(y_test, y_predict, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test, y_predict))

classification_report:
               precision    recall  f1-score   support

           1       0.71      0.45      0.55       123
           0       0.84      0.94      0.89       386

    accuracy                           0.82       509
   macro avg       0.78      0.70      0.72       509
weighted avg       0.81      0.82      0.81       509

The ROC AUC score: 0.6950798264459329


**The precision, recall, f1-score, accuracy remains same if we take mean() or sum() for numerical value in groupby.**

## Decision Tree

In [11]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state = 42) 
dtc.fit(X_train, y_train)
y_predict_dtc = dtc.predict(X_test)


print("classification_report:\n", classification_report(y_test, y_predict_dtc, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test, y_predict_dtc))

classification_report:
               precision    recall  f1-score   support

           1       0.51      0.51      0.51       123
           0       0.84      0.84      0.84       386

    accuracy                           0.76       509
   macro avg       0.68      0.68      0.68       509
weighted avg       0.76      0.76      0.76       509

The ROC AUC score: 0.6783773537217237


## Random Forest Classifier

In [12]:
# Random forest classifier
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(random_state = 42)
RFC.fit(X_train, y_train)
y_predict_RFC = RFC.predict(X_test)

print("classification_report:\n", classification_report(y_test, y_predict_RFC, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test, y_predict_RFC))

classification_report:
               precision    recall  f1-score   support

           1       0.65      0.37      0.47       123
           0       0.82      0.94      0.88       386

    accuracy                           0.80       509
   macro avg       0.74      0.65      0.67       509
weighted avg       0.78      0.80      0.78       509

The ROC AUC score: 0.6518387463667382


## Gradient Boosting Classifier

In [13]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(random_state = 42)
gbc.fit(X_train, y_train)
y_predict_gbc = gbc.predict(X_test)

print("classification_report:\n", classification_report(y_test, y_predict_gbc, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test, y_predict_gbc))

classification_report:
               precision    recall  f1-score   support

           1       0.72      0.47      0.57       123
           0       0.85      0.94      0.89       386

    accuracy                           0.83       509
   macro avg       0.78      0.71      0.73       509
weighted avg       0.82      0.83      0.81       509

The ROC AUC score: 0.7059796116095877


## Support Vector Machine

In [14]:
from sklearn.svm import SVC
svc = SVC(random_state = 42)
svc.fit(X_train, y_train)
y_predict_svc = svc.predict(X_test)

print("classification_report:\n", classification_report(y_test, y_predict_svc, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test, y_predict_svc))

classification_report:
               precision    recall  f1-score   support

           1       0.71      0.41      0.52       123
           0       0.83      0.95      0.89       386

    accuracy                           0.82       509
   macro avg       0.77      0.68      0.70       509
weighted avg       0.80      0.82      0.80       509

The ROC AUC score: 0.6773452967690299


# Apply SMORT to handle the imbalance in the data

In [15]:
import imblearn
from collections import Counter
from imblearn.over_sampling import SMOTE

counter = Counter(y_train)
print(counter)
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)
counter = Counter(y_train)
print(counter)

Counter({0: 912, 1: 273})
Counter({1: 912, 0: 912})


**Applying SMORT Transformed data, again train different ML models and evaluate:**  

In [16]:
# Logistic regression
lr2 = LogisticRegression(solver = 'liblinear', random_state = 42) 
lr2.fit(X_train, y_train)
y_predict_lr2 = lr2.predict(X_test)
print("Classification_report from Logistic regression:\n", classification_report(y_test, y_predict_lr2, labels = [1, 0]))
print("The ROC AUC score from Logistic regression:", roc_auc_score(y_test, y_predict_lr2))
print("\n")

# Decision Tree
dtc2 = DecisionTreeClassifier(random_state = 42) 
dtc2.fit(X_train, y_train)
y_predict_dtc2 = dtc2.predict(X_test)
print("Classification_report from Decision Tree Clasiifier:\n", classification_report(y_test, y_predict_dtc2, labels = [1, 0]))
print("The ROC AUC score from Decision Tree Clasiifier:", roc_auc_score(y_test, y_predict_dtc2))
print("\n")

# Random forest classifier
RFC2 = RandomForestClassifier(random_state = 42)
RFC2.fit(X_train, y_train)
y_predict_RFC2 = RFC2.predict(X_test)
print("Classification_report from Random Forest Classifier:\n", classification_report(y_test, y_predict_RFC2, labels = [1, 0]))
print("The ROC AUC score from Random Forest Classifier:", roc_auc_score(y_test, y_predict_RFC2))
print("\n")

# Gradient Boosting
gbc2 = GradientBoostingClassifier()
gbc2.fit(X_train, y_train)
y_predict_gbc2 = gbc.predict(X_test)
print("classification_report from Gradient Boosting Classifier:\n", classification_report(y_test, y_predict_gbc2, labels = [1, 0]))
print("The ROC AUC score from Gradient Boosting Classifier:", roc_auc_score(y_test, y_predict_gbc2))
print("\n")

# Support Vector Machine
svc2 = SVC(random_state = 42)
svc2.fit(X_train, y_train)
y_predict_svc2 = svc.predict(X_test)
print("classification_report from Supportt Vector Classifier:\n", classification_report(y_test, y_predict_svc2, labels = [1, 0]))
print("The ROC AUC score from Supportt Vector Classifier:", roc_auc_score(y_test, y_predict_svc2))

Classification_report from Logistic regression:
               precision    recall  f1-score   support

           1       0.53      0.67      0.59       123
           0       0.88      0.81      0.85       386

    accuracy                           0.78       509
   macro avg       0.71      0.74      0.72       509
weighted avg       0.80      0.78      0.79       509

The ROC AUC score from Logistic regression: 0.74006908462867


Classification_report from Decision Tree Clasiifier:
               precision    recall  f1-score   support

           1       0.43      0.44      0.43       123
           0       0.82      0.81      0.82       386

    accuracy                           0.72       509
   macro avg       0.62      0.63      0.63       509
weighted avg       0.73      0.72      0.72       509

The ROC AUC score from Decision Tree Clasiifier: 0.626247946417288


Classification_report from Random Forest Classifier:
               precision    recall  f1-score   support

  

**After applying the SMORT on the data set, all the ML models performance increased, and among all, the Logistic regression performance was maximum with roc_auc_score: 0.75.** 

# Apply GridsearchCV and RandomSearchCV on Logistic Regression

In [17]:
from sklearn.model_selection import GridSearchCV

parameters = {'penalty': ['l1', 'l2'],
             'C': [0.001, 0.01, 0.1, 1.0, 10],
             'solver': ['liblinear'],
              'max_iter':[100, 200] 
             }
lr3 = LogisticRegression(random_state = 42)
gsc_lr3 = GridSearchCV(lr3, param_grid = parameters, scoring = {'f1_score': 'f1', 'roc_auc_score': 'roc_auc'}, refit = 'roc_auc_score', cv = 3, n_jobs = -1)
# To use two evaluation scoring metrics, I need some help, I got some answer, but not satisfied
#gsc_lr3 = GridSearchCV(lr3, param_grid = parameters, scoring = {'f1_score': 'f1', 'roc_auc_score'= 'roc_auc'}, refit: 'roc_auc', cv = 3)
gsc_lr3.fit(X_train, y_train)

best_parameters = gsc_lr3.best_params_
best_score = gsc_lr3.best_score_
print("Best parameters from GridSearchCV on Logistic Regression:", best_parameters)
print("Best Score from Grid Search CV on: ", best_score)

gsc_lr3_1 = LogisticRegression(solver = best_parameters['solver'], random_state = 42, C = best_parameters['C'], penalty = best_parameters['penalty'], max_iter = best_parameters['max_iter'])
gsc_lr3_1.fit(X_train, y_train)
y_predict_gsc_lr3_1 = gsc_lr3_1.predict(X_test)

print("Classification_report from best Logistic regression model obtained from GridSeachCV:\n", classification_report(y_test, y_predict_gsc_lr3_1, labels = [1, 0]))
print("The ROC AUC score from best Logistic regression model obtained from GridSeachCV:", roc_auc_score(y_test, y_predict_gsc_lr3_1))

Best parameters from GridSearchCV on Logistic Regression: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score from Grid Search CV on:  0.896583564173592
Classification_report from best Logistic regression model obtained from GridSeachCV:
               precision    recall  f1-score   support

           1       0.57      0.69      0.62       123
           0       0.89      0.83      0.86       386

    accuracy                           0.80       509
   macro avg       0.73      0.76      0.74       509
weighted avg       0.82      0.80      0.81       509

The ROC AUC score from best Logistic regression model obtained from GridSeachCV: 0.7626269008804076


In [18]:
# Apply Randomized search on Logistic Regression
from sklearn.model_selection import RandomizedSearchCV
lr4 = LogisticRegression(solver = 'liblinear', random_state = 42)

parameters = {'penalty': ['l1', 'l2'],
             'C': np.linspace(0.001, 10, 10),
             'solver': ['liblinear'],
              'max_iter':np.linspace(50, 200, 3)}
rsc_lr4 = RandomizedSearchCV(lr4, param_distributions = parameters, scoring = 'roc_auc', random_state = 42, n_jobs = -1)
rsc_lr4.fit(X_train, y_train)


best_parameters = rsc_lr4.best_params_
best_score = rsc_lr4.best_score_

print("Best parameters from RandomizedSearchCV on Logistic Regression:", best_parameters)
print("Best Score from RandomizedSearchCV on: ", best_score)

rsc_lr4_1 = LogisticRegression(solver = best_parameters['solver'], random_state = 42, C = best_parameters['C'], penalty = best_parameters['penalty'], max_iter = best_parameters['max_iter'])
rsc_lr4_1.fit(X_train, y_train)
y_predict_rsc_lr4_1 = rsc_lr4_1.predict(X_test)

print("classification_report:\n", classification_report(y_test, y_predict_rsc_lr4_1, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test, y_predict_rsc_lr4_1))

Best parameters from RandomizedSearchCV on Logistic Regression: {'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 50.0, 'C': 2.223}
Best Score from RandomizedSearchCV on:  0.8916569499077156
classification_report:
               precision    recall  f1-score   support

           1       0.53      0.67      0.59       123
           0       0.88      0.81      0.85       386

    accuracy                           0.78       509
   macro avg       0.71      0.74      0.72       509
weighted avg       0.80      0.78      0.79       509

The ROC AUC score: 0.74006908462867


**With application of GridSearchCV and RandomSearchCV, we obtained the maximum roc_auc score of 0.769.**   

# Select Best K features and train model again with diffrent number of features values

In [19]:
# let us try for one k, i.e k = 90
from sklearn.feature_selection import SelectKBest, f_classif, chi2
features_selector = SelectKBest(score_func = f_classif, k = 90)
ordered = features_selector.fit_transform(X_train, y_train)
print(ordered.shape)
a = features_selector.get_support()
print(a)
features = X_train.columns
print(features)
b = features[a].tolist()
print('List of 3 most influential features on the target (high to low):', b)

(1824, 90)
[ True False  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True False False False  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True False  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True]
Index(['InscClaimAmtReimbursed_x', 'DeductibleAmtPaid_x',
       'InpatientClaimPeriod', 'DurationInHospital', 'NoOfMonths_PartACov_x',
       'NoOfMonths_PartBCov_x', 'IPAnnualReimbursementAmt_x',
       'IPAnnualDeductibleAmt_x', 'OPAnnualReimbursementAmt_x',
       'OPAnnualDeductibleAmt_x', 'AttendingPhysician_x',
       'OperatingPhysician_x', 'OtherPhysician_x', 'DiagnosisGroupCode',
       'ClmDiagno

  f = msb / msw


In [20]:
# With 90 most important features, apply GridsearchCV on Logistic Regression
lr5 = LogisticRegression(solver = 'liblinear', random_state = 42)

parameters = {'penalty': ['l1', 'l2'],
             'C': [0.001, 0.01, 0.1, 1.0, 10],
             'solver': ['liblinear'],
              'max_iter':[100, 200] 
             }
gsc_lr5 = GridSearchCV(lr5, param_grid = parameters, scoring = {'f1_score': 'f1', 'roc_auc_score': 'roc_auc'}, refit = 'roc_auc_score', cv = 3, n_jobs = -1)
gsc_lr5.fit(X_train[b], y_train)


best_parameters = gsc_lr5.best_params_
best_score = gsc_lr5.best_score_

print("Best parameters from GridSearchCV on Logistic Regression:", best_parameters)
print("Best Score from Grid Search CV on: ", best_score)

gsc_lr5_1 = LogisticRegression(solver = best_parameters['solver'], random_state = 42, C = best_parameters['C'], penalty = best_parameters['penalty'], max_iter = best_parameters['max_iter'])
gsc_lr5_1.fit(X_train[b], y_train)

y_predict_gsc_lr5_1 = gsc_lr5_1.predict(X_test[b])

print("classification_report:\n", classification_report(y_test, y_predict_gsc_lr5_1, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test, y_predict_gsc_lr5_1))


Best parameters from GridSearchCV on Logistic Regression: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score from Grid Search CV on:  0.896583564173592
classification_report:
               precision    recall  f1-score   support

           1       0.57      0.69      0.62       123
           0       0.89      0.83      0.86       386

    accuracy                           0.80       509
   macro avg       0.73      0.76      0.74       509
weighted avg       0.82      0.80      0.81       509

The ROC AUC score: 0.7626269008804076


In [21]:
# Apply Randomized search on Logistic Regression

lr6 = LogisticRegression(solver = 'liblinear', random_state = 42)

parameters = {'penalty': ['l1', 'l2'],
             'C': np.linspace(0.001, 10, 10),
             'solver': ['liblinear'],
              'max_iter':np.linspace(50, 200, 3)}
rsc_lr6 = RandomizedSearchCV(lr6, param_distributions = parameters, scoring = 'roc_auc', random_state = 42, n_jobs = -1)
rsc_lr6.fit(X_train[b], y_train)


best_parameters = rsc_lr6.best_params_
best_score = rsc_lr6.best_score_

print("Best parameters from GridSearchCV on Logistic Regression:", best_parameters)
print("Best Score from Grid Search CV on: ", best_score)
print("\n")

rsc_lr6_1 = LogisticRegression(solver = best_parameters['solver'], random_state = 42, C = best_parameters['C'], penalty = best_parameters['penalty'], max_iter = best_parameters['max_iter'])
rsc_lr6_1.fit(X_train[b], y_train)
y_predict_rsc_lr6_1 = rsc_lr6_1.predict(X_test[b])

print("classification_report:\n", classification_report(y_test, y_predict_rsc_lr6_1, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test, y_predict_rsc_lr6_1))

Best parameters from GridSearchCV on Logistic Regression: {'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 50.0, 'C': 2.223}
Best Score from Grid Search CV on:  0.8917350139202057


classification_report:
               precision    recall  f1-score   support

           1       0.53      0.67      0.59       123
           0       0.88      0.81      0.85       386

    accuracy                           0.78       509
   macro avg       0.71      0.74      0.72       509
weighted avg       0.80      0.78      0.79       509

The ROC AUC score: 0.74006908462867


In [22]:
# Write a function to carry out the SelectKBest, Logistic regression, GridSearchCV
def Best_features_LogisticRegression(X_tr, y_tr, X_te, y_te, K):
    for k in K:
        features_selector = SelectKBest(score_func = f_classif, k = k)
        ordered = features_selector.fit_transform(X_tr, y_tr)
        a = features_selector.get_support()
        features = X_tr.columns
        b = features[a].tolist()

        lr7 = LogisticRegression(solver = 'liblinear', random_state = 42)
        parameters = {'penalty': ['l1', 'l2'],
             'C': [0.001, 0.01, 0.1, 1.0, 10],
             'solver': ['liblinear'],
              'max_iter':[100, 200]}
        gbc_pipe = GridSearchCV(lr7, param_grid = parameters, scoring = {'f1_score': 'f1', 'roc_auc_score': 'roc_auc'}, refit = 'roc_auc_score', cv = 3, n_jobs = -1)      
        gbc_pipe.fit(X_tr[b], y_tr)
        best_parameters = gbc_pipe.best_params_
        best_score = gbc_pipe.best_score_

        print("K = ", k)
        print("Best parameters from GridSearchCV on Logistic Regression:", best_parameters)
        print("Best Score from Grid Search CV on: ", best_score)
        print("\n")

        rsc_lr7_1 = LogisticRegression(solver = best_parameters['solver'], random_state = 42, C = best_parameters['C'], penalty = best_parameters['penalty'], max_iter = best_parameters['max_iter'])
        rsc_lr7_1.fit(X_tr[b], y_tr)
        y_predict_rsc_lr7_1 = rsc_lr7_1.predict(X_te[b])

        print("classification_report:\n", classification_report(y_te, y_predict_rsc_lr7_1, labels = [1, 0]))
        print("The ROC AUC score:", roc_auc_score(y_te, y_predict_rsc_lr7_1))
    return print("End")

In [23]:
print(Best_features_LogisticRegression(X_train, y_train, X_test, y_test, [50, 60, 70, 75, 80, 85, 90, 95]))

  f = msb / msw


K =  50
Best parameters from GridSearchCV on Logistic Regression: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score from Grid Search CV on:  0.8946827389196675


classification_report:
               precision    recall  f1-score   support

           1       0.56      0.69      0.62       123
           0       0.89      0.83      0.86       386

    accuracy                           0.79       509
   macro avg       0.73      0.76      0.74       509
weighted avg       0.81      0.79      0.80       509

The ROC AUC score: 0.7587408905177134


  f = msb / msw


K =  60
Best parameters from GridSearchCV on Logistic Regression: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score from Grid Search CV on:  0.8937233090951061


classification_report:
               precision    recall  f1-score   support

           1       0.56      0.68      0.62       123
           0       0.89      0.83      0.86       386

    accuracy                           0.79       509
   macro avg       0.73      0.76      0.74       509
weighted avg       0.81      0.79      0.80       509

The ROC AUC score: 0.7559711866548718


  f = msb / msw


K =  70
Best parameters from GridSearchCV on Logistic Regression: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score from Grid Search CV on:  0.8956602031394274


classification_report:
               precision    recall  f1-score   support

           1       0.57      0.69      0.62       123
           0       0.89      0.83      0.86       386

    accuracy                           0.80       509
   macro avg       0.73      0.76      0.74       509
weighted avg       0.82      0.80      0.80       509

The ROC AUC score: 0.761331564092843


  f = msb / msw


K =  75
Best parameters from GridSearchCV on Logistic Regression: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score from Grid Search CV on:  0.8957864439058172


classification_report:
               precision    recall  f1-score   support

           1       0.57      0.68      0.62       123
           0       0.89      0.83      0.86       386

    accuracy                           0.80       509
   macro avg       0.73      0.76      0.74       509
weighted avg       0.81      0.80      0.80       509

The ROC AUC score: 0.7585618602300013


  f = msb / msw


K =  80
Best parameters from GridSearchCV on Logistic Regression: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score from Grid Search CV on:  0.895800871421976


classification_report:
               precision    recall  f1-score   support

           1       0.57      0.68      0.62       123
           0       0.89      0.84      0.86       386

    accuracy                           0.80       509
   macro avg       0.73      0.76      0.74       509
weighted avg       0.81      0.80      0.81       509

The ROC AUC score: 0.7598571970175662


  f = msb / msw


K =  85
Best parameters from GridSearchCV on Logistic Regression: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score from Grid Search CV on:  0.896583564173592


classification_report:
               precision    recall  f1-score   support

           1       0.57      0.69      0.62       123
           0       0.89      0.83      0.86       386

    accuracy                           0.80       509
   macro avg       0.73      0.76      0.74       509
weighted avg       0.82      0.80      0.81       509

The ROC AUC score: 0.7626269008804076


  f = msb / msw


K =  90
Best parameters from GridSearchCV on Logistic Regression: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score from Grid Search CV on:  0.896583564173592


classification_report:
               precision    recall  f1-score   support

           1       0.57      0.69      0.62       123
           0       0.89      0.83      0.86       386

    accuracy                           0.80       509
   macro avg       0.73      0.76      0.74       509
weighted avg       0.82      0.80      0.81       509

The ROC AUC score: 0.7626269008804076


  f = msb / msw


K =  95
Best parameters from GridSearchCV on Logistic Regression: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score from Grid Search CV on:  0.896583564173592


classification_report:
               precision    recall  f1-score   support

           1       0.57      0.69      0.62       123
           0       0.89      0.83      0.86       386

    accuracy                           0.80       509
   macro avg       0.73      0.76      0.74       509
weighted avg       0.82      0.80      0.81       509

The ROC AUC score: 0.7626269008804076
End
None


**Among all the features, the best roc_auc_score obtained with k = 70.**

Now let us see applying the above features to Random Forest Classifier and  Gradient Boosting Classifier. 

In [24]:
features_selector = SelectKBest(score_func = f_classif, k = 70)
ordered = features_selector.fit_transform(X_train, y_train)
#print(ordered.shape)
a = features_selector.get_support()
#print(a)
features = X_train.columns
#print(features)
b = features[a].tolist()

RFC3 = RandomForestClassifier(random_state = 42)
RFC3.fit(X_train[b], y_train)
y_predict_RFC3 = RFC3.predict(X_test[b])
print("classification_report:\n", classification_report(y_test, y_predict_RFC3, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test, y_predict_RFC3))

  f = msb / msw


classification_report:
               precision    recall  f1-score   support

           1       0.58      0.56      0.57       123
           0       0.86      0.87      0.87       386

    accuracy                           0.80       509
   macro avg       0.72      0.72      0.72       509
weighted avg       0.79      0.80      0.79       509

The ROC AUC score: 0.7157209654998105


In [25]:
features_selector = SelectKBest(score_func = f_classif, k = 70)
ordered = features_selector.fit_transform(X_train, y_train)
a = features_selector.get_support()
features = X_train.columns
b = features[a].tolist()
print(b[0], b[26:29], b[69:75])

InscClaimAmtReimbursed_x ['ChronicCond_Diabetes_x', 'ChronicCond_IschemicHeart_x', 'ChronicCond_Osteoporasis_x'] ['ChronicCond_stroke_y']


  f = msb / msw


# Apply PCA

In [26]:
from sklearn.decomposition import PCA

pca = PCA(random_state = 42)
pca1 = pca.fit(X_train)
pca1_train = pca1.transform(X_train)
pca1_test = pca1.transform(X_test)
print(pca1.explained_variance_ratio_)

lr_pca = LogisticRegression(random_state = 42)
lr_pca.fit(pca1_train, y_train)
y_predict_pca1 = lr_pca.predict(pca1_test)
print("classification_report:\n", classification_report(y_test, y_predict_pca1, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test, y_predict_pca1))

[3.99990774e-01 1.69289790e-01 4.38049300e-02 3.22223688e-02
 2.38183576e-02 2.16646214e-02 1.67300268e-02 1.59665870e-02
 1.51744680e-02 1.41792510e-02 1.31724278e-02 1.24646577e-02
 1.16322939e-02 1.11792936e-02 1.08574220e-02 1.01433881e-02
 9.64594560e-03 9.57512314e-03 9.29333408e-03 9.00985671e-03
 8.64369651e-03 8.03858907e-03 7.95500739e-03 7.79034283e-03
 7.45721086e-03 7.38644062e-03 7.19822386e-03 6.81121282e-03
 6.64349734e-03 6.45044766e-03 5.92483707e-03 5.72502687e-03
 5.71297762e-03 5.49936526e-03 4.89542822e-03 4.53058090e-03
 4.13468110e-03 3.78155148e-03 3.49409741e-03 3.44972950e-03
 3.33858360e-03 3.06524987e-03 2.64189822e-03 2.19606200e-03
 2.05193088e-03 1.91570007e-03 1.64339128e-03 1.48453573e-03
 3.16500499e-04 8.28646855e-06 7.88842710e-32 2.38520797e-33
 2.38520797e-33 2.38520797e-33 2.38520797e-33 2.38520797e-33
 2.38520797e-33 2.38520797e-33 2.38520797e-33 2.38520797e-33
 2.38520797e-33 2.38520797e-33 2.38520797e-33 2.38520797e-33
 2.38520797e-33 2.385207

In [27]:
pca = PCA(n_components = 5, random_state = 42)
pca1 = pca.fit(X_train)
pca1_train = pca1.transform(X_train)
pca1_test = pca1.transform(X_test)
print(pca1.explained_variance_ratio_)

lr_pca = LogisticRegression(random_state = 42)
lr_pca.fit(pca1_train, y_train)
y_predict_pca1 = lr_pca.predict(pca1_test)
print("classification_report:\n", classification_report(y_test, y_predict_pca1, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test, y_predict_pca1))

[0.39999077 0.16928979 0.04380493 0.03222237 0.02381827]
classification_report:
               precision    recall  f1-score   support

           1       0.56      0.67      0.61       123
           0       0.89      0.83      0.86       386

    accuracy                           0.79       509
   macro avg       0.72      0.75      0.73       509
weighted avg       0.81      0.79      0.80       509

The ROC AUC score: 0.7504317789291882


In [28]:
pca = PCA(n_components = 10, random_state = 42)
pca1 = pca.fit(X_train)
pca1_train = pca1.transform(X_train)
pca1_test = pca1.transform(X_test)
print(pca1.explained_variance_ratio_)

lr_pca = LogisticRegression(random_state = 42)
lr_pca.fit(pca1_train, y_train)
y_predict_pca1 = lr_pca.predict(pca1_test)
print("classification_report:\n", classification_report(y_test, y_predict_pca1, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test, y_predict_pca1))

[0.39999077 0.16928979 0.0438049  0.0322223  0.02381794 0.0216593
 0.01671741 0.01594594 0.01515799 0.01413419]
classification_report:
               precision    recall  f1-score   support

           1       0.57      0.68      0.62       123
           0       0.89      0.84      0.86       386

    accuracy                           0.80       509
   macro avg       0.73      0.76      0.74       509
weighted avg       0.81      0.80      0.81       509

The ROC AUC score: 0.7598571970175662


# PCA with best parameters of logistic regression

In [29]:
pca = PCA(n_components = 10, random_state = 42)
pca1 = pca.fit(X_train)
pca1_train = pca1.transform(X_train)
pca1_test = pca1.transform(X_test)
print(pca1.explained_variance_ratio_)
lr_best_pca = LogisticRegression(C = 0.01, max_iter= 100, penalty= 'l2', solver = 'liblinear', random_state = 42) # These hyperparameters was obtained earlier during opmization of logistic regression
lr_best_pca.fit(pca1_train, y_train)
y_predict_lr_best_pca = lr_best_pca.predict(pca1_test)

print("classification_report:\n", classification_report(y_test, y_predict_lr_best_pca, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test, y_predict_lr_best_pca))

[0.39999077 0.16928979 0.0438049  0.0322223  0.02381794 0.0216593
 0.01671741 0.01594594 0.01515799 0.01413419]
classification_report:
               precision    recall  f1-score   support

           1       0.59      0.66      0.62       123
           0       0.89      0.85      0.87       386

    accuracy                           0.81       509
   macro avg       0.74      0.76      0.74       509
weighted avg       0.81      0.81      0.81       509

The ROC AUC score: 0.7554340957917351


**Among all with PCA application, 0.765 is the highest roc_auc_score.**

# Case-2: ML model building (with data where dummie features have been obtained for categorical features) 

In [30]:
X_train2 = pd.read_csv('X_train5.csv', index_col = 0)
y_train2 = pd.read_csv('y_train5.csv', index_col = 0)

X_test2 = pd.read_csv('X_test5.csv', index_col = 0)
y_test2 = pd.read_csv('y_test5.csv', index_col = 0)

In [31]:
X_train2.shape, y_train2.shape, X_test2.shape, y_test2.shape

((1185, 95), (1185, 1), (509, 95), (509, 1))

In [32]:
X_train2.head()

Unnamed: 0,InscClaimAmtReimbursed_x,DeductibleAmtPaid_x,InpatientClaimPeriod,DurationInHospital,NoOfMonths_PartACov_x,NoOfMonths_PartBCov_x,IPAnnualReimbursementAmt_x,IPAnnualDeductibleAmt_x,OPAnnualReimbursementAmt_x,OPAnnualDeductibleAmt_x,...,ChronicCond_Heartfailure_y,ChronicCond_KidneyDisease_y,ChronicCond_Cancer_y,ChronicCond_ObstrPulmonary_y,ChronicCond_Depression_y,ChronicCond_Diabetes_y,ChronicCond_IschemicHeart_y,ChronicCond_Osteoporasis_y,ChronicCond_rheumatoidarthritis_y,ChronicCond_stroke_y
0,0.03529,0.0,0.634834,0.632766,0.215258,-0.648255,-0.037962,-0.016147,-0.242751,-0.867557,...,-0.454031,-0.454031,-0.454031,-0.454031,-0.454031,-0.454031,-0.454031,-0.454031,-0.454031,-0.454031
1,-0.547996,0.0,0.88734,0.885036,0.215258,0.237652,-1.1167,-0.475348,-0.9028,-0.997736,...,-0.032232,-0.032232,-0.032232,-0.032232,-0.032232,-0.032232,-0.032232,-0.032232,-0.032232,-0.032232
2,-0.017145,0.0,-0.822593,-0.823304,0.215258,-1.695236,-0.372273,-0.326406,-0.438602,-0.818562,...,-0.100631,-0.100631,-0.100631,-0.100631,-0.100631,-0.100631,-0.100631,-0.100631,-0.100631,-0.100631
3,0.238384,0.0,-1.056474,-1.056968,0.215258,0.237652,-0.434826,-0.442754,-0.737917,0.944394,...,-0.237431,-0.237431,-0.237431,-0.237431,-0.237431,-0.237431,-0.237431,-0.237431,-0.237431,-0.237431
4,-0.817705,0.0,-0.770619,-0.771379,0.215258,0.237652,-0.205604,0.837067,-0.546073,0.195945,...,-0.94423,-0.94423,-0.94423,-0.94423,-0.94423,-0.94423,-0.94423,-0.94423,-0.94423,-0.94423


In [33]:
y_train2.head()

Unnamed: 0,PotentialFraud
0,1
1,0
2,1
3,0
4,0


In [34]:
y_train2 = y_train2.values.ravel()

# Model building and evaluation

# Logistic Regression

In [35]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver = 'liblinear', random_state = 42) 

# Fit the model
lr.fit(X_train2, y_train2)
# Let us apply this to test set
y_predict = lr.predict(X_test2)

from sklearn.metrics import confusion_matrix, classification_report,roc_auc_score
print("classification_report:\n", classification_report(y_test2, y_predict, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test2, y_predict))

classification_report:
               precision    recall  f1-score   support

           1       0.71      0.45      0.55       123
           0       0.84      0.94      0.89       386

    accuracy                           0.82       509
   macro avg       0.78      0.70      0.72       509
weighted avg       0.81      0.82      0.81       509

The ROC AUC score: 0.6950798264459329


# Random Forest Classifier

In [36]:
# Random forest classifier
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(random_state = 42)
RF.fit(X_train2, y_train2)
y_predict_RF = RF.predict(X_test2)

print("classification_report:\n", classification_report(y_test2, y_predict_RF, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test2, y_predict_RF))

classification_report:
               precision    recall  f1-score   support

           1       0.65      0.37      0.47       123
           0       0.82      0.94      0.88       386

    accuracy                           0.80       509
   macro avg       0.74      0.65      0.67       509
weighted avg       0.78      0.80      0.78       509

The ROC AUC score: 0.6518387463667382


# Gradient Boosting Classifier 

In [37]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(random_state = 42)
gbc.fit(X_train2, y_train2)
y_predict_gbc = gbc.predict(X_test2)
print("classification_report:\n", classification_report(y_test2, y_predict_gbc, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test2, y_predict_gbc))

classification_report:
               precision    recall  f1-score   support

           1       0.72      0.47      0.57       123
           0       0.85      0.94      0.89       386

    accuracy                           0.83       509
   macro avg       0.78      0.71      0.73       509
weighted avg       0.82      0.83      0.81       509

The ROC AUC score: 0.7059796116095877


# Support Vector Classifier

In [38]:
from sklearn.svm import SVC
svc = SVC(random_state = 42)
svc.fit(X_train2, y_train2)
y_predict_svc = svc.predict(X_test2)
print("classification_report:\n", classification_report(y_test2, y_predict_svc, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test2, y_predict_svc))

classification_report:
               precision    recall  f1-score   support

           1       0.71      0.41      0.52       123
           0       0.83      0.95      0.89       386

    accuracy                           0.82       509
   macro avg       0.77      0.68      0.70       509
weighted avg       0.80      0.82      0.80       509

The ROC AUC score: 0.6773452967690299


# SMORT application

In [39]:
oversample = SMOTE()
counter = Counter(y_train2)
print(counter)
X_train2, y_train2 = oversample.fit_resample(X_train2, y_train2)
counter = Counter(y_train2)
print(counter)

Counter({0: 912, 1: 273})
Counter({1: 912, 0: 912})


In [40]:
# Apply Logistic regression
lr2 = LogisticRegression(solver = 'liblinear', random_state = 42) 
lr2.fit(X_train2, y_train2)
y_predict_lr2 = lr2.predict(X_test2)
print("classification_report:\n", classification_report(y_test2, y_predict_lr2, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test2, y_predict_lr2))

classification_report:
               precision    recall  f1-score   support

           1       0.53      0.68      0.60       123
           0       0.89      0.81      0.85       386

    accuracy                           0.78       509
   macro avg       0.71      0.74      0.72       509
weighted avg       0.80      0.78      0.78       509

The ROC AUC score: 0.7443131555667889


In [41]:
# Random Forest Classifier
RF2 = RandomForestClassifier(random_state = 42)
RF2.fit(X_train2, y_train2)
y_predict_RF2 = RF2.predict(X_test2)
print("classification_report:\n", classification_report(y_test2, y_predict_RF2, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test2, y_predict_RF2))

classification_report:
               precision    recall  f1-score   support

           1       0.61      0.50      0.55       123
           0       0.85      0.90      0.87       386

    accuracy                           0.80       509
   macro avg       0.73      0.70      0.71       509
weighted avg       0.79      0.80      0.79       509

The ROC AUC score: 0.7002190488226124


In [42]:
# Apply Gradient Boosting Classifier
gbc2 = GradientBoostingClassifier(random_state = 42)
gbc2.fit(X_train2, y_train2)
y_predict_gbc2 = gbc.predict(X_test2)
print("classification_report:\n", classification_report(y_test2, y_predict_gbc2, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test2, y_predict_gbc2))

classification_report:
               precision    recall  f1-score   support

           1       0.72      0.47      0.57       123
           0       0.85      0.94      0.89       386

    accuracy                           0.83       509
   macro avg       0.78      0.71      0.73       509
weighted avg       0.82      0.83      0.81       509

The ROC AUC score: 0.7059796116095877


In [43]:
# Apply Support Vector Classifier
svc2 = SVC(random_state = 42)
svc2.fit(X_train2, y_train2)
y_predict_svc2 = svc.predict(X_test2)
print("classification_report:\n", classification_report(y_test2, y_predict_svc2, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test2, y_predict_svc2))

classification_report:
               precision    recall  f1-score   support

           1       0.71      0.41      0.52       123
           0       0.83      0.95      0.89       386

    accuracy                           0.82       509
   macro avg       0.77      0.68      0.70       509
weighted avg       0.80      0.82      0.80       509

The ROC AUC score: 0.6773452967690299


# Apply GridsearchCV and RandomSearchCV on Logistic Regression

In [44]:
from sklearn.model_selection import GridSearchCV

parameters = {'penalty': ['l1', 'l2'],
             'C': [0.001, 0.01, 0.1, 1.0, 10],
             'solver': ['liblinear'],
              'max_iter':[100, 200] 
             }
lr3 = LogisticRegression(random_state = 42)
gsc_lr3 = GridSearchCV(lr3, param_grid = parameters, scoring = {'f1_score': 'f1', 'roc_auc_score': 'roc_auc'}, refit = 'roc_auc_score', cv = 3, n_jobs = -1)
# To use two evaluation scoring metrics, I need some help, I got some answer, but not satisfied
#gsc_lr3 = GridSearchCV(lr3, param_grid = parameters, scoring = {'f1_score': 'f1', 'roc_auc_score'= 'roc_auc'}, refit: 'roc_auc', cv = 3)
gsc_lr3.fit(X_train2, y_train2)

best_parameters = gsc_lr3.best_params_
best_score = gsc_lr3.best_score_
print("Best parameters from GridSearchCV on Logistic Regression:", best_parameters)
print("Best Score from Grid Search CV on: ", best_score)

gsc_lr3_1 = LogisticRegression(solver = best_parameters['solver'], random_state = 42, C = best_parameters['C'], penalty = best_parameters['penalty'], max_iter = best_parameters['max_iter'])
gsc_lr3_1.fit(X_train2, y_train2)
y_predict_gsc_lr3_1 = gsc_lr3_1.predict(X_test2)

print("Classification_report from best Logistic regression model obtained from GridSeachCV:\n", classification_report(y_test2, y_predict_gsc_lr3_1, labels = [1, 0]))
print("The ROC AUC score from best Logistic regression model obtained from GridSeachCV:", roc_auc_score(y_test2, y_predict_gsc_lr3_1))

Best parameters from GridSearchCV on Logistic Regression: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score from Grid Search CV on:  0.9004970279316713
Classification_report from best Logistic regression model obtained from GridSeachCV:
               precision    recall  f1-score   support

           1       0.57      0.69      0.62       123
           0       0.89      0.83      0.86       386

    accuracy                           0.80       509
   macro avg       0.73      0.76      0.74       509
weighted avg       0.82      0.80      0.80       509

The ROC AUC score from best Logistic regression model obtained from GridSeachCV: 0.761331564092843


In [45]:
# Apply Randomized search on Logistic Regression
from sklearn.model_selection import RandomizedSearchCV
lr4 = LogisticRegression(solver = 'liblinear', random_state = 42)

parameters = {'penalty': ['l1', 'l2'],
             'C': np.linspace(0.001, 10, 10),
             'solver': ['liblinear'],
              'max_iter':np.linspace(50, 200, 3)}
rsc_lr4 = RandomizedSearchCV(lr4, param_distributions = parameters, scoring = 'roc_auc', random_state = 42, n_jobs = -1)
rsc_lr4.fit(X_train2, y_train2)


best_parameters = rsc_lr4.best_params_
best_score = rsc_lr4.best_score_

print("Best parameters from RandomizedSearchCV on Logistic Regression:", best_parameters)
print("Best Score from RandomizedSearchCV on: ", best_score)

rsc_lr4_1 = LogisticRegression(solver = best_parameters['solver'], random_state = 42, C = best_parameters['C'], penalty = best_parameters['penalty'], max_iter = best_parameters['max_iter'])
rsc_lr4_1.fit(X_train2, y_train2)
y_predict_rsc_lr4_1 = rsc_lr4_1.predict(X_test2)

print("classification_report:\n", classification_report(y_test2, y_predict_rsc_lr4_1, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test2, y_predict_rsc_lr4_1))

Best parameters from RandomizedSearchCV on Logistic Regression: {'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 50.0, 'C': 2.223}
Best Score from RandomizedSearchCV on:  0.8954977917056823
classification_report:
               precision    recall  f1-score   support

           1       0.53      0.68      0.60       123
           0       0.89      0.81      0.85       386

    accuracy                           0.78       509
   macro avg       0.71      0.75      0.72       509
weighted avg       0.80      0.78      0.79       509

The ROC AUC score: 0.7456084923543537


In [46]:
print(Best_features_LogisticRegression(X_train2, y_train2, X_test2, y_test2, [50, 60, 70, 75, 80, 85, 90, 95]))

  f = msb / msw


K =  50
Best parameters from GridSearchCV on Logistic Regression: {'C': 0.1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Best Score from Grid Search CV on:  0.8995267774699908


classification_report:
               precision    recall  f1-score   support

           1       0.56      0.67      0.61       123
           0       0.89      0.83      0.86       386

    accuracy                           0.79       509
   macro avg       0.72      0.75      0.73       509
weighted avg       0.81      0.79      0.80       509

The ROC AUC score: 0.7519061460044653


  f = msb / msw


K =  60
Best parameters from GridSearchCV on Logistic Regression: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score from Grid Search CV on:  0.8976584141274238


classification_report:
               precision    recall  f1-score   support

           1       0.56      0.68      0.62       123
           0       0.89      0.83      0.86       386

    accuracy                           0.79       509
   macro avg       0.73      0.76      0.74       509
weighted avg       0.81      0.79      0.80       509

The ROC AUC score: 0.7559711866548718


  f = msb / msw


K =  70
Best parameters from GridSearchCV on Logistic Regression: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score from Grid Search CV on:  0.8989929593721145


classification_report:
               precision    recall  f1-score   support

           1       0.56      0.71      0.63       123
           0       0.90      0.83      0.86       386

    accuracy                           0.80       509
   macro avg       0.73      0.77      0.74       509
weighted avg       0.82      0.80      0.80       509

The ROC AUC score: 0.7668709718185265


  f = msb / msw


K =  75
Best parameters from GridSearchCV on Logistic Regression: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score from Grid Search CV on:  0.8991877308402585


classification_report:
               precision    recall  f1-score   support

           1       0.56      0.71      0.63       123
           0       0.90      0.83      0.86       386

    accuracy                           0.80       509
   macro avg       0.73      0.77      0.74       509
weighted avg       0.82      0.80      0.80       509

The ROC AUC score: 0.7668709718185265


  f = msb / msw


K =  80
Best parameters from GridSearchCV on Logistic Regression: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score from Grid Search CV on:  0.8993608610341645


classification_report:
               precision    recall  f1-score   support

           1       0.56      0.70      0.62       123
           0       0.90      0.83      0.86       386

    accuracy                           0.80       509
   macro avg       0.73      0.76      0.74       509
weighted avg       0.82      0.80      0.80       509

The ROC AUC score: 0.76280593116812


  f = msb / msw


K =  85
Best parameters from GridSearchCV on Logistic Regression: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score from Grid Search CV on:  0.9004970279316713


classification_report:
               precision    recall  f1-score   support

           1       0.57      0.69      0.62       123
           0       0.89      0.83      0.86       386

    accuracy                           0.80       509
   macro avg       0.73      0.76      0.74       509
weighted avg       0.82      0.80      0.80       509

The ROC AUC score: 0.761331564092843


  f = msb / msw


K =  90
Best parameters from GridSearchCV on Logistic Regression: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score from Grid Search CV on:  0.9004970279316713


classification_report:
               precision    recall  f1-score   support

           1       0.57      0.69      0.62       123
           0       0.89      0.83      0.86       386

    accuracy                           0.80       509
   macro avg       0.73      0.76      0.74       509
weighted avg       0.82      0.80      0.80       509

The ROC AUC score: 0.761331564092843


  f = msb / msw


K =  95
Best parameters from GridSearchCV on Logistic Regression: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score from Grid Search CV on:  0.9004970279316713


classification_report:
               precision    recall  f1-score   support

           1       0.57      0.69      0.62       123
           0       0.89      0.83      0.86       386

    accuracy                           0.80       509
   macro avg       0.73      0.76      0.74       509
weighted avg       0.82      0.80      0.80       509

The ROC AUC score: 0.761331564092843
End
None


**With application of GridSearchCV and RandomSearchCV, we obtained the maximum roc_auc score of 0.76.**  

# Apply SelectKBest to see the most influencial features on target

In [47]:
# let us try for one k, i.e k = 70
from sklearn.feature_selection import SelectKBest, f_classif, chi2
features_selector = SelectKBest(score_func = f_classif, k = 70)
ordered = features_selector.fit_transform(X_train2, y_train2)
print(ordered.shape)
a = features_selector.get_support()
#print(a)
features = X_train2.columns
#print(features)
b = features[a].tolist()
print('List of 3 most influential features on the target (high to low):', b)

(1824, 70)
List of 3 most influential features on the target (high to low): ['InscClaimAmtReimbursed_x', 'InpatientClaimPeriod', 'DurationInHospital', 'NoOfMonths_PartBCov_x', 'IPAnnualReimbursementAmt_x', 'IPAnnualDeductibleAmt_x', 'OPAnnualReimbursementAmt_x', 'OPAnnualDeductibleAmt_x', 'AttendingPhysician_x', 'OperatingPhysician_x', 'DiagnosisGroupCode', 'ClmDiagnosisCode_10_x', 'ClmProcedureCode_2_x', 'ClmProcedureCode_3_x', 'BeneID_x', 'ClaimID_x', 'Gender_x', 'Race_x', 'RenalDiseaseIndicator_x', 'State_x', 'County_x', 'ChronicCond_Alzheimer_x', 'ChronicCond_Heartfailure_x', 'ChronicCond_KidneyDisease_x', 'ChronicCond_Cancer_x', 'ChronicCond_ObstrPulmonary_x', 'ChronicCond_Depression_x', 'ChronicCond_Diabetes_x', 'ChronicCond_IschemicHeart_x', 'ChronicCond_Osteoporasis_x', 'ChronicCond_rheumatoidarthritis_x', 'ChronicCond_stroke_x', 'InscClaimAmtReimbursed_y', 'DeductibleAmtPaid_y', 'OutpatientClaimPeriod', 'IPAnnualReimbursementAmt_y', 'IPAnnualDeductibleAmt_y', 'OPAnnualReimburs

  f = msb / msw


# Apply PCA

In [48]:
from sklearn.decomposition import PCA
pca = PCA(random_state = 42)
pca1 = pca.fit(X_train2)
pca1_train = pca1.transform(X_train2)
pca1_test = pca1.transform(X_test2)
print(pca1.explained_variance_ratio_[:10])

lr_pca = LogisticRegression(random_state = 42)
lr_pca.fit(pca1_train, y_train2)
y_predict_pca1 = lr_pca.predict(pca1_test)
print("classification_report:\n", classification_report(y_test2, y_predict_pca1, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test2, y_predict_pca1))

[0.39756592 0.16744801 0.04422981 0.0322966  0.02450746 0.02189038
 0.01696319 0.01611208 0.01521016 0.0139954 ]
classification_report:
               precision    recall  f1-score   support

           1       0.53      0.68      0.60       123
           0       0.89      0.81      0.85       386

    accuracy                           0.78       509
   macro avg       0.71      0.74      0.72       509
weighted avg       0.80      0.78      0.78       509



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


The ROC AUC score: 0.7443131555667889


In [49]:
pca5 = PCA(n_components = 5, random_state = 42)
pca5 = pca5.fit(X_train2)
pca5_train = pca5.transform(X_train2)
pca5_test = pca5.transform(X_test2)
print(pca5.explained_variance_ratio_[:10])

lr_pca5 = LogisticRegression(random_state = 42)
lr_pca5.fit(pca5_train, y_train2)
y_predict_pca5 = lr_pca5.predict(pca5_test)
print("classification_report:\n", classification_report(y_test2, y_predict_pca5, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test2, y_predict_pca5))

[0.39756592 0.16744801 0.04422981 0.0322966  0.02450737]
classification_report:
               precision    recall  f1-score   support

           1       0.57      0.67      0.62       123
           0       0.89      0.84      0.86       386

    accuracy                           0.80       509
   macro avg       0.73      0.76      0.74       509
weighted avg       0.81      0.80      0.80       509

The ROC AUC score: 0.7570874931547242


In [50]:
from sklearn.decomposition import PCA
pca10 = PCA(n_components = 10, random_state = 42)
pca10 = pca10.fit(X_train2)
pca10_train = pca10.transform(X_train2)
pca10_test = pca10.transform(X_test2)
print(pca10.explained_variance_ratio_[:10])

lr_pca10 = LogisticRegression(random_state = 42)
lr_pca10.fit(pca10_train, y_train2)
y_predict_pca10 = lr_pca10.predict(pca10_test)
print("classification_report:\n", classification_report(y_test2, y_predict_pca10, labels = [1, 0]))
print("The ROC AUC score:", roc_auc_score(y_test2, y_predict_pca10))

[0.39756592 0.16744801 0.04422979 0.03229653 0.02450712 0.02188634
 0.01695093 0.01608477 0.01519478 0.01393139]
classification_report:
               precision    recall  f1-score   support

           1       0.58      0.67      0.62       123
           0       0.89      0.84      0.87       386

    accuracy                           0.80       509
   macro avg       0.73      0.76      0.74       509
weighted avg       0.81      0.80      0.81       509

The ROC AUC score: 0.758382829942289


# Summary:

In both cases, the performance of the Logistic Regression was highest and the roc_auc_score was also found to be same. <br>In Case-1 (roc_auc_score): 0.76 <br>case-2(roc_auc_score): 0.76