In [3]:
import matplotlib.pyplot as plt
from sklearn import linear_model
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction import DictVectorizer
import pandas as pd
import eli5
from eli5.sklearn import PermutationImportance

%matplotlib inline

In [4]:
## Load the data set that got dummified to process feature selection.
train = pd.read_csv('./df_train_cleaned.csv')
test = pd.read_csv('./df_test_cleaned.csv')
print(train.shape)
print(test.shape)

(89407, 48)
(9933, 48)


In [5]:
train.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,metformin-pioglitazone,change,diabetesMed,readmitted,level1_diag1,level1_diag2,level1_diag3,med_dosage_change,number_of_medicine,random
0,7733208,3291489,1,0,65,1,1,7,2,51,...,0,0,0,0,4.0,7.0,0.0,0,0,782
1,152449578,84529188,1,1,45,1,2,7,3,86,...,0,0,0,0,0.0,1.0,0.0,0,0,609
2,440311646,121372727,1,1,45,1,3,7,13,88,...,0,1,1,0,6.0,6.0,6.0,1,1,560
3,106684962,24066279,1,0,85,5,3,17,4,18,...,0,1,1,0,2.0,0.0,0.0,0,3,-710
4,139779162,86645961,2,1,65,1,4,1,3,22,...,0,1,1,0,1.0,7.0,1.0,0,3,568


In [6]:
test.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,metformin-pioglitazone,change,diabetesMed,readmitted,level1_diag1,level1_diag2,level1_diag3,med_dosage_change,number_of_medicine,random
0,110939484,19274094,1,0,75,1,1,6,11,68,...,0,0,1,0,4.0,7.0,0.0,0,1,263
1,170328306,65634327,1,1,55,1,1,1,1,20,...,0,0,1,0,0.0,1.0,0.0,0,1,-712
2,245688426,100657359,1,0,65,3,6,1,4,21,...,0,0,1,0,6.0,6.0,6.0,0,1,-508
3,150826224,83144448,1,1,35,2,1,1,12,28,...,0,0,1,0,2.0,0.0,0.0,0,1,70
4,135993852,65234214,2,0,65,1,2,7,1,21,...,0,0,1,1,1.0,7.0,1.0,0,1,200


In [5]:
# machine learning
import warnings

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import log_loss, accuracy_score, mean_squared_error, r2_score, precision_score, recall_score, roc_auc_score

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
# from boruta import BorutaPy
from xgboost import XGBClassifier, XGBRanker
from scipy.stats import skew

print(__doc__)

# Definitions
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline
warnings.filterwarnings('ignore')

Automatically created module for IPython interactive environment


In [7]:
# Create X and y for train and test sets 
y = train['readmitted']
X = train.drop('readmitted', axis=1)


y_test = test['readmitted']
X_test = test.drop('readmitted', axis=1)


# Check the shape
print(X.shape)
print(y.shape)
print(X_test.shape)
print(y_test.shape)

(89407, 47)
(89407,)
(9933, 47)
(9933,)


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=0)
logreg = LogisticRegression(fit_intercept=True, penalty='l1')
print("Cross Validation Score: {:.2%}".format(np.mean(cross_val_score(logreg, X_train, y_train, cv=10))))
logreg.fit(X_train, y_train)
print("Dev Set score: {:.2%}".format(logreg.score(X_valid, y_valid)))

Cross Validation Score: 88.69%
Dev Set score: 88.30%


In [12]:
y_valid_predict = logreg.predict(X_valid)
y_valid_predict

array([0, 0, 0, ..., 0, 0, 0])

In [13]:
pd.crosstab(pd.Series(y_valid, name = 'Actual'), pd.Series(y_valid_predict, name = 'Predict'), margins = True)

Predict,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3079,9,3088
1,431,2,433
All,3510,11,3521


In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
print("Accuracy is {0:.2f}".format(accuracy_score(y_valid, y_valid_predict)))
print("Precision is {0:.2f}".format(precision_score(y_valid, y_valid_predict)))
print("Recall is {0:.2f}".format(recall_score(y_valid, y_valid_predict)))
print("AUC is {0:.2f}".format(roc_auc_score(y_valid, y_valid_predict)))

Accuracy is 0.88
Precision is 0.49
Recall is 0.02
AUC is 0.51


In [15]:
import statsmodels.api as sm
logit = sm.Logit(y_train, X_train)

result = logit.fit()
from scipy import stats

print(result.summary())

         Current function value: 0.342944
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:             readmitted   No. Observations:                71525
Model:                          Logit   Df Residuals:                    71478
Method:                           MLE   Df Model:                           46
Date:                Tue, 03 Dec 2019   Pseudo R-squ.:                 0.02823
Time:                        16:19:12   Log-Likelihood:                -24529.
converged:                      False   LL-Null:                       -25242.
Covariance Type:            nonrobust   LLR p-value:                1.963e-268
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
encounter_id             -1.317e-09    1.5e-10     -8.784      0.000   -1.61e-09   -1.02e-09
patient_nbr            



- From the result above, data donot balance. Need to apply data balancing to the calculation.

## Applying SMOTE method to balance the dataset

In [8]:
# Data balancing applied using SMOTE
from imblearn.over_sampling import SMOTE

from collections import Counter
print('Original dataset shape {}'.format(Counter(y)))
sm = SMOTE(random_state=20)
X_new, y_new = sm.fit_sample(X, y)
print('New dataset shape {}'.format(Counter(y_new)))

Original dataset shape Counter({0: 79227, 1: 10180})
New dataset shape Counter({0: 79227, 1: 79227})


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [9]:
X_new = pd.DataFrame(X_new, columns = list(X.columns))

In [10]:
X_new

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,level1_diag1,level1_diag2,level1_diag3,med_dosage_change,number_of_medicine,random
0,7.733208e+06,3.291489e+06,1.000000,0.000000,65.000000,1.000000,1.000000,7.000000,2.000000,51.000000,...,0.0,0.0,0.000000,0.000000,4.000000,7.000000,0.000000,0.000000,0.000000,782.000000
1,1.524496e+08,8.452919e+07,1.000000,1.000000,45.000000,1.000000,2.000000,7.000000,3.000000,86.000000,...,0.0,0.0,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,609.000000
2,4.403116e+08,1.213727e+08,1.000000,1.000000,45.000000,1.000000,3.000000,7.000000,13.000000,88.000000,...,0.0,0.0,1.000000,1.000000,6.000000,6.000000,6.000000,1.000000,1.000000,560.000000
3,1.066850e+08,2.406628e+07,1.000000,0.000000,85.000000,5.000000,3.000000,17.000000,4.000000,18.000000,...,0.0,0.0,1.000000,1.000000,2.000000,0.000000,0.000000,0.000000,3.000000,-710.000000
4,1.397792e+08,8.664596e+07,2.000000,1.000000,65.000000,1.000000,4.000000,1.000000,3.000000,22.000000,...,0.0,0.0,1.000000,1.000000,1.000000,7.000000,1.000000,0.000000,3.000000,568.000000
5,2.232780e+08,5.955926e+07,1.000000,1.000000,75.000000,3.000000,22.000000,1.000000,5.000000,68.000000,...,0.0,0.0,1.000000,1.000000,8.000000,4.000000,0.000000,0.000000,2.000000,-779.000000
6,1.270057e+08,7.203434e+07,1.000000,1.000000,85.000000,3.000000,1.000000,1.000000,4.000000,66.000000,...,0.0,0.0,1.000000,1.000000,3.000000,4.000000,1.000000,0.000000,2.000000,872.000000
7,3.732046e+07,2.356080e+07,1.000000,0.000000,65.000000,5.000000,3.000000,17.000000,6.000000,17.000000,...,0.0,0.0,1.000000,1.000000,5.000000,0.000000,0.000000,1.000000,1.000000,-537.000000
8,1.440333e+08,6.382269e+06,2.000000,1.000000,55.000000,1.000000,1.000000,7.000000,2.000000,43.000000,...,0.0,0.0,1.000000,1.000000,2.000000,1.000000,0.000000,0.000000,2.000000,466.000000
9,1.660080e+08,8.944564e+07,1.000000,1.000000,75.000000,2.000000,1.000000,1.000000,2.000000,52.000000,...,0.0,0.0,0.000000,1.000000,4.000000,1.000000,0.000000,0.000000,1.000000,952.000000


In [16]:
X_new_train, X_new_valid, y_new_train, y_new_valid = train_test_split(X_new, y_new, test_size=0.20, random_state=0)


In [14]:
logreg = LogisticRegression(fit_intercept=True, penalty='l1')


In [None]:
print("Cross Validation Score: {:.2%}".format(np.mean(cross_val_score(logreg, X_new_train, y_new_train, cv=10))))


In [None]:
logreg.fit(X_new_train, y_new_train)




In [None]:
print("Dev Set score: {:.2%}".format(logreg.score(X_valid_new, y_valid_new)))


In [None]:
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import cross_val_score
# logreg = LogisticRegression(fit_intercept=True, penalty='l1')
# print("Cross Validation Score: {:.2%}".format(np.mean(cross_val_score(logreg, X_train_new, y_train_new, cv=10))))
# logreg.fit(X_train_new, y_train_new)
# print("Dev Set score: {:.2%}".format(logreg.score(X_valid_new, y_valid_new)))


In [44]:
Y_dev_predict = logreg.predict(X_dev)

In [45]:
pd.crosstab(pd.Series(Y_dev, name = 'Actual'), pd.Series(Y_dev_predict, name = 'Predict'), margins = True)

Predict,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,6372,4164,10536
1,3905,6562,10467
All,10277,10726,21003


In [46]:
print("Accuracy is {0:.2f}".format(accuracy_score(Y_dev, Y_dev_predict)))
print("Precision is {0:.2f}".format(precision_score(Y_dev, Y_dev_predict)))
print("Recall is {0:.2f}".format(recall_score(Y_dev, Y_dev_predict)))
print("AUC is {0:.2f}".format(roc_auc_score(Y_dev, Y_dev_predict)))

accuracy_logreg = accuracy_score(Y_dev, Y_dev_predict)
precision_logreg = precision_score(Y_dev, Y_dev_predict)
recall_logreg = recall_score(Y_dev, Y_dev_predict)
auc_logreg = roc_auc_score(Y_dev, Y_dev_predict)

Accuracy is 0.62
Precision is 0.61
Recall is 0.63
AUC is 0.62
