In [1]:
# pip install eli5

In [35]:
import matplotlib.pyplot as plt
from sklearn import linear_model
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction import DictVectorizer
import pandas as pd
import eli5
from eli5.sklearn import PermutationImportance

%matplotlib inline

In [23]:
## Load the data set that got dummified to process feature selection.
train = pd.read_csv('df_train_preprocessed.csv')
test = pd.read_csv('df_test_preprocessed.csv')
print(train.shape)
print(test.shape)

(56442, 175)
(4938, 175)


In [24]:
train.head()

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_2,diag_3,...,medical_specialty_Surgery-Colon&Rectal,medical_specialty_Surgery-General,medical_specialty_Surgery-Maxillofacial,medical_specialty_Surgery-Neuro,medical_specialty_Surgery-Pediatric,medical_specialty_Surgery-Plastic,medical_specialty_Surgery-Thoracic,medical_specialty_Surgery-Vascular,medical_specialty_SurgicalSpecialty,medical_specialty_Urology
0,65,-0.779,0.404,0.914,-0.576,-0.272,-0.191,-0.403,3.0,4.0,...,0,0,0,0,0,0,0,0,0,0
1,45,-0.442,2.171,-0.234,-0.091,0.609,-0.191,0.803,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,45,2.934,2.272,2.063,2.213,-0.272,-0.191,-0.403,1.0,2.0,...,0,0,0,0,0,0,0,0,0,0
3,65,-0.442,-1.06,-0.808,-0.576,0.609,-0.191,2.008,2.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,75,0.233,1.262,-0.234,1.122,-0.272,1.327,-0.403,7.0,1.0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
test.head()

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_2,diag_3,...,medical_specialty_Surgery-Colon&Rectal,medical_specialty_Surgery-General,medical_specialty_Surgery-Maxillofacial,medical_specialty_Surgery-Neuro,medical_specialty_Surgery-Pediatric,medical_specialty_Surgery-Plastic,medical_specialty_Surgery-Thoracic,medical_specialty_Surgery-Vascular,medical_specialty_SurgicalSpecialty,medical_specialty_Urology
0,75,2.259,1.262,-0.808,0.516,-0.272,-0.191,-0.403,7.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,55,-1.117,-1.161,-0.808,-1.06,-0.272,-0.191,-0.403,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,85,0.908,-0.051,0.34,1.364,-0.272,-0.191,-0.403,4.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,75,0.571,-0.0,0.34,0.152,-0.272,-0.191,-0.403,4.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,55,-0.442,1.01,1.489,0.152,-0.272,-0.191,-0.403,4.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
# machine learning
import warnings

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import log_loss, accuracy_score, mean_squared_error, r2_score, precision_score, recall_score, roc_auc_score

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
# from boruta import BorutaPy
from xgboost import XGBClassifier, XGBRanker
from scipy.stats import skew

print(__doc__)

# Definitions
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline
warnings.filterwarnings('ignore')

Automatically created module for IPython interactive environment


In [34]:
# Create X and y for train and test sets 
X = train.drop('readmitted', axis=1)
y = train['readmitted']

X_test = test.drop('readmitted', axis=1)
y_test = test['readmitted']


# Check the shape
print(X.shape)
print(y.shape)
print(X_test.shape)
print(y_test.shape)

(56442, 174)
(56442,)
(4938, 174)
(4938,)


In [36]:
# split train data set in train and test sets for model training
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.20, random_state=42)

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
X_train, X_dev, Y_train, Y_dev = train_test_split(X, y, test_size=0.20, random_state=0)
logreg = LogisticRegression(fit_intercept=True, penalty='l1')
print("Cross Validation Score: {:.2%}".format(np.mean(cross_val_score(logreg, X_train, Y_train, cv=10))))
logreg.fit(X_train, Y_train)
print("Dev Set score: {:.2%}".format(logreg.score(X_dev, Y_dev)))

Cross Validation Score: 93.02%
Dev Set score: 93.06%


In [38]:
Y_dev_predict = logreg.predict(X_dev)

In [39]:
pd.crosstab(pd.Series(Y_dev, name = 'Actual'), pd.Series(Y_dev_predict, name = 'Predict'), margins = True)

Predict,0,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2037,2037
1,196,196
All,2233,2233


In [40]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
print("Accuracy is {0:.2f}".format(accuracy_score(Y_dev, Y_dev_predict)))
print("Precision is {0:.2f}".format(precision_score(Y_dev, Y_dev_predict)))
print("Recall is {0:.2f}".format(recall_score(Y_dev, Y_dev_predict)))
print("AUC is {0:.2f}".format(roc_auc_score(Y_dev, Y_dev_predict)))

Accuracy is 0.93
Precision is 0.00
Recall is 0.00
AUC is 0.50


In [None]:
# import statsmodels.api as sm
# logit = sm.Logit(Y_train, X_train)

# result = logit.fit()

# # from scipy import stats

# print(result.summary())

- From the result above, data donot balance. Need to apply data balancing to the calculation.

## Applying SMOTE method to balance the dataset

In [41]:
# Data balancing applied using SMOTE
from imblearn.over_sampling import SMOTE

from collections import Counter
print('Original dataset shape {}'.format(Counter(y)))
sm = SMOTE(random_state=20)
X_new, y_new = sm.fit_sample(X, y)
print('New dataset shape {}'.format(Counter(y_new)))

Original dataset shape Counter({0: 52507, 1: 3935})
New dataset shape Counter({0: 52507, 1: 52507})


In [42]:
X_new = pd.DataFrame(X_new, columns = list(X.columns))

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
X_train, X_dev, Y_train, Y_dev = train_test_split(X_new, y_new, test_size=0.20, random_state=0)
logreg = LogisticRegression(fit_intercept=True, penalty='l1')
print("Cross Validation Score: {:.2%}".format(np.mean(cross_val_score(logreg, X_train, Y_train, cv=10))))
logreg.fit(X_train, Y_train)
print("Dev Set score: {:.2%}".format(logreg.score(X_dev, Y_dev)))

Cross Validation Score: 61.61%
Dev Set score: 61.58%


In [44]:
Y_dev_predict = logreg.predict(X_dev)

In [45]:
pd.crosstab(pd.Series(Y_dev, name = 'Actual'), pd.Series(Y_dev_predict, name = 'Predict'), margins = True)

Predict,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,6372,4164,10536
1,3905,6562,10467
All,10277,10726,21003


In [46]:
print("Accuracy is {0:.2f}".format(accuracy_score(Y_dev, Y_dev_predict)))
print("Precision is {0:.2f}".format(precision_score(Y_dev, Y_dev_predict)))
print("Recall is {0:.2f}".format(recall_score(Y_dev, Y_dev_predict)))
print("AUC is {0:.2f}".format(roc_auc_score(Y_dev, Y_dev_predict)))

accuracy_logreg = accuracy_score(Y_dev, Y_dev_predict)
precision_logreg = precision_score(Y_dev, Y_dev_predict)
recall_logreg = recall_score(Y_dev, Y_dev_predict)
auc_logreg = roc_auc_score(Y_dev, Y_dev_predict)

Accuracy is 0.62
Precision is 0.61
Recall is 0.63
AUC is 0.62
