In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from scipy import stats
import statistics
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error, mean_squared_error, make_scorer
import seaborn as sns
import os
from scipy.stats import skew, probplot, norm
import sklearn.model_selection as ms
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNetCV, LassoCV, Lasso, ElasticNet 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold 
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from xgboost import XGBRegressor
# from lightgbm import LGBMRegressor

### importing data

In [3]:
train_df = pd.read_csv('./whole_train_cleaned.csv')
test_df = pd.read_csv('./whole_test_cleaned.csv')
merged_df = pd.concat([train_df, test_df], axis=0)

print(f'whole_train_cleaned dataset has {train_df.shape}')
print(f'whole_test_cleaned dataset has {test_df.shape}')
print(f'merged_df dataset has {merged_df.shape}')




whole_train_cleaned dataset has (89390, 94)
whole_test_cleaned dataset has (9930, 94)
merged_df dataset has (99320, 94)


In [60]:
msk = np.random.rand(len(merged_df)) < 0.8

msk

array([ True,  True,  True, ..., False,  True,  True])

In [61]:
new_train_df = merged_df[msk]
new_test_df = merged_df[~msk]

In [58]:
new_train_df, new_test_df = train_test_split(merged_df, test_size=0.10, random_state=123)


In [63]:
print(new_train_df.shape)
print(new_test_df.shape)
19692/79628

(79628, 94)
(19692, 94)


0.2472999447430552

In [65]:
unique_new_train_df = new_train_df.drop_duplicates(subset= ['patient_nbr'], keep = 'first')
unique_new_test_df = new_test_df.drop_duplicates(subset= ['patient_nbr'], keep = 'first')
print(unique_new_train_df.shape)
print(unique_new_test_df.shape)
17783/58867


(58867, 94)
(17783, 94)


0.302087757147468

In [66]:
#function to summarize feature
def summarize_feature(dataframe,feature):
    feature_summary = {'values':dataframe[feature].value_counts().index.tolist(),
                     'counts':dataframe[feature].value_counts().values.tolist(),
                     'frequency': dataframe[feature].value_counts(normalize = True).values.tolist()}
    feature_summary = pd.DataFrame(feature_summary)
    return feature_summary


In [67]:
summarize_feature(new_train_df, 'readmitted')

Unnamed: 0,values,counts,frequency
0,0,70506,0.885442
1,1,9122,0.114558


In [68]:
summarize_feature(new_test_df, 'readmitted')

Unnamed: 0,values,counts,frequency
0,0,17505,0.88894
1,1,2187,0.11106


In [69]:
summarize_feature(unique_new_train_df, 'readmitted')

Unnamed: 0,values,counts,frequency
0,0,54184,0.920448
1,1,4683,0.079552


In [70]:
summarize_feature(unique_new_test_df, 'readmitted')

Unnamed: 0,values,counts,frequency
0,0,16019,0.900804
1,1,1764,0.099196


In [69]:
#dropping the columns encounter_id and patient_nbr
training_df = training_df.drop(['encounter_id','patient_nbr'], axis = 1)
test_df = test_df.drop(['encounter_id','patient_nbr'], axis = 1)


In [4]:
print(training_df.shape)
training_df.head(5)

(89390, 92)


Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,readmitted,...,insulin_Up,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_Steady,metformin-pioglitazone_Steady,change_No,diabetesMed_Yes
0,65,2,51,3,11,0,0,0,4,0,...,0,1,0,0,0,0,0,0,1,0
1,45,3,86,1,15,1,0,1,9,0,...,0,1,0,0,0,0,0,0,1,0
2,45,13,88,5,34,0,0,0,9,0,...,0,1,0,0,0,0,0,0,0,1
3,85,4,18,2,17,3,2,0,9,0,...,0,1,0,0,0,0,0,0,0,1
4,65,3,22,0,11,1,0,2,6,0,...,0,1,0,0,0,0,0,0,0,1


## Using the whole data set

In [5]:
#separate the input feature and outpout feature
X = training_df.loc[:, training_df.columns !='readmitted']
y = training_df[['readmitted']]


### Scaling the data

In [6]:
#scalar fit the data to training inputs (X)
scaler  = StandardScaler()
scaler.fit(X)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [7]:
#save it using a package called pickle
import pickle
scalerfile = 'scaler.sav'
pickle.dump(scaler, open(scalerfile, 'wb'))


In [8]:
# load it back
scaler = pickle.load(open(scalerfile, 'rb'))

In [9]:
#scale the training inputs (X)
X_tf = scaler.transform(X)

## Applying SMOTE method to balance the training dataset

In [10]:
# Data balancing applied using SMOTE
from imblearn.over_sampling import SMOTE

from collections import Counter
print('Original dataset shape {}'.format(Counter(y)))
sm = SMOTE(random_state=20)
X_new, y_new = sm.fit_sample(X_tf, y)
print('New dataset shape {}'.format(Counter(y_new)))

Using TensorFlow backend.


Original dataset shape Counter({'readmitted': 1})
New dataset shape Counter({0: 79213, 1: 79213})


## Split train data into 80% train and 20% validate

In [11]:
# split train data set in train and test sets for model training
X_train, X_valid, y_train, y_valid = train_test_split(X_new, y_new, test_size=0.20, random_state=42)


### Model Selection

In [12]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
def calc_specificity(y_actual, y_pred, thresh):
    # calculates specificity
    return sum((y_pred < thresh) & (y_actual == 0)) /sum(y_actual ==0)

def print_report(y_actual, y_pred, thresh):
    
    auc = roc_auc_score(y_actual, y_pred)
    accuracy = accuracy_score(y_actual, (y_pred > thresh))
    recall = recall_score(y_actual, (y_pred > thresh))
    precision = precision_score(y_actual, (y_pred > thresh))
    specificity = calc_specificity(y_actual, y_pred, thresh)
    print('AUC:%.3f'%auc)
    print('accuracy:%.3f'%accuracy)
    print('recall:%.3f'%recall)
    print('precision:%.3f'%precision)
    print('specificity:%.3f'%specificity)
    print('prevalence:%.3f'%calc_prevalence(y_actual))
    print(' ')
    return auc, accuracy, recall, precision, specificity

def calc_prevalence(y_actual):
    return (sum(y_actual)/len(y_actual))

In [13]:
# logistic regression
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(random_state = 42)
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
thresh=0.5
y_train_preds = lr.predict_proba(X_train)[:,1]
y_valid_preds = lr.predict_proba(X_valid)[:,1]

print('Logistic Regression')
print('Training:')
lr_train_auc, lr_train_accuracy, lr_train_recall, \
    lr_train_precision, lr_train_specificity = print_report(y_train,y_train_preds, thresh)
print('Validation:')
lr_valid_auc, lr_valid_accuracy, lr_valid_recall, \
    lr_valid_precision, lr_valid_specificity = print_report(y_valid,y_valid_preds, thresh)

Logistic Regression
Training:
AUC:0.667
accuracy:0.615
recall:0.566
precision:0.627
specificity:0.665
prevalence:0.499
 
Validation:
AUC:0.669
accuracy:0.618
recall:0.568
precision:0.636
specificity:0.669
prevalence:0.504
 


In [23]:
y_valid_pred = lr.predict(X_valid)

In [29]:
X_train.shape

(126740, 91)

In [24]:
pd.crosstab(pd.Series(y_valid, name = 'Actual'), pd.Series(y_valid_preds, name = 'Predict'), margins = False)


Predict,0.0011363077251614526,0.0011387222594705126,0.06255786410167605,0.07217158987152576,0.08175806376937252,0.0886962125973866,0.09356419296327499,0.09426304771550355,0.09491517005156938,0.10022234619098311,...,0.9868644376044794,0.9868827468339587,0.9870321619402724,0.9871987718813255,0.9873594541686095,0.9886073161852338,0.9901044467451398,0.9910795168014983,0.9920557672681076,0.9947511277563434
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1


## Using the unique data set

In [31]:
#separate the input feature and outpout feature
X = unique_train_df.loc[:, unique_train_df.columns !='readmitted']
y = unique_train_df[['readmitted']]


### Scaling the data

In [32]:
#scalar fit the data to training inputs (X)
scaler  = StandardScaler()
scaler.fit(X)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [33]:
#save it using a package called pickle
import pickle
scalerfile = 'scaler.sav'
pickle.dump(scaler, open(scalerfile, 'wb'))


In [34]:
# load it back
scaler = pickle.load(open(scalerfile, 'rb'))

In [35]:
#scale the training inputs (X)
X_tf = scaler.transform(X)

## Applying SMOTE method to balance the training dataset

In [36]:
# Data balancing applied using SMOTE
from imblearn.over_sampling import SMOTE

from collections import Counter
print('Original dataset shape {}'.format(Counter(y)))
sm = SMOTE(random_state=20)
X_new, y_new = sm.fit_sample(X_tf, y)
print('New dataset shape {}'.format(Counter(y_new)))

Original dataset shape Counter({'readmitted': 1})
New dataset shape Counter({0: 59528, 1: 59528})


## Split train data into 80% train and 20% validate

In [37]:
# split train data set in train and test sets for model training
X_train, X_valid, y_train, y_valid = train_test_split(X_new, y_new, test_size=0.20, random_state=42)


### Model Selection

In [38]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
def calc_specificity(y_actual, y_pred, thresh):
    # calculates specificity
    return sum((y_pred < thresh) & (y_actual == 0)) /sum(y_actual ==0)

def print_report(y_actual, y_pred, thresh):
    
    auc = roc_auc_score(y_actual, y_pred)
    accuracy = accuracy_score(y_actual, (y_pred > thresh))
    recall = recall_score(y_actual, (y_pred > thresh))
    precision = precision_score(y_actual, (y_pred > thresh))
    specificity = calc_specificity(y_actual, y_pred, thresh)
    print('AUC:%.3f'%auc)
    print('accuracy:%.3f'%accuracy)
    print('recall:%.3f'%recall)
    print('precision:%.3f'%precision)
    print('specificity:%.3f'%specificity)
    print('prevalence:%.3f'%calc_prevalence(y_actual))
    print(' ')
    return auc, accuracy, recall, precision, specificity

def calc_prevalence(y_actual):
    return (sum(y_actual)/len(y_actual))

In [39]:
# logistic regression
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(random_state = 42)
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [62]:
thresh=0.5
y_train_preds = lr.predict_proba(X_train)[:,1]
y_valid_preds = lr.predict_proba(X_valid)[:,1]
y_test_preds = lr.predict_proba(X_test)[:,1]

print('Logistic Regression')
print('Training:')
lr_train_auc, lr_train_accuracy, lr_train_recall, \
    lr_train_precision, lr_train_specificity = print_report(y_train,y_train_preds, thresh)
print('Validation:')
lr_valid_auc, lr_valid_accuracy, lr_valid_recall, \
    lr_valid_precision, lr_valid_specificity = print_report(y_valid,y_valid_preds, thresh)
print('Test:')
lr_test_auc, lr_test_accuracy, lr_test_recall, \
    lr_test_precision, lr_test_specificity = print_report(y_test,y_test_preds, thresh)

Logistic Regression
Training:
AUC:0.667
accuracy:0.615
recall:0.566
precision:0.627
specificity:0.665
prevalence:0.499
 
Validation:
AUC:0.669
accuracy:0.618
recall:0.568
precision:0.636
specificity:0.669
prevalence:0.504
 
Test:


ValueError: Shape of passed values is (9930, 9930), indices imply (1, 9930)

In [61]:
test_df

X_test = test_df.loc[:, test_df.columns !='readmitted']
y_test = test_df[['readmitted']]

In [63]:
roc_auc_score(y_test, y_test_preds)

0.5912373073867574

In [65]:
accuracy_score(y_test, (y_test_preds > thresh))

0.11399798590130916