In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

diabetic = pd.read_csv('diabetic_data.csv')

In [2]:
diabetic_df = diabetic.replace('?', np.nan)
diabetic_df = diabetic_df.sort_values('encounter_id')
diabetic_df.drop_duplicates(subset = ['patient_nbr'], keep = 'first', inplace = True)
print(diabetic_df.shape)
diabetic_df = diabetic_df[~diabetic_df['discharge_disposition_id'].isin([11,13,14,19,20,21])]
print(diabetic_df.shape)
diabetic_df.drop(diabetic_df.loc[diabetic_df.gender=='Unknown/Invalid'].index,inplace=True)
print(diabetic_df.shape)

(71518, 50)
(69973, 50)
(69970, 50)


In [3]:
diabetic_df['readmit_30d'] = diabetic_df.readmitted.apply(lambda x: 1 if x=='<30' else 0)
diabetic_df['discharge_1'] = diabetic_df.discharge_disposition_id.apply(lambda x: 1 if x==1 else 0)
diabetic_df['discharge_5'] = diabetic_df.discharge_disposition_id.apply(lambda x: 1 if x==5 else 0)
diabetic_df['discharge_22'] = diabetic_df.discharge_disposition_id.apply(lambda x: 1 if x==22 else 0)

In [4]:
diabetic_df['age_num'] = diabetic_df['age'].replace(
    ['[0-10)','[10-20)','[20-30)','[30-40)','[40-50)','[50-60)','[60-70)','[70-80)','[80-90)','[90-100)'],
    [5,15,25,35,45,55,65,75,85,95])

In [5]:
diabetic_df = diabetic_df[['age_num','discharge_1','discharge_5','discharge_22','number_emergency','number_inpatient','readmit_30d']]

In [158]:
diabetic_df.reset_index(drop=True,inplace=True)
df = diabetic_df
df.tail()

Unnamed: 0,age_num,discharge_1,discharge_5,discharge_22,number_emergency,number_inpatient,readmit_30d
69965,75,1,0,0,0,0,0
69966,45,1,0,0,1,0,0
69967,65,1,0,0,1,1,0
69968,85,1,0,0,1,0,0
69969,75,1,0,0,0,0,0


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop(columns='readmit_30d')
y = df.readmit_30d
scaler_X = StandardScaler().fit(X)
scaled_X = scaler_X.transform(X)
scaled_X = pd.DataFrame(scaled_X,index=X.index)

X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.2, random_state=42, stratify=y)

In [162]:
y_test

62518    0
19093    0
33160    0
69543    0
47767    0
        ..
45576    0
64936    0
69129    0
1314     0
61439    0
Name: readmit_30d, Length: 13994, dtype: int64

In [8]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score

vals = np.concatenate([np.linspace(100,10,10)[:-1],\
                       np.linspace(10,1,10)[:-1],\
                       np.linspace(1,.1,10)[:-1],\
                       np.linspace(.1,.01,10)[:-1],\
                       np.linspace(.01,.001,10)[:-1],\
                       np.linspace(.001,.0001,10)[:-1],\
                       np.linspace(.0001,.00001,10)[:-1],
                       np.linspace(.00001,.000001,10)])
roc_scorer = make_scorer(roc_auc_score)

selector = LogisticRegressionCV(Cs=vals,scoring=roc_scorer,cv=10,penalty='l2',solver='liblinear',\
                                class_weight='balanced',random_state=5,n_jobs=-2)                 
selector.fit(X_train,y_train)


LogisticRegressionCV(Cs=array([1.e+02, 9.e+01, 8.e+01, 7.e+01, 6.e+01, 5.e+01, 4.e+01, 3.e+01,
       2.e+01, 1.e+01, 9.e+00, 8.e+00, 7.e+00, 6.e+00, 5.e+00, 4.e+00,
       3.e+00, 2.e+00, 1.e+00, 9.e-01, 8.e-01, 7.e-01, 6.e-01, 5.e-01,
       4.e-01, 3.e-01, 2.e-01, 1.e-01, 9.e-02, 8.e-02, 7.e-02, 6.e-02,
       5.e-02, 4.e-02, 3.e-02, 2.e-02, 1.e-02, 9.e-03, 8.e-03, 7.e-03,
       6.e-03, 5.e-03, 4.e-03, 3.e-03, 2.e-03, 1.e-...
       8.e-05, 7.e-05, 6.e-05, 5.e-05, 4.e-05, 3.e-05, 2.e-05, 1.e-05,
       9.e-06, 8.e-06, 7.e-06, 6.e-06, 5.e-06, 4.e-06, 3.e-06, 2.e-06,
       1.e-06]),
                     class_weight='balanced', cv=10, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='warn', n_jobs=-2, penalty='l2',
                     random_state=5, refit=True,
                     scoring=make_scorer(roc_auc_score), solver='liblinear',
                     tol=0.0001, verbose=0)

In [9]:
pd.set_option('display.max_rows', None)
features = pd.DataFrame(selector.scores_[1])
features.columns = vals
pd.concat([features.apply(lambda x: x.mean(),axis=0),features.apply(lambda x: x.std(),axis=0)],axis=1)

Unnamed: 0,0,1
100.0,0.594925,0.012048
90.0,0.594925,0.012048
80.0,0.594925,0.012048
70.0,0.594925,0.012048
60.0,0.594925,0.012048
50.0,0.594925,0.012048
40.0,0.594925,0.012048
30.0,0.594925,0.012048
20.0,0.594925,0.012048
10.0,0.594925,0.012048


In [181]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

estimator = LogisticRegression(C=.000003,penalty='l2',solver='liblinear',class_weight='balanced')
estimator.fit(X_train,y_train)
train_pred= estimator.predict(X_train)
test_pred = estimator.predict(X_test)
print(confusion_matrix(y_train,train_pred))
print(confusion_matrix(y_test,test_pred))
print(roc_auc_score(y_train,train_pred))
print(roc_auc_score(y_test,test_pred))

[[30179 20775]
 [ 1991  3031]]
[[7535 5204]
 [ 509  746]]
0.5979118576852733
0.5929565043069734


In [183]:
estimator.coef_

array([[ 0.00645   , -0.013172  ,  0.0064088 ,  0.01101548,  0.00416523,
         0.01351076]])

In [184]:
estimator.intercept_

array([-0.00024449])

In [186]:
X.columns

Index(['age_num', 'discharge_1', 'discharge_5', 'discharge_22',
       'number_emergency', 'number_inpatient'],
      dtype='object')

In [187]:
predicted_probs = pd.Series(estimator.predict_proba(X_test)[:,1],index=X_test.index)
bins = pd.qcut(predicted_probs.sort_values(),q=10,labels=range(1,11))
results = []
for i in range(1,11):
    results.append(y_test[bins[bins==i].index])

In [194]:
overall = y_test.sum()/len(y_test)
#print([(i.sum())/(len(i)) for i in results])
#print([(i.sum())/(len(i))/overall for i in results])
final = pd.DataFrame([[(i.sum())/(len(i)) for i in results],[(i.sum())/(len(i))/overall for i in results]]).T
final.columns = ['% Rehospitalized','Lift']
final.index = ['Decile 1','Decile 2','Decile 3','Decile 4','Decile 5','Decile 6','Decile 7','Decile 8','Decile 9','Decile 10']
final

Unnamed: 0,% Rehospitalized,Lift
Decile 1,0.048748,0.543573
Decile 2,0.055412,0.617881
Decile 3,0.071467,0.796902
Decile 4,0.069857,0.778947
Decile 5,0.062207,0.69364
Decile 6,0.078528,0.87563
Decile 7,0.100396,1.119479
Decile 8,0.106214,1.184352
Decile 9,0.103837,1.15785
Decile 10,0.204978,2.285628


In [169]:
test = pd.qcut(predicted_probs.sort_values(),q=10,labels=range(1,11))
#test
test[test==8].index
y_test[test[test==1].index].value_counts()

0    1444
1      74
Name: readmit_30d, dtype: int64

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor 
X_vif = pd.DataFrame(X_train, dtype=float)
vif_data = pd.DataFrame() 
vif_data["feature"] = X_vif.columns
vif_data["VIF"] = [variance_inflation_factor(X_vif.values, i) 
                          for i in range(len(X_vif.columns))] 