In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

diabetic = pd.read_csv('diabetic_data.csv')

In [None]:
pd.set_option('display.max_rows', None)

In [46]:
diabetic_df = diabetic.replace('?', np.nan)
diabetic_df = diabetic_df.sort_values('encounter_id')
diabetic_df.drop_duplicates(subset = ['patient_nbr'], keep = 'first', inplace = True)
print(diabetic_df.shape)
diabetic_df = diabetic_df[~diabetic_df['discharge_disposition_id'].isin([11,13,14,19,20,21])]
print(diabetic_df.shape)
diabetic_df.drop(diabetic_df.loc[diabetic_df.gender=='Unknown/Invalid'].index,inplace=True)
print(diabetic_df.shape)

(71518, 50)
(69973, 50)
(69970, 50)


In [8]:
for i in diabetic.columns:
    print(diabetic[i].value_counts())

96210942     1
89943846     1
384306986    1
94650156     1
83156784     1
            ..
74454612     1
208073976    1
166229592    1
38340702     1
77856768     1
Name: encounter_id, Length: 101766, dtype: int64
88785891     40
43140906     28
23199021     23
1660293      23
88227540     23
             ..
71081460      1
30060018      1
67443444      1
141344240     1
93251151      1
Name: patient_nbr, Length: 71518, dtype: int64
Caucasian          76099
AfricanAmerican    19210
?                   2273
Hispanic            2037
Other               1506
Asian                641
Name: race, dtype: int64
Female             54708
Male               47055
Unknown/Invalid        3
Name: gender, dtype: int64
[70-80)     26068
[60-70)     22483
[50-60)     17256
[80-90)     17197
[40-50)      9685
[30-40)      3775
[90-100)     2793
[20-30)      1657
[10-20)       691
[0-10)        161
Name: age, dtype: int64
?            98569
[75-100)      1336
[50-75)        897
[100-125)      625
[125-1

In [47]:
diabetic_df['readmit_30d'] = diabetic_df.readmitted.apply(lambda x: 1 if x=='<30' else 0)
diabetic_df['discharge_1'] = diabetic_df.discharge_disposition_id.apply(lambda x: 1 if x==1 else 0)
diabetic_df['discharge_5'] = diabetic_df.discharge_disposition_id.apply(lambda x: 1 if x==5 else 0)
diabetic_df['discharge_22'] = diabetic_df.discharge_disposition_id.apply(lambda x: 1 if x==22 else 0)

In [48]:
diabetic_df['age_num'] = diabetic_df['age'].replace(
    ['[0-10)','[10-20)','[20-30)','[30-40)','[40-50)','[50-60)','[60-70)','[70-80)','[80-90)','[90-100)'],
    [5,15,25,35,45,55,65,75,85,95])

### Model with 6 features + num_meds and +num_lab_procedures

Purpose of adding meds and labs is to try to make the probabilities predicted more continuous and therefore get more even numbers of patients into the bins for lift analysis

In [12]:
diabetic_df = diabetic_df[['age_num','discharge_1','discharge_5','discharge_22',
                           'number_emergency','number_inpatient','num_medications',
                           'num_lab_procedures','readmit_30d']]

In [13]:
diabetic_df.reset_index(drop=True,inplace=True)
df = diabetic_df

Unnamed: 0,age_num,discharge_1,discharge_5,discharge_22,number_emergency,number_inpatient,num_medications,num_lab_procedures,readmit_30d
69965,75,1,0,0,0,0,33,50,0
69966,45,1,0,0,1,0,26,73,0
69967,65,1,0,0,1,1,17,46,0
69968,85,1,0,0,1,0,22,76,0
69969,75,1,0,0,0,0,3,13,0


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop(columns='readmit_30d')
y = df.readmit_30d
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score


estimator = LogisticRegression(C=1e5,penalty='l2',solver='liblinear',\
                                class_weight='balanced',random_state=5)                 
estimator.fit(X_train,y_train)

train_pred= estimator.predict(X_train)
test_pred = estimator.predict(X_test)

print(roc_auc_score(y_train,train_pred))
print(roc_auc_score(y_test,test_pred))

0.6000901368202932
0.5959874764229055


In [20]:
predicted_probs = pd.Series(estimator.predict_proba(X_test)[:,1],index=X_test.index)
bins = pd.qcut(predicted_probs.sort_values(),q=10,labels=range(1,11))
results = []
for i in range(1,11):
    results.append(y_test[bins[bins==i].index])
overall = y_test.sum()/len(y_test)
print([(i.sum())/(len(i)) for i in results])
print([(i.sum())/(len(i))/overall for i in results])

[0.04642857142857143, 0.056428571428571425, 0.045064377682403435, 0.08, 0.07862759113652609, 0.07934238741958542, 0.09571428571428571, 0.10078627591136526, 0.11222301644031452, 0.20214285714285715]
[0.5177063175867958, 0.6292122936824132, 0.5024947420617958, 0.8920478087649403, 0.8767446297725467, 0.8847150354977518, 1.067271485486625, 1.1238272072539008, 1.2513536988571803, 2.254013659647126]


In [31]:
bins.value_counts()

10    1400
7     1400
4     1400
2     1400
1     1400
9     1399
8     1399
6     1399
5     1399
3     1398
dtype: int64

### Simpler old model, now without standard scaler

In [54]:
diabetic_df = diabetic_df[['age_num','discharge_1','discharge_5','discharge_22',
                           'number_emergency','number_inpatient','readmit_30d']]
diabetic_df.reset_index(drop=True,inplace=True)
df = diabetic_df

In [55]:
from sklearn.model_selection import train_test_split

X = df.drop(columns='readmit_30d')
y = df.readmit_30d
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score


estimator = LogisticRegression(C=1e5,penalty='l2',solver='liblinear',\
                                class_weight='balanced',random_state=5)                 
estimator.fit(X_train,y_train)

train_pred= estimator.predict(X_train)
test_pred = estimator.predict(X_test)

print(roc_auc_score(y_train,train_pred))
print(roc_auc_score(y_test,test_pred))

0.5964200857280679
0.5947201382084505


In [57]:
predicted_probs = pd.Series(estimator.predict_proba(X_test)[:,1],index=X_test.index)
bins = pd.qcut(predicted_probs.sort_values(),q=10,labels=range(1,11))
results = []
for i in range(1,11):
    results.append(y_test[bins[bins==i].index])
overall = y_test.sum()/len(y_test)
print([(i.sum())/(len(i)) for i in results])
print([(i.sum())/(len(i))/overall for i in results])

[0.048748353096179184, 0.055412371134020616, 0.07146724417975095, 0.06972883231876037, 0.06382978723404255, 0.08359133126934984, 0.09811046511627906, 0.1055187637969095, 0.10851063829787234, 0.20610119047619047]
[0.5435732695043279, 0.617881053107159, 0.7969024821126971, 0.7775181509711017, 0.7117402729507502, 0.932093298632097, 1.093990317798573, 1.1765972753577303, 1.2099584640162755, 2.298151441851641]


In [58]:
bins.value_counts()

8     2265
3     1847
4     1807
2     1552
1     1518
7     1376
10    1344
6      969
5      846
9      470
dtype: int64

### Simple model (six features from above) with just num_medications added

In [49]:
diabetic_df = diabetic_df[['age_num','discharge_1','discharge_5','discharge_22',
                           'number_emergency','number_inpatient', 'num_medications',
                           'readmit_30d']]
diabetic_df.reset_index(drop=True,inplace=True)
df = diabetic_df

In [50]:
from sklearn.model_selection import train_test_split

X = df.drop(columns='readmit_30d')
y = df.readmit_30d
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score


estimator = LogisticRegression(C=1e5,penalty='l2',solver='liblinear',\
                                class_weight='balanced',random_state=5)                 
estimator.fit(X_train,y_train)

train_pred= estimator.predict(X_train)
test_pred = estimator.predict(X_test)

print(roc_auc_score(y_train,train_pred))
print(roc_auc_score(y_test,test_pred))

0.5990748177501273
0.5956266620463746


In [52]:
predicted_probs = pd.Series(estimator.predict_proba(X_test)[:,1],index=X_test.index)
bins = pd.qcut(predicted_probs.sort_values(),q=10,labels=range(1,11))
results = []
for i in range(1,11):
    results.append(y_test[bins[bins==i].index])
overall = y_test.sum()/len(y_test)
print([(i.sum())/(len(i)) for i in results])
print([(i.sum())/(len(i))/overall for i in results])

[0.04722222222222222, 0.05856515373352855, 0.052105638829407566, 0.07645051194539249, 0.07396449704142012, 0.07361516034985423, 0.09843081312410841, 0.11056682995101469, 0.10607168983174835, 0.2]
[0.526555998229305, 0.6530364632247001, 0.58100901177588, 0.8524688957480657, 0.8247483439024966, 0.8208530310245897, 1.0975623895289028, 1.2328862297486052, 1.1827627310800688, 2.230119521912351]


In [53]:
bins.value_counts()

4     1465
1     1440
8     1429
7     1402
3     1401
10    1400
6     1372
9     1367
2     1366
5     1352
dtype: int64