In [1]:
# import modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel, f_classif, mutual_info_classif
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer

### 1. Cleaned Data

#### 1.1. Load data and display setting

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
data = pd.read_csv('data_top.csv')

In [4]:
data.shape

(69970, 44)

## 3. Modeling

### 3.1 Train, Test set split

In [5]:
X = data.drop (['readmit_30d'], axis=1)
y = data['readmit_30d']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [7]:
y_train.value_counts() / len(y_train) * 100

0    91.028298
1     8.971702
Name: readmit_30d, dtype: float64

In [8]:
# ## standard scaler:: 
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train_scaled = scaler.transform(X_train)
# X_test_scaled = scaler.transform(X_test)

### 3.2 Logistic regression modeling

### 3.2.1 Logistic Regression Model Fitting

In [9]:
logistic = LogisticRegression(solver='liblinear',
                              penalty='l2',
                              C=0.01,
                              random_state=1,
                             class_weight = 'balanced')

logistic.fit(X_train, y_train)

LogisticRegression(C=0.01, class_weight='balanced', random_state=1,
                   solver='liblinear')

In [10]:
from sklearn.linear_model import LogisticRegression
y_pred = logistic.predict(X_test)

In [11]:
logistic.predict_proba(X_test)[:,1]

array([0.70005511, 0.49009429, 0.42729962, ..., 0.45128439, 0.59226766,
       0.45795198])

In [12]:
predicted_probs = pd.Series(logistic.predict_proba(X_test)[:,1],index=X_test.index)
bins = pd.qcut(predicted_probs.sort_values(),q=10,labels=range(1,11))
results = []
for i in range(1,11):
    results.append(y_test[bins[bins==i].index])
overall = y_test.sum()/len(y_test)
print([(i.sum())/(len(i)) for i in results])
print([(i.sum())/(len(i))/overall for i in results])

[0.03857142857142857, 0.06647605432451752, 0.05003573981415296, 0.06285714285714286, 0.07076483202287348, 0.08220157255182273, 0.08357142857142857, 0.1093638313080772, 0.13867047891350964, 0.19428571428571428]
[0.43009447922595334, 0.7412477324440623, 0.5579284007643479, 0.7008947068867388, 0.7890701667952921, 0.9165966583985715, 0.9318713716562322, 1.2194720759563604, 1.546258710689764, 2.1664018212862834]


In [13]:
train_pred= logistic.predict(X_train)
test_pred = logistic.predict(X_test)

print(roc_auc_score(y_train,train_pred))
print(roc_auc_score(y_test,test_pred))

0.6022650942283282
0.6087038610609762


In [14]:
y_proba = logistic.predict_proba(X_test)

In [15]:
y_proba

array([[0.29994489, 0.70005511],
       [0.50990571, 0.49009429],
       [0.57270038, 0.42729962],
       ...,
       [0.54871561, 0.45128439],
       [0.40773234, 0.59226766],
       [0.54204802, 0.45795198]])

In [16]:
y_proba_ = y_proba[:,1]

In [17]:
test = pd.DataFrame({ 'actual' : y_test, 'pred':y_proba_ })

In [18]:
test = test.sort_values('pred')

In [19]:
test['quantile'] = pd.qcut(predicted_probs.sort_values(),q=10,labels=range(1,11))

In [20]:
test

Unnamed: 0,actual,pred,quantile
18640,0,0.222927,1
39396,0,0.231428,1
35384,0,0.235238,1
30987,0,0.243864,1
12638,0,0.244129,1
...,...,...,...
8355,0,0.964788,10
44611,0,0.966060,10
26370,0,0.968043,10
28087,1,0.973587,10


In [21]:
temp = test.groupby('quantile').count()['actual']

In [22]:
temp2 = test.groupby('quantile').sum()['actual'] 

In [23]:
temp3 = test.groupby('quantile').min()['pred']
temp4 = test.groupby('quantile').max()['pred']
temp5 = test.groupby('quantile').mean()['pred']

In [24]:
final = pd.concat([temp,temp2,temp3,temp4,temp5],axis = 1)

In [25]:
final.columns=['unique_patients','actual','min_pred','max_pred','average_pred']

In [26]:
final['pct_disenrolled']=final['actual']/final['unique_patients']

In [27]:
y_test.mean()

0.08968129198227812

In [28]:
final

Unnamed: 0_level_0,unique_patients,actual,min_pred,max_pred,average_pred,pct_disenrolled
quantile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1400,54,0.222927,0.358939,0.327169,0.038571
2,1399,93,0.358957,0.390386,0.375692,0.066476
3,1399,70,0.390435,0.413276,0.401828,0.050036
4,1400,88,0.413276,0.434385,0.424081,0.062857
5,1399,99,0.434395,0.456811,0.445685,0.070765
6,1399,115,0.456836,0.482724,0.469037,0.082202
7,1400,117,0.482736,0.516179,0.498733,0.083571
8,1399,153,0.51625,0.559305,0.536701,0.109364
9,1399,194,0.559336,0.622208,0.587467,0.13867
10,1400,272,0.622219,0.981256,0.698689,0.194286


In [29]:
predicted_probs = pd.Series(logistic.predict_proba(X_test)[:,1],index=X_test.index)
bins = pd.qcut(predicted_probs.sort_values(),q=10,labels=range(1,11))
results = []
for i in range(1,11):
    results.append(y_test[bins[bins==i].index])

In [30]:
overall = y_test.sum()/len(y_test)
overall

0.08968129198227812

In [31]:
print([(i.sum())/(len(i)) for i in results])
print([(i.sum())/(len(i))/overall for i in results])

[0.03857142857142857, 0.06647605432451752, 0.05003573981415296, 0.06285714285714286, 0.07076483202287348, 0.08220157255182273, 0.08357142857142857, 0.1093638313080772, 0.13867047891350964, 0.19428571428571428]
[0.43009447922595334, 0.7412477324440623, 0.5579284007643479, 0.7008947068867388, 0.7890701667952921, 0.9165966583985715, 0.9318713716562322, 1.2194720759563604, 1.546258710689764, 2.1664018212862834]


In [32]:
lift_logistic  =[(i.sum())/(len(i))/overall for i in results]

In [33]:
final['lift'] = lift_logistic 

In [34]:
final

Unnamed: 0_level_0,unique_patients,actual,min_pred,max_pred,average_pred,pct_disenrolled,lift
quantile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1400,54,0.222927,0.358939,0.327169,0.038571,0.430094
2,1399,93,0.358957,0.390386,0.375692,0.066476,0.741248
3,1399,70,0.390435,0.413276,0.401828,0.050036,0.557928
4,1400,88,0.413276,0.434385,0.424081,0.062857,0.700895
5,1399,99,0.434395,0.456811,0.445685,0.070765,0.78907
6,1399,115,0.456836,0.482724,0.469037,0.082202,0.916597
7,1400,117,0.482736,0.516179,0.498733,0.083571,0.931871
8,1399,153,0.51625,0.559305,0.536701,0.109364,1.219472
9,1399,194,0.559336,0.622208,0.587467,0.13867,1.546259
10,1400,272,0.622219,0.981256,0.698689,0.194286,2.166402


### 3.2.2 Dicision Tree decile analysis

In [35]:
X = data.drop (['readmit_30d'], axis=1)
y = data['readmit_30d']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [36]:
tree = DecisionTreeClassifier(criterion='gini',
                              max_depth= 7,
                              min_samples_split = 3,
                              random_state=42,
                              class_weight='balanced')
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight='balanced', max_depth=7,
                       min_samples_split=3, random_state=42)

In [37]:
from sklearn.linear_model import LogisticRegression
y_pred = tree.predict(X_test)

In [38]:
y_proba = tree.predict_proba(X_test)

In [None]:
predicted_probs = pd.Series(tree.predict_proba(X_test)[:,1],
                            index=X_test.index)
bins = pd.qcut(predicted_probs.sort_values(),q=10,labels=range(1,11))
results = []
for i in range(1,11):
    results.append(y_test[bins[bins==i].index])
overall = y_test.sum()/len(y_test)
print([(i.sum())/(len(i)) for i in results])
print([(i.sum())/(len(i))/overall for i in results])

overall = y_test.sum()/len(y_test)
#print([(i.sum())/(len(i)) for i in results])
#print([(i.sum())/(len(i))/overall for i in results])
final = pd.DataFrame([[(i.sum())/(len(i)) for i in results],[(i.sum())/(len(i))/overall for i in results]]).T
final.columns = ['% Rehospitalized','Lift']
final.index = ['Decile 1','Decile 2','Decile 3','Decile 4','Decile 5','Decile 6','Decile 7','Decile 8','Decile 9','Decile 10']
final