preparing final dataset

In [22]:
import pandas as pd
import numpy as np

In [23]:
diabetic = pd.read_csv('../../data/diabetic_data_initial.csv')

In [24]:
diabetic.shape

(101766, 50)

In [25]:
diabetic_df = diabetic.replace('?', np.nan)

In [26]:
diabetic_df = diabetic_df.sort_values('encounter_id')
diabetic_df.drop_duplicates(subset = ['patient_nbr'], keep = 'first', inplace = True)

In [27]:
diabetic_df.shape

(71518, 50)

In [28]:
diabetic_df = diabetic_df[~diabetic_df['discharge_disposition_id'].isin([11,13,14,19,20,21])]

In [29]:
diabetic_df.drop(diabetic_df.loc[diabetic_df.gender=='Unknown/Invalid'].index,inplace=True)

In [30]:
diabetic_df.shape

(69970, 50)

In [31]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

dropping encounter_id, patient_nbr, and columns with high missingness

In [32]:
diabetic_df.drop(['encounter_id','patient_nbr','weight','medical_specialty','payer_code'],\
                 axis = 1, inplace = True)

binarizing target variable

In [33]:
diabetic_df['readmit_30d'] = (diabetic_df['readmitted'] == '<30')

dicretizing id columns

In [34]:
diabetic_df['admission_type_id'] = diabetic_df['admission_type_id'].astype('str')
diabetic_df['admission_source_id'] = diabetic_df['admission_source_id'].astype('str')
diabetic_df['discharge_disposition_id'] = diabetic_df['discharge_disposition_id'].astype('str')

admission_type_id

simplifying age column

In [35]:
# diabetic_df['age_group'] = diabetic_df['age'].replace(
#     ['[0-10)','[10-20)','[20-30)','[30-40)','[40-50)','[50-60)','[60-70)','[70-80)','[80-90)','[90-100)'],
#     ['inf to adole','inf to adole','adult','adult','mid-age','mid-age','senior','senior','senior','senior'])

diabetic_df['age_num'] = diabetic_df['age'].replace(
    ['[0-10)','[10-20)','[20-30)','[30-40)','[40-50)','[50-60)','[60-70)','[70-80)','[80-90)','[90-100)'],
    [5,15,25,35,45,55,65,75,85,95])

removing sub-ICDs and adding diag-based features

In [36]:
diabetic_df['diag_1'] = diabetic_df['diag_1'].str.split('.', expand = True).drop(1, axis = 1)
diabetic_df['diag_2'] = diabetic_df['diag_2'].str.split('.', expand = True).drop(1, axis = 1)
diabetic_df['diag_3'] = diabetic_df['diag_3'].str.split('.', expand = True).drop(1, axis = 1)

using diagnosis in diag_1 with > 500 samples to create new diagnosis variables

In [73]:
count_1 = diabetic_df.diag_1.value_counts()
index_1 = count_1[count_1>500].index.tolist()

In [74]:
count_2 = diabetic_df.diag_2.value_counts()
index_2 = count_2[count_2>500].index.tolist()

In [75]:
count_3 = diabetic_df.diag_3.value_counts()
index_3 = count_3[count_3>500].index.tolist()

In [78]:
len(set(index_1+index_2+index_3))

45

In [38]:
diag_1_count.index

Index(['250', '414', '428', '786', '410', '486', '427', '715', '434', '682',
       ...
       '963', '955', '299', '143', '318', 'V51', '640', '976', '669', '911'],
      dtype='object', length=661)

In [39]:
for d in top_diag_1:
    diabetic_df[d+'_in_diag'] = ((diabetic_df['diag_1']==d)|
                                (diabetic_df['diag_2']==d)|
                                (diabetic_df['diag_3']==d))

In [40]:
diabetic_df.columns

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'readmit_30d', 'age_num', '250_in_diag', '414_in_diag', '428_in_diag',
       '786_in_diag', '410_in_diag', '486_in_diag', '427_in_diag',
       '715_in_diag', '434_in_

In [52]:
temp = diabetic_df[['250_in_diag', '414_in_diag', '428_in_diag',
       '786_in_diag', '410_in_diag', '486_in_diag', '427_in_diag',
       '715_in_diag', '434_in_diag', '682_in_diag', '780_in_diag',
       '491_in_diag', '276_in_diag', '996_in_diag', '38_in_diag',
       '599_in_diag', '584_in_diag', '820_in_diag', '574_in_diag',
       '435_in_diag', '562_in_diag', '577_in_diag', '493_in_diag',
       '722_in_diag', 'V57_in_diag', '296_in_diag', '433_in_diag',
       '440_in_diag', '518_in_diag', '560_in_diag', 'readmit_30d']]

In [53]:
temp['col']=temp.apply(sum, axis = 1)
temp['col'].value_counts()
temp.groupby('col')['readmit_30d'].mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


col
0    0.000000
1    0.025582
2    0.076345
3    0.256572
4    1.000000
Name: readmit_30d, dtype: float64

binarizing drug variables with minority classes > 500 samples

In [None]:
for col in ['metformin','repaglinide','glimepiride','glipizide', 'glyburide','pioglitazone',
            'rosiglitazone','insulin']:
    diabetic_df[col+'_used'] = np.where(diabetic_df[col]=='No', 0, 1)

# 'nateglinide','glyburide-metformin' (close to 500)

(pending) binarizing max_glu_serum and A1Cresult

In [None]:
#diabetic_df['glu_test'] = np.where(diabetic_df['max_glu_serum']=='None', 0, 1)

In [None]:
#diabetic_df['A1C_test'] = np.where(diabetic_df['A1Cresult']=='None', 0, 1)

remove unused columns

In [None]:
diabetic_final = diabetic_df.drop(['age',
                                   'diag_1','diag_2','diag_3',
                                   'metformin','repaglinide','nateglinide','chlorpropamide','glimepiride',
                                   'acetohexamide','glipizide','glyburide','tolbutamide','pioglitazone',
                                   'rosiglitazone','acarbose','miglitol','troglitazone','tolazamide','examide',
                                   'citoglipton','insulin','glyburide-metformin','glipizide-metformin',
                                   'glimepiride-pioglitazone','metformin-rosiglitazone','metformin-pioglitazone',
                                   'readmitted'],
                                 axis = 1)
#'discharge_disposition_id','max_glu_serum','A1Cresult'

In [None]:
diabetic_final.columns

dummification

In [None]:
diabetic_final = pd.get_dummies(diabetic_final, drop_first = True)
len(diabetic_final.columns)

train test split

In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split

In [None]:
features = diabetic_final.drop(['readmit_30d'], axis = 1)
target = diabetic_final['readmit_30d']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    target, 
                                                    test_size=0.2,
                                                    stratify = target,
                                                    random_state = 42)

logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score

In [None]:
logit = LogisticRegression(C=1e7, solver='liblinear', class_weight = 'balanced')
logit.fit(X_train, y_train)

In [None]:
# accuracy
print(logit.score(X_train, y_train))
print(logit.score(X_test, y_test))

In [None]:
y_train_pred = logit.predict(X_train)
y_test_pred = logit.predict(X_test)

In [None]:
confusion_matrix(y_train, y_train_pred)

In [None]:
# AUC score
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

In [None]:
# 

checking VIF

In [None]:
X_vif = pd.DataFrame(X_train, dtype=float)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor 
vif_data = pd.DataFrame() 
vif_data["feature"] = X_vif.columns
vif_data["VIF"] = [variance_inflation_factor(X_vif.values, i) 
                          for i in range(len(X_vif.columns))] 
print(vif_data)

logistic with lasso

In [None]:
logit_l1 = LogisticRegression(solver='liblinear', class_weight = 'balanced', penalty = 'l1')
logit_l1

In [None]:
logit_cv = LogisticRegressionCV(Cs = np.linspace(1e-4, , 20), cv = 3, penalty = 'l1',
                                scoring = 'roc_auc', solver='liblinear')
logit_cv.fit(X_train, y_train)

In [None]:
logit_cv.scores_

In [None]:
logit_cv.coef_[0]

In [None]:
pd.DataFrame({'features': X_train.columns, 'coef': logit_cv.coef_[0]}).sort_values('coef',ascending = False)

In [None]:
from sklearn import metrics
sorted(metrics.SCORERS.keys())

RFE CV

In [None]:
from sklearn.feature_selection import RFECV

In [None]:
logit_rfecv = RFECV(C = estimator=logit, step=2, cv=3, 
                    verbose = 1, scoring='roc_auc')

In [None]:
logit_rfecv.fit(X_train, y_train)

In [None]:
logit_rfecv.grid_scores_

In [None]:
X_train.columns

In [None]:
logit_rfecv.ranking_

model with best params

random forest

In [None]:
from sklearn import ensemble
RFC = ensemble.RandomForestClassifier(oob_score=True, class_weight = 'balanced', min_samples_leaf = 2)

In [None]:
RFC.fit(X_train, y_train)
print(RFC.oob_score_)

In [None]:
y_train_pred = RFC.predict(X_train)
y_test_pred = RFC.predict(X_test)

In [None]:
confusion_matrix(y_train, y_train_pred)

In [None]:
confusion_matrix(y_test, y_test_pred)

In [None]:
# AUC score
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

RF CV

In [None]:
RF_grid_params = [{
#     "n_estimators": [25, 50, 75],
    'max_depth' : [4,5,6,7,8],
    "criterion": ['gini','entropy'],
    "min_samples_leaf": range(1, 5),
    "min_samples_split": np.linspace(start=2, stop=20, num=10, dtype=int),
    "random_state": [108]}]
grid_search_RF = GridSearchCV(RFC, RF_grid_params, scoring='roc_auc', cv=3, n_jobs=-1)
%time grid_search_RF.fit(X_train, y_train)

In [None]:
grid_search_RF.best_params_

In [None]:
RFC_best = ensemble.RandomForestClassifier(oob_score=True, class_weight = 'balanced',
                                           min_samples_leaf = 3,
                                          min_samples_split = 14,
                                          max_depth = 8,
                                          criterion = 'gini')

In [None]:
RFC_best.fit(X_train, y_train)

In [None]:
y_train_pred = RFC_best.predict(X_train)
y_test_pred = RFC_best.predict(X_test)

In [None]:
confusion_matrix(y_train, y_train_pred)

In [None]:
# AUC score
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

In [None]:
pd.DataFrame({'feature':X_train.columns, 'importance':RFC_best.feature_importances_}).\
sort_values('importance',ascending = False)