In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('diabetic_data.csv', na_values = '?')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


# Data Cleaning

### There are rows that are the same patient

In [4]:
df.shape

(101766, 50)

In [5]:
df['patient_nbr'].nunique()

71518

### Missing variables

In [6]:
print("Column-wise missing value percentage")
a = df.isnull().sum()/len(df)*100
a = a[a!=0]
a.sort_values(ascending=False)

Column-wise missing value percentage


weight               96.858479
medical_specialty    49.082208
payer_code           39.557416
race                  2.233555
diag_3                1.398306
diag_2                0.351787
diag_1                0.020636
dtype: float64

### Can we fill in anything? Race?

In [7]:
race_missing = df[df['race'].isna()]
race_not_missing = df[df['race'].notna()]

In [8]:
#Not worth it to filling missing race
sum(race_missing['patient_nbr'].apply(lambda x: True if (x in race_not_missing['patient_nbr']) else False))

2

### Drop weight (too many missing), medical_specialty & payer code (no way we can impute this)

In [9]:
df_clean = df.copy()
df_clean.drop(['weight','medical_specialty','payer_code'],axis=1,inplace = True)

### Drop NAs in other columns

In [10]:
df_clean.dropna(inplace = True)

### No missing values now

In [11]:
print("Column-wise missing value percentage")
a = df_clean.isnull().sum()/len(df_clean)*100
a = a[a!=0]
a.sort_values(ascending=False)

Column-wise missing value percentage


Series([], dtype: float64)

### Still a good number of rows left

In [12]:
df.shape

(101766, 50)

In [13]:
df_clean.shape

(98053, 47)

# Pipeline!

In [14]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score

In [15]:
X = df_clean.drop(['readmitted','encounter_id','patient_nbr'], axis=1)
#We just want a binary prediction of whether or not the patient gets readmitted
y = np.where(df_clean['readmitted']=='NO',0,1)

### Pretty balanced dataset

In [16]:
pd.Series(y).value_counts()

0    52338
1    45715
dtype: int64

In [17]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=20)

In [18]:
numeric_features_raw = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features_raw = X.select_dtypes(include=['object']).columns

In [19]:
numeric_features_raw

Index(['admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses'],
      dtype='object')

In [20]:
#hand pick some numeric_features, we know the _id variables are not numeric
numeric_features = [
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses']

In [21]:
categorical_features_raw

Index(['race', 'gender', 'age', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum',
       'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed'],
      dtype='object')

In [22]:
#Add the _id variables to categorical list
categorical_features = ['admission_type_id', 'discharge_disposition_id', 'admission_source_id',
        'race', 'gender', 'age', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum',
       'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed']

In [23]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

In [24]:
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [25]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [26]:
from sklearn.ensemble import RandomForestClassifier
rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

In [27]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


In [28]:
accuracy_score(y_test,y_pred)

0.6047626332160522

### categrocial feature names

In [29]:
hot_coded_cat_names = rf.named_steps['preprocessor'].transformers_[1][1]\
   .named_steps['onehot'].get_feature_names(categorical_features)

In [30]:
feature_names = list(hot_coded_cat_names)+numeric_features

In [31]:
len(feature_names)

2321

In [32]:
len(rf.steps[1][1].feature_importances_)

2321

### Feature importances

In [33]:
feature_importances = pd.DataFrame(rf.steps[1][1].feature_importances_,
                                   index = feature_names,
                                    columns=['importance']).sort_values('importance', ascending=False)
feature_importances.head(20)

Unnamed: 0,importance
admission_type_id_2,0.049886
admission_type_id_4,0.047109
admission_type_id_7,0.039269
admission_type_id_1,0.03623
admission_type_id_8,0.028374
admission_type_id_3,0.025574
admission_type_id_5,0.015085
admission_type_id_6,0.012422
age_[60-70),0.010798
age_[50-60),0.01066
