In [169]:
%matplotlib inline
# Required Python Packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

In [212]:
# Reading in data from the 50-50 readmit vs. nonreadmit dataset

df = pd.read_csv('dfd.csv')
df.head();

In [213]:
# Dropping certain columns

# Dropping ID numbers and dates
df = df.drop(columns=['subject_id', 'hadm_id', 'admittime', 'dischtime'])

# Dropping labevents and chartevents values of less importance based on feature selection (2/3 values for each measurement)
df = df.drop(columns=['rdw_min', 'rdw_max', 'hemoglobin_min', 'hemoglobin_max', 'creatinine_median', 'creatinine_min', 
                      'hematocrit_median', 'hematocrit_min', 'tempc_median', 'tempc_max', 'resprate_median', 
                      'resprate_min', 'wbc_median', 'wbc_max', 'inr_min', 'inr_median', 'ptt_median', 'ptt_max', 
                      'lactate_median', 'lactate_max', 'sysbp_median', 'sysbp_min', 'spo2_median', 'spo2_max', 
                      'bilirubin_median', 'bilirubin_max', 'platelet_median', 'platelet_max', 'heartrate_min',
                      'heartrate_median'])
df.head();

In [214]:
# Converting categorical features into dummy variables

df_converted = pd.get_dummies(df)
df_converted.head();

In [173]:
# Splitting dataframe into data (predictors) vs. label (attributed to be predicted)

label_df = df_converted.pop('followed_by_readmit')
data_df = df_converted
names = data_df.columns.values
#print('label_df:\n', label_df.head(), 2*'\n', 'data_df:\n', data_df.head())

In [215]:
# Converting dataframes to NumPy arrays

label = label_df.values
data = data_df.values

In [216]:
# 80/20 train-test split

from sklearn.model_selection import train_test_split

train_data, test_data, train_label, test_label = train_test_split(data, label, train_size=0.8, test_size=0.2, random_state=10)
print('Training data:', train_data.shape, '\tTest data:', test_data.shape)
print('Training labels:', train_label.shape, '\tTest labels:', test_label.shape)

Training data: (4873, 110) 	Test data: (1219, 110)
Training labels: (4873,) 	Test labels: (1219,)


In [217]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
model = RandomForestClassifier(n_estimators = 700,max_depth=6, oob_score = True, n_jobs = -1,random_state =50,  max_features = "auto" , min_samples_leaf = 20)

In [218]:
model.fit(train_data,train_label)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=20, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=700, n_jobs=-1,
            oob_score=True, random_state=50, verbose=0, warm_start=False)

In [219]:
print(model.oob_score_)

0.629591627334291


In [220]:
# Use the forest's predict method on the test data
predictions = model.predict(test_data)

In [221]:
accuracy_score(test_label, predictions)

0.6570959803117309

In [223]:
model.feature_importances_;

In [144]:
param_grid = { 
    'n_estimators': [200, 300,400,500,700],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [159]:
from sklearn.grid_search import GridSearchCV
CV_rfc = GridSearchCV(estimator=model, param_grid=param_grid, cv= 10)

In [160]:
CV_rfc.fit(train_data, train_label)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=20, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=700, n_jobs=-1,
            oob_score=True, random_state=50, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [200, 700], 'max_features': ['auto', 'sqrt', 'log2']},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [161]:
print (CV_rfc.best_params_)

{'max_features': 'auto', 'n_estimators': 200}


In [224]:
print (sorted(zip(map(lambda x: round(x, 4), model.feature_importances_), names), 
             reverse=True))

[(0.1269, 'rdw_median'), (0.0787, 'readmit_time'), (0.0741, 'is_readmit'), (0.068, 'No_of_Depart'), (0.0491, 'length_of_stay'), (0.0454, 'hemoglobin_median'), (0.0451, 'creatinine_max'), (0.0375, 'wbc_min'), (0.029, 'hematocrit_max'), (0.027, 'ptt_min'), (0.0261, 'tempc_min'), (0.0247, 'resprate_max'), (0.0236, 'platelet_min'), (0.0228, 'spo2_min'), (0.0224, 'sysbp_max'), (0.0217, 'diseases of the genitourinary system'), (0.021, 'heartrate_max'), (0.021, 'age_on_admiss'), (0.0202, 'ethnicity_UNKNOWN/NOT SPECIFIED'), (0.0189, 'lactate_min'), (0.0178, 'inr_max'), (0.0121, 'ethnicity_BLACK/AFRICAN AMERICAN'), (0.0111, 'diseases of the blood and blood-forming organs'), (0.011, 'bilirubin_min'), (0.0099, 'single_int_mammCAB'), (0.0096, 'infectious and parasitic diseases'), (0.0075, 'external causes of injury and supplemental classification'), (0.0074, 'glycopyrrolate'), (0.0072, 'diseases of the respiratory system'), (0.0067, 'aorto_CBT'), (0.0065, 'diseases of the digestive system'), (0.00

## Top features

[(0.1269, 'rdw_median'), (0.0787, 'readmit_time'), (0.0741, 'is_readmit'), (0.068, 'No_of_Depart'), (0.0491, 'length_of_stay'), (0.0454, 'hemoglobin_median'), (0.0451, 'creatinine_max'), (0.0375, 'wbc_min'), (0.029, 'hematocrit_max'), (0.027, 'ptt_min'), (0.0261, 'tempc_min'), (0.0247, 'resprate_max'), (0.0236, 'platelet_min'), (0.0228, 'spo2_min'), (0.0224, 'sysbp_max'), (0.0217, 'diseases of the genitourinary system'), (0.021, 'heartrate_max'), (0.021, 'age_on_admiss'), (0.0202, 'ethnicity_UNKNOWN/NOT SPECIFIED'), (0.0189, 'lactate_min'), (0.0178, 'inr_max'), (0.0121, 'ethnicity_BLACK/AFRICAN AMERICAN'), (0.0111, 'diseases of the blood and blood-forming organs'), (0.011, 'bilirubin_min'),

# Adaboost

In [198]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

In [209]:
clf = AdaBoostClassifier(n_estimators=100)

In [210]:
scores = cross_val_score(clf, train_data, train_label)

In [211]:
scores.mean() 

0.6047619047619047

In [None]:
seed = 7
num_trees = 30
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, , Y, cv=kfold)
print(results.mean())