In [338]:
import pandas as pd
import numpy as np
import math

## Import dataset

In [339]:
df=pd.read_csv('dataset_output/raw_training_data_cleaned_up.csv')

## Shuffle dataset and reset index
reset_index tells Pandas to replace the existing index column instead of creating a new one.

frac is the fraction of rows to return; in this case 100% of them, in random order

In [340]:
df = df.sample(frac=1).reset_index(drop=True)

In [341]:
df

Unnamed: 0.1,Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide.metformin,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone,change,diabetesMed,readmitted
0,3079,19583754,4890204,3,1,9,2,1,1,2,...,1,1,1,1,1,1,1,0,0,0
1,17351,64029438,70002297,3,1,8,6,1,1,1,...,1,1,1,1,1,1,1,0,1,0
2,85815,272892954,54097524,3,2,8,3,1,1,7,...,1,1,1,1,1,1,1,0,0,0
3,49437,149775444,39942765,3,1,9,5,23,1,9,...,1,4,1,1,1,1,1,1,1,2
4,56211,161806470,40937607,3,1,8,3,23,1,14,...,1,1,1,1,1,1,1,0,1,2
5,93552,339053252,80353494,3,1,7,1,1,7,2,...,1,3,1,1,1,1,1,1,1,0
6,42130,130223976,40605183,3,1,8,1,1,7,1,...,1,1,1,1,1,1,1,0,1,2
7,6957,33727470,11290896,1,1,6,1,1,7,2,...,1,1,1,1,1,1,1,0,1,2
8,2485,16206390,3054483,1,1,3,3,1,1,2,...,1,2,1,1,1,1,1,1,1,0
9,67301,189387594,95467419,3,1,5,2,1,7,3,...,1,2,1,1,1,1,1,1,1,2


## Clean up the dataframe a bit

In [342]:
df.rename(columns={'glyburide.metformin':'glyburide_metformin', 
                   'glipizide.metformin':'glipizide_metformin',
                   'glimepiride.pioglitazone':'glimepiride_pioglitazone', 
                   'metformin.rosiglitazone':'metformin_rosiglitazone',
                   'metformin.pioglitazone':'metformin_pioglitazone'}, 
          inplace=True)

In [343]:
X = df.drop(['Unnamed: 0', 'encounter_id','patient_nbr'], axis=1, inplace=True)

In [344]:
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100702 entries, 0 to 100701
Data columns (total 45 columns):
race                        100702 non-null int64
gender                      100702 non-null int64
age                         100702 non-null int64
admission_type_id           100702 non-null int64
discharge_disposition_id    100702 non-null int64
admission_source_id         100702 non-null int64
time_in_hospital            100702 non-null int64
num_lab_procedures          100702 non-null int64
num_procedures              100702 non-null int64
num_medications             100702 non-null int64
number_outpatient           100702 non-null int64
number_emergency            100702 non-null int64
number_inpatient            100702 non-null int64
diag_1                      100702 non-null int64
diag_2                      100702 non-null int64
diag_3                      100702 non-null int64
number_diagnoses            100702 non-null int64
max_glu_serum               100702 non-

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,citoglipton,insulin,glyburide_metformin,glipizide_metformin,glimepiride_pioglitazone,metformin_rosiglitazone,metformin_pioglitazone,change,diabetesMed,readmitted
0,3,1,9,2,1,1,2,31,1,6,...,1,1,1,1,1,1,1,0,0,0
1,3,1,8,6,1,1,1,30,3,10,...,1,1,1,1,1,1,1,0,1,0
2,3,2,8,3,1,1,7,21,0,4,...,1,1,1,1,1,1,1,0,0,0
3,3,1,9,5,23,1,9,50,0,18,...,1,4,1,1,1,1,1,1,1,2
4,3,1,8,3,23,1,14,61,0,18,...,1,1,1,1,1,1,1,0,1,2
5,3,1,7,1,1,7,2,68,4,30,...,1,3,1,1,1,1,1,1,1,0
6,3,1,8,1,1,7,1,41,0,5,...,1,1,1,1,1,1,1,0,1,2
7,1,1,6,1,1,7,2,35,0,20,...,1,1,1,1,1,1,1,0,1,2
8,1,1,3,3,1,1,2,32,1,15,...,1,2,1,1,1,1,1,1,1,0
9,3,1,5,2,1,7,3,75,0,16,...,1,2,1,1,1,1,1,1,1,2


#### All columns are using encoded int's. Great! 

# Random forest

#### How to pick parameters: http://scikit-learn.org/stable/modules/ensemble.html#parameters

scikit-learn uses randomly selected features to determine how to split a node when constructing a tree (http://scikit-learn.org/stable/modules/ensemble.html#random-forests)

In [345]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

### 1) Construct training, testing and validation sets

In [346]:
TRAIN_SPLIT = 0.8
# TODO: validation set
#VALIDATION_PCT = 0.2 # 20% of the training set will be the validation set
num_rows = len(df)
training_set = df[: math.floor(TRAIN_SPLIT * num_rows)]
#validation_set = df[: math.floor(VALIDATION_PCT * num_rows)]
test_set = df[math.floor((TRAIN_SPLIT) * num_rows) :]

# Sanity check
print(num_rows, len(training_set), len(test_set))

100702 80561 20141


### 2) Extract input and output features

In [347]:
# We want every column other than the admission and disposition id's
all_features = df.axes[1]
in_features = all_features[:3].union(all_features[5:])
out_features = all_features.difference(in_features)

### 3) Run RandomForestClassifier

In [348]:
classifier = RandomForestClassifier(n_estimators=33, max_features=6, 
                                    verbose=2, bootstrap=False,
                                   n_jobs=-1, )
classifier.fit(training_set[in_features], training_set[out_features])

building tree 4 of 33building tree 3 of 33building tree 1 of 33building tree 2 of 33



building tree 5 of 33
building tree 6 of 33
building tree 7 of 33
building tree 8 of 33
building tree 9 of 33
building tree 10 of 33
building tree 12 of 33building tree 11 of 33

building tree 13 of 33
building tree 14 of 33
building tree 15 of 33
building tree 16 of 33
building tree 17 of 33
building tree 18 of 33
building tree 19 of 33
building tree 20 of 33
building tree 21 of 33
building tree 22 of 33
building tree 23 of 33
building tree 24 of 33
building tree 25 of 33
building tree 26 of 33
building tree 27 of 33
building tree 28 of 33
building tree 29 of 33
building tree 30 of 33
building tree 31 of 33
building tree 32 of 33
building tree 33 of 33


[Parallel(n_jobs=-1)]: Done  33 out of  33 | elapsed:    7.9s finished


RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=6, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=33, n_jobs=-1, oob_score=False, random_state=None,
            verbose=2, warm_start=False)

In [349]:
classifier.get_params()

{'bootstrap': False,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 6,
 'max_leaf_nodes': None,
 'min_impurity_split': 1e-07,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 33,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': None,
 'verbose': 2,
 'warm_start': False}

### 4) Train our model

In [350]:
predict_classes = classifier.predict(test_set[in_features]) # predict classes for data in test_set
predict_classes

[Parallel(n_jobs=4)]: Done  33 out of  33 | elapsed:    2.2s finished


array([[ 3.,  6.],
       [ 1.,  1.],
       [ 1.,  1.],
       ..., 
       [ 5.,  1.],
       [ 1.,  1.],
       [ 1.,  1.]])

### 5) Evaluate our model

Since scikit doesn't support multiclass-multilabel, we have to do this manually in a simple, basic way for now.

In [351]:
np.sum(np.equal(test_set[out_features], predict_classes))/len(test_set)

admission_type_id           0.775582
discharge_disposition_id    0.621419
dtype: float64

## Models to try:


Adaboost

Gradient Tree Boosting 

Try VotingClassifier from scikit at the end


## Things to try:
predict time in hospital