In [78]:
import pandas as pd
import numpy as np
import math

## Import dataset

In [79]:
df=pd.read_csv('dataset_output/raw_training_data_cleaned_up.csv')

## Shuffle dataset and reset index
reset_index tells Pandas to replace the existing index column instead of creating a new one.

frac is the fraction of rows to return; in this case 100% of them, in random order

In [80]:
df = df.sample(frac=1).reset_index(drop=True)

In [81]:
df

Unnamed: 0.1,Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide.metformin,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone,change,diabetesMed,readmitted
0,61780,172635228,85974372,3,1,8,1,11,7,14,...,1,1,1,1,1,1,1,0,1,0
1,39087,121936098,94086738,3,2,6,1,1,7,2,...,1,2,1,1,1,1,1,1,1,0
2,48899,148848666,25198893,1,2,6,3,1,4,1,...,1,3,1,1,1,1,1,1,1,0
3,87818,281842560,44044362,3,2,6,1,7,7,1,...,1,4,1,1,1,1,1,1,1,2
4,4163,24149970,15988968,3,2,6,1,1,7,4,...,1,1,1,1,1,1,1,0,1,0
5,43773,135342408,75976047,1,1,6,1,1,7,1,...,1,1,1,1,1,1,1,0,0,0
6,90487,298350728,64564542,3,2,7,2,3,5,3,...,1,3,1,1,1,1,1,0,1,1
7,55571,160688760,54272547,3,1,7,5,6,1,6,...,1,3,1,1,1,1,1,0,1,2
8,53257,156627498,65012598,3,1,7,2,1,1,4,...,1,1,1,1,1,1,1,0,0,0
9,76773,232658508,44317233,3,1,8,3,1,1,1,...,1,1,1,1,1,1,1,1,1,0


## Clean up the dataframe a bit

In [82]:
df.rename(columns={'glyburide.metformin':'glyburide_metformin', 
                   'glipizide.metformin':'glipizide_metformin',
                   'glimepiride.pioglitazone':'glimepiride_pioglitazone', 
                   'metformin.rosiglitazone':'metformin_rosiglitazone',
                   'metformin.pioglitazone':'metformin_pioglitazone'}, 
          inplace=True)

In [83]:
X = df.drop(['Unnamed: 0', 'encounter_id','patient_nbr'], axis=1, inplace=True)

In [84]:
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100702 entries, 0 to 100701
Data columns (total 45 columns):
race                        100702 non-null int64
gender                      100702 non-null int64
age                         100702 non-null int64
admission_type_id           100702 non-null int64
discharge_disposition_id    100702 non-null int64
admission_source_id         100702 non-null int64
time_in_hospital            100702 non-null int64
num_lab_procedures          100702 non-null int64
num_procedures              100702 non-null int64
num_medications             100702 non-null int64
number_outpatient           100702 non-null int64
number_emergency            100702 non-null int64
number_inpatient            100702 non-null int64
diag_1                      100702 non-null int64
diag_2                      100702 non-null int64
diag_3                      100702 non-null int64
number_diagnoses            100702 non-null int64
max_glu_serum               100702 non-

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,citoglipton,insulin,glyburide_metformin,glipizide_metformin,glimepiride_pioglitazone,metformin_rosiglitazone,metformin_pioglitazone,change,diabetesMed,readmitted
0,3,1,8,1,11,7,14,52,0,26,...,1,1,1,1,1,1,1,0,1,0
1,3,2,6,1,1,7,2,28,0,12,...,1,2,1,1,1,1,1,1,1,0
2,1,2,6,3,1,4,1,17,4,12,...,1,3,1,1,1,1,1,1,1,0
3,3,2,6,1,7,7,1,34,0,13,...,1,4,1,1,1,1,1,1,1,2
4,3,2,6,1,1,7,4,55,5,20,...,1,1,1,1,1,1,1,0,1,0
5,1,1,6,1,1,7,1,57,0,6,...,1,1,1,1,1,1,1,0,0,0
6,3,2,7,2,3,5,3,44,0,19,...,1,3,1,1,1,1,1,0,1,1
7,3,1,7,5,6,1,6,65,1,26,...,1,3,1,1,1,1,1,0,1,2
8,3,1,7,2,1,1,4,39,0,10,...,1,1,1,1,1,1,1,0,0,0
9,3,1,8,3,1,1,1,33,6,7,...,1,1,1,1,1,1,1,1,1,0


#### All columns are using encoded int's. Great! 

# Random forest

#### How to pick parameters: http://scikit-learn.org/stable/modules/ensemble.html#parameters

It's interesting that scikit-learn uses randomly selected features to determine how to split a node when constructing a tree (http://scikit-learn.org/stable/modules/ensemble.html#random-forests)

In [138]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

### 1) Construct training, testing and validation sets

In [93]:
TRAIN_SPLIT = 0.8
# TODO: validation set
#VALIDATION_PCT = 0.2 # 20% of the training set will be the validation set
num_rows = len(df)
training_set = df[: math.floor(TRAIN_SPLIT * num_rows)]
#validation_set = df[: math.floor(VALIDATION_PCT * num_rows)]
test_set = df[math.floor((TRAIN_SPLIT) * num_rows) :]

# Sanity check
print(num_rows, len(training_set), len(test_set))

100702 80561 20141


### 2) Extract input and output features

In [130]:
# We want every column other than the admission and disposition id's
all_features = df.axes[1]
in_features = all_features[:3].union(all_features[5:])
out_features = all_features.difference(in_features)

### 3) Run RandomForestClassifier

In [136]:
classifier = RandomForestClassifier(n_estimators=25, verbose = 2)
classifier.fit(training_set[in_features], training_set[out_features])

building tree 1 of 25


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


building tree 2 of 25
building tree 3 of 25
building tree 4 of 25
building tree 5 of 25
building tree 6 of 25
building tree 7 of 25
building tree 8 of 25
building tree 9 of 25
building tree 10 of 25
building tree 11 of 25
building tree 12 of 25
building tree 13 of 25
building tree 14 of 25
building tree 15 of 25
building tree 16 of 25
building tree 17 of 25
building tree 18 of 25
building tree 19 of 25
building tree 20 of 25
building tree 21 of 25
building tree 22 of 25
building tree 23 of 25
building tree 24 of 25
building tree 25 of 25


[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    7.6s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=25, n_jobs=1, oob_score=False, random_state=None,
            verbose=2, warm_start=False)

### 4) Evaluate our classification model

## Models to try:
Random forest

Adaboost

Gradient Tree Boosting 

Try VotingClassifier from scikit at the end


## Things to try:
predict time in hospital