In [569]:
import pandas as pd
import numpy as np
import math

## Import dataset

In [570]:
df=pd.read_csv('dataset_output/raw_training_data_cleaned_up.csv')

## Shuffle dataset and reset index
reset_index tells Pandas to replace the existing index column instead of creating a new one.

frac is the fraction of rows to return; in this case 100% of them, in random order

In [571]:
df = df.sample(frac=1).reset_index(drop=True)

## Clean up the dataframe a bit

In [572]:
df.rename(columns={'glyburide.metformin':'glyburide_metformin', 
                   'glipizide.metformin':'glipizide_metformin',
                   'glimepiride.pioglitazone':'glimepiride_pioglitazone', 
                   'metformin.rosiglitazone':'metformin_rosiglitazone',
                   'metformin.pioglitazone':'metformin_pioglitazone'}, 
          inplace=True)

In [573]:
X = df.drop(['Unnamed: 0', 'encounter_id','patient_nbr'], axis=1, inplace=True)

In [574]:
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100702 entries, 0 to 100701
Data columns (total 45 columns):
race                        100702 non-null int64
gender                      100702 non-null int64
age                         100702 non-null int64
admission_type_id           100702 non-null int64
discharge_disposition_id    100702 non-null int64
admission_source_id         100702 non-null int64
time_in_hospital            100702 non-null int64
num_lab_procedures          100702 non-null int64
num_procedures              100702 non-null int64
num_medications             100702 non-null int64
number_outpatient           100702 non-null int64
number_emergency            100702 non-null int64
number_inpatient            100702 non-null int64
diag_1                      100702 non-null int64
diag_2                      100702 non-null int64
diag_3                      100702 non-null int64
number_diagnoses            100702 non-null int64
max_glu_serum               100702 non-

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,citoglipton,insulin,glyburide_metformin,glipizide_metformin,glimepiride_pioglitazone,metformin_rosiglitazone,metformin_pioglitazone,change,diabetesMed,readmitted
0,3,1,8,3,18,4,8,50,0,13,...,1,1,1,1,1,1,1,1,1,1
1,3,1,4,1,1,7,5,47,0,14,...,1,3,1,1,1,1,1,0,1,1
2,1,1,5,3,1,1,7,4,0,10,...,1,4,1,1,1,1,1,1,1,2
3,3,2,7,1,1,7,6,96,2,13,...,1,3,1,1,1,1,1,0,1,0
4,5,2,8,1,1,7,3,47,0,11,...,1,1,1,1,1,1,1,0,1,2
5,3,1,8,1,1,7,4,67,1,15,...,1,2,1,1,1,1,1,1,1,0
6,3,1,8,5,6,17,8,24,0,22,...,1,4,1,1,1,1,1,1,1,2
7,3,2,8,1,1,7,4,59,0,8,...,1,3,1,1,1,1,1,0,1,0
8,3,1,8,1,1,7,2,46,1,10,...,1,1,1,1,1,1,1,0,1,2
9,3,1,9,6,6,1,13,37,0,3,...,1,1,1,1,1,1,1,0,0,0


#### All columns are using encoded int's. Great! 

# Random forest

#### How to pick parameters: http://scikit-learn.org/stable/modules/ensemble.html#parameters

scikit-learn uses randomly selected features to determine how to split a node when constructing a tree (http://scikit-learn.org/stable/modules/ensemble.html#random-forests)

In [575]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

### 1) Construct training and testing 
TODO: validation sets

In [576]:
# Separate input and output features 
# Original df is not mutated
X = df.drop(['discharge_disposition_id','admission_type_id'], axis=1)

# columns of the values we want to predict which we use to train our model
y_disposition = df['discharge_disposition_id']
y_admission = df['admission_type_id']

# For disposition statuses
X_disposition_train, X_disposition_test, y_disposition_train, y_disposition_test = train_test_split(X, y_disposition, test_size=0.3)

# For admission statuses
X_admission_train, X_admission_test, y_admission_train, y_admission_test = train_test_split(X, y_admission, test_size=0.3)

### 2) Run RandomForestClassifier to train our model

In [582]:
admission_classifier = RandomForestClassifier(n_estimators=70, n_jobs=-1 )
disposition_classifier = RandomForestClassifier(n_estimators=70, n_jobs=-1 )

admission_classifier.fit(X_admission_train, y_admission_train)
disposition_classifier.fit(X_disposition_train, y_disposition_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=70, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

### 3) Predict!

In [583]:
# predict classes for data in test_set
predict_admission = admission_classifier.predict(X_admission_test) 
predict_disposition = disposition_classifier.predict(X_disposition_test)

### 4) Evaluate our model

~~Since scikit doesn't support multiclass-multilabel, we have to do this manually in a simple, basic way for now.~~ (I was using multiclass multilabel classifiers. But I'm experimenting multiclass single-label now.)

In [584]:
from sklearn.metrics import classification_report,confusion_matrix

In [585]:
print(classification_report(predict_admission, y_admission_test))

             precision    recall  f1-score   support

          1       0.94      0.85      0.90     17645
          2       0.36      0.56      0.44      3537
          3       0.77      0.66      0.71      6570
          4       0.00      0.00      0.00         0
          5       0.64      0.69      0.67      1292
          6       0.69      0.95      0.80      1167
          7       0.00      0.00      0.00         0
          8       0.00      0.00      0.00         0

avg / total       0.82      0.77      0.79     30211



  'recall', 'true', average, warn_for)


In [586]:
print(classification_report(predict_disposition, y_disposition_test))

             precision    recall  f1-score   support

          1       0.96      0.66      0.78     26109
          2       0.00      0.00      0.00         1
          3       0.37      0.46      0.41      3268
          4       0.00      0.00      0.00         0
          5       0.00      0.00      0.00         3
          6       0.07      0.37      0.12       761
          7       0.00      0.00      0.00         0
          8       0.00      0.00      0.00         0
          9       0.00      0.00      0.00         0
         10       0.00      0.00      0.00         0
         11       0.01      0.67      0.02         6
         12       0.00      0.00      0.00         0
         13       0.00      0.00      0.00         1
         14       0.00      0.00      0.00         0
         15       0.00      0.00      0.00         0
         16       0.00      0.00      0.00         0
         17       0.00      0.00      0.00         0
         18       0.02      0.35      0.03   

  'recall', 'true', average, warn_for)


## Models to try:


Adaboost

Gradient Tree Boosting 

Try VotingClassifier from scikit at the end


## Things to try:
predict time in hospital