In [1]:
%matplotlib inline
from redcap import Project
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
sns.set_context('notebook')

In [39]:
api_url = 'https://redcap.vanderbilt.edu/api/'

hospitalized_key = open("token.txt").read()
hospitalized_proj = Project(api_url, hospitalized_key)
hospitalized_raw = hospitalized_proj.export_records(format='df', 
                            df_kwargs={'index_col': hospitalized_proj.field_names[0]})

In [49]:
black_subset = hospitalized_raw[hospitalized_raw.race==3].copy()

In [69]:
black_subset['response'] = None
black_subset.loc[black_subset.therapy_response___0==1, 'response'] = 1
black_subset.loc[black_subset.therapy_response___1==1, 'response'] = 0
black_subset.response = black_subset.response.astype(float)

In [70]:
black_subset.response.value_counts()

1    63
0    24
Name: response, dtype: int64

In [92]:
black_subset.columns[black_subset.columns.str.startswith('therapy_response')]

Index([u'therapy_response___0', u'therapy_response___1',
       u'therapy_response___3', u'therapy_response___4',
       u'therapy_response_notes'],
      dtype='object')

In [103]:
response_cols = black_subset.columns[black_subset.columns.str.startswith('therapy_response')]
meaningless_cols = ['mrn', 'mrn_and_treatment_date']
drop_cols = np.append(response_cols.values, meaningless_cols)

black_subset = black_subset.dropna(subset=['response']).drop(drop_cols, axis=1)

In [104]:
black_subset_low_missing = black_subset.loc[:, (black_subset.isnull().sum(0) < 5) & (black_subset.dtypes != object)]
black_subset_complete = black_subset_low_missing.apply(lambda x: x.fillna(x.mean()))

In [105]:
black_subset_complete.shape

(87, 161)

In [106]:
from sklearn import preprocessing

X = black_subset_complete.copy()
y = X.pop('response')

X_scaled = preprocessing.scale(X)

In [125]:
from sklearn.preprocessing._weights import _balance_weights

w = _balance_weights(y)

In [130]:
black_subset.response.value_counts()

1    63
0    24
Name: response, dtype: int64

In [141]:
from sklearn import cross_validation

X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X_scaled, y, test_size=0.4, random_state=0)

In [142]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs=4,
                            oob_score=True,
                            class_weight={0:w.min(), 1:w.max()})
rf.fit(X_train, y_train)

preds = rf.predict(X_test)
pd.crosstab(y_test, preds, rownames=['actual'], 
            colnames=['prediction'])

prediction,0.0,1.0
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,7,2
1,0,26


In [143]:
importance = pd.Series(rf.feature_importances_, index=X.columns)
importance.sort(ascending=False)
importance[:25]

  from ipykernel import kernelapp as app


kd_therapy___1                    0.402378
hd_num_days                       0.145434
illness_day_at_rx                 0.069734
num_echo_post_eval                0.051949
echo_2_specify___7                0.032730
echo_5_specify___1                0.028004
echo_7_specify___2                0.026014
type_of_subsequent_diagnos___2    0.020925
echo_3_specify___5                0.019978
lab_criteria___4                  0.017576
classification                    0.017312
abnormality___9                   0.016846
lab_criteria___1                  0.013117
kd_therapy___9                    0.013073
results_echo_1                    0.010863
lab_criteria___3                  0.010730
echo_2_result                     0.010576
abnormality___7                   0.010114
echo_9_specify___2                0.008757
ethnicity                         0.008511
echo_2_specify___3                0.007915
kd_therapy___2                    0.007193
type_of_subsequent_diagnos___1    0.006885
clinical_cr