In [1]:
import pandas as pd
import numpy as np


from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.externals import joblib
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline

In [3]:
df = pd.read_csv('../data/simplifiedSNFData.csv')

In [4]:
def response(x):
    if x == 'no':
        return 0
    else:
        return 1

In [5]:
def icu_flag(x):
    if x == 'n':
        return 0
    else: 
        return 1

In [6]:
df['response'] = df['response'].apply(lambda x: response(x))

In [7]:
df['icu_flag'] = df['icu_flag'].apply(lambda x: icu_flag(x))

In [8]:
compute_class_weight('balanced', np.unique(df['response']), df['response'])

array([ 0.61226525,  2.72686899])

In [9]:
X = df.drop('response', axis=1)

In [10]:
y = df['response']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [12]:
n_estimators = 5000

In [13]:
gbm = GradientBoostingClassifier(n_estimators = n_estimators)

In [14]:
gbm.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=5000, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [15]:
p = gbm.predict(X_test)

In [16]:
df_test = pd.DataFrame(X_test)

In [17]:
df_test['class'] = p
df_test['flag'] = y_test

In [18]:
tp = len(df_test[(df_test['flag'] == 1) & (df_test['class']==1)])
tn = len(df_test[(df_test['flag'] == 0) & (df_test['class']==0)])
fp = len(df_test[(df_test['flag'] == 0) & (df_test['class']==1)])
fn = len(df_test[(df_test['flag'] == 1) & (df_test['class']==0)])

In [19]:
acc = (float(tp)+float(tn)) / float(len(df_test))
prec = float(tp) / float((tp+fp))
rec = float(tp) / float((fn+tp))

In [20]:
f1 = f1_score(y_test, p)

In [21]:
print "Model Scores"
print "True Positives: %d" % tp
print "True Negatives: %d" % tn
print "False Positives: %d" % fp
print "False Negatives: %d" % fn
print "\n"
print "Accuracy: %f" % acc
print "Precision: %f" % prec
print "Recall: %f" % rec
print "\n"
print "F1 Score: %f" % f1

Model Scores
True Positives: 51
True Negatives: 1424
False Positives: 83
False Negatives: 284


Accuracy: 0.800760
Precision: 0.380597
Recall: 0.152239


F1 Score: 0.217484


In [22]:
joblib.dump(gbm, 'simplifiedSNF_gbm.pkl')

['simplifiedSNF_gbm.pkl']

In [23]:
df.columns

Index([u'response', u'prior_util_6m_ip_count', u'index_los', u'ch_index',
       u'icu_flag', u'lab_crea_resultn'],
      dtype='object')

In [24]:
X_t = 'n'

In [25]:
pipeline = make_pipeline(FunctionTransformer(icu_flag))

In [26]:
pipeline.transform(X_t)

0

In [27]:
df.head()

Unnamed: 0,response,prior_util_6m_ip_count,index_los,ch_index,icu_flag,lab_crea_resultn
0,0,0,12,2,1,0.8
1,0,0,3,3,0,0.8
2,0,1,11,5,0,0.8
3,0,1,11,6,0,1.0
4,0,1,5,6,0,1.0
