Model inputs: [label: mortality, action, rl_features, note_string]
Model output: mortality 

1. Bag of Words model

In [1]:
from collections import defaultdict
import numpy as np 
import math 
import re 
import pickle
import pandas as pd

Bag of words model 

In [2]:
### Bag of Words model for sentiment analysis

# => compile all the words that appear in the training set of notes into a dictionary, to produce a list of 'd' unique words
# => transform each of the reviews into a feature vector 
# by setting the ith coordinate to 1 if the ith word in the dictionary appears in the note
# => print # of unique words observed 
# => remove stop words 
# => print top 10 words
# TEST two version: (1) binary, (2) based on word counts, (3) unigram vs. bigrams 

# => learn a LogisticRegression classifier and evaluate 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files 
from sklearn import linear_model 
import matplotlib.pyplot as plt 
import random 

# load the dataset 
train_dataset = pd.read_csv("train_dataset.csv")
test_dataset = pd.read_csv("test_dataset.csv")
train_notes = train_dataset['note'].as_matrix()
test_notes = test_dataset['note'].as_matrix()
print "# of train notes", len(train_notes)
print "# of test notes", len(test_notes)

# converting train and test to clean format 
print "Converting training to no numbers"
for i in range(len(train_notes)):
    note = train_notes[i].lower()
    string = re.sub("\d+", "", note)
    train_notes[i] = " ".join(re.findall(r'\w+', string))

print "Converting testing to no numbers"
for i in range(len(test_notes)):
    note = test_notes[i].lower()
    string = re.sub("\d+", "", note)
    test_notes[i] = " ".join(re.findall(r'\w+', string))
    
print "Finished converting."
print train_notes[0]

# of train notes 80185
# of test notes 26627
Created count vectorizer
Converting to no numbers
nursing other report npn p a this is a yo female with no pmh who presented to ew with rigors temp of pt has seen her pcp and was given azithromycin for pneumonia cxr in ew showed lll pneumonia given mg levaquin gm tylenol and l ivf s as she was hypotensive to high s systolic transferred here to msicu for closer monitoring before going to floor at this time all vital signs are stable a ox non productive cough noted crackles in lll otherwise clear pt satting on r a abd soft bs g piv intact awaiting md assessment sepsis vs pneumonia w dehydration nursing other report patient has been hemodynamically stable see careview sr st without ectopy denies any chest pain no sob lung sounds clear crackles at lll occasional productive cough whitish yellow in small amount no sob sats at room air voiding adequate amount of clear yellow urine on regular diet with fair appetite continues on levofloxacin mgs po 

In [7]:
# create a dictionary
vocab = defaultdict()
for i, name in enumerate(cv.get_feature_names()):
    vocab[name] = 1
print "Size of Vocabulary: ", len(vocab)

Size of Vocabulary:  845521
[0.74732414466518948, 0.7247155143275622, 0.71810568220227589, 0.71761745596574911, 0.71735456491531158]


In [None]:
# Cross-validation (5 parts) --> sklearn (score: training) ===> unigram, bigram, unigram/bigram  

# Test validation score 

# Top features (noisy) => confidence intervals, if sig. (choose significant ones)

# unigram, bigram, trigram

# Random Forest 


In [27]:
num_top = 30
coefs=best_lr.coef_[0]
top = np.argpartition(coefs, -num_top)[-num_top:]
top_sorted=top[np.argsort(coefs[top])]
names = feature_names[top_sorted]

print "Top Features"
for i in range(num_top):
    index = num_top - 1 - i
    print names[index]

Top Features
dnr
intubated current
meeting
pt tx
pronounced
mets
duoderm
report correction
family bedside
psv cpap
cmo
receiving cc
expired
fi
noted nursing
admission note
current abg
wean fio
morphine gtt
pcv
discussion
psv weaned
oliguria
remained psv
secretions abgs
wife son
cvvhd
support changes
tx alb
met acidosis


In [30]:
# Test Unigrams, Bigrams, and Unigram + Bigrams 

# extract the labels 
train_labels = train_dataset['label'].as_matrix()
test_labels = test_dataset['label'].as_matrix()

# initialize CVs
cv_unigram = CountVectorizer(ngram_range=(1, 1), stop_words="english")
cv_bigram = CountVectorizer(ngram_range=(2, 2), stop_words="english")
cv = CountVectorizer(ngram_range=(1,2), stop_words="english")

# fit to training data
cv_unigram_fit = cv_unigram.fit_transform(train_notes)
cv_bigram_fit = cv_bigram.fit_transform(train_notes)
cv_fit = cv.fit_transform(train_notes)

# testing data 
cv_unigram_test = cv_unigram.transform(test_notes)
cv_bigram_test = cv_bigram.transform(test_notes)
cv_test = cv.transform(test_notes)

# create the LR models 
constants = [1e-1, 1e1, 1e3, 1e5, 1e7] 

# testing scores 
scores_unigram = []
scores_bigram = []
scores_unibigram = []

# models 
models_unigram = []
models_bigram = []
models_unibigram = []

# training scores 
train_scores_u = []
train_scores_b = []
train_scores_ub = []

for c in constants:
    # unigram
    lr_unigram = linear_model.LogisticRegression(C=c)
    lr_unigram.fit(cv_unigram_fit, train_labels)
    train_scores_u.append(lr_unigram.score(cv_unigram_fit, train_labels))
    scores_unigram.append(lr_unigram.score(cv_unigram_test, test_labels))
    models_unigram.append(lr_unigram)
    
    # bigram
    lr_b = linear_model.LogisticRegression(C=c)
    lr_b.fit(cv_bigram_fit, train_labels)
    train_scores_b.append(lr_b.score(cv_bigram_fit, train_labels))
    scores_bigram.append(lr_b.score(cv_bigram_test, test_labels))
    models_bigram.append(lr_b)
    
    # unibigram
    lr_ub = linear_model.LogisticRegression(C=c)
    lr_ub.fit(cv_fit, train_labels)
    train_scores_ub.append(lr_ub.score(cv_fit, train_labels))
    scores_unibigram.append(lr_ub.score(cv_test, test_labels))
    models_unibigram.append(lr_ub)

print "Testing Scores"
print "Scores for Unigram", scores_unigram
print "Scores for Bigram", scores_bigram 
print "Scores for Unigram and Bigram", scores_unibigram

print "Training Scores"
print "Scores for Unigram", train_scores_u
print "Scores for Bigram", train_scores_b
print "Scores for Unigram and Bigram", train_scores_ub

Testing Scores
Scores for Unigram [0.73958763660945659, 0.70327111578472978, 0.69636083674465765, 0.6972246216246667, 0.69313103241071095]
Scores for Bigram [0.7548728733991813, 0.72648063995192846, 0.71686633867878469, 0.71367409020918615, 0.71510121305441843]
Scores for Unigram and Bigram [0.74732414466518948, 0.7247155143275622, 0.71810568220227589, 0.71761745596574911, 0.71735456491531158]
Training Scores
Scores for Unigram [0.93944004489617761, 0.98457317453389037, 0.99035979297873666, 0.99068404315021508, 0.99412608343206332]
Scores for Bigram [0.99462492984972251, 0.99972563447028751, 0.99975057679117041, 0.99975057679117041, 0.99975057679117041]
Scores for Unigram and Bigram [0.99600922865872665, 0.99972563447028751, 0.99975057679117041, 0.99975057679117041, 0.99975057679117041]


In [34]:
### LR ANALYSIS 

best_unigram_model = models_unigram[0]
best_bigram_model = models_bigram[0]
best_unibigram_model = models_unigram[0]

feature_names_unigram = np.array(cv_unigram.get_feature_names())
feature_names_bigram = np.array(cv_bigram.get_feature_names())
feature_names_unibigram = np.array(cv.get_feature_names())

num_top = 30
def find_top_features(model, feature_names):
    coefs=model.coef_[0]
    top = np.argpartition(coefs, -num_top)[-num_top:]
    top_sorted=top[np.argsort(coefs[top])]
    names = feature_names[top_sorted]

    print "Top Features"
    for i in range(num_top):
        index = num_top - 1 - i
        print names[index]

print "Top Features for Unigram"
unigram_features = np.array(cv_unigram.get_feature_names())
find_top_features(best_unigram_model, unigram_features)

print "Top Features for Bigram"
bigram_features = np.array(cv_bigram.get_feature_names())
find_top_features(best_bigram_model, bigram_features)

print "Top Features for Unigram and Bigram"
unibigram_features = np.array(cv.get_feature_names())
find_top_features(best_unibigram_model, unibigram_features)


Top Features for Unigram
Top Features
oliguria
pronounced
nonreactive
mets
pitressin
flows
bagged
excoriation
space
metastatic
nectar
resolve
cyanotic
tobramycin
olanzapine
induce
beneprotein
fenoldopam
commenced
adaptic
steroid
wanting
dnr
generaly
flowby
attack
defib
meeting
intermit
duoderm
Top Features for Bigram
Top Features
family meeting
family bedside
dnr dni
morphine gtt
intubated current
met acidosis
report correction
pt tx
pt dnr
palliative care
receiving cc
comfort measures
maintained range
noted nursing
wife son
skin duoderm
secretions abgs
psv cpap
current abg
pt expired
wean fio
rr peep
support changes
pt pronounced
remained psv
family present
essentially unchanged
lung ca
code status
cpap fio
Top Features for Unigram and Bigram
Top Features
aggitaed
airway seizures
afternoon oozing
afebrile rehab
agreeing turned
added response
able diet
actually item
ammonia action
afebrile overnoc
afternoon appreciated
allergies fever
ache chest
andwered appropriatly
aggatroban
admissi

In [None]:
# ### SVM Classifier 
# from sklearn import svm

# # testing scores 
# test_svm_u = []
# test_svm_b = []
# test_svm_ub = []

# # models 
# models_svm_u = []
# models_svm_b = []
# models_svm_ub = [] 

# # training scores 
# train_svm_u = []
# train_svm_b = []
# train_svm_ub = [] 

# for c in constants:
#     # unigram
#     svm_u = svm.SVC()
#     svm_u.fit(cv_unigram_fit, train_labels)
#     train_svm_u.append(svm_u.score(cv_unigram_fit, train_labels))
#     test_svm_u.append(svm_u.score(cv_unigram_test, test_labels))
#     models_svm_u.append(svm_u)
    
#     # bigram
#     svm_b = svm.SVC()
#     svm_b.fit(cv_bigram_fit, train_labels)
#     train_svm_b.append(svm_b.score(cv_bigram_fit, train_labels))
#     test_svm_b.append(svm_b.score(cv_bigram_test, test_labels))
#     models_svm_b.append(svm_b)
    
#     # unibigram
#     svm_ub = svm.SVC()
#     svm_ub.fit(cv_fit, train_labels)
#     train_svm_ub.append(svm_ub.score(cv_fit, train_labels))
#     test_svm_ub.append(svm_ub.score(cv_test, test_labels))
#     models_svm_ub.append(svm_ub)

# print "Testing Scores"
# print "Scores for Unigram", test_svm_u
# print "Scores for Bigram", test_svm_b 
# print "Scores for Unigram and Bigram", test_svm_ub

# print "Training Scores"
# print "Scores for Unigram", train_svm_u
# print "Scores for Bigram", train_svm_b
# print "Scores for Unigram and Bigram", train_svm_ub

In [None]:
# ### SVM ANALYSIS 

# best_unigram_svm = models_svm_u[]
# best_bigram_svm = models_svm_b[]
# best_unibigram_svm = models_svm_ub[]

# print "Top Features for SVM Unigram"
# find_top_features(best_unigram_svm, unigram_features)

# print "Top Features for SVM Bigram"
# find_top_features(best_bigram_svm, bigram_features)

# print "Top Features for SVM Unigram and Bigram"
# find_top_features(best_unibigram_svm, unibigram_features)

In [38]:
### RANDOM FORESTS 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier 
from sklearn.tree import DecisionTreeClassifier

# testing scores 
test_t_u = []
test_t_b = []
test_t_ub = []

# models 
models_t_u = []
models_t_b = []
models_t_ub = [] 

# training scores 
train_t_u = []
train_t_b = []
train_t_ub = [] 

for c in constants:
    
    # DT
    dt = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0)
    dt.fit(cv_bigram_fit, train_labels)
    train_t_b.append(dt.score(cv_bigram_fit, train_labels))
    test_t_b.append(dt.score(cv_bigram_test, test_labels))
    models_t_b.append(dt)
    
    # ET
    et = ExtraTreesClassifier(max_depth=None, min_samples_split=2, random_state=0) 
    et.fit(cv_bigram_fit, train_labels)
    train_t_b.append(et.score(cv_bigram_fit, train_labels))
    test_t_b.append(et.score(cv_bigram_test, test_labels))
    models_t_b.append(et)
    
    # RF
    rf = RandomForestClassifier(max_depth=None, min_samples_split=2, random_state=0) 
    rf.fit(cv_bigram_fit, train_labels)
    train_t_b.append(rf.score(cv_bigram_fit, train_labels))
    test_t_b.append(rf.score(cv_bigram_test, test_labels))
    models_t_b.append(rf)

print "Testing Scores for Tree Classifiers"
print "Scores for DT", test_t_u
print "Scores for ET", test_t_b 
print "Scores for RF", test_t_ub

print "Training Scores"
print "Scores for DT", train_t_u
print "Scores for ET", train_t_b
print "Scores for RF", train_t_ub

Testing Scores for Tree Classifiers
Scores for DT []
Scores for ET [0.69974086453599726, 0.74649791564952872, 0.75051639313478802, 0.69974086453599726, 0.74649791564952872, 0.75051639313478802, 0.69974086453599726, 0.74649791564952872, 0.75051639313478802, 0.69974086453599726, 0.74649791564952872, 0.75051639313478802, 0.69974086453599726, 0.74649791564952872, 0.75051639313478802]
Scores for RF []
Training Scores
Scores for DT []
Scores for ET [0.99975057679117041, 0.99975057679117041, 0.99927667269439424, 0.99975057679117041, 0.99975057679117041, 0.99927667269439424, 0.99975057679117041, 0.99975057679117041, 0.99927667269439424, 0.99975057679117041, 0.99975057679117041, 0.99927667269439424, 0.99975057679117041, 0.99975057679117041, 0.99927667269439424]
Scores for RF []


In [None]:
### Random Forests ANALYSIS 
best_unigram_rf = models_t_b[0]
best_bigram_rf = models_t_b[1]
best_unibigram_rf = models_t_b[2]

print "Top Features for RF Unigram"
find_top_features(best_unigram_rf, unigram_features)

print "Top Features for RF Bigram"
find_top_features(best_bigram_rf, bigram_features)

print "Top Features for RF Unigram and Bigram"
find_top_features(best_unibigram_rf, unibigram_features)