In [1]:
import numpy as np
from sklearn import metrics
import torch
from collections import defaultdict
import math 
import re 
import pickle
import pandas as pd
import gzip
from sklearn import preprocessing
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.datasets import imdb

Using TensorFlow backend.


In [7]:
# load the dataset 
train_dataset = pd.read_csv("train_dataset.csv")
test_dataset = pd.read_csv("test_dataset.csv")

# extract the notes and the mortality labels 
train_notes = train_dataset['note'].as_matrix()
test_notes = test_dataset['note'].as_matrix()
train_labels = train_dataset['label'].as_matrix().astype(float)
test_labels = test_dataset['label'].as_matrix().astype(float)

# extract the sofa scores 
sofa_train_labels = train_dataset['sofa'].as_matrix()
sofa_test_labels = test_dataset['sofa'].as_matrix()

# combine into one set of notes 
all_notes = np.concatenate((train_notes, test_notes), axis=0)
all_labels = np.concatenate((train_labels, test_labels), axis=0)

# combine all the sofa notes 
sofa_labels = np.concatenate((sofa_train_labels, sofa_test_labels), axis=0)

# label smoothing
smooth_labels = np.copy(all_labels)
smooth_labels[smooth_labels > 0.0] = 0.7
print(smooth_labels[:50])
print(all_labels[:50])

# convert into clean format
print("Converting training to no numbers and no punctuation")
for i in range(len(all_notes)):
    note = all_notes[i].lower()
    string = re.sub("\d+", "", note)
    words = " ".join(re.findall(r'\w+', string))
    all_notes[i] = words

[ 0.   0.   0.   0.7  0.7  0.7  0.7  0.7  0.7  0.7  0.7  0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.7  0.7
  0.7  0.7  0.7  0.7  0.7]
[ 0.  0.  0.  1.  1.  1.  1.  1.  1.  1.  1.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  1.  1.  1.  1.]
Converting training to no numbers and no punctuation


In [30]:
from keras.preprocessing.text import Tokenizer 

# extract the actions
train_actions = train_dataset['action'].as_matrix()
test_actions = test_dataset['action'].as_matrix()
all_actions = np.concatenate((train_actions, test_actions), axis=0)

106812
106812
106812
[  0.   0.   0.   3.   2.   0.   0.   2.   2.   2.   2.   0.  14.  13.  13.
   9.   7.   7.   2.   2.]


In [10]:
# build the tokenizer
t = Tokenizer()
t.fit_on_texts(all_notes)
vocab_size = len(t.word_index) + 1 

# integer encoding
encoded_notes = t.texts_to_sequences(all_notes)

print("notes length", len(encoded_notes))

max_len = 0 
for string in encoded_notes:
    length = len(string)
    if length > max_len:
        max_len = length
print("max length of sequence", max_len)

notes length 106812
max length of sequence 5047


In [74]:
from keras.preprocessing.sequence import pad_sequences
vocab_size = len(t.word_index) + 1
print("vocab size", vocab_size)
max_length = 3000
padded_notes = pad_sequences(encoded_notes, maxlen=max_length, padding='post')
print(padded_notes[:2])

vocab size 45725
[[12 24 28 ...,  0  0  0]
 [12 24 28 ...,  0  0  0]]


In [13]:
# load embeddings
f = gzip.open('word_vectors.txt.gz', 'r')
wv_text = [ ]
lines = f.readlines()
for line in lines:
    wv_text.append(line.strip())

word_to_vec = {}

for line in wv_text:
    parts = line.split()
    word = parts[0]
    vector = np.array([float(v) for v in parts[1:]])
    word_to_vec[word] = vector

print('Loaded %s word vectors.' % len(word_to_vec))

Loaded 12986 word vectors.


In [67]:
# create embedding matrix 
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in t.word_index.items():
	embedding_vector = word_to_vec.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

In [68]:
print(embedding_matrix[1][:10])

[ 0.06728151  0.0133113  -0.0587122   0.05504927  0.01670193 -0.04523229
 -0.02211876  0.03265653 -0.00706662  0.52338961]


In [78]:
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

# for splitting up samples 
print(len(all_actions))
print(len(padded_notes))
all_features = np.concatenate((np.array([all_actions]).T, padded_notes), axis=1)
X_train, X_test, y_train, y_test = train_test_split(all_features, all_labels, test_size=0.1, random_state=42)

X_train_actions = X_train[:, 0]
X_train_embeddings = X_train[:, 1:]
X_test_actions = X_test[:, 0]
X_test_embeddings = X_test[:, 1:]

print(X_test_embeddings.shape)

# kf = KFold(n_splits=5, random_state=None, shuffle=False) 

# for k, (train, test) in enumerate(k_fold.split(padded_notes)):
#     lasso_cv.fit(padded_notes[train], all_labels[train])
#     print("[fold {0}] alpha: {1:.5f}, score: {2:.5f}".
#           format(k, lasso_cv.alpha_, lasso_cv.score(padded_notes[test], all_labels[test])))

106812
106812
(10682, 3000)


In [None]:
### CODE FOR MODEL WITHOUT ACTION 

batch_size = 32

print('Loading data...')

print('Build model...')
model = Sequential()
e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_length, trainable=False)
model.add(e)

# higher dropout
model.add(LSTM(128, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(X_train_embeddings, y_train,
          batch_size=batch_size,
          epochs=15,
          validation_data=(X_test_embeddings, y_test))
score, acc = model.evaluate(X_test_embeddings, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

In [None]:
### CODE FOR MODEL WITH ACTION 

from keras.layers.merge import concatenate
from keras.layers import Flatten, Bidirectional

batch_size = 32

print('Loading data...')
print(len(X_train_embeddings), 'train sequences')
print(len(X_test_embeddings), 'test sequences')

print('Build model...')
text_model = Sequential()
e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_length, trainable=False)
text_model.add(e)
text_model.add(LSTM(128, dropout=0.5, recurrent_dropout=0.5))

action_model = Sequential()
action_model.add(Dense(1, input_shape=(1,), activation='sigmoid'))

model = Sequential()
merged = Merge([text_model, action_model], mode = 'concat')
model.add(merged)
model.add(Dense(1, activation='sigmoid'))


# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit([X_train_embeddings, X_train_actions], y_train,
          batch_size=batch_size,
          epochs=10,
          validation_data=([X_test_embeddings, X_test_actions.T], y_test))
score, acc = model.evaluate([X_test_embeddings, X_test_actions.T], y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)