# MIMIC-III clinical note classification using doc2vec (gensim) and a gated recurrent unit neural network (tensorflow)

This notebook demonstrates an approach to classifying medical notes by type (physician vs. social worker) using deep learning. The clinical notes were obtained through MIMIC-III, a publically availabele ICU database.

# Setting up environment

In [1]:
import numpy as np
import pandas as pd
import MySQLdb
import tensorflow as tf
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from random import shuffle
import re
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
import functools
import tensorflow as tf

## Import tables from SQL server and convert to pandas dataframe

See README file for details on obtaining the MIMIC csv files, and see the SQL build file for details on converting the tables to SQL. This project is limited to the entire corpus of the social worker notes (2670 total documents) and an equal number of physician notes.

In [10]:
db = MySQLdb.connect(host="0.0.0.0",    # your host, usually localhost
                     user="user",         # your username
                     passwd="passwd",  # your password
                     db="mimic_sql")        # name of the data base

In [11]:
# Convert SQL tables for pandas dataframe

physician_df = pd.read_sql("SELECT ROW_ID, CATEGORY, TEXT FROM NOTEEVENTS where category='Physician' limit 2670", con=db)
sw_df = pd.read_sql("SELECT ROW_ID, CATEGORY, TEXT FROM NOTEEVENTS where category='Social Work'", con=db)

# Combine physician and social worker notes into a single dataframe
notes = pd.concat([physician_df, sw_df])


In [2]:
# Import sentence embeddings. See sentence_embeddings_mod.py for details
fname = 'd2v-200'
model = Doc2Vec.load(fname)

## Convert text from notes into 3D numpy array with zero padding

The gated recurrent unit neural network outlined below requires the training/testing data to be in 3D np array with dimensions equal to [total # of training examples, max length # of sentence vectors, sentence vector length].

In [13]:

# Set max_length to maximum # of sentences from splitting training text
max_length = 200

# create two empty lists for training parameters and target variables
y = []
X = []

# iterate through notes dataframe and 
for index, row in notes.iterrows():
    # for each clinical note, tokenize into sentences
    line_array = sent_tokenize(row['TEXT'])
    matrix = []

    for sentence in line_array:
        # for each sentence remove all non-letters and split by white space
        sentence = re.sub('[^A-Za-z\s]+', ' ', sentence).lower().replace("\n"," ").split()
        if sentence == []:
            pass
        elif matrix == []:
            matrix = model.infer_vector(sentence)
        else:
            # create 2D matrix with dimension [max_length, sentence vector length]
            matrix = np.vstack((matrix, np.array(model.infer_vector(sentence))))
    if len(matrix.shape) < 2:
        matrix = np.vstack((matrix, np.zeros(max_length)))
    while matrix.shape[0] < 200:
        matrix = np.vstack((matrix, np.zeros(max_length)))
    if X == []:
        X = matrix
    else:
        # stack 2D parameter arrays into 3D array
        X = np.dstack((cube,matrix))
    if row['CATEGORY']=='Social Work':
    # convert target variable to one-hot 2D array
        total_y.append(np.array([0,1]))
    else:
        total_y.append(np.array([1,0]))

y = np.vstack(y)

# adjust parameter dimensions
X = np.swapaxes(X,1,2)
X = np.swapaxes(X,0,1)



KeyboardInterrupt: 

In [7]:
# split into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(total_X, total_y, test_size=0.1, random_state=42)

## Function for creating, training, and testing GRU/RNN

The following code was adapted from a variable length sequencing tutorial written by Danijar Hafner described [here](https://danijar.com/variable-sequence-lengths-in-tensorflow/) and shown in full [here](https://gist.github.com/danijar/d11c77c5565482e965d1919291044470)

In [8]:
tf.reset_default_graph()


def lazy_property(function):
    attribute = '_' + function.__name__

    @property
    @functools.wraps(function)
    def wrapper(self):
        if not hasattr(self, attribute):
            setattr(self, attribute, function(self))
        return getattr(self, attribute)
    return wrapper


class VariableSequenceClassification:
    # initialize variables
    def __init__(self, data, target, num_hidden=200, num_layers=2):
        self.data = data
        self.target = target
        self._num_hidden = num_hidden
        self._num_layers = num_layers
        self.prediction
        self.error
        self.optimize

    @lazy_property
    # function for trimming zero padding from input 2D arrays
    def length(self):
        used = tf.sign(tf.reduce_max(tf.abs(self.data), reduction_indices=2))
        length = tf.reduce_sum(used, reduction_indices=1)
        length = tf.cast(length, tf.int32)
        return length

    @lazy_property
    def prediction(self):
        # building recurrent network
        output, _ = tf.nn.dynamic_rnn(
            tf.contrib.rnn.GRUCell(self._num_hidden),
            data,
            dtype=tf.float32,
            sequence_length=self.length,
        )
        last = self._last_relevant(output, self.length)
        # Softmax layer
        weight, bias = self._weight_and_bias(
            self._num_hidden, int(self.target.get_shape()[1]))
        prediction = tf.nn.softmax(tf.matmul(last, weight) + bias)
        return prediction

    @lazy_property
    def cost(self):
        cross_entropy = -tf.reduce_sum(self.target * tf.log(self.prediction))
        return cross_entropy

    @lazy_property
    def optimize(self):
        learning_rate = 0.003
        optimizer = tf.train.RMSPropOptimizer(learning_rate)
        return optimizer.minimize(self.cost)

    @lazy_property
    def error(self):
        mistakes = tf.not_equal(
            tf.argmax(self.target, 1), tf.argmax(self.prediction, 1))
        return tf.reduce_mean(tf.cast(mistakes, tf.float32))

    @staticmethod
    def _weight_and_bias(in_size, out_size):
        weight = tf.truncated_normal([in_size, out_size], stddev=0.01)
        bias = tf.constant(0.1, shape=[out_size])
        return tf.Variable(weight), tf.Variable(bias)

    @staticmethod
    def _last_relevant(output, length):
        # function for selecting only last target for training
        batch_size = tf.shape(output)[0]
        max_length = int(output.get_shape()[1])
        output_size = int(output.get_shape()[2])
        index = tf.range(0, batch_size) * max_length + (length - 1)
        flat = tf.reshape(output, [-1, output_size])
        relevant = tf.gather(flat, index)
        return relevant


if __name__ == '__main__':
    # We treat images as sequences of pixel rows.
    examples, max_length, vec_size = X_train.shape
    num_classes = y_train.shape[1]
    batch_size = 10 
    data = tf.placeholder(tf.float32, [None, max_length, vec_size])
    target = tf.placeholder(tf.float32, [None, num_classes])
    model = VariableSequenceClassification(data, target)
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    for epoch in range(10):
        for _ in range(0,examples,batch_size):
            batch_X, batch_y = X_train[_:_+batch_size], y_train[_:_+batch_size]
            sess.run(model.optimize, {
                data: batch_X, target: batch_y})
        error = sess.run(model.error, {data: X_test, target: y_test})
        print('Epoch {:2d} error {:3.1f}%'.format(epoch + 1, 100 * error))
        


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch  1 error 31.0%
Epoch  2 error 8.0%
Epoch  3 error 8.0%
Epoch  4 error 7.0%
Epoch  5 error 6.0%
Epoch  6 error 4.0%
Epoch  7 error 4.0%
Epoch  8 error 5.0%
Epoch  9 error 5.0%
Epoch 10 error 12.0%
