In [2]:
#imports
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter

from sklearn import model_selection,preprocessing,metrics
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers


Using TensorFlow backend.


In [3]:
df = pd.read_csv("sample_data/critical-findings-sample-data.csv") #putting it into a dataframe
df

Unnamed: 0,Modality,Critical_Finding,Category,Data
0,CT,Complete Critical Finding,Significant Vascular Pathology,STUDY: CT CHEST WITH CONTRAST<br /><br />REA...
1,CT,Complete Critical Finding,Significant Vascular Pathology,STUDY: CT CHEST WITH CONTRAST<br /><br />REA...
2,CT,Complete Critical Finding,Significant Vascular Pathology,STUDY: CT CHEST WITH CONTRAST<br /><br />REA...
3,CT,Complete Critical Finding,Acute Vascular Event,HISTORY: left sided weakness<br /><br />TECHNI...
4,CT,Complete Critical Finding,Acute Vascular Event,HISTORY: left sided weakness<br /><br />TECHNI...
5,CT,Complete Critical Finding,Immediate Surgical Medical,STUDY: CT ABDOMEN AND PELVIS WITHOUT CONTRAST...
6,CT,Complete Critical Finding,Immediate Surgical Medical,STUDY: CT ABDOMEN AND PELVIS WITHOUT CONTRAST...
7,CT,Complete Critical Finding,Immediate Surgical Medical,STUDY: CT ABDOMEN AND PELVIS WITHOUT CONTRAST...
8,CR,Complete Critical Finding,Unsuspected Injury,STUDY: X-RAY CHEST<br /><br />REASON FOR EXA...
9,CR,Complete Critical Finding,Unsuspected Injury,STUDY: X-RAY CHEST<br /><br />REASON FOR EXA...


# Preprocessing

In [4]:
df['Data'] = df['Data'].astype(str).str.lower() #to lower case

df['Data'] = df['Data'].str.replace('x-ray' , 'xray')

tag_replace = '<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6})' #html tags replace
df['Data'] = df['Data'].str.replace(tag_replace, ' ') 

pattern = '[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'
df['Data'] = df['Data'].str.replace(pattern, ' ')  #replacing punctuations with whitespace          

df['Data'] = df['Data'].replace('\d+', '', regex=True) #removes digits

df['Data'] = df['Data'].replace('\s+', ' ', regex=True)
#replaces many spaces with a single space

df['Data']


0      study ct chest with contrast reason for exam m...
1      study ct chest with contrast reason for exam m...
2      study ct chest with contrast reason for exam m...
3      history left sided weakness technique ct head ...
4      history left sided weakness technique ct head ...
5      study ct abdomen and pelvis without contrast r...
6      study ct abdomen and pelvis without contrast r...
7      study ct abdomen and pelvis without contrast r...
8      study xray chest reason for exam male years ol...
9      study xray chest reason for exam male years ol...
10     study xray chest reason for exam male years ol...
11     study cta chest reason for exam male years old...
12     study cta chest reason for exam male years old...
13     study cta chest reason for exam male years old...
14     study ct abdomen and pelvis with contrast reas...
15     study ct abdomen and pelvis with contrast reas...
16     study ct abdomen and pelvis with contrast reas...
17     study ct brain without c

In [5]:
# load the pre-trained word-embedding vectors 
embeddings_index = {}
for i, line in enumerate(open('glove.txt','r',encoding = 'utf-8')):
    values = line.split()
    embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')



In [6]:
# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(df['Data'])
word_index = token.word_index

# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df['Data'], df['Category'])

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)


In [7]:


# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=737)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=737)



In [8]:
# create token-embedding mapping
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [9]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [10]:
def create_cnn():
    # Add an Input Layer
    input_layer = layers.Input((737, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_cnn()
accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print ("CNN, Word Embeddings",  accuracy)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/1
CNN, Word Embeddings 0.224


In [11]:
def create_rnn_lstm():
    # Add an Input Layer
    input_layer = layers.Input((737, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = layers.LSTM(100)(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_rnn_lstm()
accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print ("RNN-LSTM, Word Embeddings",  accuracy)

Epoch 1/1
RNN-LSTM, Word Embeddings 0.224
