In [1]:
#imports
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import models
from keras import layers
from keras import regularizers

from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
#reading critical findings data
findings = pd.read_csv("sample_data/critical-findings-sample-data.csv",encoding = 'utf-8')

In [4]:
#converts the data column to lower case
findings['Data'] = findings['Data'].astype(str).str.lower()

#since xray is found in glove replacing x-ray with xray
findings['Data'] = findings['Data'].str.replace('x-ray','xray') 

#html tags replace
tag_replace = '<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6})' 
findings['Data'] = findings['Data'].str.replace(tag_replace, ' ') 

#replacing punctuations with whitespace
pattern = '[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'
findings['Data'] = findings['Data'].str.replace(pattern, ' ')            

#numbers removal
findings['Data'] = findings['Data'].replace('\d+', '', regex=True) 

#replaces many spaces with a single space
findings['Data'] = findings['Data'].replace('\s+', ' ', regex=True)

findings['Data'][0]

'study ct chest with contrast reason for exam male years old chest pain and esophageal dilation radiation dosage if supplied by facility ctdivol mgy dlp mgycm technique transaxial imaging was performed following intravenous administration of ml ml of isovue contrast material individualized dose optimization techniques were used for this ct comparison none findings the lungs are normal there is no demonstrated pleural abnormality normal heart and pericardium normal mediastinum normal hilar regions pulmonary emboli are present in right lower lobe arterial branches normal aorta arch and descending thoracic aorta normal osseous structures there is no demonstrated abnormality of the visualized upper abdomen impression pulmonary emboli are present in right lower lobe arterial branches no evidence of acute pulmonary or mediastinal pathology '

In [5]:
#Taking out the feature and label columns
final_df = findings[['Data', 'Category']]

In [6]:
#splitting the dataset into train dataset and test dataset
X_train, X_test, y_train, y_test = train_test_split(final_df.Data, final_df.Category, test_size=0.3, random_state=37)
print('# Train data samples:', X_train.shape[0])
print('# Test data samples:', X_test.shape[0])
assert X_train.shape[0] == y_train.shape[0]
assert X_test.shape[0] == y_test.shape[0]

# Train data samples: 350
# Test data samples: 150


In [7]:
#max words to be put into the dictionary
NB_WORDS = 3200

#keras tokenizer for tokenizing then data
tk = Tokenizer(num_words=NB_WORDS,
               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               lower=True,
               split=" ")

tk.fit_on_texts(X_train)


#converting the texts to sequences
X_train_seq = tk.texts_to_sequences(X_train)
X_test_seq = tk.texts_to_sequences(X_test)

X_train_seq

[[277,
  1027,
  2100,
  2101,
  156,
  2102,
  401,
  24,
  15,
  254,
  21,
  66,
  550,
  831,
  16,
  9,
  42,
  43,
  24,
  39,
  44,
  1096,
  1,
  124,
  2,
  38,
  497,
  420,
  28,
  57,
  19,
  10,
  50,
  149,
  85,
  21,
  213,
  361,
  932,
  181,
  217,
  10,
  49,
  21,
  49,
  269,
  3,
  60,
  350,
  144,
  145,
  933,
  3,
  551,
  2103,
  430,
  289,
  2104,
  238,
  431,
  383,
  5,
  2105,
  300,
  165,
  324,
  237,
  190,
  5,
  393,
  131,
  460,
  11,
  70,
  25,
  10,
  50,
  149,
  19,
  73,
  934,
  2106,
  21,
  141,
  498,
  701,
  1223,
  366,
  185,
  301,
  113,
  78,
  7,
  182,
  66,
  51,
  42,
  43,
  52,
  20,
  44,
  12,
  26,
  15],
 [27,
  310,
  53,
  36,
  12,
  34,
  101,
  41,
  37,
  602,
  2,
  603,
  65,
  96,
  73,
  97,
  30,
  98,
  116,
  117,
  102,
  105,
  24,
  1,
  247,
  39,
  67,
  7,
  1,
  118,
  134,
  2,
  129,
  2,
  362,
  16,
  132,
  155,
  432,
  2,
  1,
  433,
  38,
  39,
  67,
  7,
  284,
  434,
  5,
  363,
  435,
  

In [8]:
#Finding out the maximum sequence length 
seq_lengths = X_train.apply(lambda x: len(x.split(' ')))
seq_lengths.describe()

count    350.000000
mean     205.834286
std      106.223643
min        6.000000
25%      127.000000
50%      181.000000
75%      260.750000
max      655.000000
Name: Data, dtype: float64

In [9]:
#since the maximum size of a sequence is 737, padding all other sequnces to change them to equal length
MAX_LEN = 655
X_train_seq_trunc = pad_sequences(X_train_seq, maxlen=MAX_LEN)
X_test_seq_trunc = pad_sequences(X_test_seq, maxlen=MAX_LEN)

In [10]:
X_train_seq_trunc[10]  

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [11]:
#encoding the label into numbers
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)
y_train_oh = to_categorical(y_train_le)
y_test_oh = to_categorical(y_test_le)


In [12]:
#splitting the modified features and labels into train and test 
X_train_emb, X_valid_emb, y_train_emb, y_valid_emb = train_test_split(X_train_seq_trunc, y_train_oh, test_size=0.1, random_state=37)

assert X_valid_emb.shape[0] == y_valid_emb.shape[0]
assert X_train_emb.shape[0] == y_train_emb.shape[0]

print('Shape of validation set:',X_valid_emb.shape)

Shape of validation set: (35, 655)


In [13]:
#reading glove file and putting the words and their embeddings into  dictionary
glove_file = 'glove.txt'
emb_dict = {}
glove = open(glove_file,'r',encoding="utf8")
for line in glove:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    emb_dict[word] = vector
glove.close()

In [14]:
GLOVE_DIM = 300
emb_matrix = np.zeros((NB_WORDS, GLOVE_DIM))

for w, i in tk.word_index.items():
    # The word_index contains a token for all words of the training data so we need to limit that
    if i < NB_WORDS:
        vect = emb_dict.get(w)
        # Check if the word from the training data occurs in the GloVe word embeddings
        # Otherwise the vector is kept with only zeros
        if vect is not None:
            emb_matrix[i] = vect
    else:
        break

In [15]:
#creating sequential model
glove_model = models.Sequential()
#adding embedding layer
glove_model.add(layers.Embedding(NB_WORDS, GLOVE_DIM, input_length=MAX_LEN))
#flattening the embedding layer
glove_model.add(layers.Flatten())
#Adding dense layer
glove_model.add(layers.Dense(7, activation='softmax'))
glove_model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 655, 300)          960000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 196500)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 1375507   
Total params: 2,335,507
Trainable params: 2,335,507
Non-trainable params: 0
_________________________________________________________________


In [16]:
#adding the embeddign matrix calculated from the corpus
glove_model.layers[0].set_weights([emb_matrix])
glove_model.layers[0].trainable = False

In [17]:
#function for compiling and fitting the model provided
def deep_model(model, X_train, y_train, X_valid, y_valid):
    '''
    Function to train a multi-class model.
    
    Parameters:
        model : model with the chosen architecture
        X_train : training features
        y_train : training target
        X_valid : validation features
        Y_valid : validation target
    Output:
        model training history
    '''
    model.compile(optimizer='rmsprop'
                  , loss='categorical_crossentropy'
                  , metrics=['accuracy'])
    
    history = model.fit(X_train
                       , y_train
                       , epochs=NB_START_EPOCHS
                       , batch_size=BATCH_SIZE
                       , validation_data=(X_valid, y_valid)
                       , verbose=0)
    return history,model

In [18]:
# VAL_SIZE = 1000  # Size of the validation set
NB_START_EPOCHS = 50 # Number of epochs we usually start to train with
BATCH_SIZE = 512
glove_history, model_glove = deep_model(glove_model, X_train_emb, y_train_emb, X_valid_emb, y_valid_emb)
glove_history.history['acc'][-1]
#printing the validation accuracy

Instructions for updating:
Use tf.cast instead.


1.0

In [19]:
print(model_glove.evaluate(X_valid_emb,y_valid_emb)[1])

0.9142857142857143
