# LSTM UPDATES


In [0]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional

## Glove setup

In [6]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

--2020-05-11 18:43:54--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-05-11 18:43:54--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-05-11 18:43:54--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’


2020

In [4]:
!wget https://ndownloader.figshare.com/files/10959626?private_link=00d69861786cd0156d81 -O cui2vec.zip
###need to rename zip
!unzip cui*.zip

Archive:  cui2vec.zip
  inflating: cui2vec_pretrained.csv  
   creating: __MACOSX/
  inflating: __MACOSX/._cui2vec_pretrained.csv  


In [0]:
with open('glove.6B.300d.txt', 'r') as f: 
    first_row = f.read().split('\n')[8]

import numpy as np
word_embeddings = {}
f = open('glove.6B.300d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [0]:
import pandas as pd
li = []
import glob
all_files = list(glob.glob("*_cui.csv"))
for filename in all_files:
    df = pd.read_csv(filename)
    li.append(df)


In [0]:
# turn a label to a vector
def label_vec(string):
  labels = ["Other","Movement","Meds_Treatments","Procedures_Results",
            "Vitals_Labs","Symptoms_Signs", "ProcedureHistory","MedicationHistory",
            "DiagnosisHistory","Demographics"]
  vec = np.zeros(len(labels))

  if string in labels:
    idx = labels.index(string)
    vec[idx] = 1
  return vec

In [0]:
# turn a vector to a label
def vec_label(vec):
  idx = np.argmax(vec)
  labels = ["Other","Movement","Meds_Treatments","Procedures_Results",
            "Vitals_Labs","Symptoms_Signs", "ProcedureHistory","MedicationHistory",
            "DiagnosisHistory","Demographics"]
  return labels[idx] if idx  < len(labels) else "Other"

In [0]:
max_length= max([len(s) for s in li])

# MODEL USING MASKING AND JUST CUI2VEC
Acc on new data: ~0.64
Can just directly run the block below

In [0]:
print(len(word_embeddings["is"]))
print(len(word_embeddings["that"]))
import numpy as np
cui_embeddings = {}
f = open('cui2vec_pretrained.csv', encoding='utf-8')
for line in f:
    values = line.split(',')
    cui = values[0]
    if len(cui) > 2:
      cui = cui[1:-1]
      if cui[0] != "C":
        continue
      vector = values[1:]
      if len(vector) != 500:
        raise Exception
      ##### length 500
      coefs = np.asarray(vector, dtype='float32')
      cui_embeddings[cui] = coefs
f.close()

In [0]:
from keras.utils import Sequence
import math
class medicalData(Sequence):

    def __init__(self, x_set, y_set, batch_size=1):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size

    def __len__(self):
        return math.ceil(len(self.x) / self.batch_size)

    def __getitem__(self, idx):
        retX = self.x[idx]
        retY = self.y[idx]

        return np.array(retX), np.array(retY)

def get_words_labels_from_df(df, max_length=len(df)):
  labels = np.concatenate([df['labels'][0:max_length],
                             ['Other']*(max_length - len(df['labels']))
                             ], axis=0)
  
  cuis = np.concatenate([df['cui'][0:max_length],
                             [' ']*(max_length - len(df['cui']))
                             ], axis=0) 
  words = np.concatenate([df['tokens'][0:max_length],
                             [' ']*(max_length - len(df['tokens']))
                             ], axis=0) 

  
  words_vec = [cui_embeddings.get(cuis[i], np.zeros(500)) for i in range(len(words))]
  
  labels_vec = [label_vec(l) for l in labels]

  words_vec2 = np.array(words_vec).reshape(1, len(words_vec), len(words_vec[0]))
  labels_vec2 = np.array(labels_vec).reshape(1, len(labels_vec), len(labels_vec[0]))

  return words_vec2, labels_vec2


def eval():
  final_avg = 0
  filtered_avg = 0
  total =0
  correct= 0
  print("NOW EVALUATING......")
  num_test= 100
  for i in range(num_test):
    preds = model.predict(np.reshape(w_batches[-1*i], [1] + list(w_batches[-1*i].shape)),
                          batch_size=1) 

    # print([vec_label(j) for j in preds[0]])
    # print([vec_label(j) for j in l_batches[-1*i]])
    predicted_labels = np.array([vec_label(j) for ind, j in enumerate(preds[0])])
    actual_labels = np.array([vec_label(j) for ind, j in enumerate(l_batches[-1*i])])

    final_avg += np.sum(actual_labels== predicted_labels)/(len(actual_labels))


    for j in range(len(li[-1*i]['cui'])):
      if li[-1*i]["cui"][j] in cui_embeddings:
        total += 1
        if actual_labels[j] == predicted_labels[j]:
          correct+=1

  filtered_avg = correct/total
  print("Average of Everything: ", final_avg/num_test)
  print("Average predicted CUI Terms correct", filtered_avg)
  model.save('masking_justcui_25ep')

w_batches = None
l_batches = None
for df in li:
  wv, lv = get_words_labels_from_df(df, max_length)
  # concatenate word vec, label vec...
  if w_batches is None:
    w_batches = wv
  else:
    w_batches = np.concatenate([w_batches, wv], axis=0)
  # w_batches.append(wv)
  if l_batches is None:
    l_batches = lv
  else:
    l_batches = np.concatenate([l_batches, lv], axis=0)
  # l_batches.append(lv)
w_batches = np.asarray(w_batches)
l_batches = np.asarray(l_batches)
print(w_batches.shape, l_batches.shape)



from keras.layers import Masking

special_value = np.zeros(500)
model = Sequential()
model.add(Masking(mask_value=special_value, input_shape=(None, 500)))
model.add(Bidirectional(LSTM(50, return_sequences=True)))
model.add(Dense(10, activation='softmax'))

model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

ds = medicalData(w_batches, l_batches)
print(np.array(w_batches).shape)
print(np.array(l_batches).shape)
model.summary(90)
for i in range(50):
  print("epoch ", i)
  model.fit(w_batches[:-100], l_batches[:-100], batch_size=4, epochs=1)
  eval()

# from keras import models
# model = models.load_model('masking_justcui_25ep')






In [18]:
print(final_avg/100)

0.13329140461215938


#Model with Masking and cui2vec+Glove
used glove 300d and padded the extra 200 values with 0s

Acc of ~.68 

In [0]:
from keras.utils import Sequence
import math
class medicalData(Sequence):

    def __init__(self, x_set, y_set, batch_size=1):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size

    def __len__(self):
        return math.ceil(len(self.x) / self.batch_size)

    def __getitem__(self, idx):
        retX = self.x[idx]
        retY = self.y[idx]

        return np.array(retX), np.array(retY)

def get_words_labels_from_df(df, max_length=len(df)):
  labels = np.concatenate([df['labels'][0:max_length],
                             ['Other']*(max_length - len(df['labels']))
                             ], axis=0)
  
  cuis = np.concatenate([df['cui'][0:max_length],
                             [' ']*(max_length - len(df['cui']))
                             ], axis=0) 
  words = np.concatenate([df['tokens'][0:max_length],
                             [' ']*(max_length - len(df['tokens']))
                             ], axis=0) 

  
  cuis_vec = [cui_embeddings.get(cuis[i], np.zeros(500)) for i in range(len(words))]
  tokens_vec = [word_embeddings.get(words[i], np.zeros(300)) for i in range(len(words))]
  words_vec = [np.concatenate([cuis_vec[i], tokens_vec[i]]) for i in range(len(words))]
  labels_vec = [label_vec(l) for l in labels]

  words_vec2 = np.array(words_vec).reshape(1, len(words_vec), len(words_vec[0]))
  labels_vec2 = np.array(labels_vec).reshape(1, len(labels_vec), len(labels_vec[0]))

  return words_vec2, labels_vec2

def eval():
  final_avg = 0
  filtered_avg = 0
  total =0
  correct= 0
  print("NOW EVALUATING......")
  num_test= 100
  for i in range(num_test):
    preds = model.predict(np.reshape(w_batches[-1*i], [1] + list(w_batches[-1*i].shape)),
                          batch_size=1) 

    # print([vec_label(j) for j in preds[0]])
    # print([vec_label(j) for j in l_batches[-1*i]])
    predicted_labels = np.array([vec_label(j) for ind, j in enumerate(preds[0])])
    actual_labels = np.array([vec_label(j) for ind, j in enumerate(l_batches[-1*i])])

    final_avg += np.sum(actual_labels== predicted_labels)/(len(actual_labels))


    for j in range(len(li[-1*i]['cui'])):
      if li[-1*i]["cui"][j] in cui_embeddings:
        total += 1
        if actual_labels[j] == predicted_labels[j]:
          correct+=1

  filtered_avg = correct/total
  print("Average of Everything: ", final_avg/num_test)
  print("Average predicted CUI Terms correct", filtered_avg)
  model.save('masking_glovecui_25ep')


w_batches = None
l_batches = None
for df in li:
  wv, lv = get_words_labels_from_df(df, max_length)
  # concatenate word vec, label vec...
  if w_batches is None:
    w_batches = wv
  else:
    w_batches = np.concatenate([w_batches, wv], axis=0)
  # w_batches.append(wv)
  if l_batches is None:
    l_batches = lv
  else:
    l_batches = np.concatenate([l_batches, lv], axis=0)
  # l_batches.append(lv)
w_batches = np.asarray(w_batches)
l_batches = np.asarray(l_batches)
print(w_batches.shape, l_batches.shape)



from keras.layers import Masking

special_value = np.zeros(800)
model = Sequential()
model.add(Masking(mask_value=special_value, input_shape=(None, 800)))
model.add(Bidirectional(LSTM(50, return_sequences=True)))
model.add(Dense(10, activation='softmax'))

model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

ds = medicalData(w_batches, l_batches)
print(np.array(w_batches).shape)
print(np.array(l_batches).shape)
model.summary(90)
for i in range(50):
  print("epoch ", i)
  model.fit(w_batches[:-100], l_batches[:-100], batch_size=4, epochs=1)
  eval()




# Model with Variable-length inputs
can just directly run the code block below



In [0]:
def get_words_labels_from_df(df, max_length=len(df)):
  labels = df['labels']
  
  cuis = df['cui']
  words = df['tokens']
  
  cuis_vec = [cui_embeddings.get(cuis[i], np.zeros(500)) for i in range(len(words))]
  tokens_vec = [word_embeddings.get(words[i], np.zeros(300)) for i in range(len(words))]
  words_vec = [np.concatenate([cuis_vec[i], tokens_vec[i]]) for i in range(len(words))]
  labels_vec = [label_vec(l) for l in labels]

  words_vec2 = np.array(words_vec).reshape(1, len(words_vec), len(words_vec[0]))
  labels_vec2 = np.array(labels_vec).reshape(1, len(labels_vec), len(labels_vec[0]))

  return words_vec2, labels_vec2

def eval():
  final_avg = 0
  filtered_avg = 0
  total =0
  correct= 0
  print("NOW EVALUATING......")
  num_test= 100
  for i in range(num_test):
    preds = model.predict(np.reshape(w_batches[-1*i], [1] + list(w_batches[-1*i].shape)),
                          batch_size=1) 

    # print([vec_label(j) for j in preds[0]])
    # print([vec_label(j) for j in l_batches[-1*i]])
    predicted_labels = np.array([vec_label(j) for ind, j in enumerate(preds[0])])
    actual_labels = np.array([vec_label(j) for ind, j in enumerate(l_batches[-1*i])])

    final_avg += np.sum(actual_labels== predicted_labels)/(len(actual_labels))


    for j in range(len(li[-1*i]['cui'])):
      if li[-1*i]["cui"][j] in cui_embeddings:
        total += 1
        if actual_labels[j] == predicted_labels[j]:
          correct+=1

  filtered_avg = correct/total
  print("Average of Everything: ", final_avg/num_test)
  print("Average predicted CUI Terms correct", filtered_avg)
  # model.save('varinp_glovecui_25ep')

variable_w = []
variable_l = []
for df in li:
  wv, lv = get_words_labels_from_df(df, max_length)
  # concatenate word vec, label vec...
  variable_w.append(wv)
  variable_l.append(lv)



from keras.models import Sequential
from keras.layers import LSTM, Dense, TimeDistributed
from keras.utils import to_categorical
import numpy as np

model = Sequential()

model.add(LSTM(50, return_sequences=True, input_shape=(None, 800)))
model.add(Dense(10, activation='softmax'))

print(model.summary(90))

model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

def train_generator(variable_w, variable_l):
    i=0
    length = len(variable_w)
    while True:
        x_train, y_train = np.array(variable_w[i % length]), np.array(variable_l[i % length])
        i+= 1
        yield x_train, y_train

for _ in range(50):
  print("epoch ", _)
  model.fit_generator(train_generator(variable_w[:-100], variable_l[:-100]), steps_per_epoch=len(variable_w)-1, epochs=1, verbose=1)
  eval()




# Model with Just Glove

In [0]:
from keras.utils import Sequence
import math
class medicalData(Sequence):

    def __init__(self, x_set, y_set, batch_size=1):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size

    def __len__(self):
        return math.ceil(len(self.x) / self.batch_size)

    def __getitem__(self, idx):
        retX = self.x[idx]
        retY = self.y[idx]

        return np.array(retX), np.array(retY)

def get_words_labels_from_df(df, max_length=len(df)):
  labels = np.concatenate([df['labels'][0:max_length],
                             ['Other']*(max_length - len(df['labels']))
                             ], axis=0)
  
  cuis = np.concatenate([df['cui'][0:max_length],
                             [' ']*(max_length - len(df['cui']))
                             ], axis=0) 
  words = np.concatenate([df['tokens'][0:max_length],
                             [' ']*(max_length - len(df['tokens']))
                             ], axis=0) 

  
  words_vec = [word_embeddings.get(words[i], np.zeros(500)) for i in range(len(words))]
  
  labels_vec = [label_vec(l) for l in labels]

  words_vec2 = np.array(words_vec).reshape(1, len(words_vec), len(words_vec[0]))
  labels_vec2 = np.array(labels_vec).reshape(1, len(labels_vec), len(labels_vec[0]))

  return words_vec2, labels_vec2


def eval():
  final_avg = 0
  filtered_avg = 0
  total =0
  correct= 0
  print("NOW EVALUATING......")
  num_test= 100
  for i in range(num_test):
    preds = model.predict(np.reshape(w_batches[-1*i], [1] + list(w_batches[-1*i].shape)),
                          batch_size=1) 

    # print([vec_label(j) for j in preds[0]])
    # print([vec_label(j) for j in l_batches[-1*i]])
    predicted_labels = np.array([vec_label(j) for ind, j in enumerate(preds[0])])
    actual_labels = np.array([vec_label(j) for ind, j in enumerate(l_batches[-1*i])])

    final_avg += np.sum(actual_labels== predicted_labels)/(len(actual_labels))


    for j in range(len(li[-1*i]['cui'])):
      if li[-1*i]["cui"][j] in cui_embeddings:
        total += 1
        if actual_labels[j] == predicted_labels[j]:
          correct+=1

  filtered_avg = correct/total
  print("Average of Everything: ", final_avg/num_test)
  print("Average predicted CUI Terms correct", filtered_avg)
  model.save('masking_justglove_25ep')

# w_batches = None
# l_batches = None
# for df in li:
#   wv, lv = get_words_labels_from_df(df, max_length)
#   # concatenate word vec, label vec...
#   if w_batches is None:
#     w_batches = wv
#   else:
#     w_batches = np.concatenate([w_batches, wv], axis=0)
#   # w_batches.append(wv)
#   if l_batches is None:
#     l_batches = lv
#   else:
#     l_batches = np.concatenate([l_batches, lv], axis=0)
#   # l_batches.append(lv)
# w_batches = np.asarray(w_batches)
# l_batches = np.asarray(l_batches)
# print(w_batches.shape, l_batches.shape)



from keras.layers import Masking

special_value = np.zeros(500)
model = Sequential()
model.add(Masking(mask_value=special_value, input_shape=(None, 500)))
model.add(Bidirectional(LSTM(50, return_sequences=True)))
model.add(Dense(10, activation='softmax'))

model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

ds = medicalData(w_batches, l_batches)
print(np.array(w_batches).shape)
print(np.array(l_batches).shape)
model.summary(90)
for i in range(50):
  print("epoch ", i)
  model.fit(w_batches[:-100], l_batches[:-100], batch_size=4, epochs=1)
  eval()

# from keras import models
# model = models.load_model('masking_justcui_25ep')