In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Input, Dense, LSTM, Dropout
from tensorflow.keras import Model
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from sklearn.utils import shuffle
import os
import json
from tqdm import tqdm
import tensorflow_datasets as tfds


In [2]:
##data
label_folder = 'drive/My Drive/Colab Notebooks/data/challenge2008/training/labels'
data_folder = 'drive/My Drive/Colab Notebooks/data/challenge2008/training/notes_cuis'

# dataset = []
# for file in os.listdir(data_folder):
#     fr = open(os.path.join(data_folder, file),'r')
#     data = fr.read()
#     dataset.append(data)
#     # print(dataset)
#     fr.close()

In [3]:
##label_read
label = np.empty([0,16], dtype = int)

for file in os.listdir(label_folder):
    with open(os.path.join(label_folder, file),'r') as file_read:
        y = np.zeros(16, dtype = int)
        for line in file_read.readlines():
            line = line.split()
            # print(line)
            if line[0] == 'intuitive':
              if line[2] == 'Asthma':
                y[0] = 0 if line[4] == 'N' else 1
              elif line[2] == 'CAD':
                y[1] = 0 if line[4] == 'N' else 1
              elif line[2] == 'CHF':
                y[2] = 0 if line[4] == 'N' else 1
              elif line[2] == 'Depression':
                y[3] = 0 if line[4] == 'N' else 1
              elif line[2] == 'Diabetes':
                y[4] = 0 if line[4] == 'N' else 1
              elif line[2] == 'GERD':
                y[5] = 0 if line[4] == 'N' else 1
              elif line[2] == 'Gallstones':
                y[6] = 0 if line[4] == 'N' else 1
              elif line[2] == 'Gout':
                y[7] = 0 if line[4] == 'N' else 1
              elif line[2] == 'Hypercholesterolemia':
                y[8] = 0 if line[4] == 'N' else 1
              elif line[2] == 'Hypertension':
                y[9] = 0 if line[4] == 'N' else 1
              elif line[2] == 'Hypertriglyceridemia':
                y[10] = 0 if line[4] == 'N' else 1
              elif line[2] == 'OA':
                y[11] = 0 if line[4] == 'N' else 1
              elif line[2] == 'OSA':
                y[12] = 0 if line[4] == 'N' else 1
              elif line[2] == 'Obesity':
                y[13] = 0 if line[4] == 'N' else 1
              elif line[2] == 'PVD':
                y[14] = 0 if line[4] == 'N' else 1
              elif line[2] == 'Venous Insufficiency':
                y[15] = 0 if line[4] == 'N' else 1
        y = np.reshape(y, (1, 16))
        # print(y)
        label = np.concatenate((label, y), axis = 0)
        # print(label.shape)


In [4]:
##map function
def labeler(example, index):
  return example, label[index]

In [5]:
##read data and combine with label
data_folder = 'drive/My Drive/Colab Notebooks/data/challenge2008/training/notes_cuis'
file_paths = []
for file in os.listdir(data_folder):
    file_paths.append(os.path.join(data_folder, file))

labeled_data_sets = []

for i, file_path in enumerate(file_paths):
  lines_dataset = tf.data.TextLineDataset(file_path)
  labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
  labeled_data_sets.append(labeled_dataset)

In [6]:
##create dataset
BUFFER_SIZE = 50000


all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
  all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
  
all_labeled_data = all_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)

In [7]:
##test
for ex in all_labeled_data.take(5):
  print(ex)

(<tf.Tensor: shape=(), dtype=string, numpy=b'C0231589 C0004238 C0085615 C1283838 C1283838 C1283838 C1283838 C1283838 C1283838 C0034063 C0336969 C0336969 C0455458 C0455458 C0040802 C0036983 C0036983 C0036983 C0036983 C0039985 C0022660 C0022660 C0022660 C0022660 C0003864 C0003864 C0449201 C0262525 C0232202 C0232202 C0020538 C0016169 C0016169 C0024109 C0024109 C0036974 C0036974 C0036974 C0036974 C0036974 C0442874 C1305153 C0229665 C0004048 C0475371 C0232201 C0232201 C0262926 C0262926 C0262926 C0262926 C0262926 C0262926 C0262926 C0262926 C1271104 C1271104 C0373486 C0085631 C1306645 C0423772 C0423772 C0231176 C0231176 C0231176 C0231176 C1279572 C1279571 C1279571 C1279571 C1253959 C0524470 C0087111 C0035309 C0337443 C1692886 C1692886 C0003842 C0003842 C0430400 C0430400 C0024236 C0202115 C0341697 C0341697 C0341697 C0341697 C0004610 C0004610 C0202304 C0024554 C0199176 C0008679 C0040405 C0201975 C0201975 C0419008 C0043251 C0221423 C0027051 C0043250 C0234425 C0015385 C0439775 C0018792 C0011581 C

In [8]:
##get vocab for encoding
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)
vocab_size

4973

In [9]:
##create encoder
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)


In [10]:
##test encoding
example_text = next(iter(all_labeled_data))[0].numpy()
print(example_text)
encoded_example = encoder.encode(example_text)
print(encoded_example)

b'C0231589 C0004238 C0085615 C1283838 C1283838 C1283838 C1283838 C1283838 C1283838 C0034063 C0336969 C0336969 C0455458 C0455458 C0040802 C0036983 C0036983 C0036983 C0036983 C0039985 C0022660 C0022660 C0022660 C0022660 C0003864 C0003864 C0449201 C0262525 C0232202 C0232202 C0020538 C0016169 C0016169 C0024109 C0024109 C0036974 C0036974 C0036974 C0036974 C0036974 C0442874 C1305153 C0229665 C0004048 C0475371 C0232201 C0232201 C0262926 C0262926 C0262926 C0262926 C0262926 C0262926 C0262926 C0262926 C1271104 C1271104 C0373486 C0085631 C1306645 C0423772 C0423772 C0231176 C0231176 C0231176 C0231176 C1279572 C1279571 C1279571 C1279571 C1253959 C0524470 C0087111 C0035309 C0337443 C1692886 C1692886 C0003842 C0003842 C0430400 C0430400 C0024236 C0202115 C0341697 C0341697 C0341697 C0341697 C0004610 C0004610 C0202304 C0024554 C0199176 C0008679 C0040405 C0201975 C0201975 C0419008 C0043251 C0221423 C0027051 C0043250 C0234425 C0015385 C0439775 C0018792 C0011581 C0277785 C0201989 C0424945 C1273870 C0332461

In [11]:
##encode function
def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

In [12]:
##encode text to vector
def encode_map_fn(text, label):
  # py_func doesn't set the shape of the returned tensors.
  encoded_text, label = tf.py_function(encode, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int64))

  # `tf.data.Datasets` work best if all components have a shape set
  #  so set the shapes manually: 
  encoded_text.set_shape([None])
  label.set_shape([])

  return encoded_text, label


all_encoded_data = all_labeled_data.map(encode_map_fn)

In [13]:
##test
example_text = next(iter(all_labeled_data))[0].numpy()
print(example_text)

b'C0231589 C0004238 C0085615 C1283838 C1283838 C1283838 C1283838 C1283838 C1283838 C0034063 C0336969 C0336969 C0455458 C0455458 C0040802 C0036983 C0036983 C0036983 C0036983 C0039985 C0022660 C0022660 C0022660 C0022660 C0003864 C0003864 C0449201 C0262525 C0232202 C0232202 C0020538 C0016169 C0016169 C0024109 C0024109 C0036974 C0036974 C0036974 C0036974 C0036974 C0442874 C1305153 C0229665 C0004048 C0475371 C0232201 C0232201 C0262926 C0262926 C0262926 C0262926 C0262926 C0262926 C0262926 C0262926 C1271104 C1271104 C0373486 C0085631 C1306645 C0423772 C0423772 C0231176 C0231176 C0231176 C0231176 C1279572 C1279571 C1279571 C1279571 C1253959 C0524470 C0087111 C0035309 C0337443 C1692886 C1692886 C0003842 C0003842 C0430400 C0430400 C0024236 C0202115 C0341697 C0341697 C0341697 C0341697 C0004610 C0004610 C0202304 C0024554 C0199176 C0008679 C0040405 C0201975 C0201975 C0419008 C0043251 C0221423 C0027051 C0043250 C0234425 C0015385 C0439775 C0018792 C0011581 C0277785 C0201989 C0424945 C1273870 C0332461

In [14]:
i = 0
for data in all_labeled_data:
  i = i+1
print(i)

730


In [20]:
TAKE_SIZE = 130
BATCH_SIZE = 600

train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE)

test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE)

In [22]:
# print(train_data.shape)

In [23]:
sample_text, sample_labels = next(iter(test_data))

sample_text[0], sample_labels[0]


InvalidArgumentError: ignored

In [21]:
vocab_size += 1

In [22]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 64))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)))
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dense(16, activation='softmax'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 64)          318272    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               197632    
_________________________________________________________________
dense_2 (Dense)              (None, 64)                16448     
_________________________________________________________________
dense_3 (Dense)              (None, 16)                1040      
Total params: 533,392
Trainable params: 533,392
Non-trainable params: 0
_________________________________________________________________


In [23]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
model.fit(train_data, epochs=10, validation_data=test_data, verbose=1)


In [None]:
#----------------------------------

In [79]:
data_folder = 'drive/My Drive/Colab Notebooks/data/challenge2008/training/notes_cuis'
file_paths = []
for file in os.listdir(data_folder):
    file_paths.append(os.path.join(data_folder, file))

In [80]:
dataset = tf.data.TextLineDataset(file_paths)

for line in dataset.take(5):
  print(line.numpy())

b'C0004238 C0150009 C0150009 C0150009 C0150009 C0150009 C0150009 C0150009 C0150009 C0150009 C0150009 C0150009 C0150009 C0150009 C0150009 C0150009 C0150009 C0150009 C0150009 C0030193 C0038351 C0150082 C0150082 C0150082 C0150082 C0150082 C0150082 C0150082 C0150082 C0150082 C0150082 C0150082 C0150082 C0150082 C0150082 C0150082 C0150082 C0150082 C0150082 C0085619 C0085619 C0226896 C0231418 C0231418 C0231418 C0231418 C0231418 C0231418 C0231418 C0231418 C0231418 C0231418 C0231418 C0231418 C0231418 C0231418 C0231418 C0231418 C0231418 C0231418 C0034063 C0034063 C0231416 C0231416 C0231416 C0231416 C0231416 C0231416 C0231416 C0231416 C0231416 C0231416 C0231416 C0231416 C0231416 C0231416 C0231416 C0231416 C0231416 C0231416 C0019564 C0019564 C0019564 C0019564 C2004489 C0034642 C0202230 C0005767 C0005767 C0007097 C0030252 C1140621 C0449201 C0035639 C0035639 C0035639 C0035639 C0035639 C1278951 C1278951 C0037313 C0009450 C0586177 C0020538 C0032305 C0023928 C1269647 C1269647 C0582147 C0582147 C0582147

In [81]:
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor in dataset:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)
print(vocab_size)

4973


In [82]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

In [83]:
# example_text = next(iter(dataset)).numpy()
# print(example_text)

# encoded_example = encoder.encode(example_text)
# print(encoded_example)

In [84]:
# def encode(text_tensor):
#   encoded_text = encoder.encode(text_tensor.numpy())
#   return encoded_text

In [85]:
encoded_data = []
for data in dataset:
  text = data.numpy()
  # print(text)
  encoded_text = encoder.encode(text)
  encoded_data.append(encoded_text)

In [98]:
print(encoded_data[0])
print(encoded_data[1])

[406, 4587, 4587, 4587, 4587, 4587, 4587, 4587, 4587, 4587, 4587, 4587, 4587, 4587, 4587, 4587, 4587, 4587, 4587, 2315, 133, 1818, 1818, 1818, 1818, 1818, 1818, 1818, 1818, 1818, 1818, 1818, 1818, 1818, 1818, 1818, 1818, 1818, 1818, 2465, 2465, 4637, 274, 274, 274, 274, 274, 274, 274, 274, 274, 274, 274, 274, 274, 274, 274, 274, 274, 274, 505, 505, 4217, 4217, 4217, 4217, 4217, 4217, 4217, 4217, 4217, 4217, 4217, 4217, 4217, 4217, 4217, 4217, 4217, 4217, 819, 819, 819, 819, 2629, 343, 4633, 4253, 4253, 4777, 4531, 4669, 3458, 1981, 1981, 1981, 1981, 1981, 4816, 4816, 2572, 3181, 4683, 2655, 2182, 4879, 4730, 4730, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 4650, 4650, 1385, 1385, 1385, 1385, 1385, 1385, 1385, 1385, 1385, 1385, 1385, 1385, 1385, 1385, 1385, 1385, 1385, 1385, 1385, 1385, 2908, 2908, 2908, 2908, 2908, 2302, 3107, 4544, 4544, 4501, 4501, 4950, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 9, 9, 1004, 1871, 1072, 673, 6

In [99]:
# for element in encoded_data.as_numpy_iterator():
#   print(element)

AttributeError: ignored

In [None]:
def labeler(example, index):
  return example, tf.cast(index, tf.int64) 

labeled_data_sets = []

for i, data in enumerate(encoded_data):
  labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
  labeled_data_sets.append(labeled_dataset)

In [96]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64
train_dataset = train_dataset.shuffle(BUFFER_SIZE)
train_dataset = train_dataset.padded_batch(BATCH_SIZE)

test_dataset = test_dataset.padded_batch(BATCH_SIZE)

In [89]:
print(label.shape)

(730, 16)


In [92]:
print(len(encoded_data))

730


In [None]:
# xtr = dataset[:600]
# xte = dataset[600:]

In [93]:
##model(create, fit & plot)
#embedding+aveg pooling+dense+output

# input = Input(shape=(80,))
# z= tf.keras.layers.Embedding(total_words, 64)(input)
# z = LSTM(256, stateful=False, unroll=True)(z)
# z = Dense(64, activation='relu')(z)
# training_pred = Dense(1, activation = 'sigmoid')(z)
# model = Model(inputs=input, outputs=training_pred)
# model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(16, activation='softmax')
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 64)          318272    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               66048     
_________________________________________________________________
dense_4 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_5 (Dense)              (None, 16)                1040      
Total params: 393,616
Trainable params: 393,616
Non-trainable params: 0
_________________________________________________________________


In [95]:
# model.fit(dataset, label, batch_size=64, epochs=10, validation_split=0.3)
history = model.fit(encoded_data, label, epochs=10)

ValueError: ignored