In [None]:
import numpy as np
import re
import random

import tensorflow as tf

import Functions as fn
from DS import DS
from Set import pool
from Iterator import Iterator

In [2]:
#Dataset = fn.firstTimeLoad()
#Dataset.writeTexts()
#Dataset.writeLabels()

In [3]:
Dataset = pool()
Dataset.loadTexts()
Dataset.loadLabels()

In [4]:
print('Number of Texts: ', Dataset.size)
print('Number of 2007 Smoking Challenge texts: ', Dataset.numberOf(challenge='2007 Smoking Challenge'))
print('Number of 2008 ObesityChallenge texts: ', Dataset.numberOf(challenge='2008 Obesity Challenge'))
print('Number of 2009 Medication Challenge texts: ', Dataset.numberOf(challenge='2009 Medication Challenge'))
print('Number of 2010 Relations Challenge texts: ', Dataset.numberOf(challenge='2010 Relations Challenge'))
print('Number of 2011 Coreference Challenge texts: ', Dataset.numberOf(challenge='2011 Coreference Challenge'))
print('Number of 2012 Temporal Relations Challenge texts: ', Dataset.numberOf(challenge='2012 Temporal Relations Challenge'))
print('Number of Train Texts: ', Dataset.numberOf(stage='train'))
print('Number of Test Texts: ', Dataset.numberOf(stage='test'))
print('Number of Labeled Texts: ', Dataset.numberOf(labelled='yes'))
print('Number of Initially Labeled Texts: ', Dataset.numberOf(labelled='yes', label_type='train'))
print('Number of Competitor Labeled Texts Texts: ', Dataset.numberOf(labelled='yes', label_type='test'))

Number of Texts:  4605
Number of 2007 Smoking Challenge texts:  926
Number of 2008 ObesityChallenge texts:  1237
Number of 2009 Medication Challenge texts:  1249
Number of 2010 Relations Challenge texts:  694
Number of 2011 Coreference Challenge texts:  188
Number of 2012 Temporal Relations Challenge texts:  311
Number of Train Texts:  2859
Number of Test Texts:  1746
Number of Labeled Texts:  258
Number of Initially Labeled Texts:  10
Number of Competitor Labeled Texts Texts:  248


In [5]:
Dataset.processForEmbedding()
sentences = Dataset.getSentences()

In [53]:
doc = open("stopwords.txt", "r")
stopwords = set(doc.read().split('\n'))
stopwords.update(set(['nm', 'ngl', 'slides', 'descreases']))

medications = set()
dosages = set()
modes = set()
frequencies = set()
durations = set()
reasons = set()

labelled = Dataset.getDS(labelled='yes')

for case in labelled.data:
    for term in re.finditer(r'm="[a-z0-9 ]+"', case.raw_labels):
        temp = term.group()[3:-1].split()
        [medications.add(re.sub(r'\d+', '<num>', word)) for word in temp if word not in stopwords]
    for term in re.finditer(r'do="[a-z0-9 ]+"', case.raw_labels):
        temp = term.group()[4:-1].split()
        [dosages.add(re.sub(r'\d+', '<num>', word)) for word in temp if word not in stopwords]
    for term in re.finditer(r'mo="[a-z0-9 ]+"', case.raw_labels):
        temp = term.group()[4:-1].split()
        [modes.add(re.sub(r'\d+', '<num>', word)) for word in temp if word not in stopwords]
    for term in re.finditer(r'f="[a-z0-9 ]+"', case.raw_labels):
        temp = term.group()[3:-1].split()
        [frequencies.add(re.sub(r'\d+', '<num>', word)) for word in temp if word not in stopwords]
    for term in re.finditer(r'du="[a-z0-9 ]+"', case.raw_labels):
        temp = term.group()[4:-1].split()
        [durations.add(re.sub(r'\d+', '<num>', word)) for word in temp if word not in stopwords]
    for term in re.finditer(r'r="[a-z0-9 ]+"', case.raw_labels):
        temp = term.group()[3:-1].split()
        [reasons.add(re.sub(r'\d+', '<num>', word)) for word in temp if word not in stopwords]

print(len(medications))
print(len(dosages))
print(len(modes))
print(len(frequencies))
print(len(durations))
print(len(reasons))

928
119
76
79
80
482


In [7]:
from gensim.models import Word2Vec



In [8]:
model_I2B2 = Word2Vec(sentences, min_count=1, size=100)

In [9]:
model_I2B2.most_similar("doctor")

[('physician', 0.7557651996612549),
 ('office', 0.7325965166091919),
 ('private', 0.7110618948936462),
 ('primary', 0.6850539445877075),
 ('cardiologist', 0.6723614931106567),
 ('pcp', 0.6335077285766602),
 ('local', 0.6258509159088135),
 ('oncologist', 0.6254763007164001),
 ('neurologist', 0.610754132270813),
 ('vandeyacht', 0.6047073602676392)]

In [10]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [11]:
from collections import Counter

words = []
for sent in sentences:
    for word in sent:
        words.append(word)

cnt = Counter(words).most_common(1000)
cnt = np.array(cnt)
topwords = np.ndarray.tolist(cnt[:,0])

In [12]:
visualisation = []
colormap = []

#[(visualisation.append(word), colormap.append('red')) for word in medications]
#[(visualisation.append(word), colormap.append('green')) for word in dosages]
#[(visualisation.append(word), colormap.append('purple')) for word in modes]
#[(visualisation.append(word), colormap.append('yellow')) for word in frequencies]
#[(visualisation.append(word), colormap.append('orange')) for word in durations]
[(visualisation.append(word), colormap.append('cyan')) for word in reasons]

for word in topwords:
    if not word in visualisation:
        visualisation.append(word)
        colormap.append('blue')


# This assumes words_top_ted is a list of strings, the top 1000 words
words_vec = model_I2B2[visualisation]

print(len(visualisation))
print(len(words_vec))
print(len(colormap))

1285
1285
1285


In [13]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_tsne = tsne.fit_transform(words_vec)

In [79]:
train_set = []
train_labels = []
train_size = 2000
vocab = set(model_I2B2.wv.vocab.keys())
target = set(reasons)

for i in range(train_size * 9 // 10):
    word = random.sample(vocab, 1)
    train_set.append(model_I2B2[word[0]])
    vocab.discard(word[0])
    if word[0] in target:
        train_labels.append([1,0])
        target.discard(word[0])
    else:
        train_labels.append([0,1])
    if i % 10 == 0:
        word = random.sample(target, 1)
        train_set.append(model_I2B2[word[0]])
        train_labels.append([1,0])
        target.discard(word[0])

In [80]:
print(len(target))
print(len(medications))

293
928


In [81]:
np.array(train_labels).sum(0)/len(train_labels)

array([ 0.09545455,  0.90454545])

In [82]:
test_set = []
test_labels = []
test_size = 500

for i in range(int(test_size * 0.5)):
    word = random.sample(vocab, 1)
    test_set.append(model_I2B2[word[0]])
    vocab.discard(word[0])
    if word[0] in medications:
        test_labels.append([1,0])
        target.discard(word[0])
    else:
        test_labels.append([0,1])
        
for i in range(int(test_size * 0.5)):
    word = random.sample(target, 1)
    test_set.append(model_I2B2[word[0]])
    test_labels.append([1,0])
    target.discard(word[0])

In [83]:
np.array(test_labels).sum(0)/len(test_labels)

array([ 0.512,  0.488])

In [77]:
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.05)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

In [78]:
node_count_1 = 50

x = tf.placeholder(tf.float32, shape=[None, 100])
y_ = tf.placeholder(tf.float32, shape=[None, 2])

# Define the first layer here
W = weight_variable([100, node_count_1])
b = bias_variable([node_count_1])
h = tf.nn.sigmoid(tf.matmul(x, W) + b)

# Use dropout for this layer (should you wish)
#keep_prob = tf.placeholder(tf.float32)
#h_drop = tf.nn.dropout(h1, keep_prob)

# Define the output layer here
V = weight_variable([node_count_1, 2])
c = bias_variable([2])
y = tf.nn.softmax(tf.matmul(h, V) + c)

# We'll use the cross entropy loss function 
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_))

# And classification accuracy
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# And the Adam optimiser
train_step = tf.train.AdamOptimizer(learning_rate=1e-2).minimize(cross_entropy)

# Start a tf session and run the optimisation algorithm
sess = tf.Session()
sess.run(tf.initialize_all_variables())

training = Iterator(train_set, train_labels)
train_accuracy = 0
test_accuracy = 0
N = 0

while training.epochs < 1000:
    trd, trl = training.next_batch(50)
    if N % 4000 == 0:
        train_accuracy = sess.run(accuracy, feed_dict={x: trd, y_: trl})
        test_accuracy = sess.run(accuracy, feed_dict={x: test_set, y_: test_labels})
        print("Epoch: %d, Training Accuracy: %f, Test Accuracy: %f" % (training.epochs, train_accuracy, test_accuracy))
    sess.run(train_step, feed_dict={x: trd, y_: trl})
    N += 1

Instructions for updating:
Use `tf.global_variables_initializer` instead.
0
Epoch: 0, Training Accuracy: 0.900000, Test Accuracy: 0.498000
Epoch: 102, Training Accuracy: 0.980000, Test Accuracy: 0.826000
Epoch: 205, Training Accuracy: 0.940000, Test Accuracy: 0.826000
Epoch: 307, Training Accuracy: 0.980000, Test Accuracy: 0.818000
Epoch: 410, Training Accuracy: 1.000000, Test Accuracy: 0.822000
Epoch: 512, Training Accuracy: 0.960000, Test Accuracy: 0.822000
Epoch: 615, Training Accuracy: 1.000000, Test Accuracy: 0.822000
Epoch: 717, Training Accuracy: 0.980000, Test Accuracy: 0.814000
Epoch: 820, Training Accuracy: 0.960000, Test Accuracy: 0.828000
Epoch: 923, Training Accuracy: 1.000000, Test Accuracy: 0.834000
