In [1]:
import numpy as np
import re
import random

import tensorflow as tf

import Functions as fn
from DS import DS
from Set import Set
from Iterator import Iterator

In [None]:
Dataset = fn.firstTimeLoad()

In [None]:
Dataset.writeTexts()

In [None]:
Dataset.writeLabels()

In [2]:
Dataset = Set()
Dataset.loadTexts()
Dataset.loadLabels()

In [3]:
print('Number of Texts: ', Dataset.size)
print('Number of 2007 Smoking Challenge texts: ', Dataset.numberOf(challenge='2007 Smoking Challenge'))
print('Number of 2008 ObesityChallenge texts: ', Dataset.numberOf(challenge='2008 Obesity Challenge'))
print('Number of 2009 Medication Challenge texts: ', Dataset.numberOf(challenge='2009 Medication Challenge'))
print('Number of 2010 Relations Challenge texts: ', Dataset.numberOf(challenge='2010 Relations Challenge'))
print('Number of 2011 Coreference Challenge texts: ', Dataset.numberOf(challenge='2011 Coreference Challenge'))
print('Number of 2012 Temporal Relations Challenge texts: ', Dataset.numberOf(challenge='2012 Temporal Relations Challenge'))
print('Number of Train Texts: ', Dataset.numberOf(stage='train'))
print('Number of Test Texts: ', Dataset.numberOf(stage='test'))
print('Number of Labeled Texts: ', Dataset.numberOf(labelled='yes'))
print('Number of Initially Labeled Texts: ', Dataset.numberOf(labelled='yes', label_type='train'))
print('Number of Competitor Labeled Texts Texts: ', Dataset.numberOf(labelled='yes', label_type='test'))

Number of Texts:  4605
Number of 2007 Smoking Challenge texts:  926
Number of 2008 ObesityChallenge texts:  1237
Number of 2009 Medication Challenge texts:  1249
Number of 2010 Relations Challenge texts:  694
Number of 2011 Coreference Challenge texts:  188
Number of 2012 Temporal Relations Challenge texts:  311
Number of Train Texts:  2859
Number of Test Texts:  1746
Number of Labeled Texts:  258
Number of Initially Labeled Texts:  10
Number of Competitor Labeled Texts Texts:  248


In [4]:
Dataset.processForEmbedding()
sentences = Dataset.getSentences()

In [5]:
medications = []
labelled = Dataset.getDS(labelled='yes')

for case in labelled.data:
    for term in re.finditer(r'm="[a-z0-9 ]+"', case.raw_labels):
        temp = term.group()[3:-1]
        temp = temp.split()
        for word in temp:
            word = re.sub(r'\d+', '<num>', word)
            if word not in medications:
                medications.append(word)
                
print(len(medications))
medications.remove('ngl')
print(len(medications))

941
940


In [6]:
from gensim.models import Word2Vec



In [7]:
model_I2B2 = Word2Vec(sentences, min_count=1, size=100)

In [8]:
model_I2B2.most_similar("doctor")

[('physician', 0.7858730554580688),
 ('office', 0.7632736563682556),
 ('private', 0.7402583956718445),
 ('cardiologist', 0.7051651477813721),
 ('oncologist', 0.6889317035675049),
 ('primary', 0.6885737180709839),
 ('pcp', 0.6705446243286133),
 ('doctor.', 0.6602672338485718),
 ('neurologist', 0.6348996758460999),
 ("physician's", 0.615487277507782)]

In [9]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [10]:
from collections import Counter

words = []
for sent in sentences:
    for word in sent:
        words.append(word)

cnt = Counter(words).most_common(1000)
cnt = np.array(cnt)
topwords = np.ndarray.tolist(cnt[:,0])

In [11]:
visualisation = []

for word in medications:
    visualisation.append(word)

colormap = np.full(len(visualisation), "red")

for word in topwords:
    if not word in visualisation:
        visualisation.append(word)
        colormap = np.append(colormap, ["blue"])


# This assumes words_top_ted is a list of strings, the top 1000 words
words_vec = model_I2B2[visualisation]

print(len(visualisation))
print(len(words_vec))
print(len(colormap))

1810
1810
1810


In [12]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_tsne = tsne.fit_transform(words_vec)

In [13]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=words_tsne[:,0],
                                    x2=words_tsne[:,1],
                                    names=visualisation,
                                    coloring=colormap))

p.scatter(x="x1", y="x2", color="coloring", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)

In [20]:
train_set = []
train_labels = []
i = 0
vocab = list(model_I2B2.wv.vocab.keys())

for i in range(1500):
    word = random.choice(vocab)
    train_set.append(model_I2B2[word])
    if word in medications:
        train_labels.append([1,0])
    else:
        train_labels.append([0,1])
    if i % 3 == 0:
        word = random.choice(medications)
        train_set.append(model_I2B2[word])
        train_labels.append([1,0])

In [21]:
np.array(train_labels).sum(0)/len(train_labels)

array([ 0.262,  0.738])

In [22]:
test_set = []
test_labels = []

for i in range(300):
    word = random.choice(vocab)
    test_set.append(model_I2B2[word])
    if word in medications:
        test_labels.append([1,0])
    else:
        test_labels.append([0,1])
        
for i in range(100):
    word = random.choice(medications)
    test_set.append(model_I2B2[word])
    test_labels.append([1,0])

In [23]:
np.array(test_labels).sum(0)/len(test_labels)

array([ 0.2725,  0.7275])

In [24]:
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.05)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

In [25]:
node_count_1 = 50

x = tf.placeholder(tf.float32, shape=[None, 100])
y_ = tf.placeholder(tf.float32, shape=[None, 2])

# Define the first layer here
W = weight_variable([100, node_count_1])
b = bias_variable([node_count_1])
h = tf.nn.sigmoid(tf.matmul(x, W) + b)

# Use dropout for this layer (should you wish)
#keep_prob = tf.placeholder(tf.float32)
#h_drop = tf.nn.dropout(h1, keep_prob)

# Define the output layer here
V = weight_variable([node_count_1, 2])
c = bias_variable([2])
y = tf.nn.softmax(tf.matmul(h, V) + c)

# We'll use the cross entropy loss function 
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_))

# And classification accuracy
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# And the Adam optimiser
train_step = tf.train.AdamOptimizer(learning_rate=1e-2).minimize(cross_entropy)

# Start a tf session and run the optimisation algorithm
sess = tf.Session()
sess.run(tf.initialize_all_variables())

training = Iterator(train_set, train_labels)
train_accuracy = 0
test_accuracy = 0
N = 0

print(training.epochs)

while training.epochs < 1000:
    trd, trl = training.next_batch(50)
    if N % 500 == 0:
        train_accuracy = sess.run(accuracy, feed_dict={x: trd, y_: trl})
        test_accuracy = sess.run(accuracy, feed_dict={x: test_set, y_: test_labels})
        print("Epoch: %d, Training Accuracy: %f, Test Accuracy: %f" % (training.epochs, train_accuracy, test_accuracy))
    sess.run(train_step, feed_dict={x: trd, y_: trl})
    N += 1

Instructions for updating:
Use `tf.global_variables_initializer` instead.
0
Epoch: 0, Training Accuracy: 0.220000, Test Accuracy: 0.272500
Epoch: 12, Training Accuracy: 0.980000, Test Accuracy: 0.922500
Epoch: 25, Training Accuracy: 0.920000, Test Accuracy: 0.915000
Epoch: 37, Training Accuracy: 0.920000, Test Accuracy: 0.910000
Epoch: 50, Training Accuracy: 0.960000, Test Accuracy: 0.910000
Epoch: 62, Training Accuracy: 0.960000, Test Accuracy: 0.912500
Epoch: 75, Training Accuracy: 0.940000, Test Accuracy: 0.917500
Epoch: 87, Training Accuracy: 0.940000, Test Accuracy: 0.912500
Epoch: 100, Training Accuracy: 0.980000, Test Accuracy: 0.917500
Epoch: 112, Training Accuracy: 0.960000, Test Accuracy: 0.907500
Epoch: 125, Training Accuracy: 0.900000, Test Accuracy: 0.915000
Epoch: 137, Training Accuracy: 1.000000, Test Accuracy: 0.915000
Epoch: 150, Training Accuracy: 0.900000, Test Accuracy: 0.915000
Epoch: 162, Training Accuracy: 0.980000, Test Accuracy: 0.915000
Epoch: 175, Training Ac