In [4]:
import numpy as np
import itertools
import datetime

import tensorflow as tf
import sklearn as sk

from gensim.models import Word2Vec

import Functions as fn
import Iterator as it
from DS import DS
from Set import pool
from FFModel import FF_Model
from RNNModel import RNN_Model
from S2SModel import S2S_Model



# Data Loading

In [2]:
Dataset = pool()
Dataset.load_texts('raw_texts')
Dataset.load_labels('raw_labels')

Raw Text Load Complete
Raw Labels Load Complete


In [3]:
target_dict = fn.load_labels('final_meta/labels')

Label Load Complete


# Embedding Generation

In [4]:
train_set = pool(data=(Dataset.get_DS(stage='test', labelled='yes')).data[:-10])
validation_set = pool(data=(Dataset.get_DS(stage='test', labelled='yes')).data[-10:])
test_set = Dataset.get_DS(stage='train', labelled='yes')
set_1 = Dataset.get_DS(stage='train', labelled='no')
set_2 = Dataset.get_DS(stage='test', labelled='no')
set_1.append(set_2.data)
set_1.append(train_set.data)
emb_set = set_1
print(emb_set.size, train_set.size, validation_set.size, test_set.size)

4585 238 10 10


In [5]:
#emb_set.process_for_embedding()
#sentences = emb_set.get_sentences()
#fn.write_sentences(sentences, 'final_meta/sentences')
sentences = fn.load_sentences('final_meta/sentences')

#model = Word2Vec(sentences, min_count=1, size=100)
#model.save('final_meta/W2V')
model = Word2Vec.load('final_meta/W2V')

vocab = model.wv.vocab.keys()

Sentence Load Complete


# Layer and Index Loading

In [6]:
#word_indices, emb_layer = fn.get_index_and_emb_layer(model)
#fn.write_word_indices(word_indices, 's2s/word_indices')
#fn.write_emb_layer(emb_layer, 's2s/emb_layer')

word_indices = fn.load_word_indices('s2s/word_indices')
emb_layer = fn.load_emb_layer('s2s/emb_layer')

Word Indices Load Complete
Embedding Layer Load Complete


# EDS2S Testing

In [7]:
train_set.process_for_s2s_testing()
validation_set.process_for_s2s_testing()
test_set.process_for_s2s_testing()

In [9]:
max_enc_inp = 0
max_dec_inp = 0
for setin in [train_set, validation_set, test_set]:
    for case in setin.data:
        for enc_inp  in case.enc_inputs:
            max_enc_inp = max(max_enc_inp, len(enc_inp))
        for dec_inp  in case.dec_inputs:
            max_dec_inp = max(max_dec_inp, len(dec_inp))
print(max_enc_inp, max_dec_inp)

97 34


In [10]:
sets = {}
sets['train'] = train_set.get_s2s_sets(word_indices, max_enc_inp, max_dec_inp)
sets['validation'] = validation_set.get_s2s_sets(word_indices, max_enc_inp, max_dec_inp)
sets['test'] = test_set.get_s2s_sets(word_indices, max_enc_inp, max_dec_inp)

In [16]:
S2S = S2S_Model(decay = 0.00001,
                batch=50,
                enc_vocab_size=len(word_indices), 
                dec_vocab_size=len(word_indices), 
                enc_emb_size=100, 
                dec_emb_size=100, 
                state_size=128, 
                dropout=1.0,
                learn_rate=0.001,
                max_gradient_norm=5,
                enc_emb_layer=emb_layer)
S2S.build_graph()
S2S.train(sets=sets, epochs=100, report_percentage=1, show_progress=True, show_plot=True)

Epoch: 0, Learn Rate: 0.0010000, Perplexity: 1811.32
Epoch: 1, Learn Rate: 0.0010000, Perplexity: 22.05
Epoch: 2, Learn Rate: 0.0009984, Perplexity: 10.23
Epoch: 3, Learn Rate: 0.0009952, Perplexity: 6.85
Epoch: 4, Learn Rate: 0.0009905, Perplexity: 4.18
Epoch: 5, Learn Rate: 0.0009842, Perplexity: 4.72
Epoch: 6, Learn Rate: 0.0009764, Perplexity: 3.56
Epoch: 7, Learn Rate: 0.0009672, Perplexity: 2.99
Epoch: 8, Learn Rate: 0.0009565, Perplexity: 2.27
Epoch: 9, Learn Rate: 0.0009444, Perplexity: 2.14
Epoch: 10, Learn Rate: 0.0009310, Perplexity: 2.44
Epoch: 11, Learn Rate: 0.0009163, Perplexity: 2.12
Epoch: 12, Learn Rate: 0.0009004, Perplexity: 1.62
Epoch: 13, Learn Rate: 0.0008834, Perplexity: 1.56
Epoch: 14, Learn Rate: 0.0008653, Perplexity: 1.49
Epoch: 15, Learn Rate: 0.0008462, Perplexity: 1.34
Epoch: 16, Learn Rate: 0.0008263, Perplexity: 1.38
Epoch: 17, Learn Rate: 0.0008056, Perplexity: 1.34
Epoch: 18, Learn Rate: 0.0007841, Perplexity: 1.21
Epoch: 19, Learn Rate: 0.0007620, Pe

In [None]:
tru_vocab = []
tru_words = sets['test'][7]
for i in range(len(sets['test'][5])):
    tru_vocab.append(sets['test'][5][i][:sets['test'][4][i]])

In [None]:
res = []
for i in range(len(sets['test'][0])):
    dummy = [[], [], [], [], [], [], []]
    for j in range(50):
        for k in range(7):
            dummy[k].append(sets['test'][k][i])
    temp = S2S.predict(dummy)
    res.append(temp[0])

In [32]:
res_vocab = res
res_words = []
for case in res_vocab:
    temp = []
    for index in case:
        for word in word_indices:
            if word_indices[word] == index:
                temp.append(word)
    res_words.append(temp)

In [35]:
for i in range(5):
    print(res_words[i])
    print(tru_words[i])

['ofloxacin', '<num>', 'mg', 'po', 'q', '<num>', '<eos>']
['ofloxacin', '<num>', 'mg', 'po', 'q', '<num>', '<eos>']
['insulin', 'nph', '<num>', 'units', 'subcu', 'bid', '<eos>']
['insulin', 'nph', '<num>', 'units', 'subcu', 'bid', '<eos>']
['colace', '<num>', 'mg', 'po', 'bid', '<eos>']
['colace', '<num>', 'mg', 'po', 'bid', '<eos>']
['percocet', '<num>-<num>', 'tablets', 'po', 'q', '<num>', 'prn', '<eos>']
['percocet', '<num>-<num>', 'tablets', 'po', 'q', '<num>', 'prn', '<eos>']
['coumadin', '<eos>']
['coumadin', '<num>', 'mg', 'in', 'evening', 'x', '<num>', '<eos>']


In [39]:
tp = 0
fp = 0
fn = 0
for i in range(360):
    mistake = False
    for word in res_words[i][:-1]:
        if word in tru_words[i][:-1]:
            tp +=1
        else:
            fp += 1
            mistake = True
    for word in tru_words[i][:-1]:
        if word not in res_words[i][:-1]:
            fn += 1
            mistake = True
    #if mistake:
        #print(sets['test'][0][i])
        #print(sets['test'])
print('TP\tFP\tFN\t')
print('{}\t{}\t{}'.format(tp, fp, fn))
print('Precision: {:.2f}'.format(tp / (tp + fp)))
print('Recall: {:.2f}'.format(tp / (tp + fn)))
print('Token-Level Horizontal Metric: {:.2f}'.format((2 * tp) / (2*tp + fn + fp)))

TP	FP	FN	
1183	246	301
Precision: 0.83
Recall: 0.80
Token-Level Horizontal Metric: 0.81


In [42]:
saver = tf.train.Saver()
saver.save(S2S.sess, "s2s/model/model.ckpt")

's2s/model/model.ckpt'

In [59]:
loaded = S2S_Model(decay = 0.00001,
                batch=50,
                enc_vocab_size=len(word_indices), 
                dec_vocab_size=len(word_indices), 
                enc_emb_size=100, 
                dec_emb_size=100, 
                state_size=128, 
                dropout=1.0,
                learn_rate=0.001,
                max_gradient_norm=5,
                enc_emb_layer=emb_layer)
loaded.build_graph()
loaded.sess = tf.Session()
loader = tf.train.Saver()
loader.restore(loaded.sess, "s2s/model/model.ckpt")

  if d.decorator_argspec is not None), _inspect.getargspec(target))


INFO:tensorflow:Restoring parameters from s2s/model/model.ckpt
