In [1]:
import numpy as np
import re
import random

import tensorflow as tf
import sklearn as sk

import Functions as fn
from DS import DS
from Set import pool
from Iterator import Iterator
from FFModel import FF_Model

# Data Loading

In [2]:
Dataset = pool()
Dataset.load_texts('raw_texts')
Dataset.load_labels('raw_labels')

Raw Text Load Complete
Raw Labels Load Complete


In [3]:
#print('Number of Texts: ', Dataset.size)
#print('Number of 2007 Smoking Challenge texts: ', Dataset.number_of(challenge='2007 Smoking Challenge'))
#print('Number of 2008 ObesityChallenge texts: ', Dataset.number_of(challenge='2008 Obesity Challenge'))
#print('Number of 2009 Medication Challenge texts: ', Dataset.number_of(challenge='2009 Medication Challenge'))
#print('Number of 2010 Relations Challenge texts: ', Dataset.number_of(challenge='2010 Relations Challenge'))
#print('Number of 2011 Coreference Challenge texts: ', Dataset.number_of(challenge='2011 Coreference Challenge'))
#print('Number of 2012 Temporal Relations Challenge texts: ', Dataset.number_of(challenge='2012 Temporal Relations Challenge'))
#print('Number of Train Texts: ', Dataset.number_of(stage='train'))
#print('Number of Test Texts: ', Dataset.number_of(stage='test'))
#print('Number of Labeled Texts: ', Dataset.number_of(labelled='yes'))
#print('Number of Initially Labeled Texts: ', Dataset.number_of(labelled='yes', label_type='train'))
#print('Number of Competitor Labeled Texts Texts: ', Dataset.number_of(labelled='yes', label_type='test'))

In [4]:
target_dict = fn.load_labels('labels')

Label Load Complete


# Embedding Generation

In [5]:
from gensim.models import Word2Vec

sentences = fn.load_sentences('sent')
#model = Word2Vec(sentences, min_count=1, size=100)

model = Word2Vec.load('W2V')
vocab = model.wv.vocab.keys()



Sentence Load Complete


In [6]:
#model.most_similar("prozac", topn=10)

# Visualisation

In [7]:
#fn.visualise(model, sentences, [target_dict['medications']], 1000, 'Medications in top 1000 words')

In [8]:
#print_set = []
#for case in Dataset.get_DS(labelled='yes').data:
#    for term in re.finditer(r'm="[^|]+\|', case.raw_labels):
#        print_set.append(term.group()[:-1])
#print(*print_set, sep='\n')

# Naive Test 

In [10]:
for target in ['medications']:
    for reps in [True, False]:
        target_size = len(target_dict[target])
        word_sets = fn.get_naive_traintest(vocab, target_dict[target], target_size * 5, target_size, 10, 50, reps, reps)
        emb_sets = fn.embed_words(word_sets, model)
        emb_sets['validation_set'] = emb_sets['test_set']
        emb_sets['validation_labels'] = emb_sets['test_labels']
        print("Target: %s \tRepetitions: %s" % (target, reps))
        NN = FF_Model()
        NN.build_graph()
        NN.train(emb_sets, epochs=50, report_percentage=10)

Target: medications 	Repetitions: True
FInal Values: TrAcc: 0.968646, ValAcc: 0.788839, ValF1: 0.736347
Test F1-Score: 0.737535

Target: medications 	Repetitions: False
FInal Values: TrAcc: 0.963069, ValAcc: 0.777526, ValF1: 0.720379
Test F1-Score: 0.720379



# HyperParameter Scan

In [11]:
emb_sizes = [5, 10,]
emb_num = 1
NN_num = 3
model_num = emb_num * NN_num * len(emb_sizes)

target = target_dict['medications']

layers = [50]
dropout = 1.0
learn_rate = 0.01

epochs = 50
batch = 50
report_percentage = 10

In [12]:
word_sets = fn.get_naive_traintest(vocab=vocab,
                                   labels=target,
                                   train_size=10000,
                                   test_size=1000,
                                   train_label_percentage=10,
                                   test_label_percentage=10,
                                   word_repetition = True,
                                   label_repetition = True)

In [18]:
n = 0
results = []
for emb_size in emb_sizes:
    average = 0
    for i in range(emb_num):
        model = Word2Vec(sentences, min_count=1, size=emb_size)
        emb_sets = fn.embed_words(word_sets, model)
        emb_sets['validation_set'] = emb_sets['test_set']
        emb_sets['validation_labels'] = emb_sets['test_labels']
        for j in range(NN_num):
            n += 1
            print('Model Number: %d/%d (EmbSize: %d, EmbMod: %d/%d, NN: %d/%d)' % (n, model_num, emb_size, i+1, emb_num, j+1, NN_num))
            NN = FF_Model(input_size=emb_size, layers=layers, dropout=dropout, learn_rate=learn_rate)
            NN.build_graph()
            _, _, _, score = NN.train(emb_sets, epochs=epochs, batch=batch, report_percentage=10)
            average += score
    results.append(average / (emb_num * NN_num))

Model Number: 1/6 (EmbSize: 5, EmbMod: 0/1, NN: 0/3)
FInal Values: TrAcc: 0.9419, ValAcc: 0.934, ValF1: 0.616279
Test F1-Score: 0.619883

Model Number: 2/6 (EmbSize: 5, EmbMod: 0/1, NN: 1/3)
FInal Values: TrAcc: 0.9422, ValAcc: 0.936, ValF1: 0.619048
Test F1-Score: 0.619048

Model Number: 3/6 (EmbSize: 5, EmbMod: 0/1, NN: 2/3)
FInal Values: TrAcc: 0.9406, ValAcc: 0.935, ValF1: 0.615385
Test F1-Score: 0.615385

Model Number: 4/6 (EmbSize: 10, EmbMod: 0/1, NN: 0/3)
FInal Values: TrAcc: 0.9518, ValAcc: 0.935, ValF1: 0.615385
Test F1-Score: 0.615385

Model Number: 5/6 (EmbSize: 10, EmbMod: 0/1, NN: 1/3)
FInal Values: TrAcc: 0.9523, ValAcc: 0.936, ValF1: 0.627907
Test F1-Score: 0.627907

Model Number: 6/6 (EmbSize: 10, EmbMod: 0/1, NN: 2/3)
FInal Values: TrAcc: 0.9513, ValAcc: 0.94, ValF1: 0.647059
Test F1-Score: 0.647059



In [19]:
print('Size:\tPerf:')
[print('%g\t%g' % (a, b)) for a, b in zip(emb_sizes, results)]

Size:	Perf:
5	0.618105
10	0.630117


[None, None]