In [1]:
import numpy as np
import re
import random
import itertools
import datetime

import tensorflow as tf
import sklearn as sk

import Functions as fn
from DS import DS
from Set import pool
from Iterator import Iterator
from FFModel import FF_Model

# Data Loading

In [2]:
Dataset = pool()
Dataset.load_texts('raw_texts')
Dataset.load_labels('raw_labels')

Raw Text Load Complete
Raw Labels Load Complete


In [3]:
#print('Number of Texts: ', Dataset.size)
#print('Number of 2007 Smoking Challenge texts: ', Dataset.number_of(challenge='2007 Smoking Challenge'))
#print('Number of 2008 ObesityChallenge texts: ', Dataset.number_of(challenge='2008 Obesity Challenge'))
#print('Number of 2009 Medication Challenge texts: ', Dataset.number_of(challenge='2009 Medication Challenge'))
#print('Number of 2010 Relations Challenge texts: ', Dataset.number_of(challenge='2010 Relations Challenge'))
#print('Number of 2011 Coreference Challenge texts: ', Dataset.number_of(challenge='2011 Coreference Challenge'))
#print('Number of 2012 Temporal Relations Challenge texts: ', Dataset.number_of(challenge='2012 Temporal Relations Challenge'))
#print('Number of Train Texts: ', Dataset.number_of(stage='train'))
#print('Number of Test Texts: ', Dataset.number_of(stage='test'))
#print('Number of Labeled Texts: ', Dataset.number_of(labelled='yes'))
#print('Number of Initially Labeled Texts: ', Dataset.number_of(labelled='yes', label_type='train'))
#print('Number of Competitor Labeled Texts Texts: ', Dataset.number_of(labelled='yes', label_type='test'))

In [4]:
target_dict = fn.load_labels('labels')

Label Load Complete


# Embedding Generation

In [5]:
from gensim.models import Word2Vec

sentences = fn.load_sentences('sentences')

#model = Word2Vec(sentences, min_count=1, size=100)
#model.save('W2V')

model = Word2Vec.load('W2V')
vocab = model.wv.vocab.keys()



Sentence Load Complete


In [6]:
#model.most_similar("prozac", topn=10)

# Visualisation

In [7]:
#fn.visualise(model, sentences, [target_dict['medications']], 1000, 'Medications in top 1000 words')

In [8]:
#print_set = []
#for case in Dataset.get_DS(labelled='yes').data:
#    for term in re.finditer(r'm="[^|]+\|', case.raw_labels):
#        print_set.append(term.group()[:-1])
#print(*print_set, sep='\n')

# Naive Test 

In [9]:
for target in ['medications']:
    for reps in [True, False]:
        target_size = len(target_dict[target])
        word_sets = fn.generate_naive_traintest(vocab, target_dict[target], target_size * 5, target_size, 10, 50, reps, reps)
        emb_sets = fn.embed_words(word_sets, model)
        emb_sets['validation_set'] = emb_sets['test_set']
        emb_sets['validation_labels'] = emb_sets['test_labels']
        print("Target: {}\tRepetitions: {}".format(target, reps))
        NN = FF_Model()
        NN.build_graph()
        NN.train(emb_sets, epochs=50, report_percentage=10, show_progress=True)

Target: medications	Repetitions: True
FInal Values: TrAcc: 0.968, ValAcc: 0.817, ValF1: 0.783
Test F1-Score: 0.783

Target: medications	Repetitions: False
FInal Values: TrAcc: 0.965, ValAcc: 0.767, ValF1: 0.703
Test F1-Score: 0.703



# HyperParameter Scan

In [None]:
emb_num = 5
NN_num = 1
emb_sizes = [30, 50, 100]
layers = [[30], [50], [100]]
dropouts = [0.3, 0.5, 0.8, 1.0]
learn_rates = [0.001, 0.01, 0.1]
epoch_nums = [10, 50, 100]
batch_sizes = [10, 50, 100]

model_num = emb_num * NN_num * len(emb_sizes) * len(layers) * len(dropouts) * \
            len(learn_rates) * len(epoch_nums) * len(batch_sizes)
target = target_dict['medications']
report_percentage = 10

print(model_num)

In [None]:
word_sets = fn.generate_naive_traintest(vocab=vocab,
                                        labels=target,
                                        train_size=10000,
                                        test_size=1000,
                                        train_label_percentage=10,
                                        test_label_percentage=10,
                                        word_repetition = True,
                                        label_repetition = True)

In [None]:
iterations = [emb_sizes, layers, dropouts, learn_rates, epoch_nums, batch_sizes]

In [None]:
n = 0
parameters = []
results = []
timestamp = re.sub(r':', '-', str(datetime.datetime.now()).split('.')[0])

f = open('tests/Model1/' + timestamp, 'w+')
print('EmbSz:\tLay:\tDrop:\tLeRa:\tEpochs:\tBatSz:\tF1:')
f.write('EmbSz:\tLay:\tDrop:\tLeRa:\tEpochs:\tBatSz:\tF1:\n')
for par in itertools.product(*iterations):
    best = 0
    parameters.append(par)
    for i in range(emb_num):
        model = Word2Vec(sentences, min_count=1, size=par[0])
        emb_sets = fn.embed_words(word_sets, model)
        emb_sets['validation_set'] = emb_sets['test_set']
        emb_sets['validation_labels'] = emb_sets['test_labels']
        for j in range(NN_num):
            n += 1
            print('Model Number: {}/{}'.format(n, model_num), end='\r')
            NN = FF_Model(input_size=par[0], layers=par[1], dropout=par[2], learn_rate=par[3])
            NN.build_graph()
            _, _, _, score = NN.train(emb_sets, epochs=par[4], batch=par[5], report_percentage=report_percentage)
            best = max(best, score)
    results.append(best)
    print('{}\t{}\t{}\t{}\t{}\t{}\t{:.4f}'.format(*parameters[-1], results[-1]))
    f.write('{}\t{}\t{}\t{}\t{}\t{}\t{:.4f}\n'.format(*parameters[-1], results[-1]))
    
print('Max Performance: {:.4f}'.format(max(results)))  
f.write('Max Performance: {:.4f}\n'.format(max(results)))
f.close()