In [1]:
import numpy as np
import re
import random

import tensorflow as tf
import sklearn as sk

import Functions as fn
from DS import DS
from Set import pool
from Iterator import Iterator
from FFModel import FF_Model

# Data Loading

In [2]:
Dataset = pool()
Dataset.load_texts('raw_texts')
Dataset.load_labels('raw_labels')

Raw Text Load Complete
Raw Labels Load Complete


In [3]:
target_dict = fn.load_labels('labels')

Label Load Complete


# Embedding Generation

In [4]:
from gensim.models import Word2Vec

sentences = fn.load_sentences('sentences')

#model = Word2Vec(sentences, min_count=1, size=50)
#model.save('W2V')

model = Word2Vec.load('W2V')
vocab = model.wv.vocab.keys()



Sentence Load Complete


# Window Testing

In [11]:
train_cases = pool((Dataset.get_DS(stage='test', labelled='yes')).data[:-25])
validation_cases = pool(Dataset.get_DS(stage='test', labelled='yes').data[-25:])
test_cases = Dataset.get_DS(stage='train', labelled='yes')

train_cases.process_for_testing()
validation_cases.process_for_testing()
test_cases.process_for_testing()

In [12]:
sets = {}
sets['train_set'], sets['train_labels'], _ = train_cases.get_ff_sets(model, left_words=3, right_words=3)
sets['validation_set'], sets['validation_labels'], _ = validation_cases.get_ff_sets(model, left_words=3, right_words=3)
sets['test_set'], sets['test_labels'], sets['test_words'] = test_cases.get_ff_sets(model, left_words=3, right_words=3)

In [13]:
print('Ratio: med: {:.2f}% non-med: {:.2f}%'.format(*(np.array(sets['train_labels']).sum(0)/ len(sets['train_labels']))))
fn.saturate_training_set_training(sets['train_set'], sets['train_labels'], 0.1)
print('Ratio: med: {:.2f}% non-med: {:.2f}%'.format(*(np.array(sets['train_labels']).sum(0)/ len(sets['train_labels']))))

Ratio: med: 0.03% non-med: 0.97%
Ratio: med: 0.11% non-med: 0.89%


In [14]:
NN = FF_Model(input_size=700, layers=[200])
NN.build_graph()
_,_,_,_ = NN.train(sets, epochs=5, batch=50, show_progress=True, show_plot=True)

Progress: 31%

KeyboardInterrupt: 

In [9]:
res = NN.predict(sets['test_set'])
tru = np.argmax(sets['test_labels'], 1)

In [10]:
TP = len([a for a in range(len(tru)) if (res[a] == 0) and (tru[a] == 0)])
TN = len([a for a in range(len(tru)) if (res[a] == 1) and (tru[a] == 1)])
FP = len([a for a in range(len(tru)) if (res[a] == 0) and (tru[a] == 1)])
FN = len([a for a in range(len(tru)) if (res[a] == 1) and (tru[a] == 0)])
TFPN = [TP, TN, FP, FN]
print('TP\tTN\tFP\tFN\n{}\t{}\t{}\t{}'.format(*TFPN))

TP	TN	FP	FN
0	10738	0	418


In [None]:
print(', '.join([sets['test_words'][a] for a in range(len(sets['test_words'])) if res[a] == 0 and tru[a] == 1]))

In [None]:
import colorama as col

out = []
for a in range(len(sets['test_words'][:500])):
    if res[a] == 0 and tru[a] == 1:
        out.append(col.Back.RED + sets['test_words'][a].upper() + col.Back.RESET)
    else:
        out.append(sets['test_words'][a])
        
print(' '.join(out))

In [11]:
pred = [0 if word in target_dict['medications'] else 1 for word in sets['test_words']]

In [12]:
sk.metrics.f1_score(tru, pred, pos_label=0, average='binary')

0.27873234058801072

In [13]:
trainer_cases = Dataset.get_DS(labelled = 'yes', stage='train')
trainer_cases.process_for_testing()

temp_sets = {}
temp_sets['set'], temp_sets['labels'], temp_sets['words'] = trainer_cases.get_ff_sets(model, left_words=3, right_words=3)

In [14]:
res2 = NN.predict(temp_sets['set'])
tru2 = np.argmax(temp_sets['labels'], 1)

In [15]:
sk.metrics.f1_score(tru2, res2, pos_label=0, average='binary')

0.74538745387453875

In [16]:
TP2 = len([a for a in range(len(tru2)) if (res2[a] == 0) and (tru2[a] == 0)])
TN2 = len([a for a in range(len(tru2)) if (res2[a] == 1) and (tru2[a] == 1)])
FP2 = len([a for a in range(len(tru2)) if (res2[a] == 0) and (tru2[a] == 1)])
FN2 = len([a for a in range(len(tru2)) if (res2[a] == 1) and (tru2[a] == 0)])
TFPN2 = [TP2, TN2, FP2, FN2]
print('TP\tTN\tFP\tFN\n{}\t{}\t{}\t{}'.format(*TFPN2))

TP	TN	FP	FN
303	10646	92	115


# HyperParameter Scan

In [None]:
emb_sizes = [100]
emb_models = 1
target_saturations = [0.05, 0.1, 0.2, 0.5, 0.7]
layer_sizes = [50]
dropouts = [1.0]
learn_rates = [0.01]
epochs = [100]
NN_num = 5

case_num = len(emb_sizes)*emb_models*len(layer_sizes)*len(target_saturations)*len(epochs)*len(dropouts)*len(learn_rates)*NN_num
print(case_num)

In [None]:
max_performance = 0
n = 1

for emb_size in emb_sizes:
    print('Model Number: %d/%d' %(n, case_num))
    for i in range(emb_models):
        model = Word2Vec(sentences, min_count=1, size=emb_size)
        for saturation in target_saturations:
            sets = fn.get_traintest2 (labelled_cases, model)
            fn.saturate_training_set(sets, model, target_dict['medications'], saturation)
            for layer_size in layer_sizes:
                for drop in dropouts:
                    for rate in learn_rates:
                        for epoch in epochs:
                            for j in range(NN_num):
                                print('Model Number: %d/%d' %(n, case_num))
                                print('ES: %d EM: %d sat: %f, LS: %d, drop: %f, LR: %f, epochs: %d, NN: %d' \
                                       % (emb_size, i, saturation, layer_size, drop, rate, epoch, j))
                                NN = FF_Model(input_size=emb_size, layers=[layer_size], dropout=drop, learn_rate=rate)
                                NN.build_graph()
                                NN.train(sets, epochs=epoch)
                                res = NN.predict(sets['test_set'])
                                tru = np.argmax(sets['test_labels'], 1)
                                perf = sk.metrics.f1_score(tru, res, pos_label=0)
                                if perf > max_performance:
                                    max_performance = perf
                                    NN.save_model('gold')
                                    model.save('gold/GOLDEMB')
                                NN.close()
                                n += 1