In [None]:
import utils
import time
from collections import Counter
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import ProposedModel as proposedModel


# Settings

In [None]:
inputs = [] #Load here the question/sentences to be classified
articleIds = [] #Load here the outputs (labels/kbaID)

outputs =  []#the text answer from the QA-System, only needed for Word2Vec training


useAttention = True
useBiRNN = True
usePretrainedEmbeddings = True
useDynamicEmbeddings = True


batchSize = 5000
epochs = 30
dropout = 0.4
rnn_layers = 2
n_hidden_nodes = 200

# Preprocessing of corpus

In [None]:
words = []
wordsPerInteractionOutput = []
wordsPerInteractionInput = []


print("Preprocessing output corpus..")
start = time.time()
for index, text in enumerate(outputs):
    processedWords = utils.preprocess(text).split(" ")
    words.extend(processedWords)
    wordsPerInteractionOutput.append(processedWords)
end = time.time()
print("Done! Time passed {:.4f} sec".format((end-start)))

print("Preprocessing input corpus..")
start = time.time()
for index, text in enumerate(inputs):
    processedWords = utils.preprocess(text).split(" ")
    words.extend(processedWords)
    wordsPerInteractionInput.append(processedWords)
end = time.time()
print("Done! Time passed {:.4f} sec".format((end-start)))

word_counts = Counter(words)

print("Total words: {}".format(len(words)))
print("Unique words: {}".format(len(set(words))))

# Load relevant GloVe embeddings 
Discard the one that do not appear in our vocabulary

In [None]:
if(usePretrainedEmbeddings):
    
    setWords = set(words)    
    
    #Load GLOVE vectors
    filepath_glove = 'glove.6B.300d.txt'
    glove_vocab = []
    glove_embd=[]
    embedding_dict = {}

    file = open(filepath_glove,'r',encoding='UTF-8')
    for line in file.readlines():
        row = line.strip().split(' ')
        vocab_word = row[0]
        if vocab_word in setWords:
            glove_vocab.append(vocab_word)
            embed_vector = [float(i) for i in row[1:]] # convert to list of float
            glove_embd.append(embed_vector)
            embedding_dict[vocab_word]=embed_vector
    file.close()

    print('Loaded GLOVE')
    
    #Generate random vectors for words that were not found in GloVe
    
    for word in setWords:
        if word not in glove_vocab:
            glove_vocab.append(word)
            embedding = 2 * np.random.random_sample(300) - 1
            glove_embd.append(embedding)
            embedding_dict[word]=embedding
            
            
    #The first vector is a 0 vector for padding
    glove_embd.insert(0, np.zeros(300))

    #Cast into array
    embeddings = np.asarray(glove_embd,dtype=np.float32)
    vocabulary = glove_vocab

# Or train embeddings on Dataset

In [None]:
def get_target(words, idx, window_size=5):
    ''' Get a list of words in a window around an index. '''
    
    R = np.random.randint(1, window_size+1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R
    target_words = set(words[start:idx] + words[idx+1:stop+1])
    
    return list(target_words)


def get_batches(n_sentences_in_batch, sentences, window_size=5):  

    for idx in range(0, len(sentences), n_sentences_in_batch):
        x, y = [], []
        batch = sentences[idx:idx+n_sentences_in_batch]
        for batchIndex in range(len(batch)):
            for wordIndex in range(len(batch[batchIndex])):
                batch_x = batch[batchIndex][wordIndex]
                batch_y = get_target(batch[batchIndex], wordIndex, window_size)
                y.extend(batch_y)
                x.extend([batch_x]*len(batch_y))
        yield x, y
        

In [None]:
if(not usePretrainedEmbeddings):
    word2vecEpochs = 10
    n_embedding = 300
    n_sampled = 10
    learning_rate = 0.001
    window_size = 5
    n_sentences_in_batch = 200

In [None]:
if(not usePretrainedEmbeddings):
    
    for index, _ in enumerate(wordsPerInteractionInput):
        wordsPerInteractionInput[index] = [word for word in wordsPerInteractionInput[index] if (word!="")]
    
    for index, _ in enumerate(wordsPerInteractionOutput):
        wordsPerInteractionOutput[index] = [word for word in wordsPerInteractionOutput[index] if (word!="")]
        
    wordsPerInteraction = list(wordsPerInteractionOutput)
    wordsPerInteraction.extend(wordsPerInteractionInput)
    
    int_wordsPerInteraction = []
    
    vocab_to_int, int_to_vocab = utils.create_lookup_tables(words)

    for index, interaction in enumerate(wordsPerInteraction):
        int_wordsPerInteraction.append([vocab_to_int[word] for word in wordsPerInteraction[index]])

In [None]:
if(not usePretrainedEmbeddings):

    tf.reset_default_graph()

    n_vocab = len(int_to_vocab) + 1
    inputs = tf.placeholder(tf.int32, [None], name='inputs')
    labels = tf.placeholder(tf.int32, [None, None], name='labels')
    embedding = tf.Variable(tf.random_uniform((n_vocab, n_embedding), -1, 1),name='embedding')
    embed = tf.nn.embedding_lookup(embedding, inputs)
    softmax_w = tf.Variable(tf.truncated_normal((n_vocab, n_embedding), stddev=0.1), name='softmax_w')
    softmax_b = tf.Variable(tf.zeros(n_vocab),name='softmax_b')
    # Calculate the loss using negative sampling
    loss = tf.nn.sampled_softmax_loss(softmax_w, softmax_b, 
                                      labels, embed,
                                      n_sampled, n_vocab)

    norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keep_dims=True))
    normalized_embedding = embedding / norm
    
    cost = tf.reduce_mean(loss)
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

    saver = tf.train.Saver()
   
    iteration = 1
    loss = 0
    
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    for e in range(1, word2vecEpochs+1):
        sentences = get_batches(n_sentences_in_batch ,int_wordsPerInteraction, window_size)
        start = time.time()
        for x, y in sentences:

            feed = {inputs: x,
                    labels: np.array(y)[:, None]}
            train_loss, _ = sess.run([cost, optimizer], feed_dict=feed)

            loss += train_loss
            
            if iteration % 100 == 0: 
                end = time.time()
                print("Epoch {}/{}".format(e, word2vecEpochs),
                      "Iteration: {}".format(iteration),
                      "Avg. Training loss: {:.4f}".format(loss/100),
                      "{:.4f} sec/batch".format((end-start)/100))
                loss = 0
                start = time.time()
            iteration += 1
            break
        break
        save_path = saver.save(sess, "Word2VecCheckpoints/Epoch{}.ckpt".format(e))
        
    embeddings = sess.run(normalized_embedding)
    embeddings[0] = np.zeros(300)

# Create Lookup tables

In [None]:
if(usePretrainedEmbeddings):
    vocab_to_int, int_to_vocab = utils.create_lookup_tables(vocabulary)
    
article_to_int, int_to_article = utils.create_lookup_tables(articleIds)


int_article = [article_to_int[articleId] for articleId in articleIds]

int_wordsPerInteractionInput = []

for index, interaction in enumerate(wordsPerInteractionInput):
    int_wordsPerInteractionInput.append([vocab_to_int[word] for word in wordsPerInteractionInput[index] if (word!="")])

# Build Model

In [None]:
tf.reset_default_graph()

x = tf.placeholder(tf.int32, [None, None])
y = tf.placeholder(tf.int32, [None])


if(useDynamicEmbeddings):
    embeddings = tf.Variable(embeddings)
    
embed = tf.nn.embedding_lookup(embeddings, x)

out_dim = len(set(articleIds))
onehot_encoding_articles = tf.one_hot(int_article, out_dim)
labels = tf.nn.embedding_lookup(onehot_encoding_articles, y)

maxLengthBatch = tf.placeholder(tf.int32)
keep_prob = tf.placeholder(tf.float32)
model = proposedModel.VariableSequenceClassification(embed,labels,maxLengthBatch,keep_prob,
                                                     useAttention,useBiRNN, num_hidden = rnn_layers, num_layers = rnn_layers)


# Split dataset in Training and Test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(int_wordsPerInteractionInput, int_article, test_size=0.05, random_state = 24403)


# Training

In [None]:
sess = tf.Session()
saver = tf.train.Saver()

sess.run(tf.global_variables_initializer())
#saver.restore(sess, tf.train.latest_checkpoint('Checkpoints'))

batches = utils.get_batches_rnn(batchSize, X_test, y_test)
x_batchTest, y_batchTest, maxLengthTest = next(batches)    

trainErr = []
evalErr = []


for epoch in range(1, epochs+1):
    
    batches = utils.get_batches_rnn(batchSize, X_train, y_train)
    batchesCount = 0
    start = time.time()
    
    for x_batch, y_batch, maxLength in batches:
        _ , err = sess.run([model.optimize, model.error], {x: x_batch, y: y_batch, maxLengthBatch: maxLength, keep_prob: (1-dropout)})
        if batchesCount % 10 == 0:
            print('Epoch {:2d}, Step {:4d} error {:3.1f}%'.format(epoch,batchesCount, 100 * err))
        batchesCount += 1
        lastErr = err
        
    trainErr.append(lastErr)
    
    saver.save(sess, "Checkpoints/epoch{}.ckpt".format(epoch))
    
    error = sess.run(model.error, {x: x_batchTest, y: y_batchTest, maxLengthBatch: maxLengthTest, keep_prob: 1})
    evalErr.append(error)

    end = time.time()
    print("---Eval---")
    print('Eval Error {:3.1f}%'.format(100 * error))
    print("Epoch Time passed {:.4f} sec".format((end-start)))
    print("----------")