In [136]:
import re
import os
import sys
import collections
import pandas as pd
import numpy as np

from keras.optimizers import *
from keras.callbacks import *
from keras.models import *
from keras.layers import *
from keras.initializers import *
from keras.activations import *
from keras_layer_normalization import LayerNormalization

import tensorflow as tf
from sklearn.model_selection import train_test_split

import nltk
nltk.download('punkt')

d_model = 512
bAbI_max_len = 12

data_en_path = "data/en/"
data_en_10k_path = "data/en-10k/"
data_en_valid_path = "data/en-valid/"
data_en_valid_10k_path = "data/en-valid-10k/"

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ICPS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [111]:
class Sentence:
    def __init__(self, num, sentence):
        self.num = num
        self.sentence = sentence

class bAbISet:
    def __init__(self, task=None, sentences=None, question=None, answer=None, supporting_num=None):
        self.task = task
        self.sentences = sentences
        self.question = question
        self.answer = answer
        self.supporting_num = supporting_num
        
    def add_vec_data(self, vec_sentences, vec_question, vec_answer):
        self.vec_sentences = vec_sentences
        self.vec_question = vec_question
        self.vec_answer = vec_answer
        
    def Print(self):
        print("=============================================================")
        print(">> Task: ", self.task)
        print(">> Sentences: ", len(self.sentences))
        for sentence in self.sentences:
            print(sentence.num, ": ", sentence.sentence)
        print(">> Question: ", self.question)
        print(">> Answer: ", self.answer)
        print(">> Supporting Fact: ", self.supporting_num)
        print("=============================================================")
        
    def Print_vec(self):
        if self.vec_sentences is None:
            return
        
        print("=============================================================")
        print(">> Task: ", self.task)
        print(">> Sentences: ", len(self.sentences))
        for sentence in self.vec_sentences:
            print(sentence.num, ": ", sentence.sentence)
        print(">> Question: ", self.vec_question)
        print(">> Answer: ", self.vec_answer)
        print(">> Supporting Fact: ", self.supporting_num)
        print("=============================================================")
        

In [137]:
class bAbIUtils:
    def __init__(self, path):
        self.files = os.listdir(path)
        self.files = [os.path.join(path, f) for f in self.files]
        
    def make_question_set(self, sentences):
        all_set = []
        temp_set = []
        for sentence in sentences:
            temp_set.append(sentence.lower())
            if sentence.find('\t') != -1:
                all_set.append(temp_set.copy())
                temp_set.clear()
                
        return all_set
                
    def data_processing(self):
        print("※ Data Processing...")
        all_data = []
        for file in self.files:
            
            idx_start = file.find('_') + 1
            idx_end = file[idx_start:].find('_')
            task = file[idx_start:idx_start + idx_end]
            
            with open(file, 'r') as f:
                data = f.readlines()
                raw_set = self.make_question_set(data)
                
                for one_set in raw_set[:1]:
                    Sentence_set = []
                    for sentence in one_set:
                        sentence = sentence.replace('\n', '')
                        
                        idx = sentence.find(' ')
                        idx_answer =  sentence.find('\t')
                        if idx_answer != -1:
                            question = sentence[idx + 1:idx_answer]
                            idx_answer_end = sentence[idx_answer + 1:].find('\t')
                            answer = sentence[idx_answer + 1: idx_answer + idx_answer_end + 1]
                            supporting_num = sentence[idx_answer + idx_answer_end + 2:]
                            
                        else:
                            Sentence_set.append(Sentence(sentence[:idx], sentence[idx + 1:]))
                            
                    all_data.append(bAbISet(task, Sentence_set, question, answer, supporting_num))
                    
        return all_data
    
    def make_dictionary(self, all_set):
        print(">> Make Dictionary...")
        
        sentence_set = []
        for one_set in all_set:
            for Sentence in one_set.sentences:
                sentence_set.append(Sentence.sentence)
            sentence_set.append(one_set.question)
            sentence_set.append(one_set.answer)
        
        words = []
        for sentence in sentence_set:
            tokens = nltk.word_tokenize(sentence)
                        
            words.extend(tokens)

        words = collections.Counter(words)

        dictionary = {}
        dictionary['<PAD>'] = 0
        dictionary['<UNK>'] = 1
        dictionary['<EOS>'] = 2
        dictionary['<S>'] = 3
        idx = 4
        for word in words.most_common():
            if len(word[0]) > 0:
                dictionary[word[0]] = idx
                idx += 1
            
            if idx >= 20000: break;

        return dictionary
    
    def vectorize_sentence(self, sentence, dictionary):
        vec_sentence = []
        #vec_sentence.append(dictionary['<S>'])
        for word in nltk.word_tokenize(sentence):
            if len(word) > 0:
                if word in dictionary:
                    vec_sentence.append(dictionary[word])
                else:
                    vec_sentence.append(dictionary['<UNK>'])
        
        for _ in range(len(vec_sentence), bAbI_max_len):
            vec_sentence.append(dictionary['<PAD>'])
        #vec_sentence.append(dictionary['<EOS>'])
        
        return vec_sentence

In [138]:
bAbI_util = bAbIUtils(data_en_path)
bAbI = bAbI_util.data_processing()
dictionary = bAbI_util.make_dictionary(bAbI)

print("Done")

※ Data Processing...
>> Make Dictionary...
Done


In [139]:
max_len = 0
for one_set in bAbI:
    vec_sentences = []
    for sentence in one_set.sentences:
        vec_sentence = bAbI_util.vectorize_sentence(sentence.sentence, dictionary)
        vec_sentences.append(Sentence(sentence.num, vec_sentence))
        
    vec_question = bAbI_util.vectorize_sentence(one_set.question, dictionary)
    vec_answer = bAbI_util.vectorize_sentence(one_set.answer, dictionary)
    one_set.add_vec_data(vec_sentences, vec_question, vec_answer)

print("Done")

Done


In [141]:
class E2EMN:
    def __init__(self):
        A = Embedding(bAbI_max_len, d_model)