In [1]:
import os, re, sys, time, json, codecs
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from inspect import getargspec

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from sklearn import metrics

from gensim.models import KeyedVectors
from gensim.models.word2vec import Word2Vec

from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
os.chdir("/Users/rita/Google Drive/630/project/model")

from Code.Input_functions import *

% matplotlib inline 

Using TensorFlow backend.


ModuleNotFoundError: No module named 'Code'

In [102]:
# Initialization
EMBEDDING_WORD = 'Embedding/GoogleNews-vectors-negative300.bin'
EMBEDDING_CHAR = 'Embedding/char2vecLearned25'
TRAIN_DATA_FILE = 'Dataset/questions/training'
TRAIN_DATA_FILE = 'Dataset/questions/training'
VAL_DATA_FILE = 'Dataset/cnn/questions/validation'
TEST_DATA_FILE = 'Dataset/cnn/questions/test'

N_TRAININGPOINTS = 10000
if N_TRAININGPOINTS == 10000:
    MAX_NUM_WORDS, MAX_NUM_CHARS = 16319, 66 # total 49465, 112
elif N_TRAININGPOINTS == 50000:
    MAX_NUM_WORDS, MAX_NUM_CHARS = 26751, 67 # total 71640, 181
    
MAX_SEQUENCE_LENGTH_NEWS = 700 # median ~ 700
MAX_SEQUENCE_LENGTH_QUES = 37 # max ~ 37
EMBEDDING_DIM_WORD = 300
EMBEDDING_DIM_CHAR = 25

UNK_WORD = "<UNK_WORD>"
UNK_CHAR = "^"
UNK_ENTITY = "<UNK_ENTITY>"

# 0. Helper Functions

In [3]:
# load_dataset(data_file, name, first=100000000, remove_stopwords=False, stem_words=False, remove_punc=False, keep_period=True)


# 1. Load Dataset

In [99]:
# Load Dataset
start_time = time.time()

print("Loading datasets")
'''
datasets = {"news":[], "questions":[], "answers":[]}
entities = [(news, questions, answer, entities)]
'''
entities = defaultdict(list)
trainsets, entities["train"] = load_dataset(TRAIN_DATA_FILE, "train", N_TRAININGPOINTS, remove_stopwords=False, stem_words=False, remove_punc=False)
valsets, entities["val"] = load_dataset(VAL_DATA_FILE, "val", remove_stopwords=False, stem_words=False, remove_punc=False)
testsets, entities["test"] = load_dataset(TEST_DATA_FILE, "test", remove_stopwords=False, stem_words=False, remove_punc=False)
print("Found {} questions in trainset".format(len(trainsets["answers"]))) # 380298
print("Found {} questions in valset".format(len(valsets["answers"]))) # 3924
print("Found {} questions in testset".format(len(testsets["answers"]))) # 3198

print("{} sec".format(time.time() - start_time)) # 545sec for all, 68sec for 50000

Loading datasets
Finished 1000 questions in train
Finished 2000 questions in train
Finished 3000 questions in train
Finished 4000 questions in train
Finished 5000 questions in train
Finished 6000 questions in train
Finished 7000 questions in train
Finished 8000 questions in train
Finished 9000 questions in train
Finished 10000 questions in train
Finished 1000 questions in val
Finished 2000 questions in val
Finished 3000 questions in val
Finished 1000 questions in test
Finished 2000 questions in test
Finished 3000 questions in test
Found 10001 questions in trainset
Found 3924 questions in valset
Found 3198 questions in testset
26.795522928237915 sec


In [31]:
count = 0
for i in range(len(trainsets['news'])):
    if trainsets["answers"][i] in trainsets["news"][i].split()[:MAX_SEQUENCE_LENGTH_NEWS]:
        count += 1
print("{0:.2f}% of trainsets have answers".format(count/len(trainsets['news'])*100))
print("Total unique tokens in the trainset: {}".format(len(Counter([j for i in trainsets["news"] for j in i.split()]))))
print("Total unique chars in the trainset: {}".format(len(Counter([k for i in trainsets["news"] for j in i.split() for k in j]))))
print("Median news length: {}".format(np.median([len(trainsets["news"][i].split()) for i in range(len(trainsets['news']))])))

95.94% of trainsets have answers
Total unique tokens in the trainset: 71640
Total unique chars in the trainset: 181
Median news length: 699.0


In [21]:
# test
print(testsets["questions"][0])
print(testsets["news"][0])

property experts say @placeholder investment in @entity5 is set to grow
( @entity0 ) sophisticated , glamorous and spacious - - when the super - rich go house - hunting they are searching for something special . real estate in @entity5 swankier suburbs can catch a buyers eye . @entity8 , @entity9 and @entity10 have long been the stomping ground of the elite - - and are now welcoming a new wave of @entity15 investors . " the @entity19 who are coming into @entity5 now are @entity19 who themselves have worked for their money , " explains @entity16 , a @entity18 - @entity17 wealth manager based in @entity5 . " they have grown in industry and are actually part of the exciting story of the @entity15 renaissance , " she continues . " it bringing to @entity5 the best of the continent . " these investors are having a considerable impact on @entity5 property market and they mainly come from just six countries : @entity17 , @entity32 , @entity33 , @entity34 , @entity35 and @entity36 . of these , 

# 2. Word-level Embedding

## 2.0 Handle OOV

In [100]:
handeling_oov = False

if handeling_oov == True:
    
    start_time = time.time()
    
    print('Replacing OOV')

    texts = trainsets["news"] + trainsets["questions"]
    word_counts_all = Counter([j for i in texts for j in i.split()])
    notOOV = word_counts_all.most_common(MAX_NUM_WORDS)
    OOV = word_counts_all - Counter(dict(notOOV))
    if "|" in OOV:
        OOV = OOV - Counter({"|":OOV["|"]})
    texts = [re.sub(r"( {} )".format(" | ".join(list(OOV.keys()))), " {} ".format(UNK_WORD), text) for text in texts]
    np.save("Dataset/GRU/{0}/traintext{0}_OOV.npy".format(N_TRAININGPOINTS), texts)
    
    print("{} sec".format(time.time() - start_time)) # 1600sec x 3

else:
    texts = np.load("Dataset/GRU/{0}/traintext{0}_OOV.npy".format(N_TRAININGPOINTS)).tolist()
    print(texts[0][:200])

( @entity0 ) - - each summer , @entity1 <UNK_WORD> a <UNK_WORD> of <UNK_WORD> and shoppers with eager cash <UNK_WORD> . before jumping into the peak - season pig pile , consider the advantages of an o


## 2.1 Texts to Sequences

In [101]:
start_time = time.time()

print('Transforming texts to sequences - Word Level')

tokenizer_word = Tokenizer(filters='', lower=False, split=" ", char_level=False)
tokenizer_word.fit_on_texts(texts)

trainSeqNews_word = tokenizer_word.texts_to_sequences(trainsets["news"])
trainSeqQues_word = tokenizer_word.texts_to_sequences(trainsets["questions"])

valSeqNews_word = tokenizer_word.texts_to_sequences(valsets["news"])
valSeqQues_word = tokenizer_word.texts_to_sequences(valsets["questions"])

testSeqNews_word = tokenizer_word.texts_to_sequences(testsets["news"])
testSeqQues_word = tokenizer_word.texts_to_sequences(testsets["questions"])

word_counts = tokenizer_word.word_counts
word_index = tokenizer_word.word_index
print('Found {} unique tokens'.format(len(word_index)))
print("Median News Length: {}".format(np.median(np.array([len(i.split()) for i in trainsets["news"]] + [len(i.split()) for i in valsets["news"]]))))
print("Max Question Length: {}".format(np.max(np.array([len(i.split()) for i in trainsets["questions"]] + [len(i.split()) for i in valsets["questions"]]))))

print("{} sec".format(time.time() - start_time)) # 60sec

Transforming texts to sequences - Word Level
Found 16319 unique tokens
Median News Length: 696.0
Max Question Length: 37
14.099263906478882 sec


## 2.2 Load Word Embedding

In [62]:
start_time = time.time()

print('Indexing word vectors')

word2vecGooNews = KeyedVectors.load_word2vec_format(EMBEDDING_WORD, binary=True) # a word:vec dictionary
# word2vec.save_word2vec_format('googlenews.txt')
print('Found {} word vectors of word2vec'.format(len(word2vecGooNews.vocab)))

print("{} sec".format(time.time() - start_time)) # 60sec

Indexing word vectors
Found 3000000 word vectors of word2vec
60.838651180267334 sec


## 2.3 Prepare Embeddings

In [103]:
print('Preparing embedding matrix - word')

nb_words = min(MAX_NUM_WORDS, len(word_index)) + 1

embedding_word_matrix = np.zeros((nb_words, EMBEDDING_DIM_WORD))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        break
    if word in word2vecGooNews.vocab:
        embedding_word_matrix[i] = word2vecGooNews.word_vec(word)
print('Null word embeddings: {}'.format(np.sum(np.sum(embedding_word_matrix, axis=1) == 0)))
print('Embedding shape: {}'.format(embedding_word_matrix.shape))

Preparing embedding matrix - word
Null word embeddings: 1059
Embedding shape: (16320, 300)


# 3. Char-level Embedding

## 3.1 Texts to Sequences

In [104]:
start_time = time.time()

print('Transforming texts to sequences - Character Level')

texts = trainsets["news"] + trainsets["questions"]
texts = [re.sub(r"[^A-z0-9 \"\'\.\?\{\}\(\)\[\]:;!~@#$%&*<>,/+\-=_]", "^", text) for text in texts]

tokenizer_char = Tokenizer(filters='', lower=False, split=" ", char_level=True)
tokenizer_char.fit_on_texts(texts)

trainSeqNews_char = tokenizer_char.texts_to_sequences(trainsets["news"])
trainSeqQues_char = tokenizer_char.texts_to_sequences(trainsets["questions"])

valSeqNews_char = tokenizer_char.texts_to_sequences(valsets["news"])
valSeqQues_char = tokenizer_char.texts_to_sequences(valsets["questions"])

testSeqNews_char = tokenizer_char.texts_to_sequences(testsets["news"])
testSeqQues_char = tokenizer_char.texts_to_sequences(testsets["questions"])

char_index = tokenizer_char.word_index
char_counts = tokenizer_char.word_counts
print('Found {} unique tokens'.format(len(char_index)))
print("{} sec".format(time.time() - start_time)) # 109sec

Transforming texts to sequences - Character Level
Found 66 unique tokens
55.71695685386658 sec


## 3.2 Generate / Load Char Embedding

In [105]:
start_time = time.time()

chars = [list(re.sub(r"[^A-z0-9 \"\'\.\?\{\}\(\)\[\]:;!~@#$%&*<>,/+\-=_]", "^", text)) for text in texts]
# char2vecLearned = Word2Vec(chars, size=EMBEDDING_DIM_CHAR, min_count=1)
# char2vecLearned.save(EMBEDDING_CHAR)
char2vecLearned = Word2Vec.load(EMBEDDING_CHAR).wv
print('Found {} word vectors of word2vec'.format(len(char2vecLearned.vocab)))

print("{} sec".format(time.time() - start_time)) # 201sec

Found 67 word vectors of word2vec
3.7756259441375732 sec


## 3.3 Prepare Embeddings

In [106]:
print('Preparing embedding matrix - char')

nb_chars = min(MAX_NUM_CHARS, len(char_index)) + 1

embedding_char_matrix = np.zeros((nb_chars, EMBEDDING_DIM_CHAR))
for char, i in char_index.items():
    if i >= MAX_NUM_CHARS:
        break
    if char in char2vecLearned.vocab:
        embedding_char_matrix[i] = char2vecLearned.word_vec(char)
print('Null char embeddings: {}'.format(np.sum(np.sum(embedding_char_matrix, axis=1) == 0)))
print('Embedding shape: {}'.format(embedding_char_matrix.shape))       

Preparing embedding matrix - char
Null char embeddings: 2
Embedding shape: (67, 25)


# 4. Input and Labels

## 4.1 Pad Sequences as Input

In [107]:
print('Padding sequences')

News_train_word = pad_sequences(trainSeqNews_word, maxlen=MAX_SEQUENCE_LENGTH_NEWS, truncating="post")
Ques_train_word = pad_sequences(trainSeqQues_word, maxlen=MAX_SEQUENCE_LENGTH_QUES, truncating="post")
print('Shape of news tensor:', News_train_word.shape)
print('Shape of questions tensor:', Ques_train_word.shape)

News_val_word = pad_sequences(valSeqNews_word, maxlen=MAX_SEQUENCE_LENGTH_NEWS, truncating="post")
Ques_val_word = pad_sequences(valSeqQues_word, maxlen=MAX_SEQUENCE_LENGTH_QUES, truncating="post")

News_test_word = pad_sequences(testSeqNews_word, maxlen=MAX_SEQUENCE_LENGTH_NEWS, truncating="post")
Ques_test_word = pad_sequences(testSeqQues_word, maxlen=MAX_SEQUENCE_LENGTH_QUES, truncating="post")

np.save("Dataset/GRU/{0}/N_train{0}.npy".format(N_TRAININGPOINTS), News_train_word)
np.save("Dataset/GRU/{0}/Q_train{0}.npy".format(N_TRAININGPOINTS), Ques_train_word)
np.save("Dataset/GRU/{0}/N_val{0}.npy".format(N_TRAININGPOINTS), News_val_word)
np.save("Dataset/GRU/{0}/Q_val{0}.npy".format(N_TRAININGPOINTS), Ques_val_word)
np.save("Dataset/GRU/{0}/N_test{0}.npy".format(N_TRAININGPOINTS), News_test_word)
np.save("Dataset/GRU/{0}/Q_test{0}.npy".format(N_TRAININGPOINTS), Ques_test_word)

Padding sequences
Shape of news tensor: (10001, 700)
Shape of questions tensor: (10001, 37)


## 4.2 Input Word Embedding

In [108]:
print('Combine input embedding')

num_words = len(word_index) + 1
sorted_word_index = sorted(word_index.items(), key=lambda x:x[1])
embedding_input_matrix = np.zeros((num_words, EMBEDDING_DIM_WORD + EMBEDDING_DIM_CHAR))
for wordcode in range(1, num_words):
    
    if wordcode == word_index[UNK_WORD]:
        continue
    
    wordorig = sorted_word_index[wordcode-1][0]
    
    # get word embedding
    try:
        word_level_embedding = embedding_word_matrix[wordcode]
    except:
        word_level_embedding = embedding_word_matrix[word_index[UNK_WORD]] # (300,)
    
    # get char embedding
    char_level_embedding =[]
    for charorig in wordorig:
        try:
            charcode = char_index[charorig]
        except:
            charcode = char_index[UNK_CHAR]
        char_level_embedding.append(embedding_char_matrix[charcode])
    char_level_embedding = np.mean(np.array(char_level_embedding), axis=0) # (25,)
    
    # combine word and char embedding
    embedding_input_matrix[wordcode] = np.concatenate((word_level_embedding, char_level_embedding)) # (325,)

print('Null word embeddings: {}'.format(np.sum(np.sum(embedding_input_matrix, axis=1) == 0)))
print('Embedding shape: {}'.format(embedding_input_matrix.shape))
np.save("Dataset/GRU/{0}/embedding_input_matrix{0}.npy".format(N_TRAININGPOINTS), embedding_input_matrix)

Combine input embedding
Null word embeddings: 2
Embedding shape: (16320, 325)


## 4.3 Output Index

In [109]:
entity_index = sorted(list(set([w for w in word_index.keys() if w.startswith('@entity')] + trainsets["answers"])), key=lambda x:int(x[7:])) + [UNK_ENTITY]
entity_index = {w: index for (index, w) in enumerate(entity_index)}
print('Found unique entity: {}'.format(len(entity_index)))
np.save("Dataset/GRU/{0}/entity_index{0}.npy".format(N_TRAININGPOINTS), entity_index)

y_train = np.array([entity_index[trainsets["answers"][i]] for i in range(len(trainsets["answers"]))])
y_val = np.array([entity_index[valsets["answers"][i]] for i in range(len(valsets["answers"]))])
y_test = np.array([entity_index[testsets["answers"][i]] for i in range(len(testsets["answers"]))])
print('y_train labels: {}'.format(len(y_train)))
print('y_val labels: {}'.format(len(y_val)))
print('y_test labels: {}'.format(len(y_test)))
np.save("Dataset/GRU/{0}/y_train{0}.npy".format(N_TRAININGPOINTS), y_train)
np.save("Dataset/GRU/{0}/y_val{0}.npy".format(N_TRAININGPOINTS), y_val)
np.save("Dataset/GRU/{0}/y_test{0}.npy".format(N_TRAININGPOINTS), y_test)

Found unique entity: 386
y_train labels: 10001
y_val labels: 3924
y_test labels: 3198
