In [13]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, GRU, Dropout
from keras.optimizers import RMSprop, Adam
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant
import numpy as np
import random
import sys
import io
import os

In [2]:
# From https://gist.github.com/jovianlin/0a6b7c58cde7a502a68914ba001c77bf
def load_glove_embeddings(fp, embedding_dim, include_empty_char=False):
    """
    Loads pre-trained word embeddings (GloVe embeddings)
        Inputs: - fp: filepath of pre-trained glove embeddings
                - embedding_dim: dimension of each vector embedding
                - include_empty_char: whether to include empty char in vocab
        Outputs:
                - word2coefs: Dictionary. Word to embedding vector
                - word2index: Dictionary. Word to word-index
                - embedding_matrix: Embedding matrix for Keras Embedding layer
    """
    # First, build the "word2coefs" and "word2index"
    word2coefs = {} # word to its corresponding coefficients
    word2index = {} # word to word-index
    with open(fp, 'r') as f:
        for idx, line in enumerate(f):
            try:
                data = [x.strip().lower() for x in line.split()]
                word = data[0]
                coefs = np.asarray(data[1:embedding_dim+1], dtype='float32')
                word2coefs[word] = coefs
                if word not in word2index:
                    word2index[word] = len(word2index)
            except Exception as e:
                print('Exception occurred in `load_glove_embeddings`:', e)
                continue
        # End of for loop.
    # End of with open
    if include_empty_char:
        word2index[''] = len(word2index)
    # Second, build the "embedding_matrix"
    # Words not found in embedding index will be all-zeros. Hence, the "+1".
    vocab_size = len(word2coefs)+1 if include_empty_char else len(word2coefs)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, idx in word2index.items():
        embedding_vec = word2coefs.get(word)
        if embedding_vec is not None and embedding_vec.shape[0]==embedding_dim:
            embedding_matrix[idx] = np.asarray(embedding_vec)
    return word2coefs, word2index, embedding_matrix

In [3]:
GLOVE_FILE = '/mnt/glove.6B/glove.6B.100d.txt'
TEXT_DATA_DIR = './dataflow/sents.prev'
MAX_SEQUENCE_LENGTH = 70
MAX_NUM_WORDS = 400000
EMBEDDING_DIM = 100
TEST_SPLIT = 0.2
VALIDATION_SPLIT = 0.2

In [4]:
# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors and preparing embedding matrix.')

glove_word2coefs, glove_word2index, glove_embedding_matrix = load_glove_embeddings(GLOVE_FILE, EMBEDDING_DIM)

print('Found %s word vectors.' % len(glove_word2index))

Indexing word vectors and preparing embedding matrix.
Found 400000 word vectors.


In [5]:
import glob

# second, prepare text samples
print('Processing text dataset')

texts = []  # list of sentences
trans_texts = []  # list of transformed sentences
for fname in sorted(glob.glob(os.path.join(TEXT_DATA_DIR, 'dataset-*'))):
    if os.path.isfile(fname):
        args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
        with open(fname, **args) as f:
            s = ''
            t = ''
            for line in f:
                if line[0] == '<':
                    texts.append(line[2:].strip())
                elif line[0] == '>':
                    trans_texts.append(line[2:].strip())
print('Found %s sentences.' % len(texts))

Processing text dataset
Found 269538 sentences.


In [6]:
# Tokenizer does not assign any word to index 0 which will be used for padding.
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, lower=True, oov_token='UNK', filters='')
tokenizer.fit_on_texts(texts + trans_texts)
text_word2index = tokenizer.word_index  # Maps words to indices.

in_sequences = tokenizer.texts_to_sequences(texts)
out_sequences = tokenizer.texts_to_sequences(trans_texts)

_in_data = pad_sequences(in_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post', value=0)
_out_data = pad_sequences(out_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post', value=0)

num_words = min(MAX_NUM_WORDS, len(text_word2index)) + 1

print('Found %s unique tokens.' % len(text_word2index))

Found 64039 unique tokens.


In [7]:
# store embeddings of words in the text based on their indices
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in text_word2index.items():
    if i > MAX_NUM_WORDS:
        continue
    if word in glove_word2index:
        embedding_vector = glove_embedding_matrix[glove_word2index[word]]
    else:
        embedding_vector = np.random.normal(loc=0, scale=1, size=(1, EMBEDDING_DIM))
        embedding_vector *= np.sqrt(32 / (np.sum(np.square(embedding_vector))))
    embedding_matrix[i] = embedding_vector

In [8]:
# split the data into a training set and a validation set
indices = np.arange(_in_data.shape[0])
np.random.shuffle(indices)
in_data = _in_data[indices]
out_data = _out_data[indices]
num_test_samples = int(TEST_SPLIT * _in_data.shape[0])
num_validation_samples = int(VALIDATION_SPLIT * _in_data.shape[0])

x_train = in_data[:-num_validation_samples - num_test_samples]
y_train = out_data[:-num_validation_samples - num_test_samples,:,None]
x_test = in_data[-num_validation_samples - num_test_samples:-num_validation_samples]
y_test = out_data[-num_validation_samples - num_test_samples:-num_validation_samples,:,None]

x_val = in_data[-num_validation_samples:]
y_val = out_data[-num_validation_samples:,:,None]

In [14]:
# build the model: a single LSTM
print('Build model...')

BATCH_SIZE = 16

print(x_train.shape, x_val.shape)

input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,), name='word1', dtype='float32')
print(input_layer.shape)

# load pre-trained word embeddings into an Embedding layer
A = Embedding(input_dim=num_words,
              output_dim=EMBEDDING_DIM,
              embeddings_initializer=Constant(embedding_matrix),
              name='embedding_layer',
              input_length=MAX_SEQUENCE_LENGTH,
              trainable=True)(input_layer)
print(A.shape)

A = GRU(128, return_sequences=True, name='hidden_gru')(A)
A = Dropout(0.2, name='dropout')(A)
print(A.shape)

A = Dense(EMBEDDING_DIM, name='output', activation='sigmoid')(A)
print(A.shape)

A = Dense(num_words, name='projection', use_bias=False, activation='softmax')(A)
print(A.shape)

print(y_train.shape, y_val.shape)

optimizer = Adam(lr=0.1)
model = Model(inputs=[input_layer], outputs=[A])
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=optimizer,
              metrics=['sparse_categorical_accuracy'])

Build model...
(161724, 70) (53907, 70)
(?, 70)
(?, 70, 100)
(?, ?, 128)
(?, 70, 100)
(?, 70, 64040)
(161724, 70, 1) (53907, 70, 1)


In [15]:
history = model.fit(x_train, y_train,
                    batch_size=BATCH_SIZE,
                    epochs=10,
                    validation_data=(x_val, y_val))

Train on 161724 samples, validate on 53907 samples
Epoch 1/10
  2384/161724 [..............................] - ETA: 19:37:52 - loss: 1.9836 - sparse_categorical_accuracy: 0.7466

KeyboardInterrupt: 

In [None]:
text_index2word = [0] * (len(text_word2index) + 1)
for word in text_word2index:
    text_index2word[text_word2index[word]] = word
text_index2word[-1] = 'UNK'

In [None]:
def print_sent(sent):
    print(' '.join(map(lambda w: text_index2word[w], sent)))

p = model.predict(np.array(x_test[:BATCH_SIZE]))[0]
p = np.argmax(p, axis=-1)
for in_sent, pred_sent, true_sent in zip(x_test, p, y_test):
    print('--------------------------------------------------------')
    print_sent(in_sent)
    print_sent(pred_sent)
    print_sent(true_sent)