In [None]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant
import numpy as np
import random
import sys
import io

In [None]:
# From https://gist.github.com/jovianlin/0a6b7c58cde7a502a68914ba001c77bf
def load_glove_embeddings(fp, embedding_dim, include_empty_char=False):
    """
    Loads pre-trained word embeddings (GloVe embeddings)
        Inputs: - fp: filepath of pre-trained glove embeddings
                - embedding_dim: dimension of each vector embedding
                - include_empty_char: whether to include empty char in vocab
        Outputs:
                - word2coefs: Dictionary. Word to embedding vector
                - word2index: Dictionary. Word to word-index
                - embedding_matrix: Embedding matrix for Keras Embedding layer
    """
    # First, build the "word2coefs" and "word2index"
    word2coefs = {} # word to its corresponding coefficients
    word2index = {} # word to word-index
    with open(fp) as f:
        for idx, line in enumerate(f):
            try:
                data = [x.strip().lower() for x in line.split()]
                word = data[0]
                coefs = np.asarray(data[1:embedding_dim+1], dtype='float32')
                word2coefs[word] = coefs
                if word not in word2index:
                    word2index[word] = len(word2index)
            except Exception as e:
                print('Exception occurred in `load_glove_embeddings`:', e)
                continue
        # End of for loop.
    # End of with open
    if include_empty_char:
        word2index[''] = len(word2index)
    # Second, build the "embedding_matrix"
    # Words not found in embedding index will be all-zeros. Hence, the "+1".
    vocab_size = len(word2coefs)+1 if include_empty_char else len(word2coefs)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, idx in word2index.items():
        embedding_vec = word2coefs.get(word)
        if embedding_vec is not None and embedding_vec.shape[0]==embedding_dim:
            embedding_matrix[idx] = np.asarray(embedding_vec)
    return word2coefs, word2index, embedding_matrix

In [11]:
0.14375**2 + 0.29442**2 + 0.078571**2 + 0.30209**2 + 0.47561**2 + 0.43339**2 + 0.11853**2 + 0.24294**2 + 0.15266**2 + 0.88948**2 + 0.63836**2 + 0.98421**2 + 0.2926**2 + 0.25954**2 + 0.28813**2 + 1.101**2 + 0.47256**2 + 0.11681**2 + 0.22856**2 + 0.43835**2 + 0.34791**2 + 0.82372**2 + 0.2971**2 + 0.75179**2 + 0.005743**2 + 0.36858**2 + 0.47466**2 + 0.19271**2 + 0.56166**2 + 0.35944**2 + 0.39476**2 + 0.26048**2 + 0.32528**2 + 1.1848**2 + 0.3129**2 + 0.39972**2 + 0.51549**2 + 0.17256**2 + 0.376**2 + 0.11885**2 + 0.31198**2 + 0.20403**2 + 0.38001**2  + 0.28433**2 + 0.41899**2 + 0.0038793**2 + 3.819**2 + 0.16336**2 + 0.05984**2 + 0.34421**2 + 0.13537**2 + 0.43697**2 + 1.2651**2 + 1.7427**2 + 0.22468**2 + 1.8752**2 + 0.4882**2 + 0.29383**2 + 0.92739**2 + 0.31286**2 + 0.93202**2 + 1.2429**2 + 0.46347**2 + 0.78895**2 + 1.0129**2  + 0.95664**2 + 0.40652**2 + 0.33332**2 + 0.30782**2 + 0.30168**2 + 0.55523**2 + 0.87218**2 + 0.38666**2 + 0.23546**2 + 0.35067**2 + 0.27966**2 + 0.83493**2 + 0.27571**2 + 0.88204**2 + 1.1066**2 + 0.24833**2 + 0.55462**2 + 0.31548**2 + 0.013784**2 + 1.2792**2 + 0.10665**2 + 0.18128**2 + 0.42517**2 + 0.18244**2 + 0.14501**2 + 0.38981**2 + 0.22133**2 + 0.048625**2 + 0.4338**2 + 0.56485**2 + 0.36333**2 + 0.079428**2 + 0.93321**2 + 0.31841**2 + 0.24426**2

48.68970071862351

In [None]:
BASE_DIR = ''
GLOVE_FILE = os.path.join(BASE_DIR, 'glove.6B/glove.6B.100d.txt')
TEXT_DATA_DIR = os.path.join(BASE_DIR, 'sents')
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [None]:
# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors and preparing embedding matrix.')

glove_word2coefs, glove_word2index, glove_embedding_matrix = load_glove_embeddings(GLOVE_FILE, EMBEDDING_DIM)

print('Found %s word vectors.' % len(word2index))

In [None]:
# second, prepare text samples
print('Processing text dataset')

texts = []  # list of sentences
trans_texts = []  # list of transformed sentences
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                #args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
                #with open(fpath, **args) as f:
                with open(fpath, 'r') as f:
                    s = ''
                    t = ''
                    while s is not None and t is not None:
                        s = f.readline()
                        if s:
                            sent = s.strip()
                            if sent and sent[0] == '<':
                                sent = sent[2:]
                                t = f.readline()
                                if t:
                                    t = t.strip()
                                    sent_trans = t[2:]
                                    texts.append(sent)
                                    trans_texts.append(sent_trans)
print('Found %s sentences.' % len(texts))

In [None]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, lower=True, oov_token='UNK')
tokenizer.fit_on_texts(texts + trans_texts)
text_word2index = tokenizer.word_index  # Maps words to indices.

in_sequences = tokenizer.texts_to_sequences(texts)
out_sequences = tokenizer.texts_to_sequences(trans_texts)

in_data = pad_sequences(in_sequences, maxlen=MAX_SEQUENCE_LENGTH)
out_data = pad_sequences(out_sequences, maxlen=MAX_SEQUENCE_LENGTH)

num_words = min(MAX_NUM_WORDS, len(text_word2index)) + 1

print('Found %s unique tokens.' % len(text_word2index))

In [None]:
# store embeddings of words in the text based on their indices
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in text_word2index.items():
    if i > MAX_NUM_WORDS:
        continue
    if word in glove_word2index:
        embedding_vector = glove_embedding_matrix[glove_word2index[word]]
    else:
        embedding_vector = np.random.normal(loc=0, scale=1, size=(1, EMBEDDING_DIM))
        embedding_vector *= np.sqrt(32 / (np.sum(np.square(embedding_vector)))
    embedding_matrix[i] = embedding_vector

In [None]:
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [None]:
# load pre-trained word embeddings into an Embedding layer
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)
maxlen = 50

# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dropout(0.2))
model.add(Dense(len(chars), activation='softmax'))
                                    
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])

model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          validation_data=(x_val, y_val))