In [1]:
import numpy as np

from keras.models import Sequential
from keras import backend as K
from keras.layers import Dense, Lambda
from keras.layers import Embedding
from keras.layers import LSTM
from keras.utils import to_categorical
from keras import regularizers

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Load Dataset

In [2]:
import glob
from nltk.tokenize import word_tokenize

NEWLINE_TOKEN = ' __newline__ '
UNK_TOKEN = '__unk__'

# Read and collect text
train_text = ""
dev_text = ""
test_text = ""
texts = [train_text, dev_text, test_text]

# Try the TED data set?
for text_idx, file in enumerate(['./data/shakespeare/train.txt', './data/shakespeare/val.txt', './data/ted/test.txt']):
    with open(file, 'r') as fp:
        texts[text_idx] += NEWLINE_TOKEN.join([l.strip() for l in fp.readlines()]) + NEWLINE_TOKEN

train_text, dev_text, test_text = texts

print("Total characters:")
print("Train: %d"%(len(train_text)))
print("Dev: %d"%(len(dev_text)))
print("Test: %d"%(len(test_text)))
print(train_text[:100])

Total characters:
Train: 431408
Dev: 124528
Test: 128419
﻿ __newline__ Project Gutenberg’s The Complete Works of William Shakespeare, by William __newline__ 


# Preprocess text
We usually preprocess the text to remove casing information, separate out punctuations etc to make our data cleaner

In [3]:
tokens = [None, None, None]
for text_idx in range(len(texts)):
    tokens[text_idx] = word_tokenize(texts[text_idx].lower())

train_tokens, dev_tokens, test_tokens = tokens

print("Total tokens:")
print("Train: %d"%(len(train_tokens)))
print("Dev: %d"%(len(dev_tokens)))
print("Test: %d"%(len(test_tokens)))

Total tokens:
Train: 81368
Dev: 23839
Test: 26162


# Build vocabulary

In [4]:
VOCAB_SIZE = 5000
full_vocab = dict()
for token in train_tokens:
    full_vocab[token] = full_vocab.get(token, 0) + 1

# Sort vocabulary by occurence
sorted_vocab = sorted(full_vocab.keys(), key=lambda word: -full_vocab[word])

# Print some samples
print("Vocabulary size: %d"%(len(sorted_vocab)))
print("Most frequent tokens")
for i in range(10):
    print("\t%s: %d"%(sorted_vocab[i], full_vocab[sorted_vocab[i]]))
print("Least frequent tokens")
for i in range(1,11):
    print("\t%s: %d"%(sorted_vocab[-i], full_vocab[sorted_vocab[-i]]))

# Create final vocab
word2idx = {w: idx for idx, w in enumerate(sorted_vocab[:VOCAB_SIZE])}
idx2word = {idx: w for idx, w in enumerate(sorted_vocab[:VOCAB_SIZE])}

word2idx[UNK_TOKEN] = VOCAB_SIZE # The last element is the UNK token
idx2word[VOCAB_SIZE] = UNK_TOKEN
VOCAB_SIZE = VOCAB_SIZE + 1

Vocabulary size: 6647
Most frequent tokens
	__newline__: 10000
	,: 5218
	.: 4361
	the: 1658
	and: 1456
	i: 1414
	to: 1254
	’: 1186
	of: 1111
	my: 906
Least frequent tokens
	impossible-: 1
	descried: 1
	approaching: 1
	full-mann: 1
	sixty: 1
	security: 1
	assurance: 1
	forgo: 1
	renowned: 1
	unexecuted: 1


# Filter text based on vocabulary
We will now have to replace words we do not have in the vocabulary with a special token, `__unk__` in this case

In [5]:
for tokens_idx in range(len(tokens)):
    tokens[tokens_idx] = [t if t in word2idx else UNK_TOKEN for t in tokens[tokens_idx]]

train_tokens, dev_tokens, test_tokens = tokens
print("Number of tokens filtered out as unknown:")
print("Train: %d/%d"%(len([1 for t in train_tokens if t == UNK_TOKEN]), len(train_tokens)))
print("Dev: %d/%d"%(len([1 for t in dev_tokens if t == UNK_TOKEN]), len(dev_tokens)))
print("Test: %d/%d"%(len([1 for t in test_tokens if t == UNK_TOKEN]), len(test_tokens)))

Number of tokens filtered out as unknown:
Train: 1647/81368
Dev: 1938/23839
Test: 5162/26162


# Prepare data in tensor form
Our keras models finally take tensors as input and labels, so we need to modify our data to fit this form

In [6]:
X_train = np.array([word2idx[t] for t in train_tokens]) # Make lists of indexes corresponding to words in each sentence and create an array of all sentences
X_dev = np.array([word2idx[t] for t in dev_tokens])
X_test = np.array([word2idx[t] for t in test_tokens])

Our labels in this exercise are just the next words. Hence, for

>   `X_train = ['hello', 'how', 'are', 'you', '?']`

we will have:

>    `y_train = ['how, 'are', you', '?']`

Which is just `X_train[1:]`
We will also remove the last element of `X_train`, since we do not have any label for it

# Helper functions

In [7]:
def build_bag_of_words(X, context_size=1, vocab_size=VOCAB_SIZE):
    num_examples = X.shape[0]-context_size  # There's no next word for the last word!
    X_bow = np.zeros((num_examples, vocab_size)) # Initialize the vector
    
    y_bow = np.zeros((num_examples, vocab_size))
    
    for idx in range(num_examples):
        for context_idx in range(context_size):
            X_bow[idx, X[idx+context_idx]] = 1
        y_bow[idx, X[idx + context_size]] = 1
    
    return X_bow, y_bow
            
def get_next_predicted_word(model, input_words, context_size=1):
    if not isinstance(input_words, list):
        input_words = [input_words]
    input_words = input_words + ["__unk__"]
    input_array = np.array([word2idx[w] for w in input_words])
    input_bow, _ = build_bag_of_words(input_array, context_size=context_size)
    scores = model.predict(input_bow)
    output_word = idx2word[np.argmax(scores)]
    
    return output_word

def get_sentence(model, start_words, context_size=1):
    if not isinstance(start_words, list):
        start_words = [start_words]

    output = [] + start_words
    while output[-1] != '__newline__' and len(output) < 100:
        prev_word = get_next_predicted_word(model, output[-context_size:], context_size=context_size)
        output.append(prev_word)
    return " ".join(output)

# Define model

In [8]:
X_train_unigram, y_train_unigram = build_bag_of_words(X_train, context_size=1)
X_dev_unigram, y_dev_unigram = build_bag_of_words(X_dev, context_size=1)
X_test_unigram, y_test_unigram = build_bag_of_words(X_test, context_size=1)

In [9]:
print(X_train_unigram.shape)
print(X_dev_unigram.shape)
print(X_test_unigram.shape)

(81367, 5001)
(23838, 5001)
(26161, 5001)


In [10]:
model = Sequential()
model.add(Dense(100, input_shape=(VOCAB_SIZE,)))
model.add(Dense(100, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(VOCAB_SIZE, activation='softmax')) # Equivalent to adding a softmax layer

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 100)               500200    
_________________________________________________________________
dense_2 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_3 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_4 (Dense)              (None, 5001)              505101    
Total params: 1,025,501
Trainable params: 1,025,501
Non-trainable params: 0
_________________________________________________________________


In [11]:
NUM_EPOCHS = 50
for epoch in range(NUM_EPOCHS):
    model.fit(X_train_unigram, y_train_unigram, batch_size=128, epochs=epoch+1, initial_epoch=epoch, validation_data=(X_dev_unigram, y_dev_unigram))
    print(get_sentence(model, ['i']))


Train on 81367 samples, validate on 23838 samples
Epoch 1/1
i , __newline__
Train on 81367 samples, validate on 23838 samples
Epoch 2/2
13312/81367 [===>..........................] - ETA: 34s - loss: 5.1738 - acc: 0.1828

KeyboardInterrupt: 

In [12]:
print(get_sentence(model, ['think']))
print(get_sentence(model, ['well']))
print(get_sentence(model, ['i']))
print(get_sentence(model, ['who']))

think __newline__
well . __newline__
i __unk__ , __newline__
who , __newline__


#### Add context
The above data uses only _one_ previous word as context, but we can change our data to include more words

In [13]:
X_train_bigram, y_train_bigram = build_bag_of_words(X_train, context_size=2)
X_dev_bigram, y_dev_bigram = build_bag_of_words(X_dev, context_size=2)
X_test_bigram, y_test_bigram = build_bag_of_words(X_test, context_size=2)

In [14]:
print(X_train_bigram.shape)
print(y_train_bigram.shape)

(81366, 5001)
(81366, 5001)


In [15]:
model_bigram = Sequential()
model_bigram.add(Dense(100, input_shape=(VOCAB_SIZE,)))
model_bigram.add(Dense(100, activation='relu'))
model_bigram.add(Dense(100, activation='relu'))
model_bigram.add(Dense(VOCAB_SIZE, activation='softmax'))

model_bigram.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model_bigram.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 100)               500200    
_________________________________________________________________
dense_6 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_7 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_8 (Dense)              (None, 5001)              505101    
Total params: 1,025,501
Trainable params: 1,025,501
Non-trainable params: 0
_________________________________________________________________


In [16]:
for epoch in range(NUM_EPOCHS):
    model_bigram.fit(X_train_bigram, y_train_bigram, batch_size=128, epochs=epoch+1, initial_epoch=epoch, validation_data=(X_dev_bigram, y_dev_bigram))
    print(get_sentence(model_bigram, ['i', 'have'], context_size=2))

Train on 81366 samples, validate on 23837 samples
Epoch 1/1

KeyboardInterrupt: 

In [27]:
print(get_sentence(model_bigram, ['think', 'of'], context_size=2))
print(get_sentence(model_bigram, ['well', 'we'], context_size=2))
print(get_sentence(model_bigram, ['i', 'have'], context_size=2))
print(get_sentence(model_bigram, ['who', 'will'], context_size=2))

think of doting 5 mourn mourn officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer officer kind-hearted officer
well we perish require officer perish officer officer kind-hear

# Trigram model

In [35]:
X_train_trigram, y_train_trigram = build_bag_of_words(X_train, context_size=3)
X_dev_trigram, y_dev_trigram = build_bag_of_words(X_dev, context_size=3)
X_test_trigram, y_test_trigram = build_bag_of_words(X_test, context_size=3)

In [36]:
print(X_train_trigram.shape)
print(y_train_trigram.shape)

(234077, 5001)
(234077, 5001)


In [37]:
model_trigram = Sequential()
model_trigram.add(Dense(100, input_shape=(VOCAB_SIZE,)))
model_trigram.add(Dense(100, activation='relu'))
model_trigram.add(Dense(100, activation='relu'))
model_trigram.add(Dense(VOCAB_SIZE, activation='softmax'))

model_trigram.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model_trigram.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_29 (Dense)             (None, 100)               500200    
_________________________________________________________________
dense_30 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_31 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_32 (Dense)             (None, 5001)              505101    
Total params: 1,025,501
Trainable params: 1,025,501
Non-trainable params: 0
_________________________________________________________________


In [38]:
for epoch in range(NUM_EPOCHS):
    model_trigram.fit(X_train_trigram, y_train_trigram, batch_size=128, epochs=epoch+1, initial_epoch=epoch, validation_data=(X_dev_trigram, y_dev_trigram))
    print(get_sentence(model_trigram, ['i', 'have','to'], context_size=3))

Train on 234077 samples, validate on 60283 samples
Epoch 1/1
i have to be the __unk__ . __newline__
Train on 234077 samples, validate on 60283 samples
Epoch 2/2
i have to be a __unk__ __unk__ . __newline__
Train on 234077 samples, validate on 60283 samples
Epoch 3/3
i have to __unk__ the __unk__ of __unk__ , and the __unk__ __unk__ of , and course i & apos ; s a __unk__ , __unk__ and the __unk__ __unk__ of , and course i & apos ; s a __unk__ , __unk__ and the __unk__ __unk__ of , and course i & apos ; s a __unk__ , __unk__ and the __unk__ __unk__ of , and course i & apos ; s a __unk__ , __unk__ and the __unk__ __unk__ of , and course i & apos ; s a __unk__ , __unk__ and the __unk__ __unk__ of ,
Train on 234077 samples, validate on 60283 samples
Epoch 4/4
i have to be a __unk__ __unk__ . __newline__
Train on 234077 samples, validate on 60283 samples
Epoch 5/5
i have to __unk__ __unk__ the , __unk__ __unk__ and __unk__ __unk__ __unk__ . __newline__
Train on 234077 samples, validate on 60

i have to admit that we exist __unk__ from , __unk__ __unk__ __unk__ , __unk__ __unk__ __unk__ , __unk__ __unk__ __unk__ , __unk__ __unk__ __unk__ , __unk__ __unk__ __unk__ , __unk__ __unk__ __unk__ , __unk__ __unk__ __unk__ , __unk__ __unk__ __unk__ , __unk__ __unk__ __unk__ , __unk__ __unk__ __unk__ , __unk__ __unk__ __unk__ , __unk__ __unk__ __unk__ , __unk__ __unk__ __unk__ , __unk__ __unk__ __unk__ , __unk__ __unk__ __unk__ , __unk__ __unk__ __unk__ , __unk__ __unk__ __unk__ , __unk__ __unk__ __unk__ , __unk__ __unk__ __unk__ , __unk__ __unk__ __unk__ , __unk__ __unk__ __unk__ , __unk__ __unk__ __unk__ , __unk__ __unk__
