In [1]:
import re
import pandas as pd
import gzip
import numpy as np
from collections import Counter

In [2]:
with open('data/shakespeare.txt', 'r') as f:
    text = f.read()
print(type(text), len(text))

<class 'str'> 5447744


In [3]:
# Parse docs

In [4]:
docs = re.split(r'\n\n', text)
print(len(docs))
docs = [d for d in docs if d.count('\n') > 1]
print(len(docs))

7469
3599


In [5]:
eod_token = 'eod'

def clean_docs(text, eod_token='eod'):
    text = re.sub(r'[\d`{}|&_<>%àûïêîëèä$*/æé#@]', '', text)
    text = '\n'.join([l.strip() for l in text.split('\n') if l.strip() != '']) + ' ' + eod_token
    return text.lower()

In [6]:
docs = [clean_docs(d) for d in docs]

In [7]:
token_boundary_pattern = r'([\s,!?:;.-])'

def tokenize_string(s):
    return [t for t in re.split(token_boundary_pattern, s) if t not in ['', ' ']]

def tokenize(docs):
    sequences = [tokenize_string(d) for d in docs]
    word_count = Counter([t for s in sequences for t in s])
    return sequences, word_count

In [8]:
sequences, word_count = tokenize(docs)
print(len(word_count))
print(word_count.most_common(100))

27971
[('\n', 107574), (',', 80860), ('.', 76515), ('the', 26808), ('and', 25621), ('i', 20144), ('to', 19287), ('of', 17809), (';', 17076), ('a', 14117), ('you', 13589), ('my', 12453), ('that', 11112), ('in', 10859), ('?', 10459), ('is', 9548), ('!', 8800), ('not', 8707), ('for', 8190), ('-', 7797), ('me', 7761), ('it', 7661), ('with', 7612), ('be', 7068), ('your', 6863), ('this', 6794), ('his', 6726), ('but', 6262), ('he', 6200), ('as', 5882), ('have', 5873), ('thou', 5474), ('so', 5256), ('him', 5132), ('will', 4963), ('what', 4452), ('by', 4354), ('thy', 4028), ('all', 3880), ('are', 3832), ('her', 3796), ('no', 3768), ('do', 3747), ('eod', 3599), ('shall', 3579), ('if', 3481), ('we', 3284), ('thee', 3180), ('or', 3054), ('our', 3054), ('on', 3028), ('lord', 3000), ('good', 2808), ('now', 2778), ('king', 2739), ('sir', 2685), ('from', 2617), ('come', 2498), ('at', 2451), ('they', 2386), ('which', 2315), ('would', 2288), ('more', 2286), ('well', 2228), ('was', 2227), ('o', 2227), ('

In [9]:
def create_vocab(word_count, vocab_size):
    padding_token = '#'
    word_inverted_index = [padding_token] + sorted([w[0] for w in word_count.most_common(vocab_size)])
    word_index = {w: i for w, i in zip(word_inverted_index, range(len(word_inverted_index)))}    
    return word_index, word_inverted_index

In [10]:
word_index, word_inverted_index = create_vocab(word_count, len(word_count))
vocab_size = len(word_index)
print(vocab_size)

27972


In [11]:
eod_index = word_index[eod_token]
print(eod_index)

9149


In [12]:
def index_encode_sequence(sequence, word_index):
    return [word_index[w] for w in sequence if w in word_index]

def index_encode(sequences, word_index):
    return [index_encode_sequence(s, word_index) for s in sequences]

In [13]:
def prep_sequence(sequence, word_index, window_size, padding_index=0):
    # Encode
    sequence = index_encode_sequence(sequence, word_index)
    # Pad
    padding = (window_size - len(sequence))
    if padding >= 0:
        return [padding_index] * padding + sequence
    else:
        return sequence[-window_size:]

In [14]:
print(sequences[0])
print(len(sequences))
sequences = index_encode(sequences, word_index)
print(sequences[0])
print(len(sequences))

['from', 'fairest', 'creatures', 'we', 'desire', 'increase', ',', '\n', 'that', 'thereby', "beauty's", 'rose', 'might', 'never', 'die', ',', '\n', 'but', 'as', 'the', 'riper', 'should', 'by', 'time', 'decease', ',', '\n', 'his', 'tender', 'heir', 'might', 'bear', 'his', 'memory', ':', '\n', 'but', 'thou', 'contracted', 'to', 'thine', 'own', 'bright', 'eyes', ',', '\n', "feed'st", 'thy', "light's", 'flame', 'with', 'self', '-', 'substantial', 'fuel', ',', '\n', 'making', 'a', 'famine', 'where', 'abundance', 'lies', ',', '\n', 'thy', 'self', 'thy', 'foe', ',', 'to', 'thy', 'sweet', 'self', 'too', 'cruel', ':', '\n', 'thou', 'that', 'art', 'now', 'the', "world's", 'fresh', 'ornament', ',', '\n', 'and', 'only', 'herald', 'to', 'the', 'gaudy', 'spring', ',', '\n', 'within', 'thine', 'own', 'bud', 'buriest', 'thy', 'content', ',', '\n', 'and', 'tender', 'churl', "mak'st", 'waste', 'in', 'niggarding', ':', '\n', 'pity', 'the', 'world', ',', 'or', 'else', 'this', 'glutton', 'be', ',', '\n', 't

In [15]:
# Windows

In [16]:
window_size = 20
stride = 1
padding_size = window_size

In [17]:
def window(sequence, window_size, stride, padding_size, padding_index=0):
    padded = [padding_index] * padding_size + sequence
    windows = []
    start_pos = 0
    end_pos = window_size + 1
    while end_pos < len(padded):
        windows.append(padded[start_pos:end_pos])
        start_pos += stride
        end_pos += stride
    return windows

In [18]:
X = np.array([w for s in sequences for w in window(s, window_size, stride, padding_size)])
print(X.shape)

(1200133, 21)


In [19]:
y = X[:, -1]
print(y.shape)
X = X[:, :-1]
print(X.shape)

(1200133,)
(1200133, 20)


In [20]:
from keras.utils import to_categorical
y_cat = to_categorical(y)
print(y_cat.shape)

Using TensorFlow backend.


(1200133, 27972)


In [21]:
max_features = max([max(seq) for seq in sequences if len(seq) > 0]) + 1
max_features

27972

In [96]:
# Load embeddings

In [15]:
#glove_path = 'data/embeddings/glove.6B.50d.txt.gz'

In [18]:
#with gzip.open(glove_path, 'r') as fin:
#    line = fin.readline().decode('utf-8')

In [20]:
#def parse_line(line):
#    values = line.decode('utf-8').strip().split()
#    word = values[0]
#    vector = np.asarray(values[1:], dtype='float32')
#    return word, vector

In [23]:
#embeddings = {}
#word_index = {}
#word_inverted_index = []

#with gzip.open(glove_path, 'r') as fin:
#    for idx, line in enumerate(fin):
#        word, vector = parse_line(line) # parse a line
        
#        embeddings[word] = vector  # add word vector
#        word_index[word] = idx  # add idx
#        word_inverted_index.append(word)  # append word

In [38]:
#vocab_size = len(embeddings)
#emb_size = len(embeddings['good'])
#print(vocab_size, emb_size)

In [23]:
from keras import backend as K
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Dense, Dropout
from keras.layers import CuDNNLSTM, Embedding
from keras.optimizers import Adam, RMSprop
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [31]:
#embedding_weights = np.zeros((vocab_size, emb_size))
#for word, index in word_index.items():
#    embedding_weights[index, :] = embeddings[word]

In [32]:
#emb_layer = Embedding(
#    input_dim=vocab_size,
#    output_dim=emb_size,
#    weights=[embedding_weights],
#    mask_zero=False,
#    trainable=False,
#    input_length=window_size)

In [24]:
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
save_best = ModelCheckpoint('../models/sp_word_weights.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_loss', verbose=1, save_best_only=True)

In [25]:
emb_size = 32

In [26]:
K.clear_session()
model = Sequential()
#model.add(emb_layer)
model.add(Embedding(input_dim=vocab_size, output_dim=emb_size, input_length=window_size))
model.add(CuDNNLSTM(128, return_sequences=False))
model.add(Dense(vocab_size, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 32)            895104    
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (None, 128)               82944     
_________________________________________________________________
dense_1 (Dense)              (None, 27972)             3608388   
Total params: 4,586,436
Trainable params: 4,586,436
Non-trainable params: 0
_________________________________________________________________


In [27]:
model.compile(
    loss='categorical_crossentropy', 
    optimizer=Adam(), 
    #optimizer=RMSprop(lr=0.01), 
    metrics=['accuracy'])

In [28]:
model.fit(X, y_cat, epochs=10, batch_size=1024, verbose=1, validation_split=0.1, callbacks=[early_stop, save_best])

Train on 1080119 samples, validate on 120014 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 5.89440, saving model to ../models/sp_word_weights.01-5.89.hdf5
Epoch 2/10

Epoch 00002: val_loss improved from 5.89440 to 5.52980, saving model to ../models/sp_word_weights.02-5.53.hdf5
Epoch 3/10

Epoch 00003: val_loss improved from 5.52980 to 5.36157, saving model to ../models/sp_word_weights.03-5.36.hdf5
Epoch 4/10

Epoch 00004: val_loss improved from 5.36157 to 5.26443, saving model to ../models/sp_word_weights.04-5.26.hdf5
Epoch 5/10

Epoch 00005: val_loss improved from 5.26443 to 5.20474, saving model to ../models/sp_word_weights.05-5.20.hdf5
Epoch 6/10

Epoch 00006: val_loss improved from 5.20474 to 5.15170, saving model to ../models/sp_word_weights.06-5.15.hdf5
Epoch 7/10

Epoch 00007: val_loss improved from 5.15170 to 5.13066, saving model to ../models/sp_word_weights.07-5.13.hdf5
Epoch 8/10

Epoch 00008: val_loss improved from 5.13066 to 5.10151, saving model to ../mod

<keras.callbacks.History at 0x7f658804e240>

In [22]:
from keras.models import load_model
model = load_model('../models/sp_word_weights.09-5.09.hdf5')

In [35]:
def sample(p, diversity=1.0):
    p1 = np.asarray(p).astype('float64')
    p1 = np.log(p1) / diversity
    e_p1 = np.exp(p1)
    s = np.sum(e_p1)
    p1 = e_p1 / s
    return np.argmax(np.random.multinomial(1, p1, 1))

def sequence_to_string(sequence):
    seq = [s if s in ['.', ',', '!', '?', '-', ':', ';', '\n'] else ' ' + s for s in sequence]
    return ''.join(seq)

def gen_doc(seed='', max_len=400, diversity=1.0):
    # convert seed to padded sequence
    doc = tokenize_string(seed)
    pred = -1
    while pred != eod_index and len(doc) < max_len:
        pred = sample(model.predict(np.array([prep_sequence(doc, word_index, window_size)]))[0], diversity)
        doc.append(word_inverted_index[pred])
    return sequence_to_string(doc)

In [36]:
print(docs[0])

from fairest creatures we desire increase,
that thereby beauty's rose might never die,
but as the riper should by time decease,
his tender heir might bear his memory:
but thou contracted to thine own bright eyes,
feed'st thy light's flame with self-substantial fuel,
making a famine where abundance lies,
thy self thy foe, to thy sweet self too cruel:
thou that art now the world's fresh ornament,
and only herald to the gaudy spring,
within thine own bud buriest thy content,
and tender churl mak'st waste in niggarding:
pity the world, or else this glutton be,
to eat the world's due, by the grave and thee. eod


In [37]:
print(gen_doc('from fairest creatures', diversity=0.5))

 from fairest creatures, that i beseech you,
 which i am sure of your honour,
 and that you have been so long to die,
 and so i have not a man; and i am,
 that is a man of a man, and art you.
 exit. ]
 dromio of syracuse. i am full of the world;
 for i will go with me, and you shall be
 in a house, and so i do.
 lear. then, good madam,
 that i would have been, and that i have seen
 the body of your own enemies.
 othello. i am a man;
 that is as such a man as your highness.
 antipholus of ephesus. nay, i will tell you.
 othello. i have been such a man as you to do.
 buckingham. ay, sir, i will not speak to the best.
 petruchio. i will not think you to the man of his name.
 but the gods, my lord, would you have it
 to make a great behalf of my lord;
 and i have pass'd to th' law of the
 to the king of the duke of the king,
 for i am a man, and that well not.
 othello. i have not a man to be a man.
 ham. the king, and the king, my lord, you are a man;
 and that that he will not, as thou s