In [1]:
# Read dataset
# Create vocab of chars
# Read windows and encode

In [1]:
import re
import numpy as np
from collections import Counter

In [2]:
with open('data/shakespeare.txt', 'r') as f:
    text = f.read()
print(type(text), len(text))

<class 'str'> 5447744


In [3]:
# Parse sonnets

In [4]:
sonnets = re.split(r'\n\n', text)
print(len(sonnets))
sonnets = [s for s in sonnets if s.count('\n') > 1]
print(len(sonnets))

7469
3599


In [5]:
def clean_sonnet(text):
    text = re.sub(r'[\d`{}|&_<>]', '', text)
    text = '\n'.join([l.strip() for l in text.split('\n') if l.strip() != ''])
    return text.lower()

In [6]:
sonnets = [clean_sonnet(s) for s in sonnets]

In [7]:
# Create vocab

In [8]:
text = ''.join(sonnets)
print(len(text), text[:10])

4861240 from faire


In [9]:
c = Counter(text)
print(c.most_common(100), np.sum([t[1] for t in c.most_common(100)]))

[(' ', 809184), ('e', 434393), ('t', 322113), ('o', 309204), ('a', 281910), ('i', 248756), ('s', 243455), ('n', 235267), ('h', 233540), ('r', 230202), ('l', 166585), ('d', 145975), ('u', 126512), ('m', 109591), ('\n', 107574), ('y', 93433), ('w', 88127), ('c', 85483), (',', 80860), ('f', 79113), ('.', 76515), ('g', 66802), ('b', 60715), ('p', 57060), ('v', 36990), ('k', 34777), ("'", 30890), (';', 17076), ('?', 10459), ('!', 8800), ('-', 7797), ('x', 5099), ('j', 4627), ('q', 3480), ('[', 1953), (']', 1948), (':', 1774), ('z', 1562), ('(', 595), (')', 594), ('"', 450)] 4861240


In [10]:
809184/4861240

0.1664562951016613

In [11]:
chars = sorted(list(set(text)))
print(chars)

['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', ':', ';', '?', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [12]:
char_to_int = {k: v for k, v in zip(chars, range(len(chars)))}
print(char_to_int)

{'\n': 0, ' ': 1, '!': 2, '"': 3, "'": 4, '(': 5, ')': 6, ',': 7, '-': 8, '.': 9, ':': 10, ';': 11, '?': 12, '[': 13, ']': 14, 'a': 15, 'b': 16, 'c': 17, 'd': 18, 'e': 19, 'f': 20, 'g': 21, 'h': 22, 'i': 23, 'j': 24, 'k': 25, 'l': 26, 'm': 27, 'n': 28, 'o': 29, 'p': 30, 'q': 31, 'r': 32, 's': 33, 't': 34, 'u': 35, 'v': 36, 'w': 37, 'x': 38, 'y': 39, 'z': 40}


In [13]:
int_to_char = {k: v for v, k in char_to_int.items()}
print(int_to_char)

{0: '\n', 1: ' ', 2: '!', 3: '"', 4: "'", 5: '(', 6: ')', 7: ',', 8: '-', 9: '.', 10: ':', 11: ';', 12: '?', 13: '[', 14: ']', 15: 'a', 16: 'b', 17: 'c', 18: 'd', 19: 'e', 20: 'f', 21: 'g', 22: 'h', 23: 'i', 24: 'j', 25: 'k', 26: 'l', 27: 'm', 28: 'n', 29: 'o', 30: 'p', 31: 'q', 32: 'r', 33: 's', 34: 't', 35: 'u', 36: 'v', 37: 'w', 38: 'x', 39: 'y', 40: 'z'}


In [14]:
# Add padding and end
padding_code = len(int_to_char)
int_to_char[padding_code] = ''
end_code = len(int_to_char)
int_to_char[end_code] = '\n\n'
print(padding_code, end_code)

41 42


In [15]:
# Encode

In [16]:
def encode(text, char_to_code):
    return [char_to_code[c] for c in text]

def decode(text_encoded, code_to_char):
    return ''.join([code_to_char[c] for c in text_encoded])

In [17]:
sonnets_encoded = [encode(s, char_to_int) for s in sonnets]
print(sonnets_encoded[0])

[20, 32, 29, 27, 1, 20, 15, 23, 32, 19, 33, 34, 1, 17, 32, 19, 15, 34, 35, 32, 19, 33, 1, 37, 19, 1, 18, 19, 33, 23, 32, 19, 1, 23, 28, 17, 32, 19, 15, 33, 19, 7, 0, 34, 22, 15, 34, 1, 34, 22, 19, 32, 19, 16, 39, 1, 16, 19, 15, 35, 34, 39, 4, 33, 1, 32, 29, 33, 19, 1, 27, 23, 21, 22, 34, 1, 28, 19, 36, 19, 32, 1, 18, 23, 19, 7, 0, 16, 35, 34, 1, 15, 33, 1, 34, 22, 19, 1, 32, 23, 30, 19, 32, 1, 33, 22, 29, 35, 26, 18, 1, 16, 39, 1, 34, 23, 27, 19, 1, 18, 19, 17, 19, 15, 33, 19, 7, 0, 22, 23, 33, 1, 34, 19, 28, 18, 19, 32, 1, 22, 19, 23, 32, 1, 27, 23, 21, 22, 34, 1, 16, 19, 15, 32, 1, 22, 23, 33, 1, 27, 19, 27, 29, 32, 39, 10, 0, 16, 35, 34, 1, 34, 22, 29, 35, 1, 17, 29, 28, 34, 32, 15, 17, 34, 19, 18, 1, 34, 29, 1, 34, 22, 23, 28, 19, 1, 29, 37, 28, 1, 16, 32, 23, 21, 22, 34, 1, 19, 39, 19, 33, 7, 0, 20, 19, 19, 18, 4, 33, 34, 1, 34, 22, 39, 1, 26, 23, 21, 22, 34, 4, 33, 1, 20, 26, 15, 27, 19, 1, 37, 23, 34, 22, 1, 33, 19, 26, 20, 8, 33, 35, 16, 33, 34, 15, 28, 34, 23, 15, 26, 1, 20, 3

In [18]:
# Extract windows with padding

In [19]:
stride = 3
window_size = 40
padding_size = window_size

In [20]:
def extract_windows(text_encoded, stride, window_size, padding_size, padding_code, end_code):
    windows = []
    # Pad
    text_padded = [padding_code] * padding_size + text_encoded
    # Extract windows
    start_pos = 0
    end_pos = window_size + 1
    while end_pos < len(text_padded):
        windows.append(text_padded[start_pos:end_pos])
        start_pos += stride
        end_pos += stride
    # Add ending
    windows.append(text_padded[-window_size:] + [end_code])
    return windows

In [21]:
X = np.array([w for s in sonnets_encoded for w in extract_windows(s, stride, window_size, padding_size, padding_code, end_code)])
print(X.shape)

(1624100, 41)


In [22]:
y = X[:, -1]
print(y.shape)

(1624100,)


In [23]:
X = X[:, :-1]
print(X.shape)

(1624100, 40)


In [24]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM, CuDNNLSTM, Embedding
from keras.utils import to_categorical
from keras.optimizers import Adam, RMSprop
from keras.callbacks import ModelCheckpoint, EarlyStopping

Using TensorFlow backend.


In [25]:
n_samples = 500000
num_classes = len(int_to_char)
print(num_classes)

43


In [26]:
y_cat = to_categorical(y[:n_samples])
print(y_cat.shape)

(500000, 43)


In [27]:
X = X[:n_samples, :]
print(X.shape)

(500000, 40)


In [28]:
early_stop = EarlyStopping(monitor='val_loss', patience=1, verbose=1)
save_best = ModelCheckpoint('weights.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_loss', verbose=1, save_best_only=True)

In [29]:
emb_size = 10
model = Sequential()
model.add(Embedding(input_dim=num_classes, output_dim=emb_size, input_length=window_size))
model.add(CuDNNLSTM(128, return_sequences=False))
#model.add(LSTM(128, return_sequences=False))
#model.add(Dense(64, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 40, 10)            430       
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (None, 128)               71680     
_________________________________________________________________
dense_1 (Dense)              (None, 43)                5547      
Total params: 77,657
Trainable params: 77,657
Non-trainable params: 0
_________________________________________________________________


In [30]:
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.01), metrics=['accuracy'])

In [31]:
model.fit(X, y_cat, epochs=10, batch_size=512, verbose=1, validation_split=0.1, callbacks=[early_stop, save_best])

Train on 450000 samples, validate on 50000 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 1.84427, saving model to weights.01-1.84.hdf5
Epoch 2/10

Epoch 00002: val_loss improved from 1.84427 to 1.75335, saving model to weights.02-1.75.hdf5
Epoch 3/10

Epoch 00003: val_loss improved from 1.75335 to 1.71728, saving model to weights.03-1.72.hdf5
Epoch 4/10

Epoch 00004: val_loss improved from 1.71728 to 1.70685, saving model to weights.04-1.71.hdf5
Epoch 5/10

Epoch 00005: val_loss improved from 1.70685 to 1.69987, saving model to weights.05-1.70.hdf5
Epoch 6/10

Epoch 00006: val_loss improved from 1.69987 to 1.69431, saving model to weights.06-1.69.hdf5
Epoch 7/10

Epoch 00007: val_loss improved from 1.69431 to 1.68921, saving model to weights.07-1.69.hdf5
Epoch 8/10

Epoch 00008: val_loss did not improve from 1.68921
Epoch 00008: early stopping


<keras.callbacks.History at 0x7f71ac6ae208>

In [33]:
def gen_sonnet(seed='', max_len=200):
    sonnet = seed
    if not seed:
        seed = np.array([[padding_code] * window_size])
    elif len(seed) < window_size:
        seed = np.array([[padding_code] * (window_size - len(seed)) + encode(seed, char_to_int)])
    pred = -1
    while pred != end_code and len(sonnet) < max_len:
        pred = np.argmax(model.predict(seed))
        sonnet += int_to_char[pred]
    return sonnet

In [36]:
print(gen_sonnet('fair y'))

fair yoooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo
