In [1]:
import numpy as np
import pandas as pd
import random
import string
import sys
import io
import os

from keras.models import Sequential
from keras.layers import Dense, LSTM, Activation, Dropout
from keras.optimizers import RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [2]:
N_GPU = 1
LAYER_COUNT = 4
DROPOUT = 0.2
HIDDEN_LAYERS_DIM = 512
BATCH_SIZE = 512
MAXLEN = 60
EPOCHS = 1

In [4]:
#Harry Potter Txt Files from: http://www.glozman.com/textpages.html
work_dir = 'data/'
text = ''
for index in range(1, 7):
    name = "harry_potter_{index}.txt".format(index=index)
    path = os.path.join(work_dir, name)
    with io.open(path, mode="r", encoding="latin-1") as fd:
        content = fd.read()
        text = ' '.join([text,content])

In [5]:
chars = list(string.printable)
chars.remove('\x0b')
chars.remove('\x0c')
chars_to_ix = {c:i for i,c in enumerate(chars)}

In [6]:
text_size, vocab_size = len(text), len(chars)
print('There are %d total characters in your data.' % (text_size))

There are 5119079 total characters in your data.


In [7]:
text = text.replace('\x96','-')
text = text.replace('\x93','"')
text = text.replace('\x91','\'')
text = text.replace('\x92','\'')
text = text.replace('\x90','')
text = text.replace('\x95','')
text = text.replace('\xad','-')  
text = text.replace('«','')
text = text.replace('»','')
text = text.replace('¦','')
text = text.replace('ü','')
text = text.replace('\\','')

In [8]:
train = text[:int(text_size * .9)]
test = text[int(text_size * .9):]

In [9]:
train_size = len(train)
test_size = len(test)

#### Model

In [10]:
def batch_generator(text, count):
    """Generate batches for training"""
    while True: # keras wants that for reasons
        for batch_ix in range(count):
            X = np.zeros((BATCH_SIZE, MAXLEN, vocab_size))
            y = np.zeros((BATCH_SIZE, vocab_size))

            batch_offset = BATCH_SIZE * batch_ix

            for sample_ix in range(BATCH_SIZE):
                sample_start = batch_offset + sample_ix
                for s in range(MAXLEN):
                    X[sample_ix, s, chars_to_ix[text[sample_start+s]]] = 1
                y[sample_ix, chars_to_ix[text[sample_start+s+1]]]=1

            yield X, y

In [11]:
def build_model(gpu_count=N_GPU):
    model = Sequential()

    for i in range(LAYER_COUNT):
        model.add(
            LSTM(
                HIDDEN_LAYERS_DIM, 
                return_sequences=True if (i!=(LAYER_COUNT-1)) else False,
                input_shape=(MAXLEN, vocab_size),
            )
        )
        model.add(Dropout(DROPOUT))

    model.add(Dense(vocab_size))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

#### Training

In [14]:
training_model = build_model(gpu_count=N_GPU)
training_model.load_weights(
    "1-gpu_BS-512_4-512_dp0.20_100S_epoch01-loss1.6339-val-loss1.2746_weights"
)

In [15]:
train_batch_count = (train_size - MAXLEN) // BATCH_SIZE
val_batch_count = (test_size - MAXLEN) // BATCH_SIZE


In [16]:
# checkpoint
from keras.callbacks import ModelCheckpoint, EarlyStopping
filepath = "./%d-gpu_BS-%d_%d-%s_dp%.2f_%dS_epoch{epoch:02d}-loss{loss:.4f}-val-loss{val_loss:.4f}_weights" % (
    N_GPU,
    BATCH_SIZE,
    LAYER_COUNT,
    HIDDEN_LAYERS_DIM,
    DROPOUT,
    MAXLEN
)
checkpoint = ModelCheckpoint(
    filepath,
    save_weights_only=True
)
# early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

callbacks_list = [checkpoint, early_stopping]

In [39]:
history = training_model.fit_generator(
    batch_generator(train, count=train_batch_count),
    train_batch_count,
    max_queue_size=1, # no more than one queued batch in RAM
    epochs=EPOCHS,
    callbacks=callbacks_list,
    validation_data=batch_generator(test, count=val_batch_count),
    validation_steps=val_batch_count,
    initial_epoch=0
)

Epoch 1/1


In [None]:
training_model.summary()