### 1. Env set up

In [1]:
from os import listdir
import pandas as pd
from os.path import isfile, join
from keras.layers import LSTM
import numpy as np 
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Dense, Activation
from keras import optimizers
from keras import callbacks
import random 
import sys
from tensorflow.python.client import device_lib

Using TensorFlow backend.


### 2. Global training settings

In [3]:
N_GPU = 1 # you can experiment with more GPUs, it gets interesting with a high SEQUENCE_LEN
SEQUENCE_LEN = 60
BATCH_SIZE = 512
EPOCHS = 20
HIDDEN_LAYERS_DIM = 512
LAYER_COUNT = 4
DROPOUT = 0.2

### 3. Data Cleanup

In [4]:
files = [join('data/', f) for f in listdir('data/') if isfile(join('data/', f)) and f != ".DS_Store"]

In [5]:
files

['data/Home \xe2\x80\x93 NXIVM.html',
 'data/Joy of Satan.htm',
 "data/Heaven's Gate - How and When It May Be Entered.htm",
 'data/The Twelve Tribes _ We are the commonwealth of Israel... we have a wonderful story to tell.htm',
 'data/Intelligent Design for Atheists - The Raelian Movement.htm',
 'data/OSHO | Meditation - Mindfulness and the Science of the Inner.html',
 'data/The Family International.htm',
 'data/EN - TALKS WITH TEACHINGS FROM MY COSMIC FRIENDS - space-people.org.html',
 'data/The Apostles of Infinite Love _ Monastery of the Apostles.htm',
 'data/Sacred Kashi.htm',
 'data/HAPPY SCIENCE Official Website.htm',
 'data/EN - TALKS WITH TEACHINGS FROM MY COSMIC FRIENDS - dont-get-chipped.org.html',
 'data/Jesus People USA Covenant Church & Community.htm',
 'data/EN - TALKS WITH TEACHINGS FROM MY COSMIC FRIENDS - universe-people.org.html',
 'data/Would you like to join our concubine_ \xe2\x80\x93 Free Love_ Inquire Within.htm',
 'data/Desteni Homepage.htm',
 'data/Official Chu

In [6]:
code_content = ''

for x in range(0, len(files)-1):
    with open(files[x]) as f: 
        s = f.read()
        code_content = code_content + s

In [7]:
files[len(files)-1]

'data/Official Church of Scientology_ What is Scientology_.htm'

In [8]:
code_content[0:100]

'<!DOCTYPE html>\n<html class="js_active  vc_desktop  vc_transform  vc_transform  vc_transform" lang="'

In [9]:
# find all the unique chracters
characters = sorted(list(set(code_content)))
print('total chars:', len(characters))
print(characters)

('total chars:', 186)
['\t', '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87', '\x88', '\x89', '\x8a', '\x8b', '\x8d', '\x8f', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97', '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f', '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7', '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf', '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7', '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf', '\xc2'

In [10]:
VOCABULARY_SIZE = len(characters)
characters_to_ix = {c:i for i,c in enumerate(characters)}

In [11]:
train_text = code_content
val_text = ''
with open(files[len(files)-1]) as f: 
    val_text = f.read()

In [12]:
print(train_text[0:100])
print(len(train_text))

<!DOCTYPE html>
<html class="js_active  vc_desktop  vc_transform  vc_transform  vc_transform" lang="
838005


In [13]:
print(val_text[0:100])
print(len(val_text))

<!DOCTYPE html>
<!-- saved from url=(0028)https://www.scientology.org/ -->
<html lang="en-US" itemsc
255442


In [14]:
def describe_batch(X, y, samples=3):
    """Describe in a human-readable format some samples from a batch"""
    for i in range(samples):
        sentence = ""
        for s in range(SEQUENCE_LEN):
            sentence += characters[X[i,s,:].argmax()]
        next_char = characters[y[i,:].argmax()]
        
        print("sample #%d: ...%s -> '%s'" % (
            i,
            sentence[-20:],
            next_char
        ))

def batch_generator(text, count):
    """Generate batches for training"""
    while True: # keras wants that for reasons
        for batch_ix in range(count):
            X = np.zeros((BATCH_SIZE, SEQUENCE_LEN, VOCABULARY_SIZE))
            y = np.zeros((BATCH_SIZE, VOCABULARY_SIZE))

            batch_offset = BATCH_SIZE * batch_ix

            for sample_ix in range(BATCH_SIZE):
                sample_start = batch_offset + sample_ix
                for s in range(SEQUENCE_LEN):
                    X[sample_ix, s, characters_to_ix[text[sample_start+s]]] = 1
                y[sample_ix, characters_to_ix[text[sample_start+s+1]]]=1

            yield X, y


In [15]:
def build_model(gpu_count=1):
    """Build a Keras sequential model for training the char-rnn"""
    model = Sequential()
    for i in range(LAYER_COUNT):
        model.add(
            LSTM(
                HIDDEN_LAYERS_DIM, 
                return_sequences=True if (i!=(LAYER_COUNT-1)) else False,
                input_shape=(SEQUENCE_LEN, VOCABULARY_SIZE),
            )
        )
        model.add(Dropout(DROPOUT))
    
    model.add(Dense(VOCABULARY_SIZE))
    model.add(Activation('softmax'))
    
    if gpu_count>1:
        model = multi_gpu.make_parallel(model, gpu_count)
    
    model.compile(loss='categorical_crossentropy', optimizer="adam")
    return model

In [16]:
text_train_len = len(train_text)
text_val_len = len(val_text)
print(text_val_len)
print(text_train_len)
print("Total of %d characters" % (text_train_len + text_val_len))

for ix, (X,y) in enumerate(batch_generator(train_text, count=1)):
    # describe some samples from the first batch
    describe_batch(X, y, samples=5)
    break

255442
838005
Total of 1093447 characters
sample #0: ...vc_desktop  vc_trans -> 'f'
sample #1: ...c_desktop  vc_transf -> 'o'
sample #2: ..._desktop  vc_transfo -> 'r'
sample #3: ...desktop  vc_transfor -> 'm'
sample #4: ...esktop  vc_transform -> ' '


In [17]:
training_model = build_model(gpu_count=N_GPU)

train_batch_count = (text_train_len - SEQUENCE_LEN) // BATCH_SIZE
val_batch_count = (text_val_len - SEQUENCE_LEN) // BATCH_SIZE
print("training batch count: %d" % train_batch_count)
print("validation batch count: %d" % val_batch_count)

# checkpoint
from keras.callbacks import ModelCheckpoint, EarlyStopping
filepath = "./%d-gpu_BS-%d_%d-%s_dp%.2f_%dS_epoch{epoch:02d}-loss{loss:.4f}-val-loss{val_loss:.4f}_weights" % (
    N_GPU,
    BATCH_SIZE,
    LAYER_COUNT,
    HIDDEN_LAYERS_DIM,
    DROPOUT,
    SEQUENCE_LEN
)
checkpoint = ModelCheckpoint(
    filepath,
    save_weights_only=True
)
# early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

callbacks_list = [checkpoint, early_stopping]

training batch count: 1636
validation batch count: 498


In [18]:
history = training_model.fit_generator(
    batch_generator(train_text, count=train_batch_count),
    train_batch_count,
    max_queue_size=1, # no more than one queued batch in RAM
    epochs=EPOCHS,
    callbacks=callbacks_list,
    validation_data=batch_generator(val_text, count=val_batch_count),
    validation_steps=val_batch_count,
    initial_epoch=0
)

Epoch 1/20
   7/1636 [..............................] - ETA: 12:01:56 - loss: 4.7853

KeyboardInterrupt: 