In [1]:
import os
import re
import sys
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [39]:
import unidecode
from nltk.tokenize import sent_tokenize
# import nltk
# nltk.download('punkt')

In [3]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dropout, LSTM, GRU, Dense
from tensorflow.keras.optimizers import RMSprop
from keras.callbacks import LambdaCallback

Using TensorFlow backend.


Data Source: Project Gutenberg  
http://www.gutenberg.org/ebooks/2600

https://github.com/keras-team/keras/blob/master/examples/lstm_text_generation.py

# Loading and Preprocessing Data

Loading War and Peace:

In [40]:
filepath = './war_peace.txt'
with open(filepath, encoding='UTF-8') as f:
    war = unidecode.unidecode(f.read())

Tokenizing:

In [41]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(war)

Encoding the whole text:

In [42]:
SIZE_VOCAB = len(tokenizer.word_index)
TEXT_SIZE = tokenizer.document_count
[encoded] = np.array(tokenizer.texts_to_sequences([war]))-1

In [44]:
word2idx = {word:idx for (idx, word) in enumerate(tokenizer.word_index.keys())}
idx2word = {idx:word for (word, idx) in word2idx.items()}

Creating the train and validation datasets:

In [45]:
def create_dataset(data, maxlen, batch_size):
    
    # +1 to account for the target
    window_length = maxlen + 1
    # Create windows of size window_length
    dataset = tf.data.Dataset.from_tensor_slices(data)
    dataset = dataset.window(window_length, shift=1, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(window_length))
    # Get batch and separate features and target
    dataset = dataset.shuffle(10000).batch(batch_size)
    dataset = dataset.map(lambda windows: (tf.one_hot(windows[:, :-1], depth=SIZE_VOCAB), 
                                           windows[:, -1]))
    # Prefetch for efficiency
    dataset = dataset.prefetch(1)

    return dataset

In [46]:
MAX_LEN = 40
BATCH_SIZE = 128

In [47]:
dataset = create_dataset(encoded, 
                         maxlen=MAX_LEN, 
                         batch_size=BATCH_SIZE)

# Model

In [48]:
# DO NOT ADD RECURRENT DROPOUT -> GPU DOES NOT WORK
model = keras.models.Sequential([
    GRU(128, return_sequences=True, input_shape=[None, SIZE_VOCAB],
        dropout=0.2),
    GRU(128, input_shape=[None, SIZE_VOCAB], 
        dropout=0.2),
    Dense(SIZE_VOCAB, activation='softmax')
])

In [49]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [50]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_4 (GRU)                  (None, None, 128)         69888     
_________________________________________________________________
gru_5 (GRU)                  (None, 128)               99072     
_________________________________________________________________
dense_2 (Dense)              (None, 52)                6708      
Total params: 175,668
Trainable params: 175,668
Non-trainable params: 0
_________________________________________________________________


# Training

Useful functions to visualize some examples after every epoch:  
https://keras.io/examples/lstm_text_generation/

In [51]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + keras.backend.epsilon()) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [52]:
def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(X, depth=SIZE_VOCAB)

In [53]:
def on_epoch_end(epoch, _):
    
    print()
    print('----- Generating text after Epoch: %d' % epoch)
    
    start_index = np.random.randint(0, TEXT_SIZE - MAX_LEN - 1)
    
    for temperature in [0.2, 0.5, 1.0, 1.2]:
        print('----- temperature:', temperature)
        
        generated = ''
        sentence = war[start_index:start_index + MAX_LEN]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)
        
        for i in range(20):
            x_pred = preprocess(sentence)
            preds = model.predict(x_pred, verbose=0)[0]
            next_char = sample(preds, temperature)
            next_char = idx2word[next_char]
            # Walk one step
            sentence = sentence[1:] + next_char
            
            sys.stdout.write(next_char)
            sys.stdout.flush()
        
        print()

In [54]:
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

Saving epoch every 5 epochs

In [55]:
mc = keras.callbacks.ModelCheckpoint('./weights/weights{epoch:08d}.h5', 
                                     save_weights_only=True, save_freq='epoch')

In [56]:
history = model.fit(dataset, epochs=60,
                    callbacks=[print_callback, mc])

Epoch 1/60
  25029/Unknown - 305s 12ms/step - loss: 1.9326
----- Generating text after Epoch: 0
----- temperature: 0.2
----- Generating with seed: "ll known wars.

The period of the campai"
ll known wars.

The period of the campaiiytioneaaenot
tahasa
----- temperature: 0.5
----- Generating with seed: "ll known wars.

The period of the campai"
ll known wars.

The period of the campaiyafi'paothuit atheft
----- temperature: 1.0
----- Generating with seed: "ll known wars.

The period of the campai"
ll known wars.

The period of the campaiewwn,r etip.a sihsda
----- temperature: 1.2
----- Generating with seed: "ll known wars.

The period of the campai"
ll known wars.

The period of the campaiisapev?ltano.
dkaixg
Epoch 2/60
----- Generating text after Epoch: 1
----- temperature: 0.2
----- Generating with seed: "e army to a
disgrace such as that of Ulm"
e army to a
disgrace such as that of Ulm tnae ahnact ntionov
----- temperature: 0.5
----- Generating with seed: "e army to a
disgrace such as 

KeyboardInterrupt: 