<a href="https://colab.research.google.com/github/davidrkearney/colab-notebooks/blob/main/Text_Generation_LSTM_Dostoevsky.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Generation - LSTM


Credit: Code from https://github.com/jeffheaton/t81_558_deep_learning

In [4]:
try:
    %tensorflow_version 2.x
    COLAB = True
    print("Note: using Google CoLab")
except:
    print("Note: not using Google CoLab")
    COLAB = False

Note: using Google CoLab


In [5]:
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.utils import get_file
import numpy as np
import random
import sys
import io
import requests
import re

In [6]:
r = requests.get("https://www.gutenberg.org/cache/epub/600/pg600.txt")
raw_text = r.text
print(raw_text[0:1000])

﻿Project Gutenberg's Notes from the Underground, by Feodor Dostoevsky

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.net


Title: Notes from the Underground

Author: Feodor Dostoevsky

Posting Date: September 13, 2008 [EBook #600]
Release Date: July, 1996

Language: English


*** START OF THIS PROJECT GUTENBERG EBOOK NOTES FROM THE UNDERGROUND ***




Produced by Judith Boss.  HTML version by Al Haines.








Notes from the Underground

FYODOR DOSTOYEVSKY





PART I

Underground*

     *The author of the diary and the diary itself
     are, of course, imaginary.  Nevertheless it is clear
     that such persons as the writer of these notes
     not only may, but positively must, exist in our
     society, when we consider the circumstances i

In [7]:
processed_text = raw_text.lower()
processed_text = re.sub(r'[^\x00-\x7f]',r'', processed_text) 

In [8]:
print('corpus length:', len(processed_text))

chars = sorted(list(set(processed_text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

corpus length: 265582
total chars: 58


In [9]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(processed_text) - maxlen, step):
    sentences.append(processed_text[i: i + maxlen])
    next_chars.append(processed_text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 88514


In [10]:
sentences

["project gutenberg's notes from the under",
 "ject gutenberg's notes from the undergro",
 "t gutenberg's notes from the underground",
 "utenberg's notes from the underground, b",
 "nberg's notes from the underground, by f",
 "rg's notes from the underground, by feod",
 's notes from the underground, by feodor ',
 'otes from the underground, by feodor dos',
 's from the underground, by feodor dostoe',
 'rom the underground, by feodor dostoevsk',
 ' the underground, by feodor dostoevsky\r\n',
 'e underground, by feodor dostoevsky\r\n\r\nt',
 'nderground, by feodor dostoevsky\r\n\r\nthis',
 'rground, by feodor dostoevsky\r\n\r\nthis eb',
 'ound, by feodor dostoevsky\r\n\r\nthis ebook',
 'd, by feodor dostoevsky\r\n\r\nthis ebook is',
 'by feodor dostoevsky\r\n\r\nthis ebook is fo',
 'feodor dostoevsky\r\n\r\nthis ebook is for t',
 'dor dostoevsky\r\n\r\nthis ebook is for the ',
 ' dostoevsky\r\n\r\nthis ebook is for the use',
 'stoevsky\r\n\r\nthis ebook is for the use of',
 'evsky\r\n\r

In [11]:
print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Vectorization...


In [12]:
x.shape

(88514, 40, 58)

In [13]:
y.shape

(88514, 58)

In [14]:
y[0:10]

array([[False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False,  True, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False,  True, False,
        False, False, False, False],
       [False, False, False, False, False, False, False, False, False,
   

In [15]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars), activation='softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...


In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 128)               95744     
_________________________________________________________________
dense (Dense)                (None, 58)                7482      
Total params: 103,226
Trainable params: 103,226
Non-trainable params: 0
_________________________________________________________________


In [17]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [18]:
def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print("******************************************************")
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(processed_text) - maxlen - 1)
    for temperature in [0.2, 0.5, 1.0, 1.2]:
        print('----- temperature:', temperature)

        generated = ''
        sentence = processed_text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()


In [None]:
# Ignore useless W0819 warnings generated by TensorFlow 2.0.  Hopefully can remove this ignore in the future.
# See https://github.com/tensorflow/tensorflow/issues/31308
import logging, os
logging.disable(logging.WARNING)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

# Fit the model
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(x, y,
          batch_size=128,
          epochs=60,
          callbacks=[print_callback])

Epoch 1/60
----- Generating text after Epoch: 0
----- temperature: 0.2
----- Generating with seed: "cried.  "but i
will make up for it or p"
cried.  "but i
will make up for it or pored the stine the derent of the dook and the stand to the store and the doon the sare the dong the dened to the dook the sare the stare the street of the seres at all the sall the deared the derent of the street in i am in the derent to the stan to the street the stine the sered to the dran that is all the stan the stare the dook the forest of the stan stand of the dook all the stand the streed t
----- temperature: 0.5
----- Generating with seed: "cried.  "but i
will make up for it or p"
cried.  "but i



he know conse the lake the s
----- temperature: 1.0
----- Generating with seed: "cried.  "but i
will make up for it or p"
cried.  "but i







"yes kle the lafine as the prosies netc
----- temperature: 1.2
----- Generating with seed: "cried.  "but i
will make up for it or p"
cried.  "but i












anp,