In [7]:
#
# this entire code is stolen from https://github.com/jeffheaton/t81_558_deep_learning/blob/master/t81_558_class_10_3_text_generation.ipynb
#

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/alice-wonderland-dataset/alice_in_wonderland.txt


In [8]:
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.utils import get_file
import numpy as np
import random
import sys
import io
import requests
import re

In [9]:
file = open("../input/alice-wonderland-dataset/alice_in_wonderland.txt", 'r')
raw_text = ' '.join(file.readlines())

In [10]:
processed_text = raw_text.lower()
processed_text = re.sub(r'[^\x00-\x7f]',r'', processed_text)

In [11]:
print('corpus length:', len(processed_text))

chars = sorted(list(set(processed_text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

corpus length: 57
total chars: 19


In [12]:
maxlen = 40 #40 char chunks
step = 3
sentences = []
next_chars = []
for i in range(0, len(processed_text) - maxlen, step):
    sentences.append(processed_text[i: i + maxlen])
    next_chars.append(processed_text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 6


In [13]:
#Vectorization
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [None]:
x.shape

In [14]:
y.shape

(6, 19)

In [15]:
#build model with single LTSM layer
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars)))) 
model.add(Dense(len(chars), activation='softmax'))

optimizer = RMSprop(lr=0.01) #could train with ADAM?
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 128)               75776     
_________________________________________________________________
dense (Dense)                (None, 19)                2451      
Total params: 78,227
Trainable params: 78,227
Non-trainable params: 0
_________________________________________________________________


In [17]:
# generate text
def sample(preds, temperature=1.0): #preds = output neurons, selects highest value after normalized by softmax (by probability)
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [19]:
def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print('Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(processed_text) - maxlen - 1)
    for temperature in [0.2, 0.5, 1.0, 1.2]:
        print('temperature:', temperature)

        generated = ''
        sentence = processed_text[start_index: start_index + maxlen]
        generated += sentence
        print('Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

In [20]:
# TRAINING!
import logging, os
logging.disable(logging.WARNING)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

# Fit the model
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(x, y,
          batch_size=128,
          epochs=60,
          callbacks=[print_callback])

Epoch 1/60
Generating text after Epoch: 0
temperature: 0.2
Generating with seed: "e-wonderland-dataset/alice_in_wonderland"
e-wonderland-dataset/alice_in_wonderlanddldllldldwsdidlxdxddldloddldllddxxdddllllxdddxlddidllldxaldxdlldldldldliddldddlldldldlddllldxixdltddxxdwxld.rsxdld-didrlllidldidddldlxllddddwldddxllddddlxlidaiwxddldlxldwdddlxdldldxddxlidllldxduxldlxd_idww/dlwddldldlllddddlddlxdlldwddxddldxldxdldxxlidldutxxxillxtddiiwddlilddldoddlllxdlldxddxldddlwldxldldddddlll-xdsilliddxiwdldlilidxdddllldxuddtidlldllxddddddlllallddxdxwdlxlddllxxdxxlxedddldlxdddlw
temperature: 0.5
Generating with seed: "e-wonderland-dataset/alice_in_wonderland"
e-wonderland-dataset/alice_in_wonderlandlilwleadwllpd-_riiewip-dw_duldn.li.d_.iaxi_u/podd/lse-wxwwdxoddasrdcr.dp_uaparitiill-ddenalildldpdodalliswl/piealddd.lx.ndxrixnastds/io

KeyboardInterrupt: 