# LSTM Study Text Generation
---
Glenn Abastillas

In [1]:
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense, Dropout
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import TimeseriesGenerator, pad_sequences

from nltk.corpus import gutenberg

import pandas as pd
import numpy as np
import altair as alt
import matplotlib.pyplot as plt

words = gutenberg.words('austen-emma.txt')

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


---
## Preprocess Text

In [2]:
tk = Tokenizer()
tk.fit_on_texts(words)
data = [_ for _ in tk.texts_to_sequences(words) if _]

Clean, modify, and inspect.

In [3]:
data = np.array(data)

Define shape variables

In [4]:
size = np.unique(data).size
span = 50

Pad sequences so ${sequence} = 1$.

In [5]:
padded = pad_sequences(data, maxlen=1, padding='post')

Generate `context` and `target` pairs

In [36]:
ts = TimeseriesGenerator(padded, padded, span)

In [56]:
context, target = ts[0]
context.shape, target.shape


((128, 50, 1), (128, 1))

Process all context and target data.

In [10]:
X, y = [], []

for c, t in ts:
    X.extend(c.tolist())
    y.extend(t.tolist())

In [11]:
X = np.array(X, dtype='int')
X.shape

(163046, 50, 1)

In [12]:
y = np.array(y, dtype='int')
y.shape

(163046, 1)

In [13]:
y = np.array([to_categorical(_, size + 1) for _ in y])

In [14]:
y.shape

(163046, 1, 7105)

Subsample data

In [15]:
x_i = np.random.choice(np.arange(X.shape[0]), 500)

In [16]:
x_ = X[x_i]
y_ = y[x_i]

In [17]:
x_.shape, y_.shape

((500, 50, 1), (500, 1, 7105))

---
## Build Model

In [58]:
size, span

(7104, 50)

In [61]:
E.shape

TensorShape([Dimension(None), Dimension(50), Dimension(150)])

In [79]:
I = Input((span,1))
# E = Embedding(size, 128)(I)
L = LSTM(128)(I)
D1 = Dense(50)(L)
DR = Dropout(0.2)(D1)
D2 = Dense(100, activation='relu')(DR)
D3 = Dense(1, activation='softmax')(L)

Compile model

In [80]:
model = Model(inputs=I, outputs=D3, name='LSTM')
model.compile('rmsprop', 'sparse_categorical_crossentropy', metrics=['acc'])
model.summary()

Model: "LSTM"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_14 (InputLayer)        (None, 50, 1)             0         
_________________________________________________________________
lstm_10 (LSTM)               (None, 128)               66560     
_________________________________________________________________
dense_21 (Dense)             (None, 1)                 129       
Total params: 66,689
Trainable params: 66,689
Non-trainable params: 0
_________________________________________________________________


---
## Train Model

In [81]:
model.fit_generator(ts, steps_per_epoch=len(ts), epochs=200)

Epoch 1/200


InvalidArgumentError: Received a label value of 6081 which is outside the valid range of [0, 1).  Label values: 29 2094 5 109 190 15 192 34 1164 11 209 657 3 29 7 103 13 115 5 21 742 39 73 11 2 327 4 38 253 15 59 321 64 180 6077 63 1108 52 64 124 1 6078 2 317 24 297 25 65 294 297 7 22 140 53 5 388 1 90 1 6 642 518 11 527 11 2 391 4 2957 78 369 321 84 12 169 96 223 84 12 1190 23 32 2 3151 49 1564 26 203 16 9 1767 1 14 11 2111 3 2020 15 16 9 124 28 54 861 7 34 14 114 104 6079 27 30 6080 6081 34 14 312 40 20 150 130 2 1333 5 21 36 28 1
	 [[{{node loss_7/dense_21_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits}}]]

In [82]:
model.fit(x_.reshape(500, 50), y_.reshape(500, 7105), batch_size=1000, epochs=10)


ValueError: Error when checking input: expected input_14 to have 3 dimensions, but got array with shape (500, 50)