# LSTM Study Text Generation
---
Glenn Abastillas

In [21]:
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense, Dropout
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import TimeseriesGenerator, pad_sequences

from nltk.corpus import gutenberg

import pandas as pd
import numpy as np
import altair as alt
import matplotlib.pyplot as plt

words = gutenberg.words('austen-emma.txt')

---
## Preprocess Text

In [2]:
tk = Tokenizer()
tk.fit_on_texts(words)
data = [_ for _ in tk.texts_to_sequences(words) if _]

Clean, modify, and inspect.

In [3]:
data = np.array(data)

Define shape variables

In [13]:
size = np.unique(data).size
span = 50

Pad sequences so ${sequence} = 1$.

In [7]:
padded = pad_sequences(data, maxlen=1, padding='post')

Generate `context` and `target` pairs

In [16]:
ts = TimeseriesGenerator(padded, padded, span)

In [20]:
context, target = ts[0]
context.shape, target.shape


((128, 50, 1), (128, 1))

Process all context and target data.

(128, 50, 1)

In [34]:
X, y = [], []

for c, t in ts:
    X.extend(c.tolist())
    y.extend(t.tolist())

In [39]:
X = np.array(X, dtype='int')
X.shape

(163046, 50, 1)

In [42]:
y = np.array(y, dtype='int')
y.shape

(163046, 1)

In [47]:
y = np.array([to_categorical(_, size + 1) for _ in y])

In [48]:
y.shape

(163046, 1, 7105)

Subsample data

In [59]:
x_i = np.random.choice(np.arange(X.shape[0]), 500)

In [60]:
x_ = X[x_i]
y_ = y[x_i]

In [61]:
x_.shape, y_.shape

((500, 50, 1), (500, 1, 7105))

---
## Build Model

In [66]:
I = Input((span,))
E = Embedding(size, 150)(I)
L = LSTM(128)(E)
D1 = Dense(50)(L)
DR = Dropout(0.2)(D1)
D2 = Dense(100, activation='relu')(DR)
D3 = Dense(size + 1, activation='softmax')(D2)

Compile model

In [67]:
model = Model(inputs=I, outputs=D3, name='LSTM')
model.compile('rmsprop', 'categorical_crossentropy', metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 50)                0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 50, 150)           1065600   
_________________________________________________________________
lstm_6 (LSTM)                (None, 128)               142848    
_________________________________________________________________
dense_15 (Dense)             (None, 50)                6450      
_________________________________________________________________
dropout_5 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_16 (Dense)             (None, 100)               5100      
_________________________________________________________________
dense_17 (Dense)             (None, 7105)              717605    
Total para

---
## Train Model

In [None]:
model.fit(x_.reshape(500, 50), y_.reshape(500, 7105), batch_size=1000, epochs=10)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
