# LSTM Study Text Generation
---
Glenn Abastillas

In [9]:
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense, Dropout
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import TimeseriesGenerator, pad_sequences

from nltk.corpus import gutenberg

import pandas as pd
import numpy as np
import altair as alt
import matplotlib.pyplot as plt

words = gutenberg.words('austen-emma.txt')

---
## Preprocess Text

In [2]:
tk = Tokenizer()
tk.fit_on_texts(words)
data = [_ for _ in tk.texts_to_sequences(words) if _]

Clean, modify, and inspect.

In [3]:
data = np.array(data)

Define shape variables

In [13]:
size = np.unique(data).size
span = 50

Pad sequences so ${sequence} = 1$.

In [7]:
padded = pad_sequences(data, maxlen=1, padding='post')

Generate `context` and `target` pairs

In [16]:
ts = TimeseriesGenerator(padded, padded, span)

In [20]:
context, target = ts[0]
context.shape, target.shape

((128, 50, 1), (128, 1))

---
## Build Model

In [14]:
I = Input((span,))
E = Embedding(size, 150)(I)
L = LSTM(128)(E)
D1 = Dense(50)(L)
DR = Dropout(0.2)(D1)
D2 = Dense(100, activation='relu')(DR)
D3 = Dense(size, activation='softmax')(D2)

Compile model

In [15]:
model = Model(inputs=I, outputs=D3)
model.compile('rmsprop', 'categorical_crossentropy', metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 50)                0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 50, 150)           1065600   
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               142848    
_________________________________________________________________
dense_6 (Dense)              (None, 50)                6450      
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 100)               5100      
_________________________________________________________________
dense_8 (Dense)              (None, 7104)              717504    
Total para

---
## Train Model