# LSTM Train Test Study
---
Glenn Abastillas

In [428]:
from keras.layers import Input, Dense, Embedding, LSTM, Flatten, Reshape
from keras.models import Model, Sequential
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.datasets import imdb

from nltk.corpus import brown

from lxml import etree
import requests
import numpy as np
from io import StringIO, BytesIO

corpus = [[_.lower() for _ in sent if _.isalnum()] for sent in brown.sents('cp05')]
url = "http://www.binisaya.com/node/3219"

#### Preprocess Text

In [19]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
data = tokenizer.texts_to_sequences(corpus)

Pad data.

In [20]:
sent_length = [len(_) for _ in data]

In [21]:
padded = pad_sequences(data, maxlen=max(sent_length), padding='post')

---
##### Cebuano Data
Get data from online

In [31]:
results = requests.get(url)

In [75]:
site = etree.parse(StringIO(results.text), parser=etree.HTMLParser())

In [94]:
all_text = site.xpath("//*[text() and (not(starts-with(text(), '<')) and not(starts-with(text(), '\n')))]/text()")
paras = site.xpath("//p[text() and (not(starts-with(text(), '<')) and not(starts-with(text(), '\n')))]/text()")

Process paragraphs

In [101]:
paras_ = [[w.lower() for w in sent.split() if w.isalnum()] for line in paras for sent in line.split(".")]
cebuano = [_ for _ in paras_ if _]

In [144]:
print([' '.join(_) for _ in cebuano])

['mga aduna koy maayong isugid kaninyo mahitungod ani bago nako nasaksihan nga maayong kapangwartahan', 'kini mao ang buzzbreak app', 'unsay paagi aron', '1', 'i download ra nimo cya sa google app store o kaha sa apple app store', '2', 'dayun mag himo ka ug account', 'duha ang paagi', 'pwede ka mag gamit sa imong facebook o kaha sa imong gmail account', '3', 'daghang paagi makakuha ug puntos o points', 'kini puntos sama ra sa kwarta nga imong nakulekta', 'usa sa paagi ang pag check in kada adlaw', 'pagbasa ug mga balita', 'paglantaw ug mga bidyo', 'ug daghan pa', '4', 'maka withdraw ka pina agi sa gcash o kaha sa paypal', 'ang pinaka ubos nimo nga mawithdraw kay usa ka piso', 'buot ipasabot nga di na ka kinahanglang magtigom pa ug gatos para lang maka withdraw', '5', 'duna sad koy gasa nimo nga puntos karon kung buot nimong kuhaon', 'sayon lang ang paagi', 'ig human nimo ug download', 'comment lang aron mahatagan tka sa puntos nga akong igasa nimo', 'para sa detalyado nga pahina mahitu

In [325]:
cebuano_y = np.array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1])

---
Tokenize

In [397]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(cebuano)
cebuano_ = tokenizer.texts_to_sequences(cebuano)


In [398]:
cebuano_ = pad_sequences(cebuano_, maxlen=15, padding='post')

Inspect vocabulary

In [399]:
to_word = {word: idx for idx, word in tokenizer.word_index.items()}

#### Develop Model

Define X, y

In [400]:
X = cebuano_
y = cebuano_y
vocab_size = len(to_word) + 1

Create series + targets

In [466]:
window = 5
series = []
flattened = X.flatten()
for i, token in enumerate(flattened):
    p = max(0, i - window)
    prev = X.flatten()[p:i]
    series.append((to_categorical(token, vocab_size), prev))

In [467]:
targets, context = zip(*series)

In [468]:
context = pad_sequences(context, 5, padding='post')
targets = np.array(targets)

Define layers

In [469]:
I = Input((1,5), name='Input')

# E = Embedding(vocab_size, 100, name='embedding', input_shape=(15,))(I)
L = LSTM(128)(I)
K = Dense(100, activation='relu')(L)
D = Dense(vocab_size, activation='sigmoid', name='output')(K)

---

In [470]:
model = Model(inputs=I, outputs=D, name='model')
model.compile('rmsprop', 'categorical_crossentropy', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input (InputLayer)           (None, 1, 5)              0         
_________________________________________________________________
lstm_35 (LSTM)               (None, 128)               68608     
_________________________________________________________________
dense_34 (Dense)             (None, 100)               12900     
_________________________________________________________________
output (Dense)               (None, 109)               11009     
Total params: 92,517
Trainable params: 92,517
Non-trainable params: 0
_________________________________________________________________


In [471]:
model.fit(context.reshape(390, 1, 5), targets, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x13700fc88>

Save mode

In [472]:
model.save('lstm-1.model')

---
#### Generate Phrases

In [482]:
seed = np.random.choice(X[X != 0].flatten())

In [497]:
arr = np.zeros(5)
arr[0] = seed
arr

array([54.,  0.,  0.,  0.,  0.])

In [498]:
model.predict(arr.reshape(1, 1, 5)).argmax()

0