# LSTM Train Test Study
---
Glenn Abastillas

In [146]:
from keras.layers import Input, Dense, Embedding, LSTM
from keras.models import Model, Sequential
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences

from nltk.corpus import brown

from lxml import etree
import requests
import numpy as np
from io import StringIO, BytesIO

corpus = [[_.lower() for _ in sent if _.isalnum()] for sent in brown.sents('cp05')]
url = "http://www.binisaya.com/node/3219"

#### Preprocess Text

In [3]:
help(Tokenizer)

Help on class Tokenizer in module keras_preprocessing.text:

class Tokenizer(builtins.object)
 |  Text tokenization utility class.
 |  
 |  This class allows to vectorize a text corpus, by turning each
 |  text into either a sequence of integers (each integer being the index
 |  of a token in a dictionary) or into a vector where the coefficient
 |  for each token could be binary, based on word count, based on tf-idf...
 |  
 |  # Arguments
 |      num_words: the maximum number of words to keep, based
 |          on word frequency. Only the most common `num_words` words will
 |          be kept.
 |      filters: a string where each element is a character that will be
 |          filtered from the texts. The default is all punctuation, plus
 |          tabs and line breaks, minus the `'` character.
 |      lower: boolean. Whether to convert the texts to lowercase.
 |      split: str. Separator for word splitting.
 |      char_level: if True, every character will be treated as a token.
 |

In [19]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts(corpus)

data = tokenizer.texts_to_sequences(corpus)

Pad data.

In [20]:
sent_length = [len(_) for _ in data]

In [21]:
padded = pad_sequences(data, maxlen=max(sent_length), padding='post')

---
##### Cebuano Data
Get data from online

In [31]:
results = requests.get(url)

In [75]:
site = etree.parse(StringIO(results.text), parser=etree.HTMLParser())

In [94]:
all_text = site.xpath("//*[text() and (not(starts-with(text(), '<')) and not(starts-with(text(), '\n')))]/text()")
paras = site.xpath("//p[text() and (not(starts-with(text(), '<')) and not(starts-with(text(), '\n')))]/text()")

Process paragraphs

In [101]:
paras_ = [[w.lower() for w in sent.split() if w.isalnum()] for line in paras for sent in line.split(".")]
cebuano = [_ for _ in paras_ if _]

In [144]:
print([' '.join(_) for _ in cebuano])

['mga aduna koy maayong isugid kaninyo mahitungod ani bago nako nasaksihan nga maayong kapangwartahan', 'kini mao ang buzzbreak app', 'unsay paagi aron', '1', 'i download ra nimo cya sa google app store o kaha sa apple app store', '2', 'dayun mag himo ka ug account', 'duha ang paagi', 'pwede ka mag gamit sa imong facebook o kaha sa imong gmail account', '3', 'daghang paagi makakuha ug puntos o points', 'kini puntos sama ra sa kwarta nga imong nakulekta', 'usa sa paagi ang pag check in kada adlaw', 'pagbasa ug mga balita', 'paglantaw ug mga bidyo', 'ug daghan pa', '4', 'maka withdraw ka pina agi sa gcash o kaha sa paypal', 'ang pinaka ubos nimo nga mawithdraw kay usa ka piso', 'buot ipasabot nga di na ka kinahanglang magtigom pa ug gatos para lang maka withdraw', '5', 'duna sad koy gasa nimo nga puntos karon kung buot nimong kuhaon', 'sayon lang ang paagi', 'ig human nimo ug download', 'comment lang aron mahatagan tka sa puntos nga akong igasa nimo', 'para sa detalyado nga pahina mahitu

In [147]:
cebuano_y = np.array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1])

---
Tokenize

In [104]:
tokenizer.fit_on_texts(cebuano)
cebuano_ = tokenizer.texts_to_sequences(cebuano)


In [108]:
cebuano_ = pad_sequences(cebuano_, maxlen=15)

Inspect vocabulary

In [111]:
to_word = {word: idx for idx, word in tokenizer.word_index.items()}

#### Develop Model

Define LSTM layer (single)

In [120]:
data = cebuano_

In [178]:
I = Input(data.shape)

In [185]:
E = Embedding(len(to_word) + 1, 100, input_length=data.shape[1])

In [186]:
E.get_config()

{'name': 'embedding_16',
 'trainable': True,
 'batch_input_shape': (None, 15),
 'dtype': 'float32',
 'input_dim': 854,
 'output_dim': 100,
 'embeddings_initializer': {'class_name': 'RandomUniform',
  'config': {'minval': -0.05, 'maxval': 0.05, 'seed': None}},
 'embeddings_regularizer': None,
 'activity_regularizer': None,
 'embeddings_constraint': None,
 'mask_zero': False,
 'input_length': 15}

In [180]:
I.shape

TensorShape([Dimension(None), Dimension(26), Dimension(15)])

In [177]:
E = Embedding(len(to_word), 100, input_length=data.shape[1])
# L = LSTM(128)(I)
D = Dense(len(to_word), activation='softmax')(E)

ValueError: Layer dense_12 was called with an input that isn't a symbolic tensor. Received type: <class 'keras.layers.embeddings.Embedding'>. Full input: [<keras.layers.embeddings.Embedding object at 0x1245533c8>]. All inputs to the layer should be tensors.

---

In [176]:
model = Model(inputs=E, outputs=D)
model.compile('rmsprop', 'categorical_crossentropy', metrics=['accuracy'])

model.fit(cebuano_, cebuano_y, epochs=10, batch_size=1)

ValueError: Error when checking input: expected input_17 to have 3 dimensions, but got array with shape (26, 15)

In [140]:
cebuano_.shape

(26, 15)