# Install Keras

In [1]:
!pip3 install keras --upgrade

Requirement already up-to-date: keras in /usr/local/lib/python3.6/dist-packages
Requirement already up-to-date: numpy>=1.9.1 in /usr/local/lib/python3.6/dist-packages (from keras)
Requirement already up-to-date: pyyaml in /usr/local/lib/python3.6/dist-packages (from keras)
Requirement already up-to-date: scipy>=0.14 in /usr/local/lib/python3.6/dist-packages (from keras)
Requirement already up-to-date: six>=1.9.0 in /usr/local/lib/python3.6/dist-packages (from keras)


# Clone Github

In [2]:
import os
#print(os.getcwd())
os.chdir('../content')
#os.chdir('../')
!ls

datalab  syllable-aware


In [3]:
!rm -R syllable-aware/
!ls

datalab


In [4]:
!git clone https://github.com/nlpchile/syllable-aware.git
!ls

Cloning into 'syllable-aware'...
remote: Counting objects: 502, done.[K
remote: Compressing objects: 100% (116/116), done.[K
remote: Total 502 (delta 68), reused 40 (delta 11), pack-reused 375[K
Receiving objects: 100% (502/502), 4.65 MiB | 24.17 MiB/s, done.
Resolving deltas: 100% (303/303), done.
datalab  syllable-aware


In [5]:
import os
os.chdir('syllable-aware')
!ls

calculate_Lmax.py     LSTMclass.ipynb		   README.md
callback_losswise.py  LSTM.py			   separadorSilabas.py
data		      on_epoch_end AND sample.txt  test_perplexity.py
generate_percent.py   perplexity.py		   test_tokens_selectors.py
generators.py	      process_corpus.py		   token_selectors.py
kmp.py		      process_text.py


# lstmClass

In [6]:
%%writefile lstmClass.py

import keras

class Model:
  
  def __init__(self,
               vocab_size,
               embedding_dim,
               hidden_dim,
               input_length,
               recurrent_dropout,
               dropout,
               seed):

    self.vocab_size = vocab_size
    self.embedding_dim = embedding_dim
    self.hidden_dim = hidden_dim
    self.input_length = input_length
    self.recurrent_dropout = recurrent_dropout
    self.dropout = dropout
    self.seed = seed


    self.word_embeddings = keras.layers.Embedding(input_dim = self.vocab_size+1,
                                                  output_dim = self.embedding_dim,
                                                  input_length = self.input_length,
                                                  mask_zero = True)

    self.lstm_1 = keras.layers.LSTM(units = self.hidden_dim,
                                    recurrent_dropout = self.recurrent_dropout,
                                    return_sequences = True,
                                    unroll = False,
                                    implementation = 2)

    self.dropout_1 = keras.layers.Dropout(rate = self.dropout,
                                          seed = self.seed)

    self.lstm_2 = keras.layers.LSTM(units = self.hidden_dim,
                                    recurrent_dropout = self.recurrent_dropout,
                                    return_sequences = False,
                                    unroll = False,
                                    implementation = 2)

    self.dense = keras.layers.Dense(units = self.vocab_size,
                                    activation = 'softmax')
    
    
    
  def build(self, optimizer, metrics):   
    
    self.optimizer = optimizer    
    self.metrics = metrics
    
    # self.learning_rate = learning_rate # (add to forward)
    # self.optimizer = keras.optimizers.RMSprop(lr = self.learning_rate)
    
    
    # Build
    
    self.model = keras.models.Sequential([self.word_embeddings, self.lstm_1, self.dropout_1, self.lstm_2, self.dense])
    
    self.summary = self.model.summary()
    
    self.model.compile(loss = 'categorical_crossentropy',
                       optimizer = self.optimizer,
                       metrics = self.metrics)
    
    #return self.model
  
  
  def fit(self, generator, epochs, workers, callbacks):
    
    self.g = generator # Object/Instance Generator, containing .generator() and .steps_per_epoch
    
    self.epochs = epochs
    self.workers = workers  
    self.callbacks = callbacks

    self.model.fit_generator(generator = self.g.generator(),
                             steps_per_epoch = self.g.steps_per_epoch,
                             epochs= self.epochs,
                             workers = self.workers,
                             callbacks = self.callbacks,
                             shuffle = False)
    
    
  def summary(self):
    
    return self.summary

Writing lstmClass.py


# preprocessClass

In [7]:
%%writefile preprocessClass.py

from process_text import *


class Preprocess:
  
  def __init__(self,
               path_to_file,
               quantity_word,
               quantity_syllable,
               train_size):
    
    
    self.path_to_file = path_to_file
    
    
    self.quantity_word = quantity_word
    self.quantity_syllable = quantity_syllable
    
    
    self.corpus = open(self.path_to_file, 'r').read().lower()
    
    
    self.selectors = get_selectors(corpus = self.corpus,
                                   quantity_word = self.quantity_word,
                                   quantity_syllable = self.quantity_syllable)
  
  
    #
    self.tokens = get_processed_text(corpus = self.corpus,
                                     selectors = self.selectors)
    #
    
    
    self.vocabulary = set(self.tokens)

    
    self.token_to_index = dict((t, i) for i, t in enumerate(self.vocabulary, 1))
    
    self.index_to_token = dict((self.token_to_index[t], t) for t in self.vocabulary)
    
    self.ind_corpus = [self.token_to_index[token] for token in self.tokens] # corpus as indexes
    
    self.vocabulary_as_index = set(self.ind_corpus) # vocabualry as index
    
    
    # testing proposes: test/train split
    
    self.train_size = train_size
    
    len_train = int(len(self.ind_corpus)*self.train_size)
    
    self.train_set = self.ind_corpus[0:len_train] # indexes
    self.test_set = self.ind_corpus[len_train:] # indexes
    
    self.vocabulary_train = set(self.train_set) # indexes
    self.vocabulary_test = set(self.test_set) # indexes

Writing preprocessClass.py


# Test LSTM Class

In [8]:
%%writefile testingClasses.py

from lstmClass import*
from preprocessClass import *


import keras
from generators import GeneralGenerator

import time


path_to_file = 'data/horoscopo_test_overfitting.txt'

train_size = 1 #0.8


## Hyperparameters

k = 1000
T = 6*k

quantity_word = 50

quantity_syllable = T - quantity_word


## Preprocessing


print('\n Processing \n')


ti = time.time()

obj = Preprocess(path_to_file,
                 quantity_word = quantity_word,
                 quantity_syllable = quantity_syllable,
                 train_size = train_size)

tf = time.time()
dt = (tf - ti) / 60.0
print('\n Elapsed Time {} \n'.format(dt))

##

L = 100

Lprima = L # se debe calcular. Lprima = f(L)

D = 512

recurrent_dropout = 0.3

dropout = 0.3

seed = 0 # para capa Dropout


## Train Generator

batch_size = 128

ind_corpus_train = obj.train_set #

#ind_val_tokens = obj.test_set

vocabulary = obj.vocabulary_as_index


## Fit Model

epochs = 100 #300

workers = 2

# https://keras.io/callbacks/
callbacks = []


## Model


model = Model(vocab_size = len(vocabulary),
              embedding_dim = D,
              hidden_dim = D,
              input_length = Lprima,
              recurrent_dropout = recurrent_dropout,
              dropout = dropout,
              seed = seed)


print(model.summary())


optimizer = 'rmsprop' #'adam'

metrics = ['top_k_categorical_accuracy', 'categorical_accuracy']


model.build(optimizer = optimizer,
            metrics = metrics)


train_generator = GeneralGenerator(batch_size = batch_size,
                                   ind_tokens = ind_corpus_train,
                                   vocabulary = vocabulary,
                                   max_len = Lprima)

#val_gen = GeneralGenerator(batch_size = batch_size,
#                           ind_tokens = ind_val_tokens, #
#                           voc = vocabulary,
#                           max_len = Lprima)


print('\n Training \n')

ti = time.time()

model.fit(generator = train_generator,
          epochs = epochs,
          workers = workers,
          callbacks = callbacks)

tf = time.time()
dt = (tf - ti) / 60.0
print('\n Elapsed Time {} \n'.format(dt))


Writing testingClasses.py


In [9]:
!python3 testingClasses.py

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.

 Processing 


 Elapsed Time 0.07199687957763672 

<bound method Model.summary of <lstmClass.Model object at 0x7fbde0dadef0>>
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 512)          86528     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100, 512)          2099200   
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 512)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 512)               2099200   
_________________________________________________________________
dense_1 (Dense)              (None, 168)               86184     
Total params: 4,371,112
Trainable params: 4,371,112
Non-

Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100

Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100

Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100


Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100

Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100


Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100

Epoch 99/100
Epoch 100/100

 Elapsed Time 4.718350195884705 

