# Test Generation using a Bidirectional LSTM Network

<br/>

In [6]:
from __future__ import print_function
#import Keras library
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM, Input, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.metrics import categorical_accuracy

import tensorflow as tf
from keras import backend as K
print(K.tensorflow_backend._get_available_gpus())
config = tf.ConfigProto(intra_op_parallelism_threads=4, \
                        inter_op_parallelism_threads=4, \
                        allow_soft_placement=True,\
                        device_count = {'CPU' : 1, 'GPU' : 1})
tf.device('/gpu:1')
sess = tf.Session(config=config)
K.set_session(sess)

#import spacy, and spacy french model
# spacy is used to work on text
import spacy
#!python -m spacy download es
nlp = spacy.load('es_core_news_sm')

#import other libraries
import numpy as np
import random
import sys
import os
import time
import codecs
import collections
from six.moves import cPickle
import pandas as pd

[]


In [8]:
gdrive = False
if gdrive:
    !pip install -U -q PyDrive
    from pydrive.auth import GoogleAuth
    from pydrive.drive import GoogleDrive
    from google.colab import auth
    from oauth2client.client import GoogleCredentials

    # Authenticate and create the PyDrive client.
    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)

In [22]:
#define parameters used in the tutorial
data_dir = 'data/'# data directory containing raw texts
save_dir = 'vocab' # directory to store trained NN models
file_list = ["spanish_emojis"]

if gdrive:
    link = "https://drive.google.com/open?id=1kdV9lTy7RNp4xYX6_IMA-6wfMRyXNSgv"
    fluff, id = link.split('=')
    downloaded = drive.CreateFile({'id':id}) 
    downloaded.GetContentFile('words_vocab.pkl')
    vocab_file = "words_vocab.pkl" 
    
    link = "https://drive.google.com/open?id=1xr0ZW6lBDTALEqaAdZefu-DwgEPIgM83"
    fluff, id = link.split('=')
    downloaded = drive.CreateFile({'id':id}) 
    downloaded.GetContentFile('words_list.pkl')
    words_list_file = "words_list.pkl" 
    
    link = "https://drive.google.com/open?id=17o5LV6AizHqPw9QeIjKwFXmTASpNs673"
    fluff, id = link.split('=')
    downloaded = drive.CreateFile({'id':id}) 
    downloaded.GetContentFile('spanish_emojis.csv')
    data_file = "spanish_emojis.csv" 
    
else:   
    vocab_file = os.path.join(data_dir, "words_vocab.pkl")
    words_list_file = os.path.join(data_dir, "words_list.pkl")
    data_file = os.path.join(data_dir, file_list[0] + ".csv")
    sequences_file = os.path.join(data_dir, "sequences_list.pkl")
    next_words_file = os.path.join(data_dir, "next_words_list.pkl")

sequences_step = 1 #step to create sequences
seq_length = 10 # sequence length
preprocess = True

In [11]:
def read_file(filename):
    items = []
    if os.path.exists(filename):
        # try:
        with open(filename, 'rb') as fname:
            while True:
                try:
                    items = cPickle.load(fname)
                except EOFError:
                    print(EOFError)
                    break
    else:
        items = []
    
    return items

In [12]:
def create_wordlist(doc):
    wl = []
    for word in doc:
        if word.text not in ("\n","\n\n",'\u2009','\xa0'):
            wl.append(word.text.lower())
    return wl

In [18]:
sequences = []
next_words = []
wordlist = []
if(True):
    input_file = pd.read_csv(data_file)
    lines = input_file['observations']
    for data in lines:
        #create sentences
        doc = nlp(data)
        wl = create_wordlist(doc)
        wordlist = wordlist + wl
        if(len(wl) < 2): 
            continue
        seq = [' ' for i in range(seq_length - len(wl) + 1)]
        seq = seq + wl

        for i in range(0, len(seq) - seq_length, sequences_step):
          sequences.append(seq[i: i + seq_length])
          next_words.append(seq[i + seq_length])

In [24]:
if(preprocess):
    # count the number of words
    word_counts = collections.Counter(wordlist)

    # Mapping from index to word : that's the vocabulary
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    vocabulary_inv = list(sorted(vocabulary_inv))

    # Mapping from word to index
    vocab = {x: i for i, x in enumerate(vocabulary_inv)}
    words = [x[0] for x in word_counts.most_common()]

    #size of the vocabulary
    vocab_size = len(words)
    print("vocab size: ", vocab_size)

    #save the words and vocabulary
    with open(os.path.join(vocab_file), 'wb') as f:
        cPickle.dump((words, vocab, vocabulary_inv), f)
    with open(os.path.join(words_list_file), 'wb') as f:
        cPickle.dump((wordlist), f)
    with open(os.path.join(sequences_file), 'wb') as f:
        cPickle.dump((sequences), f)
    with open(os.path.join(next_words_file), 'wb') as f:
        cPickle.dump((next_words), f)

vocab size:  66846


In [25]:
len(sequences)

421621

In [26]:
sequences[0]

[' ', ' ', ' ', ' ', '?', 'en', 'serio', 'han', 'cancelado', 'tambien']

In [27]:
tmp = read_file(sequences_file)

<class 'EOFError'>


In [28]:
len(tmp)

421621

In [29]:
tmp[0]

[' ', ' ', ' ', ' ', '?', 'en', 'serio', 'han', 'cancelado', 'tambien']

In [8]:
if(not preprocess):
    wordlist = read_file(words_list_file)
    (words, vocab, vocabulary_inv) = read_file(vocab_file)
    sequences = read_file(sequences_file)
    next_words = read_file(next_words_file)
    
    # count the number of words
    word_counts = collections.Counter(wordlist)
    
    #size of the vocabulary
    vocab_size = len(words)
    print("vocab size: ", vocab_size)

<class 'EOFError'>
<class 'EOFError'>
vocab size:  66846


In [30]:
#create sequences
#sequences = []
#next_words = []
#for i in range(0, len(wordlist) - seq_length, sequences_step):
#    sequences.append(wordlist[i: i + seq_length])
#    next_words.append(wordlist[i + seq_length])

print('nb sequences: ', len(sequences))
print('vocav( ): ', vocab[' '])
print('words(0): ', vocabulary_inv[0])

nb sequences:  421621
vocav( ):  0
words(0):   


In [31]:
def bidirectional_lstm_model(seq_length, vocab_size):
    print('Build LSTM model.')
    model = Sequential()
    model.add(Bidirectional(LSTM(rnn_size, activation="relu"),input_shape=(seq_length, vocab_size)))
    model.add(Dropout(0.6))
    model.add(Dense(vocab_size))
    model.add(Activation('softmax'))
    
    optimizer = Adam(lr=learning_rate)
    callbacks=[EarlyStopping(patience=2, monitor='val_loss')]
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=[categorical_accuracy])
    return model

In [32]:
rnn_size = 256 # size of RNN
batch_size = 64 # minibatch size
seq_length = 10 # sequence length
num_epochs = 10 # number of epochs
learning_rate = 0.001 #learning rate
sequences_step = 1 #step to create sequences

In [33]:
md = bidirectional_lstm_model(seq_length, vocab_size)
md.summary()

Build LSTM model.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 512)               137426944 
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 66846)             34291998  
_________________________________________________________________
activation_1 (Activation)    (None, 66846)             0         
Total params: 171,718,942
Trainable params: 171,718,942
Non-trainable params: 0
_________________________________________________________________


In [34]:
#X = np.zeros((len(sequences)//500, seq_length, vocab_size), dtype=np.bool)
#y = np.zeros((len(sequences)//500, vocab_size), dtype=np.bool)
X = np.zeros((1000, seq_length, vocab_size), dtype=np.bool)
y = np.zeros((1000, vocab_size), dtype=np.bool)
n=0
for i, sentence in enumerate(sequences[1000*n:1000*(n+1)]):
    for t, word in enumerate(sentence):
        X[i, t, vocab[word]] = 1
    y[i, vocab[next_words[i]]] = 1

#fit the model
callbacks=[EarlyStopping(patience=4, monitor='val_loss'),
           ModelCheckpoint(filepath=save_dir + "/" + 'my_model_gen_sentences_lstm.{epoch:02d}-{val_loss:.2f}.hdf5',\
                           monitor='val_loss', verbose=0, mode='auto', period=2)]
history = md.fit(X, y,
                 batch_size=batch_size,
                 shuffle=True,
                 epochs=num_epochs,
                 callbacks=callbacks,
                 validation_split=0.01)

Train on 990 samples, validate on 10 samples
Epoch 1/10


ResourceExhaustedError: OOM when allocating tensor with shape[66846,1024] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
	 [[{{node training/Adam/gradients/AddN_12/tmp_var}} = TemporaryVariable[dtype=DT_FLOAT, shape=[66846,1024], var_name="training/Adam/gradients/AddN_12/tmp_var", _device="/job:localhost/replica:0/task:0/device:CPU:0"](^training/Adam/gradients/bidirectional_1/forward_lstm_1/strided_slice_3_grad/StridedSliceGrad)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


In [None]:
#save the model
md.save(save_dir + "/" + 'my_model_gen_sentences_lstm.final.hdf5')

In [None]:
#load vocabulary
print("loading vocabulary...")
vocab_file = os.path.join(save_dir, "words_vocab.pkl")

with open(os.path.join(save_dir, 'words_vocab.pkl'), 'rb') as f:
        words, vocab, vocabulary_inv = cPickle.load(f)

vocab_size = len(words)

In [None]:
from keras.models import load_model
# load the model
print("loading model...")
model = load_model(save_dir + "/" + 'my_model_gen_sentences_lstm.final.hdf5')

In [None]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
#initiate sentences
seed_sentences = "estoy muy cansado para estudiar ."
generated = ''
sentence = []
for i in range (seq_length):
    sentence.append("a")

seed = seed_sentences.split()

for i in range(len(seed)):
    sentence[seq_length-i-1]=seed[len(seed)-i-1]

generated += ' '.join(sentence)
print('Generating text with the following seed: "' + ' '.join(sentence) + '"')

print ()

In [None]:
words_number = 10
#generate the text
for i in range(words_number):
    #create the vector
    x = np.zeros((1, seq_length, vocab_size))
    for t, word in enumerate(sentence):
        x[0, t, vocab[word]] = 1.
    #print(x.shape)

    #calculate next word
    preds = model.predict(x, verbose=0)[0]
    next_index = sample(preds, 0.34)
    next_word = vocabulary_inv[next_index]

    #add the next word to the text
    generated += " " + next_word
    # shift the sentence by one, and and the next word at its end
    sentence = sentence[1:] + [next_word]

print(generated)