In [1]:
import numpy as np

from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf

In [2]:
##### Data in list format #####


#Todas as sequências tem o mesmo comprimento
#Vetores de tamanho fixo
data_set = ['Lista', 'Pilha', 'Fila', 'Lista Simplesmente Encadeada', 'Lista Duplamente Encadeada', 'Árvore', '9',
             'Pilha', 'Fila', 'Lista', 'Lista Simplesmente Encadeada', 'Lista Duplamente Encadeada', 'Árvore', '6',
             'Árvore', 'Pilha', 'Fila', 'Lista', 'Lista Simplesmente Encadeada', 'Lista Duplamente Encadeada', '5',
             'Pilha', 'Lista', 'Lista Simplesmente Encadeada', 'Lista Duplamente Encadeada', 'Fila', 'Árvore', '6']

In [3]:
##### Data in list format #####

#itens distintos 
vocab_l = sorted(set(data_set))
print(vocab_l)
print ('{} itens distintos'.format(len(vocab_l)))

['5', '6', '9', 'Fila', 'Lista', 'Lista Duplamente Encadeada', 'Lista Simplesmente Encadeada', 'Pilha', 'Árvore']
9 itens distintos


In [4]:
# Creating a mapping from unique characters to indices

char2idx = {u:i for i, u in enumerate(vocab_l)}
idx2char = np.array(vocab_l)

text_as_int = np.array([char2idx[c] for c in data_set])

print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '5' :   0,
  '6' :   1,
  '9' :   2,
  'Fila':   3,
  'Lista':   4,
  'Lista Duplamente Encadeada':   5,
  'Lista Simplesmente Encadeada':   6,
  'Pilha':   7,
  'Árvore':   8,
  ...
}


In [5]:
#char2int das sequências

values = np.array(data_set)
print(values)

label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)

['Lista' 'Pilha' 'Fila' 'Lista Simplesmente Encadeada'
 'Lista Duplamente Encadeada' 'Árvore' '9' 'Pilha' 'Fila' 'Lista'
 'Lista Simplesmente Encadeada' 'Lista Duplamente Encadeada' 'Árvore' '6'
 'Árvore' 'Pilha' 'Fila' 'Lista' 'Lista Simplesmente Encadeada'
 'Lista Duplamente Encadeada' '5' 'Pilha' 'Lista'
 'Lista Simplesmente Encadeada' 'Lista Duplamente Encadeada' 'Fila'
 'Árvore' '6']
[4 7 3 6 5 8 2 7 3 4 6 5 8 1 8 7 3 4 6 5 0 7 4 6 5 3 8 1]


In [6]:
#one-hot encoding das sequências

onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

[[0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]]


In [7]:
#reverter one-hot encoding das sequências

for i in onehot_encoded:
    inverted = label_encoder.inverse_transform([np.argmax(i)])
    print(inverted)

['Lista']
['Pilha']
['Fila']
['Lista Simplesmente Encadeada']
['Lista Duplamente Encadeada']
['Árvore']
['9']
['Pilha']
['Fila']
['Lista']
['Lista Simplesmente Encadeada']
['Lista Duplamente Encadeada']
['Árvore']
['6']
['Árvore']
['Pilha']
['Fila']
['Lista']
['Lista Simplesmente Encadeada']
['Lista Duplamente Encadeada']
['5']
['Pilha']
['Lista']
['Lista Simplesmente Encadeada']
['Lista Duplamente Encadeada']
['Fila']
['Árvore']
['6']


In [8]:
##### Create training examples and targets #####

# The maximum length sentence we want for a single input in characters
seq_length = 6
examples_per_epoch = len(data_set)#(seq_length+1)

char_dataset = tf.data.Dataset.from_tensor_slices(onehot_encoded)
# for i in char_dataset:
#     print(i)

sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

#print batches encoded
for i in sequences:
    print(i)
    
#print batches as text
for i in sequences:
    for j in i:
        print(''.join(label_encoder.inverse_transform([np.argmax(j)])))

tf.Tensor(
[[0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0.]], shape=(7, 9), dtype=float64)
tf.Tensor(
[[0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]], shape=(7, 9), dtype=float64)
tf.Tensor(
[[0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0.]], shape=(7, 9), dtype=float64)
tf.Tensor(
[[0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]], shape=(7, 9), dtype=f

In [37]:
##### Create training examples and targets #####

def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

#foreach item in sequences apply split_input_target
dataset = sequences.map(split_input_target)

#print input_text and target_text as text
#não necessário, para visualização
for input_example, target_example in  dataset:
    print('-----Input-----')
    for i in input_example:
        print(''.join(label_encoder.inverse_transform([np.argmax(i)])))
    print('-----Target-----')
    for i in target_example:
        print(''.join(label_encoder.inverse_transform([np.argmax(i)])))

-----Input-----
Lista
Pilha
Fila
Lista Simplesmente Encadeada
Lista Duplamente Encadeada
Árvore
-----Target-----
Pilha
Fila
Lista Simplesmente Encadeada
Lista Duplamente Encadeada
Árvore
9
-----Input-----
Pilha
Fila
Lista
Lista Simplesmente Encadeada
Lista Duplamente Encadeada
Árvore
-----Target-----
Fila
Lista
Lista Simplesmente Encadeada
Lista Duplamente Encadeada
Árvore
6
-----Input-----
Árvore
Pilha
Fila
Lista
Lista Simplesmente Encadeada
Lista Duplamente Encadeada
-----Target-----
Pilha
Fila
Lista
Lista Simplesmente Encadeada
Lista Duplamente Encadeada
5
-----Input-----
Pilha
Lista
Lista Simplesmente Encadeada
Lista Duplamente Encadeada
Fila
Árvore
-----Target-----
Lista
Lista Simplesmente Encadeada
Lista Duplamente Encadeada
Fila
Árvore
6


In [44]:
#não necessário, para visualização
for input_example, target_example in  dataset:
    for i, (input_idx, target_idx) in enumerate(zip(input_example, target_example)):
        print("Step {:4d}".format(i))
        print("  input: {} ({:s})".format(input_idx, str(label_encoder.inverse_transform([np.argmax(input_idx)]))))
        print("  expected output: {} ({:s})".format(target_idx, str(label_encoder.inverse_transform([np.argmax(target_idx)]))))
    print('------------------------------------------------------------------')

Step    0
  input: [0. 0. 0. 0. 1. 0. 0. 0. 0.] (['Lista'])
  expected output: [0. 0. 0. 0. 0. 0. 0. 1. 0.] (['Pilha'])
Step    1
  input: [0. 0. 0. 0. 0. 0. 0. 1. 0.] (['Pilha'])
  expected output: [0. 0. 0. 1. 0. 0. 0. 0. 0.] (['Fila'])
Step    2
  input: [0. 0. 0. 1. 0. 0. 0. 0. 0.] (['Fila'])
  expected output: [0. 0. 0. 0. 0. 0. 1. 0. 0.] (['Lista Simplesmente Encadeada'])
Step    3
  input: [0. 0. 0. 0. 0. 0. 1. 0. 0.] (['Lista Simplesmente Encadeada'])
  expected output: [0. 0. 0. 0. 0. 1. 0. 0. 0.] (['Lista Duplamente Encadeada'])
Step    4
  input: [0. 0. 0. 0. 0. 1. 0. 0. 0.] (['Lista Duplamente Encadeada'])
  expected output: [0. 0. 0. 0. 0. 0. 0. 0. 1.] (['Árvore'])
Step    5
  input: [0. 0. 0. 0. 0. 0. 0. 0. 1.] (['Árvore'])
  expected output: [0. 0. 1. 0. 0. 0. 0. 0. 0.] (['9'])
------------------------------------------------------------------
Step    0
  input: [0. 0. 0. 0. 0. 0. 0. 1. 0.] (['Pilha'])
  expected output: [0. 0. 0. 1. 0. 0. 0. 0. 0.] (['Fila'])
Step    1


In [58]:
##### Create training batches #####

# Batch size
BATCH_SIZE = 1

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10

# print(dataset)
# for i in dataset:
#     print(i)

datasetBatch = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
# If your program depends on the batches having the same outer dimension,
# you should set the drop_remainder argument to True to prevent the smaller
# batch from being produced.

print(datasetBatch)
for i in datasetBatch:
    print(i)


#for i in datasetBatch.take(1):
#    for j in i:
#        for a in j:
#            for e in a:
#                print(''.join(label_encoder.inverse_transform([np.argmax(e)])))

<BatchDataset shapes: ((1, 6, 9), (1, 6, 9)), types: (tf.float64, tf.float64)>
(<tf.Tensor: shape=(1, 6, 9), dtype=float64, numpy=
array([[[0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1.]]])>, <tf.Tensor: shape=(1, 6, 9), dtype=float64, numpy=
array([[[0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0.]]])>)
(<tf.Tensor: shape=(1, 6, 9), dtype=float64, numpy=
array([[[0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0

In [81]:
##### Building the model #####

# Length of the vocabulary in chars
vocab_size = len(vocab_l)

# Number of RNN units
rnn_units = 1024

def build_model(vocab_size, rnn_units):
    model = tf.keras.Sequential([
        tf.keras.layers.LSTM(rnn_units,batch_input_shape=(1, 6, 9)),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(
    vocab_size = len(vocab_l),
    rnn_units=rnn_units)

In [82]:
 model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_6 (LSTM)                (1, 1024)                 4235264   
_________________________________________________________________
dense_7 (Dense)              (1, 9)                    9225      
Total params: 4,244,489
Trainable params: 4,244,489
Non-trainable params: 0
_________________________________________________________________


In [90]:
##### Testing the model #####

for input_example_batch, target_example_batch in datasetBatch:
    print('Input:',input_example_batch)
    print()
    
    print('Target:',target_example_batch)
    print()
    
    example_batch_predictions = model(input_example_batch)
    print('Batche Predictions:',example_batch_predictions, "# (batch_size, sequence_length, vocab_size)")
    print()
    
    sampled_indices = tf.random.categorical(example_batch_predictions, num_samples=1)
    sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
    print('Sample Indices:',sampled_indices)
    print('--------------------------------------------------------')
 

Input: tf.Tensor(
[[[0. 0. 0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1.]]], shape=(1, 6, 9), dtype=float64)

Target: tf.Tensor(
[[[0. 0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 1. 0. 0. 0. 0. 0. 0. 0.]]], shape=(1, 6, 9), dtype=float64)

Batche Predictions: tf.Tensor(
[[ 0.01813579 -0.0004243  -0.00978522  0.0186614   0.01369935 -0.00464518
   0.01937051 -0.00422775 -0.00045989]], shape=(1, 9), dtype=float32) # (batch_size, sequence_length, vocab_size)

Sample Indices: [8]
--------------------------------------------------------
Input: tf.Tensor(
[[[0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0.]]], shape=(1, 6, 9)

In [84]:
##### Try prediction for the first example in the batch #####

#print(example_batch_predictions)
#sampled_indices = tf.random.categorical(example_batch_predictions, num_samples=1)
#sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
#print(sampled_indices)

tf.Tensor(
[[ 0.01447803 -0.00242894 -0.01396178  0.02150395  0.01190736 -0.00065227
   0.01840041 -0.00205957  0.00655539]], shape=(1, 9), dtype=float32)
[8]
