In [3]:
import numpy as np

from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf

In [4]:
##### Data in list format #####


#Todas as sequências tem o mesmo comprimento
#Vetores de tamanho fixo
data_set = ['Lista', 'Pilha', 'Fila', 'Lista Simplesmente Encadeada', 'Lista Duplamente Encadeada', 'Árvore', '9',
             'Pilha', 'Fila', 'Lista', 'Lista Simplesmente Encadeada', 'Lista Duplamente Encadeada', 'Árvore', '6',
             'Árvore', 'Pilha', 'Fila', 'Lista', 'Lista Simplesmente Encadeada', 'Lista Duplamente Encadeada', '5',
             'Pilha', 'Lista', 'Lista Simplesmente Encadeada', 'Lista Duplamente Encadeada', 'Fila', 'Árvore', '6']

In [5]:
##### Data in list format #####

#itens distintos 
vocab_l = sorted(set(data_set))
print(vocab_l)
print ('{} itens distintos'.format(len(vocab_l)))

['5', '6', '9', 'Fila', 'Lista', 'Lista Duplamente Encadeada', 'Lista Simplesmente Encadeada', 'Pilha', 'Árvore']
9 itens distintos


In [6]:
# Creating a mapping from unique characters to indices

char2idx = {u:i for i, u in enumerate(vocab_l)}
idx2char = np.array(vocab_l)

text_as_int = np.array([char2idx[c] for c in data_set])

print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '5' :   0,
  '6' :   1,
  '9' :   2,
  'Fila':   3,
  'Lista':   4,
  'Lista Duplamente Encadeada':   5,
  'Lista Simplesmente Encadeada':   6,
  'Pilha':   7,
  'Árvore':   8,
  ...
}


In [7]:
##### Create training examples and targets #####

# The maximum length sentence we want for a single input in characters
seq_length = 6
examples_per_epoch = len(data_set)#(seq_length+1)

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
# for i in char_dataset:
#     print(i)

sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

#print batches encoded
for i in sequences:
    print(i)
    
#print batches as text
for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

tf.Tensor([4 7 3 6 5 8 2], shape=(7,), dtype=int32)
tf.Tensor([7 3 4 6 5 8 1], shape=(7,), dtype=int32)
tf.Tensor([8 7 3 4 6 5 0], shape=(7,), dtype=int32)
tf.Tensor([7 4 6 5 3 8 1], shape=(7,), dtype=int32)
'ListaPilhaFilaLista Simplesmente EncadeadaLista Duplamente EncadeadaÁrvore9'
'PilhaFilaListaLista Simplesmente EncadeadaLista Duplamente EncadeadaÁrvore6'
'ÁrvorePilhaFilaListaLista Simplesmente EncadeadaLista Duplamente Encadeada5'
'PilhaListaLista Simplesmente EncadeadaLista Duplamente EncadeadaFilaÁrvore6'


In [10]:
##### Create training examples and targets #####

def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

#foreach item in sequences apply split_input_target
dataset = sequences.map(split_input_target)

#print input_text and target_text as text
#não necessário, para visualização
for input_example, target_example in  dataset:
    print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])),'\n')
    print ('Target data:', repr(''.join(idx2char[target_example.numpy()])),'\n')

Input data:  'ListaPilhaFilaLista Simplesmente EncadeadaLista Duplamente EncadeadaÁrvore' 

Target data: 'PilhaFilaLista Simplesmente EncadeadaLista Duplamente EncadeadaÁrvore9' 

Input data:  'PilhaFilaListaLista Simplesmente EncadeadaLista Duplamente EncadeadaÁrvore' 

Target data: 'FilaListaLista Simplesmente EncadeadaLista Duplamente EncadeadaÁrvore6' 

Input data:  'ÁrvorePilhaFilaListaLista Simplesmente EncadeadaLista Duplamente Encadeada' 

Target data: 'PilhaFilaListaLista Simplesmente EncadeadaLista Duplamente Encadeada5' 

Input data:  'PilhaListaLista Simplesmente EncadeadaLista Duplamente EncadeadaFilaÁrvore' 

Target data: 'ListaLista Simplesmente EncadeadaLista Duplamente EncadeadaFilaÁrvore6' 



In [11]:
#não necessário, para visualização
for input_example, target_example in  dataset:
    for i, (input_idx, target_idx) in enumerate(zip(input_example, target_example)):
        print("Step {:4d}".format(i))
        print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
        print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))
    print('------------------------------------------------------------------')

Step    0
  input: 4 ('Lista')
  expected output: 7 ('Pilha')
Step    1
  input: 7 ('Pilha')
  expected output: 3 ('Fila')
Step    2
  input: 3 ('Fila')
  expected output: 6 ('Lista Simplesmente Encadeada')
Step    3
  input: 6 ('Lista Simplesmente Encadeada')
  expected output: 5 ('Lista Duplamente Encadeada')
Step    4
  input: 5 ('Lista Duplamente Encadeada')
  expected output: 8 ('Árvore')
Step    5
  input: 8 ('Árvore')
  expected output: 2 ('9')
------------------------------------------------------------------
Step    0
  input: 7 ('Pilha')
  expected output: 3 ('Fila')
Step    1
  input: 3 ('Fila')
  expected output: 4 ('Lista')
Step    2
  input: 4 ('Lista')
  expected output: 6 ('Lista Simplesmente Encadeada')
Step    3
  input: 6 ('Lista Simplesmente Encadeada')
  expected output: 5 ('Lista Duplamente Encadeada')
Step    4
  input: 5 ('Lista Duplamente Encadeada')
  expected output: 8 ('Árvore')
Step    5
  input: 8 ('Árvore')
  expected output: 1 ('6')
---------------------

In [12]:
##### Create training batches #####

# Batch size
BATCH_SIZE = 1

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10

# print(dataset)
# for i in dataset:
#     print(i)

datasetBatch = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
# If your program depends on the batches having the same outer dimension,
# you should set the drop_remainder argument to True to prevent the smaller
# batch from being produced.

print(datasetBatch)
for i in datasetBatch:
    print(i)


#for i in datasetBatch.take(1):
#    for j in i:
#        for a in j:
#            for e in a:
#                print(''.join(label_encoder.inverse_transform([np.argmax(e)])))

<BatchDataset shapes: ((1, 6), (1, 6)), types: (tf.int32, tf.int32)>
(<tf.Tensor: shape=(1, 6), dtype=int32, numpy=array([[8, 7, 3, 4, 6, 5]])>, <tf.Tensor: shape=(1, 6), dtype=int32, numpy=array([[7, 3, 4, 6, 5, 0]])>)
(<tf.Tensor: shape=(1, 6), dtype=int32, numpy=array([[7, 4, 6, 5, 3, 8]])>, <tf.Tensor: shape=(1, 6), dtype=int32, numpy=array([[4, 6, 5, 3, 8, 1]])>)
(<tf.Tensor: shape=(1, 6), dtype=int32, numpy=array([[7, 3, 4, 6, 5, 8]])>, <tf.Tensor: shape=(1, 6), dtype=int32, numpy=array([[3, 4, 6, 5, 8, 1]])>)
(<tf.Tensor: shape=(1, 6), dtype=int32, numpy=array([[4, 7, 3, 6, 5, 8]])>, <tf.Tensor: shape=(1, 6), dtype=int32, numpy=array([[7, 3, 6, 5, 8, 2]])>)


In [20]:
##### Building the model #####

# Length of the vocabulary in chars
vocab_size = len(vocab_l)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                          batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(
    vocab_size = len(vocab_l),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

In [21]:
 model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            2304      
_________________________________________________________________
lstm_2 (LSTM)                (1, None, 1024)           5246976   
_________________________________________________________________
dense_2 (Dense)              (1, None, 9)              9225      
Total params: 5,258,505
Trainable params: 5,258,505
Non-trainable params: 0
_________________________________________________________________


In [22]:
##### Testing the model #####

for input_example_batch, target_example_batch in datasetBatch:
    print('Input:',input_example_batch)
    print()
    
    print('Target:',target_example_batch)
    print()
    
    example_batch_predictions = model(input_example_batch)
    print('Batche Predictions:',example_batch_predictions, "# (batch_size, sequence_length, vocab_size)")
    print()
    
    sampled_indices = tf.random.categorical(example_batch_predictions, num_samples=1)
    sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
    print('Sample Indices:',sampled_indices)
    print('--------------------------------------------------------')
 

Input: tf.Tensor([[8 7 3 4 6 5]], shape=(1, 6), dtype=int32)

Target: tf.Tensor([[7 3 4 6 5 0]], shape=(1, 6), dtype=int32)

Batche Predictions: tf.Tensor(
[[[-0.00162221 -0.00026628 -0.00346668 -0.00449064 -0.00406137
    0.00018725 -0.00035133 -0.00317207  0.00271029]
  [ 0.00336596 -0.00823973 -0.00210959  0.00280748 -0.00663296
    0.00014146 -0.00516854 -0.00624472 -0.0050321 ]
  [ 0.00165804 -0.00247024 -0.00479421  0.00583282 -0.00332201
    0.0028725  -0.00632217 -0.00103183  0.00545756]
  [ 0.00139558 -0.00020239 -0.00824285  0.00476362  0.00167502
   -0.00103394 -0.00478953 -0.00083874  0.00717098]
  [ 0.00136414  0.00578063  0.00127151  0.01175594  0.00074629
   -0.00318136 -0.00796716 -0.00792722  0.00146125]
  [ 0.00253247  0.00446041  0.00039789  0.01307701  0.00324432
   -0.00328533 -0.00587091 -0.00195501  0.00698574]]], shape=(1, 6, 9), dtype=float32) # (batch_size, sequence_length, vocab_size)



InvalidArgumentError: logits should be a matrix, got shape [1,6,9] [Op:Multinomial]

In [84]:
##### Try prediction for the first example in the batch #####

#print(example_batch_predictions)
#sampled_indices = tf.random.categorical(example_batch_predictions, num_samples=1)
#sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
#print(sampled_indices)

tf.Tensor(
[[ 0.01447803 -0.00242894 -0.01396178  0.02150395  0.01190736 -0.00065227
   0.01840041 -0.00205957  0.00655539]], shape=(1, 9), dtype=float32)
[8]
