# Lab: Using Recurrent Neural Network (RNN) to process the text


In [1]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

import tensorflow as tf
import numpy as np
# import unidecode

In [2]:
# Install the package for running tensorboard on google colaboration
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

print ("Running in Google COLAB : ", IN_COLAB)


# if IN_COLAB == True:
#     from tensorboardcolab import *
#     !pip install -U tensorboardcolab
# # Load the TensorBoard notebook extension
#     %load_ext tensorboard

Running in Google COLAB :  False


## Step 1: Load Data

In [3]:
# You can used your own dataset with english text
data_location = "/data/text/state-of-the-unions/2009-Obama.txt"
# data_location = 'https://elephantscale-public.s3.amazonaws.com/data/text/state-of-the-unions/2009-Obama.txt'


with open( data_location, "r") as f:
    text = f.read()

In [4]:
## If the above fails, try this

# !wget 'https://elephantscale-public.s3.amazonaws.com/data/text/state-of-the-unions/2009-Obama.txt'

# with open('2009-Obama.txt' , "r") as f:
#     text = f.read()

## Step 2 : Explore Data

In [5]:
print(len(text))

print(text[:1000])

33794
Madame Speaker, Mr. Vice President, Members of Congress, and the First Lady of
the United States:

I've come here tonight not only to address the distinguished men and women in
this great chamber, but to speak frankly and directly to the men and women who
sent us here.

I know that for many Americans watching right now, the state of our economy is
a concern that rises above all others.  And rightly so.  If you haven't been
personally affected by this recession, you probably know someone who has -- a
friend; a neighbor; a member of your family.  You don't need to hear another
list of statistics to know that our economy is in crisis, because you live it
every day.  It's the worry you wake up with and the source of sleepless
nights.  It's the job you thought you'd retire from but now have lost; the
business you built your dreams upon that's now hanging by a thread; the
college acceptance letter your child had to put back in the envelope.  The
impact of this recession is real, and it

## Step 3 : Shape Data

### 3.1 - Remove character and create vocabulary

<img src="../assets/images/rnn_vocab.png">

In [6]:
import unidecode 

text = unidecode.unidecode(text)
text = text.lower()

text = text.replace("2", "")
text = text.replace("1", "")
text = text.replace("8", "")
text = text.replace("5", "")
text = text.replace(">", "")
text = text.replace("<", "")
text = text.replace("!", "")
text = text.replace("?", "")
text = text.replace("-", "")
text = text.replace("$", "")

text = text.strip()

vocab = set(text)
print(len(vocab), vocab)

print(text[:1000])

41 {'a', 'p', 'i', ':', 'y', '0', ';', '3', 'n', 'k', 'd', 'e', '\n', 'h', '.', '9', 'o', 't', ' ', "'", 'x', 'q', '%', 'w', 'b', 'r', 'l', ',', 'g', 'c', 'u', 'v', '7', '/', 'm', '6', 'j', 's', '"', 'z', 'f'}
madame speaker, mr. vice president, members of congress, and the first lady of
the united states:

i've come here tonight not only to address the distinguished men and women in
this great chamber, but to speak frankly and directly to the men and women who
sent us here.

i know that for many americans watching right now, the state of our economy is
a concern that rises above all others.  and rightly so.  if you haven't been
personally affected by this recession, you probably know someone who has  a
friend; a neighbor; a member of your family.  you don't need to hear another
list of statistics to know that our economy is in crisis, because you live it
every day.  it's the worry you wake up with and the source of sleepless
nights.  it's the job you thought you'd retire from but now 

### 3.2 - Map each letter to int

In [7]:
vocab_size = len(vocab)

vocab_to_int = {l:i for i,l in enumerate(vocab)}
int_to_vocab = {i:l for i,l in enumerate(vocab)}

print("vocab_to_int", vocab_to_int)
print()
print("int_to_vocab", int_to_vocab)

print("\nint for e:", vocab_to_int["e"])
int_for_e = vocab_to_int["e"]
print("letter for %s: %s" % (vocab_to_int["e"], int_to_vocab[int_for_e]))

vocab_to_int {'a': 0, 'p': 1, 'i': 2, ':': 3, 'y': 4, '0': 5, ';': 6, '3': 7, 'n': 8, 'k': 9, 'd': 10, 'e': 11, '\n': 12, 'h': 13, '.': 14, '9': 15, 'o': 16, 't': 17, ' ': 18, "'": 19, 'x': 20, 'q': 21, '%': 22, 'w': 23, 'b': 24, 'r': 25, 'l': 26, ',': 27, 'g': 28, 'c': 29, 'u': 30, 'v': 31, '7': 32, '/': 33, 'm': 34, '6': 35, 'j': 36, 's': 37, '"': 38, 'z': 39, 'f': 40}

int_to_vocab {0: 'a', 1: 'p', 2: 'i', 3: ':', 4: 'y', 5: '0', 6: ';', 7: '3', 8: 'n', 9: 'k', 10: 'd', 11: 'e', 12: '\n', 13: 'h', 14: '.', 15: '9', 16: 'o', 17: 't', 18: ' ', 19: "'", 20: 'x', 21: 'q', 22: '%', 23: 'w', 24: 'b', 25: 'r', 26: 'l', 27: ',', 28: 'g', 29: 'c', 30: 'u', 31: 'v', 32: '7', 33: '/', 34: 'm', 35: '6', 36: 'j', 37: 's', 38: '"', 39: 'z', 40: 'f'}

int for e: 11
letter for 11: e


In [8]:
encoded = [vocab_to_int[l] for l in text]
encoded_sentence = encoded[:100]

print(encoded_sentence)

[34, 0, 10, 0, 34, 11, 18, 37, 1, 11, 0, 9, 11, 25, 27, 18, 34, 25, 14, 18, 31, 2, 29, 11, 18, 1, 25, 11, 37, 2, 10, 11, 8, 17, 27, 18, 34, 11, 34, 24, 11, 25, 37, 18, 16, 40, 18, 29, 16, 8, 28, 25, 11, 37, 37, 27, 18, 0, 8, 10, 18, 17, 13, 11, 18, 40, 2, 25, 37, 17, 18, 26, 0, 10, 4, 18, 16, 40, 12, 17, 13, 11, 18, 30, 8, 2, 17, 11, 10, 18, 37, 17, 0, 17, 11, 37, 3, 12, 12, 2]


In [9]:
decoded_sentence = [int_to_vocab[i] for i in encoded_sentence]
print(decoded_sentence)

['m', 'a', 'd', 'a', 'm', 'e', ' ', 's', 'p', 'e', 'a', 'k', 'e', 'r', ',', ' ', 'm', 'r', '.', ' ', 'v', 'i', 'c', 'e', ' ', 'p', 'r', 'e', 's', 'i', 'd', 'e', 'n', 't', ',', ' ', 'm', 'e', 'm', 'b', 'e', 'r', 's', ' ', 'o', 'f', ' ', 'c', 'o', 'n', 'g', 'r', 'e', 's', 's', ',', ' ', 'a', 'n', 'd', ' ', 't', 'h', 'e', ' ', 'f', 'i', 'r', 's', 't', ' ', 'l', 'a', 'd', 'y', ' ', 'o', 'f', '\n', 't', 'h', 'e', ' ', 'u', 'n', 'i', 't', 'e', 'd', ' ', 's', 't', 'a', 't', 'e', 's', ':', '\n', '\n', 'i']


In [10]:
decoded_sentence = "".join(decoded_sentence)
print(decoded_sentence)

madame speaker, mr. vice president, members of congress, and the first lady of
the united states:

i


## 3.3 - Genrate batch

### Sample of one batch

<img src="../assets/images/rnn_letter.png">

In [11]:
inputs, targets = encoded, encoded[1:]

print("Inputs", inputs[:10])
print("Targets", targets[:10])

Inputs [34, 0, 10, 0, 34, 11, 18, 37, 1, 11]
Targets [0, 10, 0, 34, 11, 18, 37, 1, 11, 0]


### Method used to generate batch in sequence order

In [12]:
def gen_batch(inputs, targets, seq_len, batch_size, noise=0):
    # Size of each chunk
    chuck_size = (len(inputs) -1)  // batch_size
    # Numbef of sequence per chunk
    sequences_per_chunk = chuck_size // seq_len

    for s in range(0, sequences_per_chunk):
        batch_inputs = np.zeros((batch_size, seq_len))
        batch_targets = np.zeros((batch_size, seq_len))
        for b in range(0, batch_size):
            fr = (b*chuck_size)+(s*seq_len)
            to = fr+seq_len
            batch_inputs[b] = inputs[fr:to]
            batch_targets[b] = inputs[fr+1:to+1]
            
            if noise > 0:
                noise_indices = np.random.choice(seq_len, noise)
                batch_inputs[b][noise_indices] = np.random.randint(0, vocab_size)
            
        yield batch_inputs, batch_targets

for batch_inputs, batch_targets in gen_batch(inputs, targets, 5, 32, noise=0):
    print(batch_inputs[0], batch_targets[0])
    break

for batch_inputs, batch_targets in gen_batch(inputs, targets, 5, 32, noise=3):
    print(batch_inputs[0], batch_targets[0])
    break

[34.  0. 10.  0. 34.] [ 0. 10.  0. 34. 11.]
[34. 16. 10.  0. 16.] [ 0. 10.  0. 34. 11.]


## Step 4: Create model
### 4.1 - Create your own layer

In [13]:
class OneHot(tf.keras.layers.Layer):
    def __init__(self, depth, **kwargs):
        super(OneHot, self).__init__(**kwargs)
        self.depth = depth

    def call(self, x, mask=None):
        return tf.one_hot(tf.cast(x, tf.int32), self.depth)

Test if the layer works well

In [14]:
class RnnModel(tf.keras.Model):

    def __init__(self, vocab_size):
        super(RnnModel, self).__init__()
        # Convolutions
        self.one_hot = OneHot(len(vocab))

    def call(self, inputs):
        output = self.one_hot(inputs)
        return output

batch_inputs, batch_targets = next(gen_batch(inputs, targets, 50, 32))

print(batch_inputs.shape)

model = RnnModel(len(vocab))
output = model.predict(batch_inputs)

print(output.shape)

#print(output)

print("Input letter is:", batch_inputs[0][0])
print("One hot representation of the letter", output[0][0])

#assert(output[int(batch_inputs[0][0])]==1)

(32, 50)
(32, 50, 41)
Input letter is: 34.0
One hot representation of the letter [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]


### 4.2 - Setup the model

<img src="../assets/images/architecture_rnn.png">

In [15]:
vocab_size = len(vocab)

### Creat the layers

# Set the input of the model
tf_inputs = tf.keras.Input(shape=(None,), batch_size=64)
# Convert each value of the  input into a one encoding vector
one_hot = OneHot(len(vocab))(tf_inputs)
# Stack LSTM cells
rnn_layer1 = tf.keras.layers.LSTM(128, return_sequences=True, stateful=True)(one_hot)
rnn_layer2 = tf.keras.layers.LSTM(128, return_sequences=True, stateful=True)(rnn_layer1)
# Create the outputs of the model
hidden_layer = tf.keras.layers.Dense(128, activation="relu")(rnn_layer2)
outputs = tf.keras.layers.Dense(vocab_size, activation="softmax")(hidden_layer)

### Setup the model
model = tf.keras.Model(inputs=tf_inputs, outputs=outputs)



### 4.3 - Check if we can reset the RNN cells

In [16]:
# Star by resetting the cells of the RNN
model.reset_states()

# Get one batch
batch_inputs, batch_targets = next(gen_batch(inputs, targets, 50, 64))

# Make a first prediction
outputs = model.predict(batch_inputs)
first_prediction = outputs[0][0]

# Reset the states of the RNN states
model.reset_states()

# Make an other prediction to check the difference
outputs = model.predict(batch_inputs)
second_prediction = outputs[0][0]

# Check if both prediction are equal
assert(set(first_prediction)==set(second_prediction))

### 4.4 - Set the loss and objectives

In [17]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam(lr=0.001)

### 4.5 - Set some metrics to track the progress of the training

In [18]:
# Loss
train_loss = tf.keras.metrics.Mean(name='train_loss')
# Accuracy
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

### 4.6 - Set the train method and the predict method in graph mode

In [19]:
@tf.function
def train_step(inputs, targets):
    with tf.GradientTape() as tape:
        # Make a prediction on all the batch
        predictions = model(inputs)
        # Get the error/loss on these predictions
        loss = loss_object(targets, predictions)
    # Compute the gradient which respect to the loss
    gradients = tape.gradient(loss, model.trainable_variables)
    # Change the weights of the model
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    # The metrics are accumulate over time. You don't need to average it yourself.
    train_loss(loss)
    train_accuracy(targets, predictions)

@tf.function
def predict(inputs):
    # Make a prediction on all the batch
    predictions = model(inputs)
    return predictions

### Step 6: Train the model

In [20]:
model.reset_states()

for epoch in range(4000):
    for batch_inputs, batch_targets in gen_batch(inputs, targets, 100, 64, noise=13):
        train_step(batch_inputs, batch_targets)
    template = '\r Epoch {}, Train Loss: {}, Train Accuracy: {}'
    print(template.format(epoch, train_loss.result(), train_accuracy.result()*100), end="")
    model.reset_states()



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
 Epoch 3999, Train Loss: 0.7701197266578674, Train Accuracy: 77.640396118164065

### Source:
https://github.com/thibo73800/tensorflow2.0-examples/blob/master/RNN%20-%20Text%20Generator.ipynb