# Lab: Using Recurrent Neural Network (RNN) to process the text


In [111]:
## Determine if we are running on google colab

try:
    import google.colab
    RUNNING_IN_COLAB = True
except:
    RUNNING_IN_COLAB = False

print ("Running in Google COLAB : ", RUNNING_IN_COLAB)


Running in Google COLAB :  False


In [112]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

import tensorflow as tf
from tensorflow import keras

## Step 1: Load Data

In [113]:
# You can used your own dataset with english text
data_location = "../data/text/state-of-the-unions/2009-Obama.txt"

if RUNNING_IN_COLAB:
    data_location = 'https://elephantscale-public.s3.amazonaws.com/data/text/state-of-the-unions/2009-Obama.txt'
    !wget $data_location  -O '2009-Obama.txt'
    data_location = '2009-Obama.txt'

    
with open( data_location, "r") as f:
    text = f.read()

## Step 2 : Explore Data

In [114]:
print ('type(text)', type(text))
print("len(text) : ", len(text))
print('---- text[:20]---')
print(text[:20])
print('-------')
print('----text[:1000]---')
print(text[:1000])
print('-------')

type(text) <class 'str'>
len(text) :  33794
---- text[:20]---
Madame Speaker, Mr. 
-------
----text[:1000]---
Madame Speaker, Mr. Vice President, Members of Congress, and the First Lady of
the United States:

I've come here tonight not only to address the distinguished men and women in
this great chamber, but to speak frankly and directly to the men and women who
sent us here.

I know that for many Americans watching right now, the state of our economy is
a concern that rises above all others.  And rightly so.  If you haven't been
personally affected by this recession, you probably know someone who has -- a
friend; a neighbor; a member of your family.  You don't need to hear another
list of statistics to know that our economy is in crisis, because you live it
every day.  It's the worry you wake up with and the source of sleepless
nights.  It's the job you thought you'd retire from but now have lost; the
business you built your dreams upon that's now hanging by a thread; the
college acc

## Step 3 : Shape Data

### 3.1 - Remove character and create vocabulary

<img src="../assets/images/rnn_vocab.png">

In [115]:
import unidecode 
import re

text = unidecode.unidecode(text)
text = text.lower()

text = re.sub(r'\d', '', text)  # replace numbers
text = re.sub(r'[^0-9a-zA-Z\ \-]+', '', text)
text = re.sub(r'[,:\n]+', '', text)
text = text.strip()

vocab = set(text)
print("vocab len : ", len(vocab))
print ("vocab : ", vocab)

print()
print("text: ", text[:100])

vocab len :  28
vocab :  {'p', 'l', 't', 'w', 'b', 'i', 'm', 'o', 'k', 'f', 'y', 'v', '-', 'j', ' ', 's', 'x', 'a', 'c', 'd', 'h', 'g', 'q', 'n', 'r', 'z', 'e', 'u'}

text:  madame speaker mr vice president members of congress and the first lady ofthe united statesive come 


In [116]:
# import unidecode 
# import re

# text = unidecode.unidecode(text)
# text = text.lower()

# text = re.sub('\d', '', text)  # replace numbers
# text = text.replace(">", "")
# text = text.replace("<", "")
# text = text.replace("!", "")
# text = text.replace("?", "")
# text = text.replace("-", "")
# text = text.replace("$", "")
# text = text.replace("%", "")

# # text = text.replace("'", "")
# text = text.replace(";", "")
# text = text.replace('"', "")
# # text = text.replace(',', "")
# text = text.replace('\n', "")
# text = text.replace('/', "")

# text = text.strip()

# vocab = set(text)
# print("vocab len : ", len(vocab), ", vocab : ", vocab)
# print()
# print(text[:1000])

### 3.2 - Map each letter to int

In [117]:
import pprint

vocab_size = len(vocab)

vocab_to_int = {l:i for i,l in enumerate(vocab)}
int_to_vocab = {i:l for i,l in enumerate(vocab)}

print("vocab_to_int: \n", pprint.pformat(vocab_to_int))
print()
print("int_to_vocab: \n", pprint.pformat(int_to_vocab))

print("\nint for e:", vocab_to_int["e"])
int_for_e = vocab_to_int["e"]
print("letter for %s: %s" % (vocab_to_int["e"], int_to_vocab[int_for_e]))

vocab_to_int: 
 {' ': 14,
 '-': 12,
 'a': 17,
 'b': 4,
 'c': 18,
 'd': 19,
 'e': 26,
 'f': 9,
 'g': 21,
 'h': 20,
 'i': 5,
 'j': 13,
 'k': 8,
 'l': 1,
 'm': 6,
 'n': 23,
 'o': 7,
 'p': 0,
 'q': 22,
 'r': 24,
 's': 15,
 't': 2,
 'u': 27,
 'v': 11,
 'w': 3,
 'x': 16,
 'y': 10,
 'z': 25}

int_to_vocab: 
 {0: 'p',
 1: 'l',
 2: 't',
 3: 'w',
 4: 'b',
 5: 'i',
 6: 'm',
 7: 'o',
 8: 'k',
 9: 'f',
 10: 'y',
 11: 'v',
 12: '-',
 13: 'j',
 14: ' ',
 15: 's',
 16: 'x',
 17: 'a',
 18: 'c',
 19: 'd',
 20: 'h',
 21: 'g',
 22: 'q',
 23: 'n',
 24: 'r',
 25: 'z',
 26: 'e',
 27: 'u'}

int for e: 26
letter for 26: e


In [118]:
def encode_text(t):
    return [vocab_to_int[l] for l in t]
    
def decode_text(encoded):
    return [int_to_vocab[i] for i in encoded_sentence]

In [119]:
# encoded = [vocab_to_int[l] for l in text]
encoded = encode_text(text)
encoded_sentence = encoded[:100]

print ("original text : ", text[:100])
print()
print("encoded sentence : ", encoded_sentence)

original text :  madame speaker mr vice president members of congress and the first lady ofthe united statesive come 

encoded sentence :  [6, 17, 19, 17, 6, 26, 14, 15, 0, 26, 17, 8, 26, 24, 14, 6, 24, 14, 11, 5, 18, 26, 14, 0, 24, 26, 15, 5, 19, 26, 23, 2, 14, 6, 26, 6, 4, 26, 24, 15, 14, 7, 9, 14, 18, 7, 23, 21, 24, 26, 15, 15, 14, 17, 23, 19, 14, 2, 20, 26, 14, 9, 5, 24, 15, 2, 14, 1, 17, 19, 10, 14, 7, 9, 2, 20, 26, 14, 27, 23, 5, 2, 26, 19, 14, 15, 2, 17, 2, 26, 15, 5, 11, 26, 14, 18, 7, 6, 26, 14]


In [120]:
# decoded_sentence = [int_to_vocab[i] for i in encoded_sentence]
decoded_sentence = decode_text(encoded_sentence)
print("decoded sentence: " , decoded_sentence)

decoded sentence:  ['m', 'a', 'd', 'a', 'm', 'e', ' ', 's', 'p', 'e', 'a', 'k', 'e', 'r', ' ', 'm', 'r', ' ', 'v', 'i', 'c', 'e', ' ', 'p', 'r', 'e', 's', 'i', 'd', 'e', 'n', 't', ' ', 'm', 'e', 'm', 'b', 'e', 'r', 's', ' ', 'o', 'f', ' ', 'c', 'o', 'n', 'g', 'r', 'e', 's', 's', ' ', 'a', 'n', 'd', ' ', 't', 'h', 'e', ' ', 'f', 'i', 'r', 's', 't', ' ', 'l', 'a', 'd', 'y', ' ', 'o', 'f', 't', 'h', 'e', ' ', 'u', 'n', 'i', 't', 'e', 'd', ' ', 's', 't', 'a', 't', 'e', 's', 'i', 'v', 'e', ' ', 'c', 'o', 'm', 'e', ' ']


In [121]:
decoded_sentence2 = "".join(decoded_sentence)
print(decoded_sentence2)

madame speaker mr vice president members of congress and the first lady ofthe united statesive come 


## 3.3 - Generate batch

### Sample of one batch

<img src="../assets/images/rnn_letter.png">

In [122]:
inputs, targets = encoded, encoded[1:]

# predict the next ones
print("Inputs", inputs[:10])
print("Targets", targets[:10])

Inputs [6, 17, 19, 17, 6, 26, 14, 15, 0, 26]
Targets [17, 19, 17, 6, 26, 14, 15, 0, 26, 17]


### Method used to generate batch in sequence order

In [123]:
def gen_batch(inputs, targets, seq_len, batch_size, noise=0):
    # Size of each chunk
    chuck_size = (len(inputs) -1)  // batch_size
    # Numbef of sequence per chunk
    sequences_per_chunk = chuck_size // seq_len

    for s in range(0, sequences_per_chunk):
        batch_inputs = np.zeros((batch_size, seq_len))
        batch_targets = np.zeros((batch_size, seq_len))
        for b in range(0, batch_size):
            fr = (b*chuck_size)+(s*seq_len)
            to = fr+seq_len
            batch_inputs[b] = inputs[fr:to]
            batch_targets[b] = inputs[fr+1:to+1]
            
            if noise > 0:
                noise_indices = np.random.choice(seq_len, noise)
                batch_inputs[b][noise_indices] = np.random.randint(0, vocab_size)
            
        yield batch_inputs, batch_targets


In [124]:
print ('no noise')
for batch_inputs, batch_targets in gen_batch(inputs, targets, 5, 32, noise=0):
    print(batch_inputs[0], batch_targets[0])
    break


print ('with some noise')
for batch_inputs, batch_targets in gen_batch(inputs, targets, 5, 32, noise=3):
    print(batch_inputs[0], batch_targets[0])
    break

no noise
[ 6. 17. 19. 17.  6.] [17. 19. 17.  6. 26.]
with some noise
[ 1.  1. 19. 17.  6.] [17. 19. 17.  6. 26.]


## Step 4: Create model
### 4.1 - Create your own layer

In [125]:
class OneHot(tf.keras.layers.Layer):
    def __init__(self, depth, **kwargs):
        super(OneHot, self).__init__(**kwargs)
        self.depth = depth

    def call(self, x, mask=None):
        return tf.one_hot(tf.cast(x, tf.int32), self.depth)

Test if the layer works well

In [126]:
class RnnModel(tf.keras.Model):

    def __init__(self, vocab_size):
        super(RnnModel, self).__init__()
        # Convolutions
        self.one_hot = OneHot(len(vocab))

    def call(self, inputs):
        output = self.one_hot(inputs)
        return output

batch_inputs, batch_targets = next(gen_batch(inputs, targets, 50, 32))

print(batch_inputs.shape)

model = RnnModel(len(vocab))
output = model.predict(batch_inputs)

print(output.shape)

#print(output)

print("Input letter is:", batch_inputs[0][0])
print("One hot representation of the letter", output[0][0])

#assert(output[int(batch_inputs[0][0])]==1)

(32, 50)
(32, 50, 28)
Input letter is: 6.0
One hot representation of the letter [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]


### 4.2 - Setup the model

<img src="../assets/images/architecture_rnn.png">

In [127]:
vocab_size = len(vocab)

### Creat the layers

# Set the input of the model
tf_inputs = tf.keras.Input(shape=(None,), batch_size=64)
# Convert each value of the  input into a one encoding vector
one_hot = OneHot(len(vocab))(tf_inputs)
# Stack LSTM cells
rnn_layer1 = tf.keras.layers.LSTM(128, return_sequences=True, stateful=True)(one_hot)
rnn_layer2 = tf.keras.layers.LSTM(128, return_sequences=True, stateful=True)(rnn_layer1)
# Create the outputs of the model
hidden_layer = tf.keras.layers.Dense(128, activation="relu")(rnn_layer2)
outputs = tf.keras.layers.Dense(vocab_size, activation="softmax")(hidden_layer)

### Setup the model
model = tf.keras.Model(inputs=tf_inputs, outputs=outputs)

### 4.3 - Check if we can reset the RNN cells

In [128]:
# Star by resetting the cells of the RNN
model.reset_states()

# Get one batch
batch_inputs, batch_targets = next(gen_batch(inputs, targets, 50, 64))

# Make a first prediction
outputs = model.predict(batch_inputs)
first_prediction = outputs[0][0]

# Reset the states of the RNN states
model.reset_states()

# Make an other prediction to check the difference
outputs = model.predict(batch_inputs)
second_prediction = outputs[0][0]

# Check if both prediction are equal
assert(set(first_prediction)==set(second_prediction))

### 4.4 - Set the loss and objectives

In [129]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam(lr=0.001)

### 4.5 - Set some metrics to track the progress of the training

In [130]:
# Loss
train_loss = tf.keras.metrics.Mean(name='train_loss')
# Accuracy
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

### 4.6 - Set the train method and the predict method in graph mode

In [131]:
@tf.function
def train_step(inputs, targets):
    with tf.GradientTape() as tape:
        # Make a prediction on all the batch
        predictions = model(inputs)
        # Get the error/loss on these predictions
        loss = loss_object(targets, predictions)
    # Compute the gradient which respect to the loss
    gradients = tape.gradient(loss, model.trainable_variables)
    # Change the weights of the model
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    # The metrics are accumulate over time. You don't need to average it yourself.
    train_loss(loss)
    train_accuracy(targets, predictions)

@tf.function
def predict(inputs):
    # Make a prediction on all the batch
    predictions = model(inputs)
    return predictions

### Step 6: Train the model

In [None]:
model.reset_states()

for epoch in range(4000):
    for batch_inputs, batch_targets in gen_batch(inputs, targets, 100, 64, noise=13):
        train_step(batch_inputs, batch_targets)
    template = '\r Epoch {}, Train Loss: {}, Train Accuracy: {}'
    print(template.format(epoch, train_loss.result(), train_accuracy.result()*100), end="")
    model.reset_states()

### Source:
https://github.com/thibo73800/tensorflow2.0-examples/blob/master/RNN%20-%20Text%20Generator.ipynb