# Recurrent Neural Network for Language processing

## Import and load the libraries

In [None]:
import tensorflow as tf

In [None]:
# Check tensorflow version
print(tf.__version__)

In [None]:
from tensorflow.keras.datasets import imdb

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


from sklearn.metrics import confusion_matrix

## Step 1: Data preprocessing

We use the imdb datasets in the keras library and the description in the oficial library is:
*This is a dataset of 25,000 movies reviews from IMDB, labeled by sentiment (positive/negative). Reviews have been preprocessed, and each review is encoded as a list of word indexes (integers). For convenience, words are indexed by overall frequency in the dataset, so that for instance the integer "3" encodes the 3rd most frequent word in the data. This allows for quick filtering operations such as: "only consider the top 10,000 most common words, but eliminate the top 20 most common words".*

In this simple demo we set the num_words parameter to 20,000 words, this means we only works with the 20,000th most frequent frecuent words. Other words will be included in the OOV, out of vocabulary, bag.

We set the parameters and load the dataset:

In [None]:
# Set parameters
number_of_words = 20000
max_len = 100

# Load the dataset
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=number_of_words)

In [None]:
# Check the shape and inspect values
print("Train dataset size: ", X_train.shape)
print("Train label dataset size: ", y_train.shape)

print("Test dataset size: ", X_test.shape)
print("Test label dataset size: ", y_test.shape)

From the shape values we infer that there are 25K items containing a list of values with different lengths. Lets take a closer look:

In [None]:
for i in range(5):
  print("Length of the list: ", len(X_train[i]))
  print(" Print first five elements in the list: ", X_train[i][:5])

  print("Length of the list: ", len(X_test[i]))
  print(" Print first five elements in the list: ", X_test[i][:5])

Every row starts with 1,the init of sequence token, followed by a list of integer tokens that represent a word in the vocabulary. We retrive the vocabulary dict in order to translate these tokens by building an inverse dict mapping indices to words.

In [None]:
# Use the default parameters to keras.datasets.imdb.load_data
start_char = 1
oov_char = 2
index_from = 3

# Retrieve the word index file mapping words to indices
word_index = tf.keras.datasets.imdb.get_word_index()
# Reverse the word index to obtain a dict mapping indices to words
# And add `index_from` to indices to sync with `x_train`
inverted_word_index = dict(
    (i + index_from, word) for (word, i) in word_index.items()
)
# Update `inverted_word_index` to include `start_char` and `oov_char`
inverted_word_index[start_char] = "[START]"
inverted_word_index[oov_char] = "[OOV]"

In [None]:
# Extract five elements
for i in range(5):
  print("Length of the list: ", len(X_train[i]))
  print(" Print first five elements in the list: ", " ".join(inverted_word_index[j] for j in X_train[i][:10]))

Next we must truncate or pad the sequences, we need to work with sequences with equal length

In [None]:
# Padding the sequences in the datasets
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=max_len)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=max_len)

# Check the length of the sequences
for i in range(5):
  # Print length in X_train
  print("Length of the X_train list: ", len(X_train[i]))
  # Print length in X_test
  print("Length of the X_train list: ", len(X_test[i]))


## Step 2: Build the RNN

We build a simple RNN containing three layers:
- Embedding layer: with a vocabulary size of 20,000, same number as we used before, and an embedding size of 128.
- An LSTM layer: with 128 units and activation function is tanh
- An final Dense layer with just 1 output using sigmoid activation


In [None]:
# Set parameters
vocab_size=number_of_words
embed_size=128
lstm_units=128

In [None]:
# Define a squential network
model = tf.keras.Sequential()
# Add the embedding layer
model.add(tf.keras.layers.Embedding(vocab_size, embed_size, input_shape=(X_train.shape[1],)))
# Add the LSTM layer
model.add(tf.keras.layers.LSTM(units=lstm_units, activation='tanh'))
# Add the dense layer
model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

In [None]:
# Compile the model
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
# Show summary
model.summary()

## Step 3: Train the model

This time we do not use a validation dataset during training, later we check the performance of the model on the test dataset

In [None]:
history = model.fit(X_train, y_train, epochs=3, batch_size=128)

## Step 4: Evaluate the model

In [None]:
plt.plot(history.history['accuracy'], label='accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.5, 1])
plt.legend(loc='lower right')

# Calculate the accuracy for thre test dataset
test_loss, test_acc = model.evaluate(X_test,  y_test, verbose=2)
# Print the final accuracy
print('Test Accuracy: ',test_acc)

## Step 5: Make predictions

In [None]:
# Make prediction on test dataset
predictions = model.predict(X_test)
# Show predictions shape
print(' Prediction shape', predictions.shape)

Lets check oputput values:

In [None]:
print(predictions[:5])

Convert the probability to a label

In [None]:
# If threshold is 0.5
y_preds = predictions > 0.5
# Show the labels
print(y_preds[:5])

### Plot the Confussion matrix

In [None]:
# Calculate the confussion matrix
cm = confusion_matrix(y_test, y_preds)
print('Confusion Matrix\n')
print(cm)

Build a function to plot the confussion matrix

In [None]:
# Function to plot the confussion matrix
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.2f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.2f}; misclass={:0.2f}'.format(accuracy, misclass))
    plt.show()

In [None]:
# Define the class names
class_names = ['Negative', 'Positive']
# Plot confussion matrix
plot_confusion_matrix(cm, class_names)

## Step 8: Save the model

to save our model we make a dir and call the save method

In [None]:
# Save the entire model as a SavedModel.
!mkdir -p saved_model
# Save the model
model.save('saved_model/my_model')

## Restore the model saved

When the model is saved using the save method, we can restore and load the model in a new model just calling the load_model. This procedure

In [None]:
# Load the saved model
new_model = tf.keras.models.load_model('saved_model/my_model')

# Check its architecture
new_model.summary()

In [None]:
# Evaluate the restored model
loss, acc = new_model.evaluate(X_test, y_test, verbose=2)
print('Restored model, accuracy: {:5.2f}%'.format(100 * acc))

In [None]:
# Calculate the predicted probabilities
predictions = new_model.predict(X_test)
# Calculate the predicted label for test dataset
y_preds= predictions > 0.5
# Calculate the confussion matrix
cm = confusion_matrix(y_test, y_preds)
# Plot confussion matrix
plot_confusion_matrix(cm, class_names)