# Recurrent Neural Networks

In [1]:
import os
import time
import numpy as np
import tensorflow as tf
tf.enable_eager_execution()
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

## German to English Translation

Based on tensorflow's Seq2Seq tutorial.

### Preparing the dataset

Data from http://www.manythings.org/anki/ and contains an english sentence followed by the German Translation.

Processing the data includes:
1. Adding a start and end token to each sentence
2. Removing special characters
3. Creating a word index and reverse word index (word -> id and id -> word)
4. Pad each sentence to a maximum length
5. Putting this into a format that tensorflow can read

In [2]:
path_to_file = os.getcwd() + "/data/deu-eng/deu.txt"

In [3]:
import unicodedata

def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [4]:
import re

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    
    # Adding space between word and punctuation
    w = re.sub(r"([?.!,])", r" \1", w)
    w = re.sub(r'[" "]+', " ", w)
    
    # replacing everything with space except(a-z, A-Z, ".", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,]+", " ", w)
    
    # Removing all special characters
    w = w.rstrip().strip()
    
    #Adding start/end tokens
    w = '<start> ' + w + ' <end>'
    
    return w

In [5]:
preprocess_sentence("TĘŠt!ng Sent&änče ^^!")

'<start> test !ng sent ance ! <end>'

In [6]:
def create_dataset(path, num_examples):
    lines = open(path, encoding='UTF-8').read().strip().split('\n')
    
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines[:num_examples]]
    
    return word_pairs

In [7]:
preprocess_test = create_dataset(path_to_file, 20)
preprocess_test

[['<start> hi . <end>', '<start> hallo ! <end>'],
 ['<start> hi . <end>', '<start> gru gott ! <end>'],
 ['<start> run ! <end>', '<start> lauf ! <end>'],
 ['<start> wow ! <end>', '<start> potzdonner ! <end>'],
 ['<start> wow ! <end>', '<start> donnerwetter ! <end>'],
 ['<start> fire ! <end>', '<start> feuer ! <end>'],
 ['<start> help ! <end>', '<start> hilfe ! <end>'],
 ['<start> help ! <end>', '<start> zu hulf ! <end>'],
 ['<start> stop ! <end>', '<start> stopp ! <end>'],
 ['<start> wait ! <end>', '<start> warte ! <end>'],
 ['<start> go on . <end>', '<start> mach weiter . <end>'],
 ['<start> hello ! <end>', '<start> hallo ! <end>'],
 ['<start> i ran . <end>', '<start> ich rannte . <end>'],
 ['<start> i see . <end>', '<start> ich verstehe . <end>'],
 ['<start> i see . <end>', '<start> aha . <end>'],
 ['<start> i try . <end>', '<start> ich probiere es . <end>'],
 ['<start> i won ! <end>', '<start> ich hab gewonnen ! <end>'],
 ['<start> i won ! <end>', '<start> ich habe gewonnen ! <end>']

In [8]:
# Class that creates a word -> index mapping
class LanguageIndex():
    def __init__(self, lang):
        self.lang = lang
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()
        
        self.create_index()
    
    def create_index(self):
        for phrase in self.lang:
            self.vocab.update(phrase.split(' '))
            
        self.vocab = sorted(self.vocab)
        
        self.word2idx['<pad>'] = 0
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 1
        
        for word, index in self.word2idx.items():
            self.idx2word[index] = word

In [9]:
def max_length(tensor):
    return max(len(t) for t in tensor)

def load_dataset(path, num_examples):
    pairs = create_dataset(path, num_examples)
    
    input_lang = LanguageIndex(deu for en, deu in pairs)
    target_lang = LanguageIndex(en for en, deu in pairs)
    
    input_tensor = [[input_lang.word2idx[s] for s in deu.split(' ')] for en, deu in pairs]
    target_tensor = [[target_lang.word2idx[s] for s in en.split(' ')] for en, deu in pairs]
    
    max_length_input, max_length_target = max_length(input_tensor), max_length(target_tensor)
    
    # Padding input and output tensor to max length
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor,
                                                                maxlen=max_length_input,
                                                                padding='post')
    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor,
                                                                 maxlen=max_length_target,
                                                                 padding='post')
    
    return input_tensor, target_tensor, input_lang, target_lang, max_length_input, max_length_target

Using only 10,000 sentences out of the roughly 200,000 available sentences in the downloaded data. Splitting these into a train and test set. 

In [10]:
num_examples = 10000
input_tensor, target_tensor, input_lang, target_lang, max_length_input, max_length_target = load_dataset(path_to_file, num_examples)

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(input_tensor, target_tensor, test_size = 0.2)

In [12]:
# Creating tf.data Dataset

buffer_size = len(X_train)
batch_size = 64
n_batch = buffer_size // batch_size
embedding_dim = 128
units = 512
vocab_input_size = len(input_lang.word2idx)
vocab_target_size = len(target_lang.word2idx)

dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(buffer_size)
dataset = dataset.batch(batch_size, drop_remainder=True)

In [13]:
vocab_input_size

3501

In [14]:
vocab_target_size

2176

### Constructing Encoder and Decoder

In [15]:
def gru(units):
    return tf.keras.layers.GRU(units, 
                               return_sequences=True,
                               return_state=True,
                               recurrent_activation="sigmoid",
                               recurrent_initializer="glorot_uniform")

In [16]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.enc_units)
        
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [32]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.dec_units)
        self.fc = tf.keras.layers.Dense(vocab_size)
        
        # used for attention
        self.W1 = tf.keras.layers.Dense(self.dec_units)
        self.W2 = tf.keras.layers.Dense(self.dec_units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, x, hidden, enc_output):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        score = self.V(tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis)))
        attention_weights = tf.nn.softmax(score, axis=1)
        
        context_vector = attention_weights * enc_output
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        
        return x, state, attention_weights
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.dec_units))

In [33]:
encoder = Encoder(vocab_input_size, embedding_dim, units, batch_size)
decoder = Decoder(vocab_target_size, embedding_dim, units, batch_size)

In [37]:
optimizer = tf.train.AdamOptimizer()

def loss_function(real, pred):
    mask = 1 - np.equal(real, 0)
    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask
    return tf.reduce_mean(loss_)

In [38]:
# Creating trianing checkpoints
checkpoint_dir = os.getcwd() + "/tf_logs/german_english_trans/training_checkpoints"
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                encoder=encoder,
                                decoder=decoder)

### Training Steps

1. pass input through encoder to produce encoder output and encoder hidden state
2. encoder output and hidden state is passed to decoder along with decoder input, i.e. the start token
3. decoder returns predictions and the decoder hidden state
4. decoder hidden state passed back into the model and predictions are used to calculate loss
5. use teacher forcing to decide the next input to decoder.  (Teacher forcing is the technique where the target word is passed as the next input to the decoder)
6. calculate gradients and apply it to optimizer and backprogagate

In [40]:
n_epochs = 10

for epoch in range(n_epochs):
    start = time.time()
    hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset):
        loss = 0
        
        with tf.GradientTape() as tape:
            enc_output, enc_hidden = encoder(inp, hidden)
            
            dec_hidden = enc_hidden
            dec_input = tf.expand_dims([target_lang.word2idx['<start>']] * batch_size, 1)
            
            # feeding target as next input
            for t in range(1, targ.shape[1]):
                predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
                loss += loss_function(targ[:,t], predictions)
                
                # using teacher forcing
                dec_input = tf.expand_dims(targ[:, t], 1)
                
            batch_loss = (loss / int(targ.shape[1]))
            total_loss += batch_loss
            
            variables = encoder.variables + decoder.variables
            
            gradients = tape.gradient(loss, variables)
            optimizer.apply_gradients(zip(gradients, variables))
            
            if batch % 50 == 0:
                print('Epoch {} \tBatch {} \tLoss{:.5f}'.format(epoch + 1, batch, batch_loss.numpy()))
                
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)
            
    print('Epoch {} \tLoss {:.4f}'.format(epoch + 1, total_loss / n_batch))
    print('Time taken for 1 epoch {:.1} sec\n'.format(time.time() - start))

Epoch 1 	Batch 0 	Loss2.35349
Epoch 1 	Batch 100 	Loss1.93987
Epoch 1 	Loss 2.0931
Time taken for 1 epoch 266.66598987579346 sec

Epoch 2 	Batch 0 	Loss1.76712
Epoch 2 	Batch 100 	Loss1.77102
Epoch 2 	Loss 1.7657
Time taken for 1 epoch 232.2601079940796 sec

Epoch 3 	Batch 0 	Loss1.58285
Epoch 3 	Batch 100 	Loss1.55807
Epoch 3 	Loss 1.5617
Time taken for 1 epoch 190.01617288589478 sec

Epoch 4 	Batch 0 	Loss1.40374
Epoch 4 	Batch 100 	Loss1.56057
Epoch 4 	Loss 1.4000
Time taken for 1 epoch 224.32567405700684 sec

Epoch 5 	Batch 0 	Loss1.24784
Epoch 5 	Batch 100 	Loss1.28348
Epoch 5 	Loss 1.2803
Time taken for 1 epoch 190.1360628604889 sec

Epoch 6 	Batch 0 	Loss1.17420
Epoch 6 	Batch 100 	Loss1.26946
Epoch 6 	Loss 1.1864
Time taken for 1 epoch 224.3213050365448 sec

Epoch 7 	Batch 0 	Loss1.07787
Epoch 7 	Batch 100 	Loss1.03178
Epoch 7 	Loss 1.1029
Time taken for 1 epoch 186.15093088150024 sec

Epoch 8 	Batch 0 	Loss1.02844
Epoch 8 	Batch 100 	Loss1.06482
Epoch 8 	Loss 1.0238
Time taken

### Translations

Let's use the Neural Net to actually make translations!  There is no teacher forcing here.  The input to the decoder at each time step is its previous predictions along with the hidden state and encoder output.  Of course, prediction will stop when the model his the end token.

In [63]:
def evaluate(sentence, encoder, decoder, input_lang, target_lang, max_length_input, max_length_target):
    attention_plot = np.zeros((max_length_target, max_length_input))
    
    sentence = preprocess_sentence(sentence)
    inputs = [input_lang.word2idx[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_input, padding='post')
    inputs = tf.convert_to_tensor(inputs)
    
    result = ''
    
    hidden = [tf.zeros((1, units))]
    
    enc_out, enc_hidden = encoder(inputs, hidden)
    
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([target_lang.word2idx['<start>']], 0)
    
    for t in range(max_length_target):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        
        # storing attention weights
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()
        
        predicted_id = tf.argmax(predictions[0]).numpy()
        
        result += target_lang.idx2word[predicted_id] + ' '
        
        if target_lang.idx2word[predicted_id] == '<end>':
            return result, sentence , attention_plot
        
        # feed predicted ID back into model
        dec_input = tf.expand_dims([predicted_id], 0)
        
    return result, sentence, attention_plot

In [64]:
def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(8,8))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')
    fontdict = {'fontsize': 14}
    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)
    plt.show()

In [74]:
def translate(sentence, encoder, decoder, input_lang, target_lang, max_length_input, max_length_target):
    result, sentence, attention_plot = evaluate(sentence, encoder, decoder, input_lang, 
                                                target_lang, max_length_input, max_length_target)
    
    print('Input: {}'.format(sentence))
    print('Predicted translation: {}'.format(result))
    
    attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
#     plot_attention(attention_plot, sentence.split(' '), result.split(' '))

In [75]:
# Restoring latest checkpoint
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.checkpointable.util.CheckpointLoadStatus at 0xb2e3f7828>

In [76]:
# How are you?
translate(u'wie gehts?', encoder, decoder, input_lang, target_lang, max_length_input, max_length_target)

Input: <start> wie gehts ? <end>
Predicted translation: do you recycle ? <end> 


In [77]:
# I like you
translate(u'Ich mag dich', encoder, decoder, input_lang, target_lang, max_length_input, max_length_target)

Input: <start> ich mag dich <end>
Predicted translation: i want to you . <end> 


In [78]:
# I am here
translate(u'Ich bin hier', encoder, decoder, input_lang, target_lang, max_length_input, max_length_target)

Input: <start> ich bin hier <end>
Predicted translation: i m in home . <end> 


In [79]:
# where are the dogs?
translate(u'wo sind die hunde?', encoder, decoder, input_lang, target_lang, max_length_input, max_length_target)

Input: <start> wo sind die hunde ? <end>
Predicted translation: what are you ? <end> 


In [82]:
# do cats like dogs?
translate(u'mogen Katzen hunde', encoder, decoder, input_lang, target_lang, max_length_input, max_length_target)

Input: <start> mogen katzen hunde <end>
Predicted translation: mary likes him . <end> 


In [83]:
# Hello friend!
translate(u'Hallo Freund', encoder, decoder, input_lang, target_lang, max_length_input, max_length_target)

Input: <start> hallo freund <end>
Predicted translation: hello ! <end> 


Not too bad considering we only trained on 8000 sentences for 10 epochs!

## What did we accomplish?

1. Loaded and processed a dataset containing 10000 sentences of English next to their German translation
2. Built an RNN model which included an embedding, and GRU units to compose an encoder and decoder with an additional attention vector input to the decoder.
3. Trained the model and used it to make predictions on simple German sentences.  The results are quite entertaining!