In [1]:
%load_ext autoreload
%autoreload 2 

In [None]:
#https://github.com/tensorflow/tensorflow/blob/r1.13/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb

In [1]:
#import tensorflow>=1.10 and enable eager execution
import tensorflow as tf 
tf.enable_eager_execution()

import matplotlib.pyplot as plt 
from sklearn.model_selection  import train_test_split

import unicodedata
import re
import numpy as np 
import os 
import time 

print(tf.__version__)

1.12.0


In [8]:
current_path = os.getcwd()
path_to_file = os.path.join(current_path,'data','spa.txt')

In [33]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w)
    
    # creating a space between a word and the punctuation following it
    # eg: "he    is a boy." => "he is a boy ." 
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r'([?.!,¿])', r' \1 ', w)  # "he   is a boy . "
    w = re.sub(r'[" "]+'," ",w)       # "he is a boy . "
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w) #replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = w.strip()
    w = '<start>' + w + '<end>'
    return w 

In [41]:
#1.remove the accents
#2.clean the sentences
#3. return word pairs in the format"[ENGLISH,SPANISH]
def create_dataset(path,num_examples):
    with open(path,encoding='utf-8') as f:
        lines = f.read().strip().split('\n')
        word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines[:num_examples]]
        return word_pairs

In [42]:
class LanguageIndex():
    def __init__(self,lang):
        self.lang = lang 
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()
        
        self.create_index()
    
    def create_index(self):
        for phrase in self.lang:
            self.vocab.update(phrase.split(' '))
        
        self.vocab = sorted(self.vocab)
        
        self.word2idx['<pad>'] = 0
        for index,word in enumerate(self.vocab):
            self.word2idx[word] = index + 1
        
        for word,index in self.word2idx.items():
            self.idx2word[index] = word

In [43]:
def max_length(tensor):
    return max(len(t) for t in tensor)

def load_dataset(path, num_examples):
    # creating cleaned input, output pairs
    pairs = create_dataset(path, num_examples)

    # index language using the class defined above    
    inp_lang = LanguageIndex(sp for en, sp in pairs)
    targ_lang = LanguageIndex(en for en, sp in pairs)
    
    # Vectorize the input and target languages
    
    # Spanish sentences
    input_tensor = [[inp_lang.word2idx[s] for s in sp.split(' ')] for en, sp in pairs]
    
    # English sentences
    target_tensor = [[targ_lang.word2idx[s] for s in en.split(' ')] for en, sp in pairs]
    
    # Calculate max_length of input and output tensor
    # Here, we'll set those to the longest sentence in the dataset
    max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)
    
    # Padding the input and output tensor to the maximum length
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, 
                                                                 maxlen=max_length_inp,
                                                                 padding='post')
    
    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, 
                                                                  maxlen=max_length_tar, 
                                                                  padding='post')
    
    return input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_tar

In [44]:
# Try experimenting with the size of that dataset
num_examples = 30000
input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset(path_to_file, num_examples)

In [47]:

# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)

(24000, 24000, 6000, 6000)

In [48]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
N_BATCH = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word2idx)
vocab_tar_size = len(targ_lang.word2idx)

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [69]:
def gru(unints):
    return tf.keras.layers.GRU(units,
                               return_sequences = True,
                               return_state = True,
                               recurrent_activation='sigmoid',
                               recurrent_initializer = 'glorot_uniform')

In [70]:
class Encoder(tf.keras.Model):
    def __init__(self,vocab_size,embedding_dim,enc_units,batch_sz):
        super().__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size,embedding_dim)
        self.gru = gru(self.enc_units)
        
    def call(self,x,hidden):
        x = self.embedding(x)
        output,state = self.gru(x ,initial_state = hidden)
        return output,state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz,self.enc_units))

In [78]:
class Decoder(tf.keras.Model):
    def __init__(self,vocab_size,embedding_dim,dec_units,batch_sz):
        super().__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding_ = tf.keras.layers.Embedding(vocab_size,embedding_dim)
        self.gru = gru(self.dec_units)
        self.fc = tf.keras.layers.Dense(vocab_size)
        
        #used for attention
        self.W1 = tf.keras.layers.Dense(self.dec_units)
        self.W2 = tf.keras.layers.Dense(self.dec_units)
        self.V = tf.keras.layers.Dense(1)
    
    def call(self,x,hidden,enc_output):
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        hidden_with_time_axis = tf.expand_dims(hidden,1)
        
        #enc_output shape == (batchsize,max_length,hidden_size)
        #score shape == (batchsize,max_length,1)
        score = self.V(tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis)))
        
        
        

In [75]:
encoder = Encoder(vocab_inp_size,embedding_dim,units,BATCH_SIZE)
h = encoder.initialize_hidden_state()

In [76]:
h.shape

TensorShape([Dimension(64), Dimension(1024)])

In [77]:
tf.expand_dims(h,1)

<tf.Tensor: id=1495, shape=(64, 1, 1024), dtype=float32, numpy=
array([[[0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.]]], dtype=float32)>