In [1]:
!pip install chart-studio

Collecting chart-studio
  Downloading chart_studio-1.1.0-py3-none-any.whl.metadata (1.3 kB)
Collecting retrying>=1.3.3 (from chart-studio)
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Downloading chart_studio-1.1.0-py3-none-any.whl (64 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.4/64.4 kB[0m [31m812.7 kB/s[0m eta [36m0:00:00[0m
[?25hDownloading retrying-1.3.4-py3-none-any.whl (11 kB)
Installing collected packages: retrying, chart-studio
Successfully installed chart-studio-1.1.0 retrying-1.3.4


In [2]:
!kaggle datasets download -d nageshsingh/englishportuguese-translation
!unzip "/content/englishportuguese-translation.zip"

Dataset URL: https://www.kaggle.com/datasets/nageshsingh/englishportuguese-translation
License(s): unknown
Downloading englishportuguese-translation.zip to /content
  0% 0.00/5.41M [00:00<?, ?B/s]
100% 5.41M/5.41M [00:00<00:00, 71.9MB/s]
Archive:  /content/englishportuguese-translation.zip
  inflating: por.txt                 


In [3]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import time
import string
import pandas as pd

import chart_studio.plotly as py
from plotly.offline import init_notebook_mode, iplot
#%plotly.offline.init_notebook_mode(connected=True)
import plotly.graph_objs as go

In [4]:
file_path = "/content/por.txt"

In [5]:
lines = open(file_path, encoding='UTF-8').read().strip().split('\n')
lines[5000:5010]

['Will it rain?\tSerá que chove?\tCC-BY 2.0 (France) Attribution: tatoeba.org #8918600 (CK) & #8930552 (JGEN)',
 'Wish me luck.\tDeseje-me sorte.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2254917 (CK) & #872788 (alexmarcelo)',
 "Won't you go?\tVocê não vai?\tCC-BY 2.0 (France) Attribution: tatoeba.org #241051 (CK) & #6212788 (bill)",
 'Write in ink.\tEscreva à tinta.\tCC-BY 2.0 (France) Attribution: tatoeba.org #3258764 (CM) & #7351595 (alexmarcelo)',
 'Write in ink.\tEscreva a tinta.\tCC-BY 2.0 (France) Attribution: tatoeba.org #3258764 (CM) & #7351606 (alexmarcelo)',
 'Write to Tom.\tEscreva para o Tom.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2240357 (CK) & #5985551 (Ricardo14)',
 'Years passed.\tPassaram os anos.\tCC-BY 2.0 (France) Attribution: tatoeba.org #282197 (CK) & #977841 (alexmarcelo)',
 'Years passed.\tAnos se passaram.\tCC-BY 2.0 (France) Attribution: tatoeba.org #282197 (CK) & #2324530 (Matheus)',
 'You amuse me.\tVocê me diverte.\tCC-BY 2.0 (France) Attributio

In [6]:
print("total number of records: ",len(lines))

total number of records:  168903


In [7]:
exclude = set(string.punctuation)
remove_digits = str.maketrans('' , '' ,string.digits)

In [8]:
def preprocess_eng_sentence(sent):
  sent = sent.lower()
  sent = re.sub('' ,'' ,sent)
  sent = ''.join(ch for ch in sent if ch not in exclude)
  sent = sent.translate(remove_digits)
  sent = sent.strip()
  sent  = re.sub(" +" ," " ,sent)
  sent = '<start>'+ sent + '<end>'
  return sent

In [9]:
def preprocess_port_sentence(sent):
  sent = re.sub("'" ,'' ,sent)
  sent = ''.join(ch for ch in sent if ch not in exclude)
  sent = sent.strip()
  sent = re.sub(" +" ," " ,sent)
  sent = '<start> ' + sent + " <end>"
  return sent

In [10]:
sent_pairs = []
for line in lines:
    sent_pair = []
    eng = line.rstrip().split('\t')[0]
    port = line.rstrip().split('\t')[1]
    eng = preprocess_eng_sentence(eng)
    sent_pair.append(eng)
    port = preprocess_port_sentence(port)
    sent_pair.append(port)
    sent_pairs.append(sent_pair)
sent_pairs[5000:5010]

[['<start>will it rain<end>', '<start> Será que chove <end>'],
 ['<start>wish me luck<end>', '<start> Desejeme sorte <end>'],
 ['<start>wont you go<end>', '<start> Você não vai <end>'],
 ['<start>write in ink<end>', '<start> Escreva à tinta <end>'],
 ['<start>write in ink<end>', '<start> Escreva a tinta <end>'],
 ['<start>write to tom<end>', '<start> Escreva para o Tom <end>'],
 ['<start>years passed<end>', '<start> Passaram os anos <end>'],
 ['<start>years passed<end>', '<start> Anos se passaram <end>'],
 ['<start>you amuse me<end>', '<start> Você me diverte <end>'],
 ['<start>you are late<end>', '<start> Você está atrasado <end>']]

In [11]:
class LanguageIndex():
  def __init__(self , lang):
    self.lang = lang
    self.word2idx = {}
    self.idx2word ={}
    self.vocab = set()

    self.create_index()

  def create_index(self):
    for phrase in self.lang:
      self.vocab.update(phrase.split(' '))
    self.vocab =sorted(self.vocab)
    self.word2idx['<pad>']=0
    for index ,word in enumerate(self.vocab):
      self.word2idx[word] = index +1

    for word,index in self.word2idx.items():
      self.idx2word[index]=word

In [12]:
def max_length(tensor):
  return max(len(t)for t in tensor)

In [13]:
import tensorflow as tf


def load_dataset(pairs, num_examples):
    inp_lang = LanguageIndex(en for en, ma in pairs)
    targ_lang = LanguageIndex(ma for en, ma in pairs)

    input_tensor = [[inp_lang.word2idx[s] for s in en.split(' ')] for en, ma in pairs]

    target_tensor = [[targ_lang.word2idx[s] for s in ma.split(' ')] for en, ma in pairs]

    max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)

    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor,
                                                                 maxlen=max_length_inp,
                                                                 padding='post')

    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor,
                                                                  maxlen=max_length_tar,
                                                                  padding='post')

    return input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_tar

In [14]:
input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset(sent_pairs, len(lines))


In [15]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.1, random_state = 101)

# Show length
len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)

(152012, 152012, 16891, 16891)

In [16]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE= 64
N_BATCH =BUFFER_SIZE // BATCH_SIZE
embedding_dim = 258
units = 1024
vocab_inp_size = len(inp_lang.word2idx)
vocab_tar_size =len(targ_lang.word2idx)

In [17]:
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [18]:
def gru(units):
  return tf.keras.layers.GRU(units,
                             return_sequences=True,
                             return_state=True,
                             recurrent_activation="sigmoid" ,
                             recurrent_initializer="glorot_uniform")

In [19]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.enc_units)

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [20]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.dec_units)
        self.fc = tf.keras.layers.Dense(vocab_size)

        self.W1 = tf.keras.layers.Dense(self.dec_units)
        self.W2 = tf.keras.layers.Dense(self.dec_units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, x, hidden, enc_output):

        hidden_with_time_axis = tf.expand_dims(hidden, 1)

        score = self.V(tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis)))

        attention_weights = tf.nn.softmax(score, axis=1)

        context_vector = attention_weights * enc_output
        context_vector = tf.reduce_sum(context_vector, axis=1)

        x = self.embedding(x)

        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)


        output, state = self.gru(x)


        output = tf.reshape(output, (-1, output.shape[2]))


        x = self.fc(output)

        return x, state, attention_weights

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.dec_units))

In [21]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

In [22]:
optimizer = tf.optimizers.Adam()

def loss_function(real, pred):
    mask = 1 - np.equal(real, 0)
    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask
    return tf.reduce_mean(loss_)

In [23]:
import os
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [None]:
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()

    hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset):
        loss = 0

        with tf.GradientTape() as tape:
            enc_output, enc_hidden = encoder(inp, hidden)

            dec_hidden = enc_hidden

            dec_input = tf.expand_dims([targ_lang.word2idx['<start>']] * BATCH_SIZE, 1)

            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

                loss += loss_function(targ[:, t], predictions)

                # using teacher forcing
                dec_input = tf.expand_dims(targ[:, t], 1)

        batch_loss = (loss / int(targ.shape[1]))

        total_loss += batch_loss

        variables = encoder.variables + decoder.variables

        gradients = tape.gradient(loss, variables)

        optimizer.apply_gradients(zip(gradients, variables))

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))
    # saving (checkpoint) the model every epoch
    checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / N_BATCH))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))