<a href="https://www.kaggle.com/code/franciscomesquita/tp2-ad-translator-english-to-cherokee?scriptVersionId=102879790" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Trabalho Prático 2 - 

Machine Translation - English to Cherokee

# Tarefa 1

In [None]:
#Imports importantes

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
except Exception:
    pass

# TensorFlow ≥2.0 is required
import tensorflow as tf
assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [None]:
#Conectar ao drive

from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Importar dataset

import pandas as pd
train_dataset = pd.read_csv('/content/drive/MyDrive/Ch_En_Train_complete.csv',delimiter=';')

In [None]:
#Visualizar dataset de treino

train_dataset

In [None]:
eng = train_dataset['English']
che = train_dataset['Cherokee']

eng_len = [len(s.split()) for s in eng]
che_len = [len(s.split()) for s in che]

#Frases com o maior número de palavras para cada linguagem
max(eng_len), max(che_len)

In [None]:
import statistics

#Este valor vai ser utilizado como referência para escolher o sequence_length nas camadas de vetorização

statistics.median(eng_len), statistics.median(che_len)

In [None]:
#Número de palavras normal para cada lingua

#Entre 0 e 40 para inglês e 0 e 25 para cherokee

%matplotlib inline

length_df = pd.DataFrame({'eng':eng_len, 'che':che_len})
length_df.hist(bins=30)

In [None]:
text_pairs = []
for index, row in train_dataset.iterrows():
    english, ch = row
    ch = "[start] " + ch + " [end]"
    text_pairs.append((english, ch))

text_pairs

In [None]:
# Dividir o datastet em 3 conjuntos: treino, validação e teste

import random
random.shuffle(text_pairs)
num_val_samples = int(0.10 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples:]

In [None]:
# Modulo de pré-processamento e vetorização do texto

import tensorflow as tf
import string
import re

from tensorflow.keras import layers

strip_chars = string.punctuation
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

# É necessário criar uma standardização especial para as frases em cherokee, de modo a não retirar os símbolos [ ]
def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(
        lowercase, f"[{re.escape(strip_chars)}]", "")

vocab_size = 8000 #Vai apenas utilizar as X palavras mais utilizadas do vocabulário.
sequence_length = 20

source_vectorization = layers.TextVectorization(
    #comentar linha abaixo para vermos de seguida o nr de tokens criada para cada vocab
    max_tokens=vocab_size,
    standardize='lower_and_strip_punctuation', #used by default
    output_mode="int",
    output_sequence_length=sequence_length,
)

target_vectorization = layers.TextVectorization(
    #comentar linha abaixo para vermos de seguida o nr de tokens criada para cada vocab
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length = sequence_length + 1,
    standardize=custom_standardization,
)

train_english_texts = [pair[0] for pair in train_pairs]
train_ch_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_ch_texts)

In [None]:
# Visualizar o vocabulário em inglês

source_vocab = source_vectorization.get_vocabulary()
source_vocab

In [None]:
# Visualizar o vocabulário em cherokee
target_vocab = target_vectorization.get_vocabulary()
target_vocab

In [None]:
#Vocabulário se não utilizarmos MaxTokens

plt.bar(['Vocab ENG','Vocab CHE'], [len(source_vocab),len(target_vocab)], color ='maroon',
        width = 0.4)

In [None]:
#Vocabulário se utilizarmos MaxTokens

plt.bar(['Vocab ENG','Vocab CHE'], [len(source_vocab),len(target_vocab)], color ='maroon',
        width = 0.4)

In [None]:
# Creating a data pipeline to feed into the translator model
# Utiliza o objeto tf.data

batch_size = 64

def format_dataset(eng, ch):
    eng = source_vectorization(eng)
    ch = target_vectorization(ch)
    return ({
        "english": eng,
        "cherokee": ch[:, :-1],
    }, ch[:, 1:])

def make_dataset(pairs):
    eng_texts, ch_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    ch_texts = list(ch_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, ch_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=4)
    return dataset.shuffle(2048).prefetch(16).cache()

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [None]:
# Visualizar o formato e alguns exemplos

for inputs, targets in train_ds.take(1):
    print(f"inputs['english'].shape: {inputs['english'].shape}")
    print(f"inputs['cherokee'].shape: {inputs['cherokee'].shape}")
    print(f"targets.shape: {targets.shape}")
    print(inputs['english'])


In [None]:
# Criação da classe que modela o Encoder 

# Na criação do objeto recebe 
# embed_dim: Dimensão da sequência de input 
# dense_dim: Número de nós da camada Dense
# num_heads: Número de attention heads

class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(
            inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

In [None]:
# Criação da classe que modela o Decoder 

# Na criação do objeto recebe 
# embed_dim: Dimensão da sequência de input 
# dense_dim: Número de nós da camada Dense
# num_heads: Número de attention heads

class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1),
             tf.constant([1, 1], dtype=tf.int32)], axis=0)
        return tf.tile(mask, mult)

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(
                mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)
        attention_output_1 = self.attention_1(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=causal_mask)
        attention_output_1 = self.layernorm_1(inputs + attention_output_1)
        attention_output_2 = self.attention_2(
            query=attention_output_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        attention_output_2 = self.layernorm_2(
            attention_output_1 + attention_output_2)
        proj_output = self.dense_proj(attention_output_2)
        return self.layernorm_3(attention_output_2 + proj_output)

In [None]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

    def get_config(self):
        config = super(PositionalEmbedding, self).get_config()
        config.update({
            "output_dim": self.output_dim,
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
        })
        return config

In [None]:
# The complete Transformer

import keras

# Settings 

embed_dim = 256
dense_dim = 2048
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="english")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="cherokee")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)

decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)

transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
#Definir o callback para o modelo parar de treinar se detetar situação de overfitting
early_stopping_cb = keras.callbacks.EarlyStopping(
    patience=8, restore_best_weights=True)

#Compilar o transformer
transformer.compile(
    optimizer="rmsprop",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"])

#Treinar o transformer
transformer.fit(train_ds, epochs=40, validation_data=val_ds, callbacks=[early_stopping_cb])

In [None]:
#Import Bleu e Sacrebleu metrics (Vai ser usado Bleu Score e CHRF)
#Ref Bleu: https://towardsdatascience.com/nlp-metrics-made-simple-the-bleu-score-b06b14fbdbc1
#Ref Sacrebleu - CHRF: https://huggingface.co/spaces/evaluate-metric/chrf

from nltk.translate.bleu_score import sentence_bleu
!pip install sacrebleu
from sacrebleu.metrics import BLEU, CHRF, TER


In [None]:
bleu = BLEU()
chrf = CHRF()

In [None]:
# Testar o desempenho do Transformer em frases do conjunto de teste
# Métricas Bleu e CHRF são utilizadas

import numpy as np
ch_vocab = target_vectorization.get_vocabulary()
ch_index_lookup = dict(zip(range(len(ch_vocab)), ch_vocab))
max_decoded_sentence_length = 20
medium_Bleu_Score = 0
medium_chrf_Score = 0

def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization(
            [decoded_sentence])[:, :-1]
        predictions = transformer(
            [tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = ch_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[end]":
            break
    return decoded_sentence

#Teste do modelo
for _ in range(50):
    test = random.choice(test_pairs)
    input_sentence = test[0]
    translation = decode_sequence(input_sentence)
    ref = [test[1]]
    score_bleu = sentence_bleu(ref, translation)
    score_chrf = str(chrf.corpus_score([translation],[ref]))
    score_chrf = float(score_chrf.split('= ',1)[1])
    medium_Bleu_Score += score_bleu
    medium_chrf_Score += score_chrf
    print(f"Frase a traduzir: {input_sentence}")    
    print("previsão: " + decode_sequence(input_sentence))
    print("Real: " + ref[0])
    print("Bleu Score: " + str(score_bleu))
    print("CHRF Score: " + str(score_chrf) + '\n')

print("Média final de Bleu Score: " + str(medium_Bleu_Score / 50))
print("Média final de CHRF Score: " + str(medium_chrf_Score / 50))

In [None]:
# Testar o desempenho do transformer em frases introduzidas pelo utilizador
# Especificar 5 frases em inglês e verificar a qualidade da tradução

for _ in range(5):
    input_sentence = input()
    translation = decode_sequence(input_sentence)
    print("-")
    print(input_sentence)
    print(f"Frase traduzida: {translation}")

# Tarefa 2

Utilizar modelos pré-treinados para traduzir

## 1ª Abordagem - Bert pre-trained model



Baseado no código presente em: https://github.com/vivekgohel56/Neural-machine-translation-english-to-polish

Foi uma tentativa de implementar uma arquitetura referenciada na área de NLP e adaptar ao nosso problema, porém não tivemos em consideração o elevado número de parâmetros (iriamos reparar depois ao treinar)

In [None]:
#Veersão 2.2.0 é necessária devida a certas especificações de bibliotecas e respetivas funções usadas abaixo

!pip uninstall tensorflow --yes
!pip install tensorflow==2.2.0

In [None]:
#Instalar pacotes e importar libraries

!pip install bert-for-tf2
!pip install numpy==1.19.5

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from bert import BertModelLayer
from bert.loader import StockBertConfig, load_stock_weights

In [None]:
#Conectar ao drive

from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Download do modelo se não existir
#https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip

if not os.path.exists('/content/drive/My Drive/machine translation/uncased_L-12_H-768_A-12'):
  !wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip 
  !unzip uncased_L-12_H-768_A-12 

In [None]:
uncleaned_data_list = pd.read_csv('/content/drive/MyDrive/Ch_En_Train_complete.csv',delimiter=';')

text_pairs = []
for index, row in uncleaned_data_list.iterrows():
    english, port = row
    text_pairs.append((english, port))


english_word  = [pair[0] for pair in text_pairs]
cherokee_word = [pair[1] for pair in text_pairs]

In [None]:
data = pd.DataFrame(columns=['English','Cherokee'])
data['English'] = english_word
data['Cherokee'] = cherokee_word

In [None]:
data

In [None]:
# Vamos utilizar 80% para treino e o restante para teste
train = int(len(data)*0.8)
test = len(data) - train
train_examples, val_examples = data.iloc[0:train,:], data.iloc[train:len(data),:]

In [None]:
#criar listas necessárias para treino e teste

english_text = train_examples['English'].values
cherokee_text = train_examples['Cherokee'].values
english_val_text = val_examples['English'].values
cherokee_val_text = val_examples['Cherokee'].values

In [None]:
#transformar para datasets do tensorflow

train_examples = tf.data.Dataset.from_tensor_slices((english_text, cherokee_text))
val_examples = tf.data.Dataset.from_tensor_slices((english_val_text, cherokee_val_text))

In [None]:
type(train_examples)

In [None]:
print(train_examples)

In [None]:
#Exemplo

for en, ch in train_examples.take(1):
  print(tf.compat.as_text(en.numpy()))
  print(tf.compat.as_text(ch.numpy()))

In [None]:
import collections
import unicodedata

#Diferentes funções utilizadas no pré-processamento

def convert_to_unicode(text):
    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
    if isinstance(text, str):
        return text
    elif isinstance(text, bytes):
        return text.decode("utf-8", "ignore")
    else:
        raise ValueError("Unsupported string type: %s" % (type(text)))


def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
    index = 0
    with tf.io.gfile.GFile(vocab_file, "r") as reader:
        while True:
            token = convert_to_unicode(reader.readline())
            if not token:
                break
            token = token.strip()
            vocab[token] = index
            index += 1
    return vocab


def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()
    if not text:
        return []
    tokens = text.split()
    return tokens

def convert_by_vocab(vocab, items):
    """Converts a sequence of [tokens|ids] using the vocab."""
    output = []
    for item in items:
        output.append(vocab[item])
    return output

class FullTokenizer(object):
    """Runs end-to-end tokenziation."""

    def __init__(self, vocab_file, do_lower_case=True):
        self.vocab = load_vocab(vocab_file)
        self.inv_vocab = {v: k for k, v in self.vocab.items()}
        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)

    def tokenize(self, text):
        split_tokens = []
        for token in self.basic_tokenizer.tokenize(text):
            for sub_token in self.wordpiece_tokenizer.tokenize(token):
                split_tokens.append(sub_token)

        return split_tokens
    
    def convert_tokens_to_ids(self, tokens):
        return convert_by_vocab(self.vocab, tokens)

    def convert_ids_to_tokens(self, ids):
        return convert_by_vocab(self.inv_vocab, ids)


class BasicTokenizer(object):
    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""

    def __init__(self, do_lower_case=True):
        """Constructs a BasicTokenizer.
    
        Args:
          do_lower_case: Whether to lower case the input.
        """
        self.do_lower_case = do_lower_case

    def tokenize(self, text):
        """Tokenizes a piece of text."""
        text = convert_to_unicode(text)
        text = self._clean_text(text)

        # This was added on November 1st, 2018 for the multilingual and Chinese
        # models. This is also applied to the English models now, but it doesn't
        # matter since the English models were not trained on any Chinese data
        # and generally don't have any Chinese data in them (there are Chinese
        # characters in the vocabulary because Wikipedia does have some Chinese
        # words in the English Wikipedia.).
        text = self._tokenize_chinese_chars(text)

        orig_tokens = whitespace_tokenize(text)
        split_tokens = []
        for token in orig_tokens:
            if self.do_lower_case:
                token = token.lower()
                token = self._run_strip_accents(token)
            split_tokens.extend(self._run_split_on_punc(token))

        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            cat = unicodedata.category(char)
            if cat == "Mn":
                continue
            output.append(char)
        return "".join(output)

    def _run_split_on_punc(self, text):
        """Splits punctuation on a piece of text."""
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1

        return ["".join(x) for x in output]

    def _tokenize_chinese_chars(self, text):
        """Adds whitespace around any CJK character."""
        output = []
        for char in text:
            cp = ord(char)
            if self._is_chinese_char(cp):
                output.append(" ")
                output.append(char)
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)

    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # This defines a "chinese character" as anything in the CJK Unicode block:
        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
        # despite its name. The modern Korean Hangul alphabet is a different block,
        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
        # space-separated words, so they are not treated specially and handled
        # like the all of the other languages.
        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
                (cp >= 0x3400 and cp <= 0x4DBF) or  #
                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
                (cp >= 0x2B820 and cp <= 0x2CEAF) or
                (cp >= 0xF900 and cp <= 0xFAFF) or  #
                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
            return True

        return False

    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        for char in text:
            cp = ord(char)
            if cp == 0 or cp == 0xfffd or _is_control(char):
                continue
            if _is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)


class WordpieceTokenizer(object):
    """Runs WordPiece tokenziation."""

    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
        self.vocab = vocab
        self.unk_token = unk_token
        self.max_input_chars_per_word = max_input_chars_per_word

    def tokenize(self, text):
        """Tokenizes a piece of text into its word pieces.
    
        This uses a greedy longest-match-first algorithm to perform tokenization
        using the given vocabulary.
    
        For example:
          input = "unaffable"
          output = ["un", "##aff", "##able"]
    
        Args:
          text: A single token or whitespace separated tokens. This should have
            already been passed through `BasicTokenizer.
    
        Returns:
          A list of wordpiece tokens.
        """

        text = convert_to_unicode(text)

        output_tokens = []
        for token in whitespace_tokenize(text):
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue

            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                start = end

            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        return output_tokens

def _is_whitespace(char):
    """Checks whether `chars` is a whitespace character."""
    # \t, \n, and \r are technically contorl characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    cat = unicodedata.category(char)
    if cat == "Zs":
        return True
    return False


def _is_control(char):
    """Checks whether `chars` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == "\t" or char == "\n" or char == "\r":
        return False
    cat = unicodedata.category(char)
    if cat.startswith("C"):
        return True
    return False


def _is_punctuation(char):
    """Checks whether `chars` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False

In [None]:
#!pip install tensorflow-datasets
import tensorflow_datasets as tfds

#Vai ler o vocabulário de um ficheiro caso exista, senão vai criar a partir dos dados e guardar num ficheiro (no nosso caso aplica-se a segunda opção)
#Depois de obtido o vocabulário, este vai ser tokenizado - pode ser visto isto no print
#Isto é o procedimento feito para cherokee

vocab_file = '/content/drive/My Drive/machine translation/vocab_ch.txt'
if os.path.isfile(vocab_file + '.subwords'):
  tokenizer_ch = tfds.features.text.SubwordTextEncoder.load_from_file(vocab_file)
else: 
  tokenizer_ch = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
      (cherokee_text), target_vocab_size=2 ** 13)
  tokenizer_ch.save_to_file('vocab_ch.txt')

sample_string = 'Transformer jest niesamowity.'
tokenized_string = tokenizer_ch.encode(sample_string)
for ts in tokenized_string:
  print ('{} ----> {}'.format(ts, tokenizer_ch.decode([ts])))

In [None]:
#O tokenizer para inglês é criado a partir do vocab presente no bert model

tokenizer_en = FullTokenizer(
    vocab_file= 'uncased_L-12_H-768_A-12/vocab.txt',
    do_lower_case=True)

test_tokens = tokenizer_en.tokenize(english_text[-1])
test_ids = tokenizer_en.convert_tokens_to_ids(['[CLS]'] + test_tokens + ['[SEP]'])
print(test_ids)
print(tokenizer_en.convert_ids_to_tokens(test_ids))

In [None]:
#Definição do encoder onde uma sequência de palavras pode ter no máximo 50 palavras

MAX_SEQ_LENGTH = 50


def encode(en, ch, seq_length=MAX_SEQ_LENGTH):
  tokens_en = tokenizer_en.tokenize(tf.compat.as_text(en.numpy()))
  lang1 = tokenizer_en.convert_tokens_to_ids(['[CLS]'] + tokens_en + ['[SEP]'])
  if len(lang1)<seq_length:
    lang1 = lang1 + list(np.zeros(seq_length - len(lang1), 'int32'))

  lang2 = [tokenizer_ch.vocab_size] + tokenizer_ch.encode(tf.compat.as_text(ch.numpy())) + [tokenizer_ch.vocab_size + 1]
  if len(lang2)<seq_length:
    lang2 = lang2 + list(np.zeros(seq_length - len(lang2), 'int32'))

  return lang1, lang2

In [None]:
#Encode function que vai chamar a função acima

def tf_encode(en, ch):
  result_en, result_ch = tf.py_function(encode, [en, ch], [tf.int32, tf.int32])
  result_en.set_shape([None])
  result_ch.set_shape([None])

  return result_en, result_ch

In [None]:
#Filtro para selecioanr as frases(sequências de palavras) que têm abaixo de MAX_SEQ_LENGTH. 

def filter_max_length(x, y, max_length=MAX_SEQ_LENGTH):
  return tf.logical_and(tf.size(x) <= max_length,
                        tf.size(y) <= max_length)

In [None]:
#Criar os dataset já com pré processamento realizado.

BUFFER_SIZE = 40000
BATCH_SIZE = 64

train_dataset = train_examples.map(tf_encode)
# train_dataset = tf.io.decode_raw(train_dataset, tf.int32)
train_dataset = train_dataset.filter(filter_max_length)

# cache the dataset to memory to get a speedup while reading from it.
train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(
    BATCH_SIZE, padded_shapes=([-1], [-1]), drop_remainder=True)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

val_dataset = val_examples.map(
    lambda en, ch: tf.py_function(encode, [en, ch], [tf.int32, tf.int32]))
val_dataset = val_dataset.filter(filter_max_length)
val_dataset = val_dataset.padded_batch(BATCH_SIZE, padded_shapes=([-1], [-1]))

In [None]:
#Funçaõ acessória para o encoding posicional

def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
  return pos * angle_rates

In [None]:
# "codificação posicional" é adicionada para fornecer ao modelo algumas informações sobre a posição relativa dos tokens na frase.

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)

    # apply sin to even indices in the array; 2i
    sines = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    cosines = np.cos(angle_rads[:, 1::2])

    pos_encoding = np.concatenate([sines, cosines], axis=-1)

    pos_encoding = pos_encoding[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

In [None]:
#TODO - Isto foi acrescentado - necessário testar!

#Apenas um gráfico representativo do encoding posicional

# pós adicionar a codificação posicional, os tokens ficarão mais próximos uns 
# dos outros com base na similaridade de seu significado e sua posição na sentença , no espaço d-dimensional.

n, d = 2048, 512
pos_encoding = positional_encoding(n, d)
print(pos_encoding.shape)
pos_encoding = pos_encoding[0]

# Juggle the dimensions for the plot
pos_encoding = tf.reshape(pos_encoding, (n, d//2, 2))
pos_encoding = tf.transpose(pos_encoding, (2, 1, 0))
pos_encoding = tf.reshape(pos_encoding, (d, n))

plt.pcolormesh(pos_encoding, cmap='RdBu')
plt.ylabel('Depth')
plt.xlabel('Position')
plt.colorbar()
plt.show()

In [None]:
#Criar uma máscara para ser aplicada nos tokens

def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  
  # add extra dimensions so that we can add the padding
  # to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

In [None]:
#Criar uma máscara por antecipação - usada para mascarar os tokens futuros em uma sequência. Em outras palavras, a máscara indica quais entradas não devem ser usadas.

def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)

In [None]:
# Função utilizada para calcular os pesos da atenção. Tem três entradas: Q (consulta), K (chave), V (valor).

def scaled_dot_product_attention(q, k, v, mask):
    """Calculate the attention weights.
    q, k, v must have matching leading dimensions.
    k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
    The mask has different shapes depending on its type(padding or look ahead) 
    but it must be broadcastable for addition.
    
    Args:
      q: query shape == (..., seq_len_q, depth)
      k: key shape == (..., seq_len_k, depth)
      v: value shape == (..., seq_len_v, depth_v)
      mask: Float tensor with shape broadcastable 
            to (..., seq_len_q, seq_len_k). Defaults to None.
      
    Returns:
      output, attention_weights
    """

    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

    # scale matmul_qk
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    # add the mask to the scaled tensor.
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)

        # softmax is normalized on the last axis (seq_len_k) so that the scores
    # add up to 1.
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

    return output, attention_weights

In [None]:
#Cria um sistema de atenção multi-head -  permite que o modelo atenda conjuntamente a informações de diferentes subespaços de representação em diferentes posições.

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)

        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention,
                                        perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

        concat_attention = tf.reshape(scaled_attention,
                                      (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

        return output, attention_weights

In [None]:
#Em cada local na sequência, y , o MultiHeadAttention executa todas as 8 cabeças de atenção em todos os outros locais da sequência, 
#retornando um novo vetor de mesmo comprimento em cada local.

temp_mha = MultiHeadAttention(d_model=512, num_heads=8)
y = tf.random.uniform((1, 60, 768))  # (batch_size, encoder_sequence, d_model)
q = tf.random.uniform((1, 60, 512))  # (batch_size, encoder_sequence, d_model)
out, attn = temp_mha(y, k=y, q=q, mask=None)
out.shape, attn.shape

In [None]:
# A rede de feed forward pontual consiste em duas camadas totalmente conectadas com uma ativação ReLU entre elas.

def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
        tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
    ])

In [None]:
#Função para construir o encoder

def build_encoder(config_file):
    with tf.io.gfile.GFile(config_file, "r") as reader:
        stock_params = StockBertConfig.from_json_string(reader.read())
        bert_params = stock_params.to_bert_model_layer_params()

    return BertModelLayer.from_params(bert_params, name="bert")

In [None]:
# Camada do codificador - cada camada tem atenção multi-cabeça com mascaramento e redes feed forward pontuais

class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)


    def call(self, x, enc_output, training,
             look_ahead_mask, padding_mask):
        # enc_output.shape == (batch_size, input_seq_len, d_model)

        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

        attn2, attn_weights_block2 = self.mha2(
            enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)

        ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)

        return out3, attn_weights_block1, attn_weights_block2

In [None]:
sample_decoder_layer = DecoderLayer(512, 8, 2048)
sample_encoder_output = tf.random.uniform((64, 128, 768))

sample_decoder_layer_output, _, _ = sample_decoder_layer(
    tf.random.uniform((64, 50, 512)), sample_encoder_output,
    False, None, None)

sample_decoder_layer_output.shape  # (batch_size, target_seq_len, d_model)

In [None]:
#Camada do decodificador - camada consiste em várias sub-camadas: 
#Atenção de várias cabeças mascaradas (com máscara de antecipação e máscara de preenchimento)
#Atenção multi-cabeça (com máscara de preenchimento).
#Redes de feed forward pontuais

class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
                 rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(target_vocab_size, self.d_model)

        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate)
                           for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, training,
             look_ahead_mask, padding_mask):
        seq_len = tf.shape(x)[1]
        attention_weights = {}

        x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                                   look_ahead_mask, padding_mask)

            attention_weights['decoder_layer{}_block1'.format(i + 1)] = block1
            attention_weights['decoder_layer{}_block2'.format(i + 1)] = block2

        # x.shape == (batch_size, target_seq_len, d_model)
        return x, attention_weights

In [None]:
import numpy as np

sample_decoder = Decoder(num_layers=2, d_model=512, num_heads=8, 
                         dff=2048, target_vocab_size=8000)

output, attn = sample_decoder(tf.random.uniform((64, 26)), 
                              enc_output=sample_encoder_output, 
                              training=False, look_ahead_mask=None, 
                              padding_mask=None)

output.shape, attn['decoder_layer2_block2'].shape

In [None]:
#Criar uma classe para configuração que é depois usada para definir particularidades do transformer

class Config(object):
  def __init__(self, num_layers, d_model, dff, num_heads):
    self.num_layers = num_layers
    self.d_model = d_model
    self.dff = dff
    self.num_heads= num_heads

In [None]:
#Criar o transformer

from bert.loader import map_to_stock_variable_name
# /content/drive/My Drive/machine translation/transformer/bert
class Transformer(tf.keras.Model):
  def __init__(self, config,
               target_vocab_size, 
               bert_config_file,
               bert_training=False, 
               rate=0.1,
               name='transformer'):
      super(Transformer, self).__init__(name=name)

      self.encoder = build_encoder(config_file=bert_config_file)
      self.encoder.trainable = bert_training

      self.decoder = Decoder(config.num_layers, config.d_model, 
                             config.num_heads, config.dff, target_vocab_size, rate)

      self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  #Carregar os pesos
  def load_stock_weights(self, bert: BertModelLayer, ckpt_file):
      assert isinstance(bert, BertModelLayer), "Expecting a BertModelLayer instance as first argument"
      assert tf.compat.v1.train.checkpoint_exists(ckpt_file), "Checkpoint does not exist: {}".format(ckpt_file)
      ckpt_reader = tf.train.load_checkpoint(ckpt_file)

      bert_prefix = 'transformer/bert'

      weights = []
      for weight in bert.weights:
          stock_name = map_to_stock_variable_name(weight.name, bert_prefix)
          if ckpt_reader.has_tensor(stock_name):
              value = ckpt_reader.get_tensor(stock_name)
              weights.append(value)
          else:
              raise ValueError("No value for:[{}], i.e.:[{}] in:[{}]".format(weight.name, stock_name, ckpt_file))
      bert.set_weights(weights)
      print("Done loading {} BERT weights from: {} into {} (prefix:{})".format(
          len(weights), ckpt_file, bert, bert_prefix))

  def restore_encoder(self, bert_ckpt_file):
      # loading the original pre-trained weights into the BERT layer:
      self.load_stock_weights(self.encoder, bert_ckpt_file)

  def call(self, inp, tar, training, look_ahead_mask, dec_padding_mask):
      enc_output = self.encoder(inp, training=self.encoder.trainable)  # (batch_size, inp_seq_len, d_model)

      # dec_output.shape == (batch_size, tar_seq_len, d_model)
      dec_output, attention_weights = self.decoder(
          tar, enc_output, training, look_ahead_mask, dec_padding_mask)

      final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)

      return final_output, attention_weights

In [None]:
target_vocab_size = tokenizer_ch.vocab_size + 2
dropout_rate = 0.15
config = Config(num_layers=6, d_model=512, dff=1024, num_heads=8)

In [None]:
# gs_folder_bert
# uncased_L-12_H-768_A-12
MODEL_DIR = "uncased_L-12_H-768_A-12"
bert_config_file = os.path.join(MODEL_DIR, "bert_config.json")
bert_ckpt_file = os.path.join(MODEL_DIR, 'bert_model.ckpt')

# with tpu_strategy.scope():
transformer = Transformer(config=config,
                          target_vocab_size=target_vocab_size,
                          bert_config_file=bert_config_file)
  
inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH))
tar_inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH))
print(tar_inp.shape) # (batch_size, tar_seq_len) 

fn_out, _ = transformer(inp, tar_inp, 
                        True,
                        look_ahead_mask=None,
                        dec_padding_mask=None)
print(fn_out.shape)  # (batch_size, tar_seq_len, target_vocab_size) 

# init bert pre-trained weights
transformer.restore_encoder(bert_ckpt_file)

In [None]:
#Sumário do modelo - (Aqui apercebo-nos que seria impossível treinar o modelo no Colab em tempo útil)

transformer.summary()

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
learning_rate = CustomSchedule(config.d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [None]:
#Seleção do Learning Rate ideal utilizando a função schedule

temp_learning_rate_schedule = CustomSchedule(config.d_model)
import matplotlib.pyplot as plt

plt.plot(temp_learning_rate_schedule(tf.range(40000, dtype=tf.float32)))
plt.ylabel("Learning Rate")
plt.xlabel("Train Step")

In [None]:
#Definição da lossobject que vai ser utilizada já abaixo para criar a loss function

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [None]:
#Criação da função loss que vai ser usada depois durante o treino do modelo

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [None]:
#Definição das métricas utilizadas durante o treino

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')

In [None]:
#Criar checkpoints de treino (Muito útil devido ao longo tempo que pode demorar e assim o progresso não é perdido!)

checkpoint_path = "/content/drive/My Drive/machine translation/checkpoints/train"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print('Latest checkpoint restored!!')

In [None]:
#Função para criar os mascaras para os tokens (tanto inputs como outputs)

def create_masks(inp, tar):
    # Used in the 2nd attention block in the decoder.
    # This padding mask is used to mask the encoder outputs.
    dec_padding_mask = create_padding_mask(inp)

    # Used in the 1st attention block in the decoder.
    # It is used to pad and mask future tokens in the input received by 
    # the decoder.
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return combined_mask, dec_padding_mask

In [None]:
#definição de aquilo que vai ser um step de treino, o que será a loss function utilizada assim como as métricas

@tf.function
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

    
    with tf.GradientTape() as tape:
        predictions, _ = transformer(inp, tar_inp, 
                                     True,
                                     combined_mask,
                                     dec_padding_mask)
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(tar_real, predictions)

**IMPOSSÍVEL TREINAR**

Sendo um modelo com um valor enormíssimo de parâmetros é computacionalmente demasiado exigente de implementar.

**Infelizmente não foi possível treinar o modelo porém ficamos com uma visão de como funciona a famosa arquitetura BERT**

In [None]:
#Treino durante 11 EPOCHS

import time

EPOCHS = 11

for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()
    train_accuracy.reset_states()

    # inp -> chinese, tar -> english
    for (batch, (inp, tar)) in enumerate(train_dataset):
        train_step(inp, tar)

        if batch % 500 == 0:
            print('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
                epoch + 1, batch, train_loss.result(), train_accuracy.result()))

    if (epoch + 1) % 1 == 0:
        ckpt_save_path = ckpt_manager.save()
        print('Saving checkpoint for epoch {} at {}'.format(epoch + 1,
                                                            ckpt_save_path))

    print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1,
                                                        train_loss.result(),
                                                        train_accuracy.result()))

    print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))



---









## 2ª Abordagem - Transformer mais simples

**Não consegue traduzir praticamente nada**

Provavelmente o modelo não é indicado para este problema
A baixa quantidade de dados influencia bastante para a fraca ou quase nula tradução

In [None]:
#Library utilizada:  https://simpletransformers.ai/docs/seq2seq-model/
!pip install simpletransformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
df = pd.read_csv('../input/en-ch-translation/Ch_En_Train - complete.csv',delimiter=';')
Clean = True

In [None]:
#Pré processamento utilizado.

import re
import string

if Clean:
    # converting every letter to lower case
    
    #Não se deve utilizar aqui lower para cherokee! (Penso que não exista caracteres maiusculos e minusculos em cherokee)
    #df["Cherokee"] = df["Cherokee"].apply(lambda x: str(x).lower())
    df["English"] = df["English"].apply(lambda x: str(x).lower())

    # removing apostrophe from the sentences
    df["Cherokee"] = df["Cherokee"].apply(lambda x: re.sub("'", "", x))
    df["English"] = df["English"].apply(lambda x: re.sub("'", "", x))
    exclude = set(string.punctuation)
    
    # removing all the punctuations
    df["Cherokee"] = df["Cherokee"].apply(
        lambda x: "".join(ch for ch in x if ch not in exclude)
    )
    df["English"] = df["English"].apply(
        lambda x: "".join(ch for ch in x if ch not in exclude)
    )
   
    # removing digits from the sentences
    digit = str.maketrans("", "", string.digits)
    df["Cherokee"]= df["Cherokee"].apply(lambda x: x.translate(digit))
    df["English"] = df["English"].apply(lambda x: x.translate(digit))

In [None]:
df

In [None]:
#Criar os vários datasets e preparar para fornecer ao modelo

text_pairs = []
for index, row in df.iterrows():
    english, ch = row
    ch = "[start] " + ch + " [end]"
    text_pairs.append((english, ch))

import random
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples:]

train_df=pd.DataFrame(train_pairs)
val_df = pd.DataFrame(val_pairs)
test_df = pd.DataFrame(test_pairs)

train_df.columns = ['input_text','target_text']
val_df.columns = ['input_text','target_text']
test_df.columns = ['input_text','target_text']  



In [None]:
train_df

In [None]:
#imports necessários

import logging
import pandas as pd
from sklearn.model_selection import train_test_split
from simpletransformers.seq2seq import Seq2SeqModel,Seq2SeqArgs
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
#Definição de hiperparametros do modelo
#Do que foi testado por nós não notamos qualquer alteração na performance do modelo ao alterar estes hiper parametros

model_args = Seq2SeqArgs()
model_args.num_train_epochs = 30
model_args.no_save = True
model_args.evaluate_generated_text = False
model_args.evaluate_during_training = False
model_args.evaluate_during_training_verbose = True
model_args.rag_embed_batch_size = 32
model_args.max_length = 120
model_args.src_lang ="en"
model_args.tgt_lang ="ch"
model_args.overwrite_output_dir = True

In [None]:
from nltk.translate.bleu_score import sentence_bleu
!pip install sacrebleu
from sacrebleu.metrics import BLEU, CHRF, TER

In [None]:
bleu = BLEU()
chrf = CHRF()

In [None]:
#Criar o modelo

model_Helsinki = Seq2SeqModel(
    encoder_decoder_type="marian",
    encoder_decoder_name="Helsinki-NLP/opus-mt-en-mul",
    args=model_args,
    #Set False or True if you are not using / using the GPU
    use_cuda=True,
)

In [None]:
#Criar função para contar o número de vezes que o valor previsto é igual ao valor real.

def count_matches(labels, preds):
    print(labels)
    print(preds)
    return sum(
        [
            1 if label == pred else 0
            for label, pred in zip(labels, preds)
        ]
    )

In [None]:
#Treinar o modelo

model_Helsinki.train_model(
    train_df, eval_data=val_df, matches=count_matches
)

In [None]:
val_df

In [None]:
n_samples = 50

test_df = test_df.sample(n_samples)

In [None]:
src = 'INITIAL PHRASE (SOURCE): '
tgt = 'REAL TRANSLATION (TARGET): '
pred = 'AUTOMATIC TRANSLATION: '
bleu_str = 'BLEU SCORE: '
chrf_str = 'CHRF SCORE: '

In [None]:
#Prever nos nossos dados de teste - análise de performance

helsinki_Blue_medium = 0
helsinki_Chrf_medium = 0


for index, row in test_df.iterrows():
    input = row['input_text']
    output = row['target_text']
    translation = model_Helsinki.predict(input)
    score_chrf = str(chrf.corpus_score(translation,[[output]]))
    score_chrf = float(score_chrf.split('= ',1)[1])
    helsinki_Chrf_medium += score_chrf
    print(f'{src}{input:30}\n{tgt}{output:25}\n{pred}{translation}\n{chrf_str}{score_chrf}\n\n')
print(f"Média de CHRF SCORE: {(helsinki_Chrf_medium / n_samples)}")

In [None]:
#Extrair os resultados - se o quisermos fazer
#Neste caso foi feito para confirmar o fracasso total nas previsões feitas pelo modelo

test_df["Target"] = model_Helsinki.predict(list(test_df["input_text"].values))
test_df[["input_text","Translated_text"]].to_csv("Results.csv",index=False)

**Resultados obtidos**

Vários fatores podem ter contribuido para não conseguirmos utilizar o dataset de en-ch nesta segunda tarefa:
- inexistênciu de modelos pré-treinados para traduzir de inglês para cherokee
- Falha técnica não detetada
- Mau pré processamento ou alguma decisão menos boa
- Modelo demasiado simples para perceber as associações
- Quantidade muito baixa de dados para treinar

**Apesar de não conseguirmos criar um bom modelo para fazer esta tradução, trata-se de um problema muito complicado. Uma pesquisa rápida na literatura mostra-nos que traduzir entre estas duas linguas é uma tarefa complexa e é necessário um conhecimento mais consolidado na área e um maior trabalho de investigação**

## Tradução Inglês - Português | Português - Inglês

Vão ser utilizados quatro modelos diferentes e comparados entre eles de forma a percebermos o melhor para este problema. Modelos utilizados:
- mbart - large EN-PT
- mbart - large PT-EN
- T5 - EN-PT
- T5 - PT-EN

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
train_dataset = pd.read_csv('/content/drive/MyDrive/por.txt',delimiter='\t')

In [None]:
train_dataset.columns = ['En','Pt']

In [None]:
#library used: https://huggingface.co/docs/transformers/index
!pip install transformers

In [None]:
#Criação do transformer mbart - large en-pt https://huggingface.co/Narrativa/mbart-large-50-finetuned-opus-en-pt-translation

from transformers import MBart50TokenizerFast, MBartForConditionalGeneration

ckpt = 'Narrativa/mbart-large-50-finetuned-opus-en-pt-translation'

tokenizer = MBart50TokenizerFast.from_pretrained(ckpt)
model = MBartForConditionalGeneration.from_pretrained(ckpt)

tokenizer.src_lang = 'en_XX'

def translate(text):
    inputs = tokenizer(text, return_tensors='pt')
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask
    output = model.generate(input_ids, attention_mask=attention_mask, forced_bos_token_id=tokenizer.lang_code_to_id['pt_XX'])
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
#Import das métricas

from nltk.translate.bleu_score import sentence_bleu
!pip install sacrebleu
from sacrebleu.metrics import BLEU, CHRF, TER

In [None]:
bleu = BLEU()
chrf = CHRF()

In [None]:
#Samples utilizadas para testar e avaliar os modelos
n_samples = 50

In [None]:
#Criar o dataset para test com uma sample aleatória com a quantidade de n_samples (definida acima)

test_df = train_dataset.sample(n_samples)

src = 'INITIAL PHRASE (SOURCE): '
tgt = 'REAL TRANSLATION (TARGET): '
pred = 'AUTOMATIC TRANSLATION: '
bleu_str = 'BLEU SCORE: '
chrf_str = 'CHRF SCORE: '

In [None]:
#Teste do modelo

mbartenpt_medium_Bleu_Score = 0
mbartenpt_medium_Chrf_score = 0

for index, row in test_df.iterrows():
    input = row['En']
    output = row['Pt']
    translation = translate(input)
    score_bleu = sentence_bleu([output],translation)
    score_chrf = str(chrf.corpus_score([translation],[[output]]))
    score_chrf = float(score_chrf.split('= ',1)[1])
    mbartenpt_medium_Bleu_Score += score_bleu
    mbartenpt_medium_Chrf_score += score_chrf
    print(f'{src}{input:30}\n{tgt}{output:25}\n{pred}{translation}\n{bleu_str}{score_bleu}\n{chrf_str}{score_chrf}\n\n')

print(f"Média de BLUE SCORE: {(mbartenpt_medium_Bleu_Score / n_samples)}")
print(f"Média de CHRF SCORE: {(mbartenpt_medium_Chrf_score / n_samples)}")

In [None]:
#Criação do modelo mbart-large pt-en https://huggingface.co/Narrativa/mbart-large-50-finetuned-opus-pt-en-translation

from transformers import MBart50TokenizerFast, MBartForConditionalGeneration

ckpt = 'Narrativa/mbart-large-50-finetuned-opus-pt-en-translation'

tokenizer = MBart50TokenizerFast.from_pretrained(ckpt)
model = MBartForConditionalGeneration.from_pretrained(ckpt)

tokenizer.src_lang = 'pt_XX'

def translate(text):
    inputs = tokenizer(text, return_tensors='pt')
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask
    output = model.generate(input_ids, attention_mask=attention_mask, forced_bos_token_id=tokenizer.lang_code_to_id['en_XX'])
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
#Teste do modelo

mbartpten_medium_Bleu_Score = 0
mbartpten_medium_Chrf_score = 0

for index, row in test_df.iterrows():
    input = row['Pt']
    output = row['En']
    translation = translate(input)
    score_bleu = sentence_bleu([output],translation)
    score_chrf = str(chrf.corpus_score([translation],[[output]]))
    score_chrf = float(score_chrf.split('= ',1)[1])
    mbartpten_medium_Bleu_Score += score_bleu
    mbartpten_medium_Chrf_score += score_chrf
    print(f'{src}{input:30}\n{tgt}{output:25}\n{pred}{translation}\n{bleu_str}{score_bleu}\n{chrf_str}{score_chrf}\n\n')

print(f"Média de BLUE SCORE: {(mbartpten_medium_Bleu_Score / n_samples)}")
print(f"Média de CHRF SCORE: {(mbartpten_medium_Chrf_score / n_samples)}")

In [None]:
#Criar o modelo T5 En-Pt https://huggingface.co/unicamp-dl/translation-en-pt-t5

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
  
tokenizer = AutoTokenizer.from_pretrained("unicamp-dl/translation-en-pt-t5")

model = AutoModelForSeq2SeqLM.from_pretrained("unicamp-dl/translation-en-pt-t5")

enpt_pipeline = pipeline('text2text-generation', model=model, tokenizer=tokenizer)

In [None]:
#Teste do modelo

t5_medium_Bleu_Score = 0
t5_medium_Chrf_score = 0

for index, row in test_df.iterrows():
    input = row['En']
    output = row['Pt']
    translation = enpt_pipeline(input)[0]['generated_text']
    score_bleu = sentence_bleu([output],translation)
    score_chrf = str(chrf.corpus_score([translation],[[output]]))
    score_chrf = float(score_chrf.split('= ',1)[1])
    t5_medium_Bleu_Score += score_bleu
    t5_medium_Chrf_score += score_chrf
    print(f'{src}{input:30}\n{tgt}{output:25}\n{pred}{translation}\n{bleu_str}{score_bleu}\n{chrf_str}{score_chrf}\n\n')

print(f"Média de BLUE SCORE: {(t5_medium_Bleu_Score / n_samples)}")
print(f"Média de CHRF SCORE: {(t5_medium_Chrf_score / n_samples)}")

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
  
tokenizer = AutoTokenizer.from_pretrained("unicamp-dl/translation-pt-en-t5")

model = AutoModelForSeq2SeqLM.from_pretrained("unicamp-dl/translation-pt-en-t5")

pten_pipeline = pipeline('text2text-generation', model=model, tokenizer=tokenizer)

In [None]:
t5_pt_en_medium_Bleu_Score = 0
t5_pt_en_medium_Chrf_score = 0

for index, row in test_df.iterrows():
    input = row['Pt']
    output = row['En']
    translation = pten_pipeline(input)[0]['generated_text']
    score_bleu = sentence_bleu([output],translation)
    score_chrf = str(chrf.corpus_score([translation],[[output]]))
    score_chrf = float(score_chrf.split('= ',1)[1])
    t5_pt_en_medium_Bleu_Score += score_bleu
    t5_pt_en_medium_Chrf_score += score_chrf
    print(f'{src}{input:30}\n{tgt}{output:25}\n{pred}{translation}\n{bleu_str}{score_bleu}\n{chrf_str}{score_chrf}\n\n')

print(f"Média de BLUE SCORE: {(t5_pt_en_medium_Bleu_Score / n_samples)}")
print(f"Média de CHRF SCORE: {(t5_pt_en_medium_Chrf_score / n_samples)}")

In [None]:
#Ajustar médias para construir gráficos
mbartenpt_medium_Bleu_Score = mbartenpt_medium_Bleu_Score / n_samples
mbartpten_medium_Bleu_Score = mbartpten_medium_Bleu_Score / n_samples
t5_medium_Bleu_Score = t5_medium_Bleu_Score / n_samples
t5_pt_en_medium_Bleu_Score = t5_pt_en_medium_Bleu_Score / n_samples

mbartenpt_medium_Chrf_score = mbartenpt_medium_Chrf_score / n_samples
mbartpten_medium_Chrf_score = mbartpten_medium_Chrf_score / n_samples
t5_medium_Chrf_score = t5_medium_Chrf_score / n_samples
t5_pt_en_medium_Chrf_score = t5_pt_en_medium_Chrf_score / n_samples

In [None]:
import matplotlib.pyplot as plt

#Gráfico de barras a compara média de BLUE SCORE
plt.bar(['mbart En-Pt', 'mbart Pt-En', 't5 En-Pt','t5 Pt-En'], [mbartenpt_medium_Bleu_Score, mbartpten_medium_Bleu_Score, t5_medium_Bleu_Score, t5_pt_en_medium_Bleu_Score], color ='maroon',
        width = 0.4)

In [None]:
import matplotlib.pyplot as plt

#Gráfico de barras a compara média de CHRF SCORE
plt.bar(['mbart En-Pt', 'mbart Pt-En', 't5 En-Pt', 't5 Pt-En'], [mbartenpt_medium_Chrf_score, mbartpten_medium_Chrf_score, t5_medium_Chrf_score, t5_pt_en_medium_Chrf_score],
        width = 0.4)