Machine Translation - English to German

In [47]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
except Exception:
    pass
# TensorFlow ≥2.0 is required
import tensorflow as tf
assert tf.__version__ >= "2.0"
print(tf.__version__)

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")


import random

import tensorflow as tf
import string
import re 
from keras import layers
import io

2.9.1


In [48]:
#input_text_file_path = "./preprocessed_dataset_for_dev.txt"
input_text_file_path = "./preprocessed_dataset_for_train.txt"

In [49]:
with open(input_text_file_path, encoding='utf-8') as f:
    lines = f.read().split("\n")[:-1]

text_pairs = []

for line in lines:
    english, port = line.split("\t")
    port = "[start] " + port + " [end]"
    text_pairs.append((english, port))

print(text_pairs[-1])

('the bookmarked feed shows the posts that you have bookmarked. the bookmarked feed offers easy access to useful posts.', '[start] im feed mit lesezeichen werden alle post angezeigt, die sie mit einem lesezeichen versehen haben. der feed "mit lesezeichen" bietet ihnen schnellen zugriff auf nützliche posts. [end]')


In [50]:
random.shuffle(text_pairs) #1. mistura todos os pairs

print("len(text_pairs) ->", len(text_pairs))

num_val_samples = int(0.15 * len(text_pairs))
print("15% for validation ->", num_val_samples)

num_train_samples = len(text_pairs) - 2 * num_val_samples
print("70% for train ->", num_train_samples)

train_pairs = text_pairs[:num_train_samples] #escolhe os primeiros 70% (shuffled) para treino
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples] #mais 15% para validação
test_pairs = text_pairs[num_train_samples + num_val_samples:] #mais 15 para teste

len(text_pairs) -> 100611
15% for validation -> 15091
70% for train -> 70429


In [51]:
print("treino: ",    len(train_pairs))
print("validação: ", len(val_pairs  ))
print("teste: ",     len(test_pairs ))

treino:  70429
validação:  15091
teste:  15091


In [52]:
strip_chars = string.punctuation # !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~

strip_chars = strip_chars.replace("[", "") # para não perder o [start] e [end]
strip_chars = strip_chars.replace("]", "") #

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(
        lowercase, f"[{re.escape(strip_chars)}]", "")

vocab_size = 22000    # O modelo apneas vai conhecer 15000 palavras
sequence_length = 25  # cada frase vai ter 20 palavrasg


source_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length
)

target_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
)

print(train_pairs[0])
train_english_texts = [pair[0] for pair in train_pairs] 
print(train_english_texts[0])

train_pt_texts = [pair[1] for pair in train_pairs] 
print(train_english_texts[1])

source_vectorization.adapt(train_english_texts)
#source_vectorization.adapt([pair[0] for pair in text_pairs] )
target_vectorization.adapt(train_pt_texts)

('12345678912345', '[start] 12345678912345 [end]')
12345678912345
how often are all my accounts, contacts, leads, and db companies updated with #url1# clean jobs?


In [53]:
source_vectorization.get_vocabulary()[0:10]

['', '[UNK]', 'the', 'to', 'a', 'and', 'in', 'you', 'for', 'your']

In [54]:
target_vectorization.get_vocabulary()[0:10]

['', '[UNK]', '[start]', '[end]', 'sie', 'die', 'der', 'und', 'in', 'für']

In [55]:
glove50_file_path = "./glove.6B.50d.embedding"

embeddings_index = {}

with open(glove50_file_path, encoding="utf-8") as gloveFile:
    for line in gloveFile:
        word, coefsAsString = line.split(maxsplit=1)
        coefs = np.fromstring(coefsAsString, "f", sep=" ")
        embeddings_index[word] = coefs

print(f"EN embeddings has {len(embeddings_index)} word vectors")


EN embeddings has 400000 word vectors


In [56]:
en_embeddings_dim = 50

en_vocabulary = source_vectorization.get_vocabulary() 

word_index = dict(zip(en_vocabulary, range(len(en_vocabulary)))) # setting an "id" to head word 

embedding_matrix = np.zeros((vocab_size, en_embeddings_dim)) 
print("embedding_matrix size:", embedding_matrix.shape)

for word, i in word_index.items():
    if(i < vocab_size):
        embedding_vector = embeddings_index.get(word)

    if(embedding_vector is not None):
        embedding_matrix[i] = embedding_vector

print(embedding_matrix[3:5])

en_embedding_layer = layers.Embedding(vocab_size, en_embeddings_dim, embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix))

embedding_matrix size: (22000, 50)
[[ 0.68046999 -0.039263    0.30186    -0.17792     0.42962     0.032246
  -0.41376001  0.13228001 -0.29846999 -0.085253    0.17117999  0.22419
  -0.10046    -0.43652999  0.33418     0.67846     0.057204   -0.34448001
  -0.42785001 -0.43274999  0.55962998  0.10032     0.18677001 -0.26853999
   0.037334   -2.09319997  0.22171    -0.39868     0.20912001 -0.55725002
   3.88260007  0.47466001 -0.95657998 -0.37788001  0.20869    -0.32752001
   0.12751     0.088359    0.16350999 -0.21634001 -0.094375    0.018324
   0.21048    -0.03088    -0.19722     0.082279   -0.09434    -0.073297
  -0.064699   -0.26043999]
 [ 0.21705     0.46515    -0.46757001  0.10082     1.01349998  0.74844998
  -0.53104001 -0.26256001  0.16812     0.13181999 -0.24909    -0.44185001
  -0.21739     0.51003999  0.13448    -0.43141001 -0.03123     0.20674001
  -0.78138    -0.20148    -0.097401    0.16088    -0.61835998 -0.18504
  -0.12461    -2.25259995 -0.22321001  0.5043      0.32257    

In [57]:
batch_size = 32

def format_dataset(eng, pt):
    eng = source_vectorization(eng)
    pt = target_vectorization(pt)
    return ({
        "english": eng,
        "portuguese": pt[:, :-1],
    }, pt[:, 1:])

def make_dataset(pairs):
    eng_texts, pt_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    pt_texts = list(pt_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, pt_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=4)
    return dataset.shuffle(2048).prefetch(16).cache()

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [58]:
for inputs, targets in train_ds.take(1):
    print(f"inputs['english'].shape: {inputs['english'].shape}")
    print(f"inputs['portuguese'].shape: {inputs['portuguese'].shape}")
    print(f"targets.shape: {targets.shape}")
    print(inputs['english'])


inputs['english'].shape: (32, 25)
inputs['portuguese'].shape: (32, 25)
targets.shape: (32, 25)
tf.Tensor(
[[   17   158  1354     4    61    52   279    41    42     3     2    15
     46   128     7    33   313     0     0     0     0     0     0     0
      0]
 [   28   677    62   256   235   126  1995    12   647  1056   312     4
    417   108   772   394     8     2   111    56   515     2   164    20
   1471]
 [   40 14818     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0]
 [   52    88   202     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0]
 [   33   186     6   121  1645     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0]
 [   61   254   301   502   237    16   111    46     5   425     0     0
      0     0     0     0     0    

In [59]:
# Criação da classe que modela o Encoder 

# Na criação do objeto recebe 
# embed_dim: Dimensão da sequência de input 
# dense_dim: Número de nós da camada Dense
# num_heads: Número de attention heads

class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):

        super().__init__(**kwargs)
        
        self.embed_dim = embed_dim
        
        self.dense_dim = dense_dim
        
        self.num_heads = num_heads
        
        self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)

        self.dense_proj = keras.Sequential([layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),])
        
        self.layernorm_1 = layers.LayerNormalization()
        
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        
        attention_output = self.attention(inputs, inputs, attention_mask=mask)

        proj_input = self.layernorm_1(inputs + attention_output)
        
        proj_output = self.dense_proj(proj_input)
        
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        
        return config

In [60]:
# Criação da classe que modela o Decoder 

# Na criação do objeto recebe 
# embed_dim: Dimensão da sequência de input 
# dense_dim: Número de nós da camada Dense
# num_heads: Número de attention heads

class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        
        super().__init__(**kwargs)
        
        self.embed_dim = embed_dim
        
        self.dense_dim = dense_dim
        
        self.num_heads = num_heads
        
        self.attention_1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        
        self.attention_2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)

        self.dense_proj = keras.Sequential([layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),])
        
        self.layernorm_1 = layers.LayerNormalization()
        
        self.layernorm_2 = layers.LayerNormalization()
        
        self.layernorm_3 = layers.LayerNormalization()
        
        self.supports_masking = True

    def get_config(self):
        config = super().get_config()
        
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        
        return config

    def get_causal_attention_mask(self, inputs):
        
        input_shape = tf.shape(inputs)
        
        batch_size, sequence_length = input_shape[0], input_shape[1]
        
        i = tf.range(sequence_length)[:, tf.newaxis]
        
        j = tf.range(sequence_length)
        
        mask = tf.cast(i >= j, dtype="int32")
        
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        
        mult = tf.concat([tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], axis=0)
        
        return tf.tile(mask, mult)

    def call(self, inputs, encoder_outputs, mask=None):
        
        causal_mask = self.get_causal_attention_mask(inputs)
        
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)
        
        attention_output_1 = self.attention_1(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=causal_mask)
        
        attention_output_1 = self.layernorm_1(inputs + attention_output_1)
        
        attention_output_2 = self.attention_2(
            query=attention_output_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask)
        
        attention_output_2 = self.layernorm_2(attention_output_1 + attention_output_2)

        proj_output = self.dense_proj(attention_output_2)
        
        return self.layernorm_3(attention_output_2 + proj_output)

In [61]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, wordEmbedding=None, **kwargs):
        super().__init__(**kwargs)

        if( wordEmbedding is not None ):
            self.token_embeddings = wordEmbedding
        else:
            self.token_embeddings = layers.Embedding(input_dim=input_dim, output_dim=output_dim)
        
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim)
        
        self.sequence_length = sequence_length
        
        self.input_dim = input_dim
        
        self.output_dim = output_dim

    def call(self, inputs):
        
        length = tf.shape(inputs)[-1]
        
        positions = tf.range(start=0, limit=length, delta=1)
        
        embedded_tokens = self.token_embeddings(inputs)
        
        embedded_positions = self.position_embeddings(positions)
        
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

    def get_config(self):
        config = super(PositionalEmbedding, self).get_config()

        config.update({
            "output_dim": self.output_dim,
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
        })

        return config

In [62]:
# The complete Transformer

import keras

# Settings 

#embed_dim = 256
embed_dim = 50
dense_dim = 2048
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="english")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim, en_embedding_layer)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="portuguese")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)

transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [63]:
keras.backend.clear_session()
best_model_file_path = "best_translator.tfmodel"
callbacks_list = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_accuracy', 
        patience = 10, 
        min_delta = 0.005, 
        restore_best_weights = True,
        mode='auto'
    ),
    keras.callbacks.ModelCheckpoint(
        filepath = best_model_file_path,
        monitor = "val_accuracy",
        verbose=0,
        save_best_only = True,
        save_weights_only = False,
        mode='max'
    )
]

In [64]:
transformer.compile(
    optimizer="rmsprop",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"])

In [65]:
history = transformer.fit(
    train_ds,
    epochs=25,
    validation_data=val_ds,
    callbacks=callbacks_list)

Epoch 1/25



INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


Epoch 2/25



INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


Epoch 3/25



INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


Epoch 4/25



INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


Epoch 5/25



INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


Epoch 6/25



INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


Epoch 7/25



INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


Epoch 8/25



INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


Epoch 9/25



INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


Epoch 10/25



INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


Epoch 11/25



INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


Epoch 12/25



INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


Epoch 13/25



INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


Epoch 14/25



INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


Epoch 15/25



INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


Epoch 16/25
Epoch 17/25



INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


Epoch 18/25



INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


Epoch 19/25



INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


Epoch 20/25
Epoch 21/25



INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25



INFO:tensorflow:Assets written to: best_translator.tfmodel\assets


INFO:tensorflow:Assets written to: best_translator.tfmodel\assets




In [69]:
# Testar o desempenho do Transformer em frases do conjunto de teste


pt_vocab = target_vectorization.get_vocabulary()
pt_index_lookup = dict(zip(range(len(pt_vocab)), pt_vocab))
max_decoded_sentence_length = 25

def decode_sequence(input_sentence):
   
    tokenized_input_sentence = source_vectorization([input_sentence])
   
    decoded_sentence = "[start]"
   
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization([decoded_sentence])[:, :-1]
        
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        
        sampled_token = pt_index_lookup[sampled_token_index]
        
        decoded_sentence += " " + sampled_token
        
        if sampled_token == "[end]":
            break
    
    return decoded_sentence

test_eng_texts = [pair[0] for pair in test_pairs]

for _ in range(20):
    input_sentence = random.choice(test_eng_texts)
    print("-")
    print(input_sentence)
    print(decode_sequence(input_sentence))

-
article versions
[start] [UNK] [end]
-
from the filters tab, under cross filters, find the cross filter you want to edit.
[start] wechseln sie auf der registerkarte [UNK] [UNK] [end]
-
for your idp issuer / entity id, enter your salesforce identity provider issuer, for example, #url1#.
[start] geben sie für ihre [UNK] ein geben sie ihre [UNK] für den [UNK] ein beispielsweise url1 [end]
-
step 2: restrict oauth connected app access (whitelist apps)
[start] datum 1 [UNK] für verbundene anwendung zugriff auf anwendungen [end]
-
edit from streams home
[start] bearbeiten auf der seite [end]
-
to add compose gmail buttons in activity history on leads and contacts, select gmail buttons.
[start] hinzufügen von [UNK] zu [UNK] in leads und benutzerdefinierten [UNK] [end]
-
contact support button
[start] [UNK] [end]
-
based on these matching criteria, here's how matching works.
[start] im folgenden finden sie die folgenden [UNK] beispielsweise [UNK] [end]
-
conflict behavior—salesforce always w

In [67]:
#for _ in range(5):
#    input_sentence = input()
#    print("-")
#    print(input_sentence)
#    print(decode_sequence(input_sentence), '\n')