# Translation of OPUS medical text from German to English
https://www.tensorflow.org/datasets/catalog/opus

In [1]:
!pip install keras_nlp -q

In [2]:
import random
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset
import keras_nlp
tfds.disable_progress_bar()
print(tf.__version__)
print(tfds.__version__)
print(keras_nlp.__version__)
seed = 54
random.seed(54)
tf.random.set_seed(seed)

2.10.0
4.7.0
0.3.1


In [3]:
dataset = tfds.load('opus', as_supervised=True)

In [4]:
text_pairs = []
for de, en in dataset['train']:
    german = de.numpy().decode().lower().strip('\n')
    english = en.numpy().decode().lower().strip('\n')
    text_pairs.append((german, english))

In [5]:
for _ in range(5):
    print(random.choice(text_pairs))
    print('')

('es werden möglicherweise nicht alle packungsgrößen in den verkehr gebracht.', 'not all pack sizes may be marketed.')

('benutzen sie die durchstechflaschen nicht, wenn die verschlusskappen locker sind oder fehlen.', 'if the caps are loose or missing, do not use the vials.')

('3.', '63 3.')

('sie dürfen das arzneimittel nach dem auf dem etikett und dem umkarton angegebenen verfalldatum nicht mehr anwenden.', 'do not use mixtard after the expiry date which is stated on the label and the carton.')

('32,4 (78/241) 51,3 (40/78) 36,7; 65,9', '32.4 (78/241)')



**Looks like we have some questionable data. In the future I may remove any pairs that don't have any letters for the alphabet in them.**

In [6]:
random.shuffle(text_pairs)
num_val_samples = int(0.05 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

1108752 total pairs
997878 training pairs
55437 validation pairs
55437 test pairs


# Tokenize the data

**Note: In this notebook, I use the prefix 'de' for German and 'en' for English**

In [7]:
def train_word_piece(text_samples, vocab_size, reserved_tokens):
    bert_vocab_args = dict(vocab_size=vocab_size, reserved_tokens=reserved_tokens, 
                           bert_tokenizer_params={"lower_case": True},)

    word_piece_ds = tf.data.Dataset.from_tensor_slices(text_samples)
    vocab = bert_vocab_from_dataset.bert_vocab_from_dataset(word_piece_ds.batch(1000).prefetch(tf.data.AUTOTUNE), 
                                                            **bert_vocab_args)
    return vocab

In [8]:
VOCAB_SIZE = 15000
reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]

de_samples = [text_pair[0] for text_pair in train_pairs]
de_vocab = train_word_piece(de_samples, VOCAB_SIZE, reserved_tokens)

en_samples = [text_pair[1] for text_pair in train_pairs]
en_vocab = train_word_piece(en_samples, VOCAB_SIZE, reserved_tokens)

In [9]:
de_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(vocabulary=de_vocab, lowercase=False)
en_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(vocabulary=en_vocab, lowercase=False)

# Format the datasets

In [10]:
MAX_SEQUENCE_LENGTH = 64
def preprocess_batch(de, en):
    batch_size = tf.shape(en)[0]

    de = de_tokenizer(de)
    en = en_tokenizer(en)

    # Pad `de` to `MAX_SEQUENCE_LENGTH`.
    de_start_end_packer = keras_nlp.layers.StartEndPacker(
        sequence_length=MAX_SEQUENCE_LENGTH,
        pad_value=de_tokenizer.token_to_id("[PAD]"),
    )
    de = de_start_end_packer(de)

    # Add special tokens (`"[START]"` and `"[END]"`) to `en` and pad it as well.
    en_start_end_packer = keras_nlp.layers.StartEndPacker(
        sequence_length=MAX_SEQUENCE_LENGTH + 1,
        start_value=en_tokenizer.token_to_id("[START]"),
        end_value=en_tokenizer.token_to_id("[END]"),
        pad_value=en_tokenizer.token_to_id("[PAD]"),
    )
    en = en_start_end_packer(en)

    return ({"encoder_inputs": de, "decoder_inputs": en[:, :-1],}, en[:, 1:])


def make_dataset(pairs, shuffle=False):
    de_texts, en_texts = zip(*pairs)
    de_texts = list(de_texts)
    en_texts = list(en_texts)
    dataset = tf.data.Dataset.from_tensor_slices((de_texts, en_texts))
    dataset = dataset.batch(64)
    dataset = dataset.map(preprocess_batch, num_parallel_calls=tf.data.AUTOTUNE)
    if shuffle:
        dataset = dataset.shuffle(2048)
    return dataset.prefetch(tf.data.AUTOTUNE)


train_ds = make_dataset(train_pairs, shuffle=True)
val_ds = make_dataset(val_pairs)

# The Model

In [11]:
EMBED_DIM = 256
INTERMEDIATE_DIM = 2048
NUM_HEADS = 8

# Encoder
encoder_inputs = tf.keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")

x = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
    )(encoder_inputs)

encoder_outputs = keras_nlp.layers.TransformerEncoder(
    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
    )(inputs=x)

encoder = tf.keras.Model(encoder_inputs, encoder_outputs)


# Decoder
decoder_inputs = tf.keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = tf.keras.Input(shape=(None, EMBED_DIM), name="decoder_state_inputs")

x = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
    )(decoder_inputs)

x = keras_nlp.layers.TransformerDecoder(
    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
    )(decoder_sequence=x, encoder_sequence=encoded_seq_inputs)

x = tf.keras.layers.Dropout(0.5)(x)
decoder_outputs = tf.keras.layers.Dense(VOCAB_SIZE, activation="softmax")(x)

decoder = tf.keras.Model([decoder_inputs, encoded_seq_inputs,], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])

transformer = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs, name="transformer")



# Training

In [12]:
transformer.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
                    loss="sparse_categorical_crossentropy",
                    metrics=['acc'])
es = tf.keras.callbacks.EarlyStopping(patience=9, verbose=1, restore_best_weights=True)
history = transformer.fit(train_ds, epochs=10, validation_data=val_ds, verbose=1, callbacks=[es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Qualitative Evaluation

In [13]:
def decode_sequences(input_sentences):
    batch_size = tf.shape(input_sentences)[0]

    # Tokenize the encoder input.
    encoder_input_tokens = de_tokenizer(input_sentences).to_tensor(
        shape=(None, MAX_SEQUENCE_LENGTH)
    )

    # Define a function that outputs the next token's probability given the
    # input sequence.
    def token_probability_fn(decoder_input_tokens):
        return transformer([encoder_input_tokens, decoder_input_tokens])[:, -1, :]

    # Set the prompt to the "[START]" token.
    prompt = tf.fill((batch_size, 1), en_tokenizer.token_to_id("[START]"))

    generated_tokens = keras_nlp.utils.greedy_search(
        token_probability_fn,
        prompt,
        max_length=40,
        end_token_id=en_tokenizer.token_to_id("[END]"),
    )
    generated_sentences = en_tokenizer.detokenize(generated_tokens)
    return generated_sentences


test_de_texts = [pair[0] for pair in test_pairs]
for i in range(5):
    input_sentence = random.choice(test_de_texts)
    translated = decode_sequences(tf.constant([input_sentence]))
    translated = translated.numpy()[0].decode("utf-8")
    translated = (
        translated.replace("[PAD]", "")
        .replace("[START]", "")
        .replace("[END]", "")
        .strip()
    )
    print(f"** Example {i} **")
    print()
    print(input_sentence)
    print()
    print(translated)
    print()

** Example 0 **

90/158 (57%)

90 / 158 ( 57 % )

** Example 1 **

η έκδοση γνώμης από την chmp απαιτεί κατά κανόνα έως και 90 ημέρες, μετά την παραλαβή μιας αίτησης για τροποποίηση της άδειας κυκλοφορίας.

the chmp was also considered the matter to be in addition to the chmp .

** Example 2 **

roche austria gmbh tel: +43 (0) 1 27739

osterreich roche austria gmbh tel : + 43 ( 0 ) 1 27739

** Example 3 **

ziehen sie den kolben der spritze langsam zurück, bis das wasser die 1,1-ml-markierung erreicht.

slowly pull the plunger to the water to the 1 . 1 ml mark .

** Example 4 **

1,9%; placebo:

1 . 9 % ; placebo :



# Quantitative Evaluation

ROUGE-N is a score based on the number of matching n-grams between the reference text and the hypothesis text. ROUGE-1, ROUGE-2, and ROUGE-3 use the number of common unigrams, bigrams, and trigrams, respectively.

In [14]:
!pip install rouge_score -q

In [16]:
%%time
rouge_1 = keras_nlp.metrics.RougeN(order=1)
rouge_2 = keras_nlp.metrics.RougeN(order=2)
rouge_3 = keras_nlp.metrics.RougeN(order=3)

for test_pair in test_pairs[:30]:  ## just evaluating the first 30 because this is a very slow calculation
    input_sentence = test_pair[0]
    reference_sentence = test_pair[1]

    translated_sentence = decode_sequences(tf.constant([input_sentence]))
    translated_sentence = translated_sentence.numpy()[0].decode("utf-8")
    translated_sentence = (
        translated_sentence.replace("[PAD]", "")
        .replace("[START]", "")
        .replace("[END]", "")
        .strip()
    )

    rouge_1(reference_sentence, translated_sentence)
    rouge_2(reference_sentence, translated_sentence)
    rouge_3(reference_sentence, translated_sentence)

print("ROUGE-1 Score: ", rouge_1.result()['f1_score'].numpy())
print("ROUGE-2 Score: ", rouge_2.result()['f1_score'].numpy())
print("ROUGE-3 Score: ", rouge_3.result()['f1_score'].numpy())


ROUGE-1 Score:  0.86314577
ROUGE-2 Score:  0.62874866
ROUGE-3 Score:  0.50575006
Wall time: 51.3 s
