In [1]:
import tensorflow as tf
keras = tf.keras
import numpy as np
import matplotlib.pyplot as plt
import random
from keras_nlp.tokenizers import WordPieceTokenizer
import keras_nlp
from tensorflow.data import Dataset
import rouge_score

2023-09-20 11:05:28.175342: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-20 11:05:28.204995: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using TensorFlow backend


In [2]:
print(tf.__version__)

2.12.1


In [3]:
BATCH_SIZE = 64
EPOCHS = 20
MAX_SEQUENCE_LENGTH = 40
ENG_VOCAB_SIZE = 15000
HIN_VOCAB_SIZE = 15000

EMBED_DIM = 256
INTERMEDIATE_DIM = 2048
NUM_HEADS = 8

In [4]:
pairs = []

with open("hin.txt") as f:
    for i in f.readlines():
        words = i.split('\t')
        pairs.append((words[0].lower(), words[1]))

print(pairs)

[('wow!', 'वाह!'), ('duck!', 'झुको!'), ('duck!', 'बतख़!'), ('help!', 'बचाओ!'), ('jump.', 'उछलो.'), ('jump.', 'कूदो.'), ('jump.', 'छलांग.'), ('hello!', 'नमस्ते।'), ('hello!', 'नमस्कार।'), ('cheers!', 'वाह-वाह!'), ('cheers!', 'चियर्स!'), ('exhale.', 'सांस छोड़।'), ('exhale.', 'सांस छोड़ो।'), ('got it?', 'समझे कि नहीं?'), ("i'm ok.", 'मैं ठीक हूँ।'), ('inhale.', 'सांस ले।'), ('inhale.', 'सांस लो।'), ('thanks!', 'धन्यवाद!'), ('we won.', 'हम जीते।'), ('awesome!', 'बहुत बढ़िया!'), ('come in.', 'अंदर आ जाओ।'), ('get out!', 'बाहर निकल जाओ!'), ('go away!', 'चले जाओ!'), ('goodbye!', 'ख़ुदा हाफ़िज़।'), ('perfect!', 'उत्तम!'), ('perfect!', 'सही!'), ('we lost.', 'हम हार गए।'), ('welcome.', 'आपका स्वागत है।'), ('welcome.', 'स्वागतम्।'), ('have fun.', 'मज़े करना।'), ('have fun.', 'मौज करना।'), ('have fun.', 'मज़े करो।'), ('i forgot.', 'मैं भूल गया।'), ('i forgot.', 'मैं भूल गई।'), ("i'll pay.", 'मैं पैसे दूंगा।'), ("i'm fine.", 'मैं ठीक हूँ।'), ("i'm full.", 'मेरा पेट भर गया है।'), ("let's go!", 'चलो

In [5]:
random.shuffle(pairs)
val_samples = int(0.15 * len(pairs))
train_samples = len(pairs) - 2 * val_samples

train_pairs = pairs[:train_samples]
val_pairs = pairs[train_samples:train_samples + val_samples]
test_pairs = pairs[train_samples + val_samples:]

print(f"Total {len(pairs)}")
print(f"Train {len(train_pairs)}")

Total 2979
Train 2087


In [6]:
def train_word_piece(text_samples, vocab_size, reserved_tokens):
    word_piece_ds = Dataset.from_tensor_slices(text_samples)
    vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
        word_piece_ds.batch(1000).prefetch(2),
        vocabulary_size=vocab_size,
        reserved_tokens=reserved_tokens,
    )
    return vocab

In [7]:
reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]

eng_samples = [pair[0] for pair in train_pairs]
eng_vocab = train_word_piece(eng_samples, ENG_VOCAB_SIZE, reserved_tokens)

hin_samples = [pair[1] for pair in train_pairs]
hin_vocab = train_word_piece(hin_samples, HIN_VOCAB_SIZE, reserved_tokens)

2023-09-20 11:05:30.232089: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-20 11:05:30.249606: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-20 11:05:30.249735: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [8]:
print("English Tokens: ", eng_vocab[100:110])
print("Hindi Tokens: ", hin_vocab[100:110])

English Tokens:  ['very', 'and', 'all', 'time', 'come', 'had', 'they', '##es', 'from', 'no']
Hindi Tokens:  ['हो', 'की', 'था', '##र', '##ी', '##ने', 'बहुत', 'हूँ', '##न', '##ना']


In [9]:
eng_tokenizer = WordPieceTokenizer(
    vocabulary=eng_vocab, lowercase=False
)
hin_tokenizer = WordPieceTokenizer(
    vocabulary=hin_vocab, lowercase=False
)

In [10]:
eng_input_ex = pairs[0][0]
eng_tokens_ex = eng_tokenizer.tokenize(eng_input_ex)
print("English sentence: ", eng_input_ex)
print("Tokens: ", eng_tokens_ex)
print(
    "Recovered text after detokenizing: ",
    eng_tokenizer.detokenize(eng_tokens_ex),
)

print()

hin_input_ex = pairs[0][1]
hin_tokens_ex = hin_tokenizer.tokenize(hin_input_ex)
print("Spanish sentence: ", hin_input_ex)
print("Tokens: ", hin_tokens_ex)
print(
    "Recovered text after detokenizing: ",
    hin_tokenizer.detokenize(hin_tokens_ex),
)

English sentence:  they charged me for the broken window.
Tokens:  tf.Tensor([106  25 141 140 197  61  65  73  50  24 117 305  78  45 302 182  10], shape=(17,), dtype=int32)
Recovered text after detokenizing:  tf.Tensor(b'they charged me for the broken window .', shape=(), dtype=string)

Spanish sentence:  उन्होंने ने मुझसे टूटी हुई खिड़की के लिए पैसे माँगे।
Tokens:  tf.Tensor([467 139 214  42 301 342 210  34 215 235 259  98 118 267 411 277  79], shape=(17,), dtype=int32)
Recovered text after detokenizing:  tf.Tensor(b'\xe0\xa4\x89\xe0\xa4\xa8\xe0\xa5\x8d\xe0\xa4\xb9\xe0\xa5\x8b\xe0\xa4\x82\xe0\xa4\xa8\xe0\xa5\x87 \xe0\xa4\xa8\xe0\xa5\x87 \xe0\xa4\xae\xe0\xa5\x81\xe0\xa4\x9d\xe0\xa4\xb8\xe0\xa5\x87 \xe0\xa4\x9f\xe0\xa5\x82\xe0\xa4\x9f\xe0\xa5\x80 \xe0\xa4\xb9\xe0\xa5\x81\xe0\xa4\x88 \xe0\xa4\x96\xe0\xa4\xbf\xe0\xa4\xa1\xe0\xa4\xbc\xe0\xa4\x95\xe0\xa5\x80 \xe0\xa4\x95\xe0\xa5\x87 \xe0\xa4\xb2\xe0\xa4\xbf\xe0\xa4\x8f \xe0\xa4\xaa\xe0\xa5\x88\xe0\xa4\xb8\xe0\xa5\x87 \xe0\xa4\xae\xe0\xa4\x

In [11]:
def preprocess_batch(eng, hin):
    batch_size = tf.shape(hin)[0]

    eng = eng_tokenizer(eng)
    hin = hin_tokenizer(hin)

    eng_start_end_packer = keras_nlp.layers.StartEndPacker(
        sequence_length=MAX_SEQUENCE_LENGTH,
        pad_value=eng_tokenizer.token_to_id("[PAD]"),
    )
    eng = eng_start_end_packer(eng)

    hin_start_end_packer = keras_nlp.layers.StartEndPacker(
        sequence_length=MAX_SEQUENCE_LENGTH + 1,
        start_value=hin_tokenizer.token_to_id("[START]"),
        end_value=hin_tokenizer.token_to_id("[END]"),
        pad_value=hin_tokenizer.token_to_id("[PAD]"),
    )
    hin = hin_start_end_packer(hin)

    return (
        {
            "encoder_inputs": eng,
            "decoder_inputs": hin[:, :-1],
        },
        hin[:, 1:],
    )


def make_dataset(pairs):
    eng_texts, hin_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    hin_texts = list(hin_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, hin_texts))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(preprocess_batch, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset.shuffle(2048).prefetch(16).cache()


train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [12]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

2023-09-20 11:05:34.185665: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [2087]
	 [[{{node Placeholder/_0}}]]
2023-09-20 11:05:34.185892: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_6' with dtype uint8 and shape [27948]
	 [[{{node Placeholder/_6}}]]


inputs["encoder_inputs"].shape: (64, 40)
inputs["decoder_inputs"].shape: (64, 40)
targets.shape: (64, 40)


2023-09-20 11:05:34.435596: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [13]:
# Encoder
encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")

x = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=ENG_VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
)(encoder_inputs)

encoder_outputs = keras_nlp.layers.TransformerEncoder(
    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
)(inputs=x)
encoder = keras.Model(encoder_inputs, encoder_outputs)


In [14]:
# Decoder
decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, EMBED_DIM), name="decoder_state_inputs")

x = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=HIN_VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
)(decoder_inputs)

x = keras_nlp.layers.TransformerDecoder(
    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
)(decoder_sequence=x, encoder_sequence=encoded_seq_inputs)
x = keras.layers.Dropout(0.5)(x)
decoder_outputs = keras.layers.Dense(HIN_VOCAB_SIZE, activation="softmax")(x)
decoder = keras.Model(
    [
        decoder_inputs,
        encoded_seq_inputs,
    ],
    decoder_outputs,
)
decoder_outputs = decoder([decoder_inputs, encoder_outputs])

In [15]:
transformer = keras.Model(
    [encoder_inputs, decoder_inputs],
    decoder_outputs,
    name="transformer",
)
transformer.summary()

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 token_and_position_embedding (  (None, None, 256)   3850240     ['encoder_inputs[0][0]']         
 TokenAndPositionEmbedding)                                                                       
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 transformer_encoder (Transform  (None, None, 256)   1315072     ['token_and_position_em

In [16]:
transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
transformer.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)

Epoch 1/20


2023-09-20 11:05:35.438354: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_6' with dtype uint8 and shape [27948]
	 [[{{node Placeholder/_6}}]]
2023-09-20 11:05:35.438576: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [2087]
	 [[{{node Placeholder/_0}}]]
2023-09-20 11:05:37.902137: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-09-20 11:05:37.926363: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x25cf7f40 initialized for platform CUD



2023-09-20 11:05:45.144315: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_7' with dtype uint8 and shape [41332]
	 [[{{node Placeholder/_7}}]]
2023-09-20 11:05:45.144539: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_7' with dtype uint8 and shape [41332]
	 [[{{node Placeholder/_7}}]]


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f4db030bc90>

In [17]:
def decode_sequences(input_sentences):
    batch_size = tf.shape(input_sentences)[0]

    encoder_input_tokens = eng_tokenizer(input_sentences).to_tensor(
        shape=(None, MAX_SEQUENCE_LENGTH)
    )

    def next(prompt, cache, index):
        logits = transformer([encoder_input_tokens, prompt])[:, index - 1, :]
        hidden_states = None
        return logits, hidden_states, cache

    length = 40
    start = tf.fill((batch_size, 1), hin_tokenizer.token_to_id("[START]"))
    pad = tf.fill((batch_size, length - 1), hin_tokenizer.token_to_id("[PAD]"))
    prompt = tf.concat((start, pad), axis=-1)

    generated_tokens = keras_nlp.samplers.GreedySampler()(
        next,
        prompt,
        end_token_id=hin_tokenizer.token_to_id("[END]"),
        index=1,
    )
    generated_sentences = hin_tokenizer.detokenize(generated_tokens)
    return generated_sentences


test_eng_texts = [pair[0] for pair in test_pairs]
for i in range(2):
    input_sentence = random.choice(test_eng_texts)
    translated = decode_sequences(tf.constant([input_sentence]))
    translated = translated.numpy()[0].decode("utf-8")
    translated = (
        translated.replace("[PAD]", "")
        .replace("[START]", "")
        .replace("[END]", "")
        .strip()
    )
    print(f"** Example {i} **")
    print(input_sentence)
    print(translated)
    print()

** Example 0 **
i cried all night.
मैं घर के लिए बहुत देर से मिला ।

** Example 1 **
he can't run very fast.
वह एक बहुत सुनता है ।



In [18]:
rouge_1 = keras_nlp.metrics.RougeN(order=1)
rouge_2 = keras_nlp.metrics.RougeN(order=2)

for test_pair in test_pairs[:30]:
    input_sentence = test_pair[0]
    reference_sentence = test_pair[1]

    translated_sentence = decode_sequences(tf.constant([input_sentence]))
    translated_sentence = translated_sentence.numpy()[0].decode("utf-8")
    translated_sentence = (
        translated_sentence.replace("[PAD]", "")
        .replace("[START]", "")
        .replace("[END]", "")
        .strip()
    )

    rouge_1(reference_sentence, translated_sentence)
    rouge_2(reference_sentence, translated_sentence)

print("ROUGE-1 Score: ", rouge_1.result())
print("ROUGE-2 Score: ", rouge_2.result())

AttributeError: 'dict' object has no attribute '_metric_obj'