In [2]:
import tensorflow as tf
from transformers import TFBertModel, AutoTokenizer
from transformers import DataCollatorWithPadding
from datasets import load_dataset, concatenate_datasets
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import random
import string
import time
from tqdm import tqdm
from datetime import timedelta

2024-06-12 10:08:35.131452: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-12 10:08:35.131564: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-12 10:08:35.402708: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# 2. Load the STSB dataset: https://huggingface.co/datasets/sentence-transformers/stsb
train_dataset = load_dataset("feverlash/stsb-indo-dataset", split="train")
eval_dataset = load_dataset("feverlash/stsb-indo-dataset", split="validation")
test_dataset = load_dataset("feverlash/stsb-indo-dataset", split="test")

train_data = train_dataset.map(lambda example: {'sentence1': example['sentence1'], 'sentence2': example['sentence2'], 'score': example['score'] / 5.0})
eval_data = eval_dataset.map(lambda example: {'sentence1': example['sentence1'], 'sentence2': example['sentence2'], 'score': example['score'] / 5.0})
test_data = test_dataset.map(lambda example: {'sentence1': example['sentence1'], 'sentence2': example['sentence2'], 'score': example['score'] / 5.0})

tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1')

max_length = 128
batch_size = 32
eval_batch_size = 32

# 3. Tokenize the dataset
def process(first_token:str, second_token:str):
    inputs = tokenizer([first_token, second_token],
                            max_length=max_length,
                            truncation=True,
                            padding="max_length",
                            return_tensors='tf')
    return inputs

def tokenize_function(examples):
    first_sent = examples['sentence1']
    second_sent = examples['sentence2']
    tokenized_sentences = process(first_sent, second_sent)
    return tokenized_sentences

train_ds = train_data.map(tokenize_function, batched=False)
eval_ds = eval_data.map(tokenize_function, batched=False)
test_ds = test_data.map(tokenize_function, batched=False)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 4. Prepare dataset for TensorFlow
train_dataset = train_ds.to_tf_dataset(
    columns=['input_ids', 'attention_mask'],
    label_cols=['score'],
    shuffle=True,
    batch_size=batch_size
)
eval_dataset = eval_ds.to_tf_dataset(
    columns=['input_ids', 'attention_mask'],
    label_cols=['score'],
    shuffle=False,
    batch_size=eval_batch_size
)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


In [4]:
def collate_fn(texts):

    input_ids = texts['input_ids']
    attention_masks = texts['attention_mask']

    features = [{'input_ids': input_id, 'attention_mask': attention_mask}
                for input_id, attention_mask in zip(input_ids, attention_masks)]

    return features

In [5]:
# class CosineSimilarityLoss(tf.keras.losses.Loss):

#     def __init__(self, loss_fn=tf.keras.losses.MeanSquaredError(name='mean_squared_error'), transform_fn=tf.identity, name="cosine_similarity_loss"):
#         super(CosineSimilarityLoss, self).__init__(name=name)
#         self.loss_fn = loss_fn
#         self.transform_fn = transform_fn

#     def call(self, y_true, y_pred):
#         # y_pred is expected to be a tuple of two embeddings
#         emb_1 = tf.stack([pred[0] for pred in y_pred])
#         emb_2 = tf.stack([pred[1] for pred in y_pred])
#         # Compute cosine similarity
#         cos_similarity = abs(tf.keras.losses.cosine_similarity(emb_1, emb_2, axis=1))
#         # Transform the cosine similarity
#         transformed_similarity = self.transform_fn(cos_similarity)
#         # Compute the loss
#         return self.loss_fn(tf.squeeze(y_true), transformed_similarity)

In [6]:
class BertForSTS(tf.keras.Model):

    def __init__(self):
        super(BertForSTS, self).__init__()
        self.bert = TFBertModel.from_pretrained('indobenchmark/indobert-base-p1')

    def call(self, input_data, training=False):
#         # Tokenize the input
#         inputs = self.tokenizer(input_data, return_tensors='tf', max_length=128, truncation=True, padding=True)
        outputs = self.bert(input_data)
        cls_output = outputs.last_hidden_state  # CLS token
        attention = input_data['attention_mask']
        mask = tf.cast(tf.expand_dims(attention, -1), dtype=tf.float32)
        masked_embeddings = cls_output * mask
        summed = tf.reduce_sum(masked_embeddings, axis=1)
        counts = tf.clip_by_value(tf.reduce_sum(mask, axis=1), clip_value_min=1e-9, clip_value_max=tf.float32.max)
        mean_pooled = summed / counts
        return mean_pooled

In [7]:
model = BertForSTS()

Some layers from the model checkpoint at indobenchmark/indobert-base-p1 were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at indobenchmark/indobert-base-p1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [8]:
class CosineSimilarityLoss(tf.keras.losses.Loss):
    def __init__(self, loss_fn=tf.keras.losses.MeanSquaredError(name='mean_squared_error'), transform_fn=tf.identity, name="cosine_similarity_loss"):
        super(CosineSimilarityLoss, self).__init__(name=name)
        self.loss_fn = loss_fn
        self.transform_fn = transform_fn

    def call(self, y_true, y_pred):
        # y_pred is expected to be a tuple of two embeddings
        emb_1 = tf.stack([pred[0] for pred in y_pred])
        emb_2 = tf.stack([pred[1] for pred in y_pred])
        # Compute cosine similarity
        cos_similarity = abs(tf.keras.losses.cosine_similarity(emb_1, emb_2, axis=1))
        # Transform the cosine similarity
        transformed_similarity = self.transform_fn(cos_similarity)
        # Compute the loss
        return self.loss_fn(tf.squeeze(y_true), transformed_similarity)

In [9]:
# train_loss = tf.keras.metrics.Mean(name='train_loss')
# total_train_loss = 0
# for input, labels in tqdm(train_dataset.take(2)):
#     input['input_ids'] = input['input_ids']
#     input['attention_mask'] = input['attention_mask']
#     print(np.shape(input['input_ids']))
#     print(labels)
#     input = collate_fn(input)
#     print(np.shape(input))
#     embeddings = [model(feature) for feature in input]
#     print("embbedding:", np.shape(embeddings))
#     loss_fn = CosineSimilarityLoss()
#     emb_1 = tf.stack([pred[0] for pred in embeddings])
#     emb_2 = tf.stack([pred[1] for pred in embeddings])
#     print("embbedding 1 :", emb_1)
#     print("embbedding 2 :", emb_2)
#     cos_similarity = abs(tf.keras.losses.cosine_similarity(emb_1, emb_2, axis=1))
#     print(cos_similarity)
#     loss  = loss_fn(labels, embeddings)
#     total_train_loss += loss
#     print("LOSS : ",loss)
#     train_loss(loss)
# avg_loss = total_train_loss/len(train_dataset.take(2))
# avg_train_loss = train_loss.result()
# print(avg_train_loss)
# print(avg_loss)

In [10]:
epochs = 5
steps_per_epoch = len(train_dataset) // batch_size
num_train_steps = steps_per_epoch * epochs
print(len(train_dataset),steps_per_epoch, num_train_steps)

180 5 25


In [11]:
def format_time(elapsed):
    return str(timedelta(seconds=int(round((elapsed)))))

@tf.function
def train_step(model, input, labels, optimizer, loss_fn):
    with tf.GradientTape() as tape:
        embeddings = [model(feature, training=True) for feature in input]
        loss = loss_fn(labels, embeddings)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

@tf.function
def val_step(model, input, labels, optimizer, loss_fn):
    embeddings = [model(feature, training=True) for feature in input]
    loss = loss_fn(labels, embeddings)
    return loss

def train(model, train_dataset, eval_dataset, epochs, batch_size):
    seed_val = 42

    random.seed(seed_val)
    np.random.seed(seed_val)
    tf.random.set_seed(seed_val)

    # Prepare optimizer, learning rate scheduler
    steps_per_epoch = len(train_dataset) // batch_size
    num_train_steps = steps_per_epoch * epochs
    optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
    loss_fn = CosineSimilarityLoss()

    # Metrics to track loss
    # train_loss = tf.keras.metrics.Mean(name='train_loss')
    # val_loss = tf.keras.metrics.Mean(name='val_loss')

    # Training and validation loops
    training_stats = []
    total_t0 = time.time()

    for epoch_i in range(epochs):
        print(f"======== Epoch {epoch_i + 1} / {epochs} ========")
        print('Training...')

        t0 = time.time()
        total_train_loss = 0

        for input, labels in tqdm(train_dataset):
            input['input_ids'] = input['input_ids']
            input['attention_mask'] = input['attention_mask']
            input = collate_fn(input)
            loss = train_step(model, input, labels, optimizer, loss_fn)
            total_train_loss += loss

        avg_train_loss = total_train_loss/len(train_dataset)
        training_time = format_time(time.time() - t0)

        print(f"  Average training loss: {avg_train_loss:.5f}")
        print(f"  Training epoch took: {training_time}")

        # Validation loop
        print("Running Validation...")
        t0 = time.time()
        
        total_val_loss = 0

        for input, labels in tqdm(eval_dataset):
            input['input_ids'] = input['input_ids']
            input['attention_mask'] = input['attention_mask']
            input = collate_fn(input)
            loss = val_step(model, input, labels, optimizer, loss_fn)
            total_val_loss += loss

        avg_val_loss = total_val_loss/len(eval_dataset)
        validation_time = format_time(time.time() - t0)

        print(f"  Validation Loss: {avg_val_loss:.5f}")
        print(f"  Validation took: {validation_time}")

        # Record all statistics from this epoch
        training_stats.append(
            {
                'epoch': epoch_i + 1,
                'Training Loss': avg_train_loss.numpy(),
                'Valid. Loss': avg_val_loss.numpy(),
                'Training Time': training_time,
                'Validation Time': validation_time
            }
        )

    print("Training complete!")
    print(f"Total training took {format_time(time.time() - total_t0)} (h:mm:ss)")

    return model, training_stats

In [12]:
model, training_stats = train(model, train_dataset, eval_dataset, epochs, batch_size)

Training...


I0000 00:00:1718187824.978833   11789 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
100%|██████████| 180/180 [25:21<00:00,  8.45s/it]  


  Average training loss: 0.03458
  Training epoch took: 0:25:22
Running Validation...


100%|██████████| 47/47 [10:03<00:00, 12.84s/it]  


  Validation Loss: 0.02737
  Validation took: 0:10:03
Training...


100%|██████████| 180/180 [05:43<00:00,  1.91s/it]


  Average training loss: 0.01585
  Training epoch took: 0:05:44
Running Validation...


100%|██████████| 47/47 [00:32<00:00,  1.47it/s]


  Validation Loss: 0.02619
  Validation took: 0:00:32
Training...


100%|██████████| 180/180 [05:42<00:00,  1.90s/it]


  Average training loss: 0.00899
  Training epoch took: 0:05:42
Running Validation...


100%|██████████| 47/47 [00:31<00:00,  1.47it/s]


  Validation Loss: 0.02768
  Validation took: 0:00:32
Training...


100%|██████████| 180/180 [05:40<00:00,  1.89s/it]


  Average training loss: 0.00646
  Training epoch took: 0:05:40
Running Validation...


100%|██████████| 47/47 [00:31<00:00,  1.48it/s]


  Validation Loss: 0.02838
  Validation took: 0:00:32
Training...


100%|██████████| 180/180 [05:40<00:00,  1.89s/it]


  Average training loss: 0.00529
  Training epoch took: 0:05:40
Running Validation...


100%|██████████| 47/47 [00:31<00:00,  1.48it/s]

  Validation Loss: 0.02838
  Validation took: 0:00:32
Training complete!
Total training took 1:00:19 (h:mm:ss)





In [13]:
import pandas as pd
# Create a DataFrame from our training statistics
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index
df_stats = df_stats.set_index('epoch')

# Display the table
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.034582,0.027372,0:25:22,0:10:03
2,0.015855,0.026193,0:05:44,0:00:32
3,0.008994,0.027679,0:05:42,0:00:32
4,0.006458,0.028383,0:05:40,0:00:32
5,0.005293,0.028379,0:05:40,0:00:32


In [23]:
model_save_path = "./bert_sts_model/model"
model.save("my_model.h5")
print(f"Model saved to {model_save_path}")

  saving_api.save_model(


NotImplementedError: Saving the model to HDF5 format requires the model to be a Functional model or a Sequential model. It does not work for subclassed models, because such models are defined via the body of a Python method, which isn't safely serializable. Consider saving to the Tensorflow SavedModel format (by setting save_format="tf") or using `save_weights`.

In [24]:
model.save_weights('./checkpoints/my_checkpoint')

In [27]:
!zip "./checkpoints/my_checkpoint.zip"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



zip error: Nothing to do! (./checkpoints/my_checkpoint.zip)


In [15]:
test_dataset = test_ds.to_tf_dataset(
    columns=['input_ids', 'attention_mask'],
    label_cols=['score'],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


In [16]:
def tokenizing(sentence_pair):
    input = tokenizer(sentence_pair, padding=True, truncation=True, max_length=max_length, return_tensors="tf")
    input['input_ids'] = input['input_ids']
    input['attention_mask'] = input['attention_mask']
    input['token_type_ids'] = input['token_type_ids']
    return input

In [17]:
def predict_similarity(sentence_pair):
    input = tokenizing(sentence_pair)

    # Ensure input is in the correct format
    input_ids = tf.convert_to_tensor(input['input_ids'])
    attention_mask = tf.convert_to_tensor(input['attention_mask'])
    token_type_ids = tf.convert_to_tensor(input['token_type_ids'])

    embed = model((input_ids, attention_mask), training=False)
    similiarity = tf.keras.metrics.CosineSimilarity()
    sim = abs(tf.keras.losses.cosine_similarity(embed[0], embed[1], axis=0))
    return sim, embed

In [18]:
# Prepare the data
skor = [i['score'] for i in test_data]
first_sent = [i['sentence1'] for i in test_data]
second_sent = [i['sentence2'] for i in test_data]
full_text = [[str(x), str(y)] for x,y in zip(first_sent, second_sent)]

In [19]:
example_1 = full_text[100]
print(f"Sentence 1: {example_1[0]}")
print(f"Sentence 2: {example_1[1]}")
sim, embed = predict_similarity(example_1)
print(f"Predicted similarity score: {round(sim.numpy(),2)}")

Sentence 1: Seekor kucing sedang berjalan di sekitar rumah.
Sentence 2: Seorang wanita sedang mengupas kentang.


TypeError: Exception encountered when calling layer 'bert_for_sts' (type BertForSTS).

tuple indices must be integers or slices, not str

Call arguments received by layer 'bert_for_sts' (type BertForSTS):
  • input_data=('tf.Tensor(shape=(2, 10), dtype=int32)', 'tf.Tensor(shape=(2, 10), dtype=int32)')
  • training=False