In [1]:
# MINE
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


- [sbert](https://www.sbert.net/)
- [training_overview](https://www.sbert.net/docs/sentence_transformer/training_overview.html)
- [loss_overview](https://www.sbert.net/docs/sentence_transformer/loss_overview.html)
- [training_examples](https://www.sbert.net/docs/sentence_transformer/training/examples.html)
- [matryoshka](https://www.sbert.net/examples/sentence_transformer/training/matryoshka/README.html)
- [adaptive_layer](https://www.sbert.net/examples/sentence_transformer/training/adaptive_layer/README.html)
- [training_with_prompts](https://www.sbert.net/examples/sentence_transformer/training/prompts/README.html)
- [training_with_peft](https://www.sbert.net/examples/sentence_transformer/training/peft/README.html)
- [link text](https://)

In [3]:
# @title **All needed imports**
from datasets import load_dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss, \
AdaptiveLayerLoss, MatryoshkaLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator

## **Embedding-Fine-Tuning-Matryoshka-Loss**

### **Load the Model**

In [4]:
# This is optional
model_card=SentenceTransformerModelCardData(
    language='en', license="mit",
    model_name="bge-large-en-v1.5"
)

# A must
model = SentenceTransformer(
    "BAAI/bge-large-en-v1.5",
    model_card_data=model_card
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [5]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [6]:
print(model._first_module().auto_model.config.output_hidden_states)
# Should print: True
# This needs to be adjusted when loading the model

False


In [7]:

# This is optional
model_card=SentenceTransformerModelCardData(
    language='en', license="mit",
    model_name="bge-large-en-v1.5"
)

# A must
model = SentenceTransformer(
    "BAAI/bge-large-en-v1.5",
    model_card_data=model_card,
    # This fails
    # output_hidden_states=True
)

# This works
# 2️⃣ Enable hidden states on the first module’s auto_model
model._first_module().auto_model.config.output_hidden_states = True

In [8]:
print(model._first_module().auto_model.config.output_hidden_states)
# Should print: True

True


In [9]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [10]:
# @title **Load your dataset**
dataset = load_dataset("sentence-transformers/all-nli", "triplet")
# train dataset is about 500K
# Will be using a subset of 200K
train_dataset = dataset["train"].select(range(200_000))
validation_dataset = dataset['dev']
test_dataset = dataset['test']

README.md: 0.00B [00:00, ?B/s]

triplet/train-00000-of-00001.parquet:   0%|          | 0.00/38.4M [00:00<?, ?B/s]

triplet/dev-00000-of-00001.parquet:   0%|          | 0.00/782k [00:00<?, ?B/s]

triplet/test-00000-of-00001.parquet:   0%|          | 0.00/810k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/557850 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/6584 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6609 [00:00<?, ? examples/s]

### **Data Inspection**

In [11]:
# train
dataset.column_names

{'train': ['anchor', 'positive', 'negative'],
 'dev': ['anchor', 'positive', 'negative'],
 'test': ['anchor', 'positive', 'negative']}

In [12]:
def random_examples_selector(given_dataset):
   from random import randint
   anchor = given_dataset['anchor'][randint(0, len(given_dataset))]
   positive = given_dataset['positive'][randint(0, len(given_dataset))]
   negative = given_dataset['negative'][randint(0, len(given_dataset))]

   print(f"Anchor: {anchor}")
   print(f"Positive: {positive}")
   print(f"Negative: {negative}")

In [13]:
random_examples_selector(train_dataset)

Anchor: Two women wearing hats covered in flowers are posing.
Positive: Three kids are caring for a pet.
Negative: There is a woman sitting on the sidewalk.


In [14]:
random_examples_selector(validation_dataset)

Anchor: Numbers began wafting about on the  I'd say at least five
Positive: Mandatory account proposals are likely to increase savings by forcing people to save.
Negative: The man is riding his bike in the country.


In [15]:
random_examples_selector(test_dataset)

Anchor: A patient is being worked on by doctors and nurses.
Positive: Though there is the odd bit of useful information, most of it has been covered in the growing number of other books on this subject.
Negative: Kids are taking a bath.


In [16]:
def examples_selector(given_dataset,index:int):
   anchor = given_dataset['anchor'][index]
   positive = given_dataset['positive'][index]
   negative = given_dataset['negative'][index]

   print(f"Anchor: {anchor}")
   print(f"Positive: {positive}")
   print(f"Negative: {negative}")

In [17]:
examples_selector(train_dataset,1)

Anchor: Children smiling and waving at camera
Positive: There are children present
Negative: The kids are frowning


In [18]:
examples_selector(validation_dataset,1)

Anchor: Two young children in blue jerseys, one with the number 9 and one with the number 2 are standing on wooden steps in a bathroom and washing their hands in a sink.
Positive: Two kids in numbered jerseys wash their hands.
Negative: Two kids in jackets walk to school.


In [19]:
examples_selector(test_dataset,1)

Anchor: A woman with a green headscarf, blue shirt and a very big grin.
Positive: The woman is very happy.
Negative: The woman has been shot.


### **Loss Definition**

In [21]:
base_loss = MultipleNegativesRankingLoss(model)

matryoshka_dims = [1024,768, 512, 256, 128, 64]
loss =  MatryoshkaLoss(model=model,
                         loss=base_loss,
                         matryoshka_dims=matryoshka_dims)

[Example Code](https://github.com/UKPLab/sentence-transformers/blob/master/examples/sentence_transformer/training/matryoshka/matryoshka_nli.py)

In [22]:
loss

MatryoshkaLoss(
  (model): SentenceTransformer(
    (0): Transformer({'max_seq_length': 512, 'do_lower_case': True, 'architecture': 'BertModel'})
    (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
    (2): Normalize()
  )
  (loss): MultipleNegativesRankingLoss(
    (model): SentenceTransformer(
      (0): Transformer({'max_seq_length': 512, 'do_lower_case': True, 'architecture': 'BertModel'})
      (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
      (2): Normalize()
    )
    (cross_entropy_loss): Cr

### **Setting up Weights and Biases for Logging**

In [23]:
import wandb

In [24]:
wandb.login()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdannyai[0m ([33mdannyai-danny-the-analyst[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [25]:
import os
# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="embedding_fine_tuning_matryoshka_loss_bge_large_en_v1.5"

# save your trained model checkpoint to wandb
# os.environ["WANDB_LOG_MODEL"]="true" # throws an error, must use 'checkpoint' or 'end'
os.environ["WANDB_LOG_MODEL"]="checkpoint"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"

### **Specify Training Arguments**

[Training Arguments](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.save_total_limit)

In [32]:
training_args = SentenceTransformerTrainingArguments(
    # num_train_epochs=1, # full training,
    max_steps=100, # reduced from 300 to 100
    per_device_train_batch_size=5, # reduced from 16 to 5
    per_device_eval_batch_size=5, # reduced from 16 to 5
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=False,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=True,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    # Some optional tracking and debugging parameters:
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=3,
    logging_strategy="steps",
    logging_steps=100,
    # logging_first_step=True,
    load_best_model_at_end=True,
    seed = 30,
    output_dir = "outputs",
    run_name="embedding_fine_tuning_matryoshka_loss_bge_large_en_v1.5",
    report_to=["wandb"] # reporting to Weights and biases project
)

In [33]:
print(training_args)

SentenceTransformerTrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
batch_sampler=BatchSamplers.NO_DUPLICATES,
bf16=True,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=False,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=100,
eval_

### **Evaluator**

- [TripletEvaluator](https://www.sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.TripletEvaluator)
- [SimilarityFunction](https://www.sbert.net/docs/package_reference/sparse_encoder/SparseEncoder.html#sentence_transformers.SimilarityFunction)

In [34]:
# Validation set is 6K
# Will use 3K
max_samples = 3000
validation_dataset = validation_dataset.select(range(max_samples))
# Will use all
# validation_dataset = validation_dataset
from sentence_transformers.evaluation import SimilarityFunction, SequentialEvaluator
# Initialise the evaluator

validation_evaluators = []
for dimension in matryoshka_dims:
  validation_evaluators.append(
      TripletEvaluator(
          anchors = validation_dataset['anchor'],
          positives = validation_dataset['positive'],
          negatives = validation_dataset['negative'],
          batch_size=50,
          main_similarity_function=SimilarityFunction.COSINE,
          show_progress_bar=True,
          name=f'all-nli-val-{dimension}',
          truncate_dim = dimension
          )
  )

validation_evaluator = SequentialEvaluator(evaluators=validation_evaluators)

# Link to SequentialEvaluator docs
# https://www.sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.SequentialEvaluator

In [35]:
# @title **Create a trainer & train**

trainer = SentenceTransformerTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    evaluator=validation_evaluator,
    loss=loss
)

In [36]:
trainer.train()

Step,Training Loss,Validation Loss,All-nli-val-1024 Cosine Accuracy,All-nli-val-768 Cosine Accuracy,All-nli-val-512 Cosine Accuracy,All-nli-val-256 Cosine Accuracy,All-nli-val-128 Cosine Accuracy,All-nli-val-64 Cosine Accuracy,Sequential Score
100,2.4168,1.291984,0.950667,0.949667,0.947667,0.945333,0.939333,0.931,0.931


Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (outputs/checkpoint-100)... Done. 51.6s


TrainOutput(global_step=100, training_loss=2.416847991943359, metrics={'train_runtime': 500.3744, 'train_samples_per_second': 0.999, 'train_steps_per_second': 0.2, 'total_flos': 0.0, 'train_loss': 2.416847991943359, 'epoch': 0.0025})

In [37]:
# @title **Evaluate the trained model on the test set**
test_evaluator = TripletEvaluator(
    anchors=test_dataset["anchor"],
    positives=test_dataset["positive"],
    negatives=test_dataset["negative"],
    name="all-nli-test",
)

In [38]:
test_evaluators = []
for dimension in matryoshka_dims:
  test_evaluators.append(
      TripletEvaluator(
        anchors = test_dataset['anchor'],
        positives = test_dataset['positive'],
        negatives = test_dataset['negative'],
        batch_size=50,
        main_similarity_function=SimilarityFunction.COSINE,
        show_progress_bar=True,
        name=f'all-nli-test-{dimension}',
        truncate_dim = dimension
        )
  )

test_evaluator = SequentialEvaluator(evaluators=test_evaluators)

In [39]:
test_evaluator(model)

Batches:   0%|          | 0/133 [00:00<?, ?it/s]

Batches:   0%|          | 0/133 [00:00<?, ?it/s]

Batches:   0%|          | 0/133 [00:00<?, ?it/s]

Batches:   0%|          | 0/133 [00:00<?, ?it/s]

Batches:   0%|          | 0/133 [00:00<?, ?it/s]

Batches:   0%|          | 0/133 [00:00<?, ?it/s]

Batches:   0%|          | 0/133 [00:00<?, ?it/s]

Batches:   0%|          | 0/133 [00:00<?, ?it/s]

Batches:   0%|          | 0/133 [00:00<?, ?it/s]

Batches:   0%|          | 0/133 [00:00<?, ?it/s]

Batches:   0%|          | 0/133 [00:00<?, ?it/s]

Batches:   0%|          | 0/133 [00:00<?, ?it/s]

Batches:   0%|          | 0/133 [00:00<?, ?it/s]

Batches:   0%|          | 0/133 [00:00<?, ?it/s]

Batches:   0%|          | 0/133 [00:00<?, ?it/s]

Batches:   0%|          | 0/133 [00:00<?, ?it/s]

Batches:   0%|          | 0/133 [00:00<?, ?it/s]

Batches:   0%|          | 0/133 [00:00<?, ?it/s]

{'all-nli-test-1024_cosine_accuracy': 0.9532455801963806,
 'all-nli-test-768_cosine_accuracy': 0.9515811800956726,
 'all-nli-test-512_cosine_accuracy': 0.950370728969574,
 'all-nli-test-256_cosine_accuracy': 0.9493115544319153,
 'all-nli-test-128_cosine_accuracy': 0.9452261924743652,
 'all-nli-test-64_cosine_accuracy': 0.9362989664077759,
 'sequential_score': 0.9362989664077759}

In [None]:
# Base model without finetuning (took this from previous notebooks)
# {'all-nli-val_cosine_accuracy': 0.9599999785423279}

### **Model Saving**

In [40]:
# @title **Save the trained model**
model.save_pretrained("embedding_fine_tuning_matryoshka_loss_bge_large_en_v1.5")

In [41]:
# @title **Push it to the Hugging Face Hub**
# do not create repo on hugging face
model.push_to_hub("embedding_fine_tuning_matryoshka_loss_bge_large_en_v1.5", exist_ok=True)

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/tmp0bqux6ta/model.safetensors    :   0%|          | 3.96MB / 1.34GB            

'https://huggingface.co/DannyAI/embedding_fine_tuning_matryoshka_loss_bge_large_en_v1.5/commit/492a375531137fb18d5553e8a6f54f5dbad7d836'