In [None]:
# MINE
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


- [sbert](https://www.sbert.net/)
- [training_overview](https://www.sbert.net/docs/sentence_transformer/training_overview.html)
- [loss_overview](https://www.sbert.net/docs/sentence_transformer/loss_overview.html)
- [training_examples](https://www.sbert.net/docs/sentence_transformer/training/examples.html)
- [matryoshka](https://www.sbert.net/examples/sentence_transformer/training/matryoshka/README.html)
- [adaptive_layer](https://www.sbert.net/examples/sentence_transformer/training/adaptive_layer/README.html)
- [training_with_prompts](https://www.sbert.net/examples/sentence_transformer/training/prompts/README.html)
- [training_with_peft](https://www.sbert.net/examples/sentence_transformer/training/peft/README.html)

In [None]:
# @title **All needed imports**
from datasets import load_dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator

## **Full Embeddings Fine-tuning**

### **Load the Model**

In [None]:
# This is optional
model_card=SentenceTransformerModelCardData(
    language='en', license="mit",
    model_name="bge-large-en-v1.5"
)

# A must
model = SentenceTransformer(
    "BAAI/bge-large-en-v1.5",
    model_card_data=model_card
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [None]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [None]:
# @title **Load your dataset**
dataset = load_dataset("sentence-transformers/all-nli", "triplet")
# train dataset is about 500K
# Will be using a subset of 200K
train_dataset = dataset["train"].select(range(200_000))
validation_dataset = dataset['dev']
test_dataset = dataset['test']

README.md: 0.00B [00:00, ?B/s]

triplet/train-00000-of-00001.parquet:   0%|          | 0.00/38.4M [00:00<?, ?B/s]

triplet/dev-00000-of-00001.parquet:   0%|          | 0.00/782k [00:00<?, ?B/s]

triplet/test-00000-of-00001.parquet:   0%|          | 0.00/810k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/557850 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/6584 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6609 [00:00<?, ? examples/s]

### **Data Inspection**

In [None]:
# train
dataset.column_names

{'train': ['anchor', 'positive', 'negative'],
 'dev': ['anchor', 'positive', 'negative'],
 'test': ['anchor', 'positive', 'negative']}

In [None]:
def random_examples_selector(given_dataset):
   from random import randint
   anchor = given_dataset['anchor'][randint(0, len(given_dataset))]
   positive = given_dataset['positive'][randint(0, len(given_dataset))]
   negative = given_dataset['negative'][randint(0, len(given_dataset))]

   print(f"Anchor: {anchor}")
   print(f"Positive: {positive}")
   print(f"Negative: {negative}")

In [None]:
random_examples_selector(train_dataset)

Anchor: Many people busily working, watching a presentation, or talking to each other.
Positive: A blue mascot is at a sporting event.
Negative: The dog is cuddled up near a warm fire.


In [None]:
random_examples_selector(validation_dataset)

Anchor: Women and children sitting outside in the shade while a group of younger children sitting inside in the shade.
Positive: A dog is licking his nose.
Negative: A boy makes a mud pie.


In [None]:
random_examples_selector(test_dataset)

Anchor: People sitting in a dim restaurant eating.
Positive: The Department established the Joint Forces Command in Virginia.
Negative: The girls are playing on a street.


In [None]:
def examples_selector(given_dataset,index:int):
   anchor = given_dataset['anchor'][index]
   positive = given_dataset['positive'][index]
   negative = given_dataset['negative'][index]

   print(f"Anchor: {anchor}")
   print(f"Positive: {positive}")
   print(f"Negative: {negative}")

In [None]:
examples_selector(train_dataset,1)

Anchor: Children smiling and waving at camera
Positive: There are children present
Negative: The kids are frowning


In [None]:
examples_selector(validation_dataset,1)

Anchor: Two young children in blue jerseys, one with the number 9 and one with the number 2 are standing on wooden steps in a bathroom and washing their hands in a sink.
Positive: Two kids in numbered jerseys wash their hands.
Negative: Two kids in jackets walk to school.


In [None]:
examples_selector(test_dataset,1)

Anchor: A woman with a green headscarf, blue shirt and a very big grin.
Positive: The woman is very happy.
Negative: The woman has been shot.


### **Loss Definition**

In [None]:
loss = MultipleNegativesRankingLoss(model)

### **Setting up Weights and Biases for Logging**

In [None]:
import wandb

In [None]:
wandb.login()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdannyai[0m ([33mdannyai-danny-the-analyst[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
import os
# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="Fine-Tune-Embeddings-bge-large-en-v1.5"

# save your trained model checkpoint to wandb
# os.environ["WANDB_LOG_MODEL"]="true" # throws an error, must use 'checkpoint' or 'end'
os.environ["WANDB_LOG_MODEL"]="checkpoint"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"

### **Specify Training Arguments**

[Training Arguments](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.save_total_limit)

In [None]:
training_args = SentenceTransformerTrainingArguments(
    # num_train_epochs=1, # full training,
    max_steps=600,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=False,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=True,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    # Some optional tracking and debugging parameters:
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=3,
    logging_strategy="steps",
    logging_steps=100,
    # logging_first_step=True,
    load_best_model_at_end=True,
    seed = 30,
    output_dir = "outputs",
    run_name="Fine_Tune_Embedding_bge_large_en_v_1_5",
    report_to=["wandb"] # reporting to Weights and biases project
)

In [None]:
training_args



### **Evaluator**

- [TripletEvaluator](https://www.sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.TripletEvaluator)
- [SimilarityFunction](https://www.sbert.net/docs/package_reference/sparse_encoder/SparseEncoder.html#sentence_transformers.SimilarityFunction)

In [None]:
# Validation set is 6K
# Will use 3K
max_samples = 3000
validation_dataset = validation_dataset.select(range(max_samples))
# Will use all
# validation_dataset = validation_dataset
from sentence_transformers.evaluation import SimilarityFunction
# Initialise the evaluator
val_evaluator = TripletEvaluator(
    anchors = validation_dataset['anchor'],
    positives = validation_dataset['positive'],
    negatives = validation_dataset['negative'],
    batch_size=50,
    main_similarity_function=SimilarityFunction.COSINE,
    show_progress_bar=True,
    name='all-nli-val'
)
# WARNING:sentence_transformers.evaluation.TripletEvaluator:The 'main_distance_function' parameter is deprecated.
# Please use 'main_similarity_function' instead. 'main_distance_function' will be removed in a future release.

In [None]:
# @title **Create an evaluator & evaluate the base model**

In [None]:
val_evaluator(model)
# {'all-nli-val_cosine_accuracy': 0.9548906683921814}

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

{'all-nli-val_cosine_accuracy': 0.9599999785423279}

In [None]:
# @title **Create a trainer & train**

trainer = SentenceTransformerTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    evaluator=val_evaluator,
    loss=loss
)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,All-nli-val Cosine Accuracy
100,0.5862,0.270474,0.953333
200,0.498,0.252001,0.955667
300,0.4677,0.259722,0.956333
400,0.4365,0.245045,0.957333
500,0.3971,0.243823,0.959
600,0.4393,0.235998,0.960667


Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (outputs/checkpoint-100)... Done. 43.3s


Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (outputs/checkpoint-200)... Done. 45.7s


Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (outputs/checkpoint-300)... Done. 55.1s


Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (outputs/checkpoint-400)... Done. 40.8s


Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (outputs/checkpoint-500)... Done. 35.6s


Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (outputs/checkpoint-600)... Done. 40.9s


TrainOutput(global_step=600, training_loss=0.47081382115681963, metrics={'train_runtime': 1820.2565, 'train_samples_per_second': 5.274, 'train_steps_per_second': 0.33, 'total_flos': 0.0, 'train_loss': 0.47081382115681963, 'epoch': 0.048})

In [None]:
# @title **Evaluate the trained model on the test set**
test_evaluator = TripletEvaluator(
    anchors=test_dataset["anchor"],
    positives=test_dataset["positive"],
    negatives=test_dataset["negative"],
    name="all-nli-test",
)

In [None]:
test_evaluator(model)

{'all-nli-test_cosine_accuracy': 0.9574822187423706}

### **Model Saving**

In [None]:
# @title **Save the trained model**
model.save_pretrained("embedding_model/full_fine_tuned_bge-large-en-v1.5")

In [None]:
# @title **Push it to the Hugging Face Hub**
# do not create repo on hugging face
model.push_to_hub("full_fine_tuned_bge-large-en-v1.5", exist_ok=True)

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/tmplnovk_81/model.safetensors    :   0%|          | 28.6kB / 1.34GB            

'https://huggingface.co/DannyAI/full_fine_tuned_bge-large-en-v1.5/commit/2cbe7ecf6d34042c0fb540dc0d32b36c35ffb23c'