In [1]:
%%capture
!pip install --upgrade openai datasets sentence_transformers huggingface_hub mteb pytorch_metric_learning peft

In [None]:
import torch
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction, TranslationEvaluator
from datasets import load_dataset, Dataset
from sentence_transformers.losses import MultipleNegativesRankingLoss, GISTEmbedLoss, CoSENTLoss
from sentence_transformers import (
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    models,
    SentenceTransformer
)
import copy
import mteb
from utils import PoolingConcat
import numpy as np
from datasets import concatenate_datasets, Value

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device = {device}')


In [None]:
# Transformer with hashed embedding
transformer = models.Transformer('huawei-noah/TinyBERT_General_4L_312D')

# Pooling
pooling = PoolingConcat(transformer.get_word_embedding_dimension())

# SentenceTransformer Model
model = SentenceTransformer(modules=[transformer, pooling])

print(f'The model has {np.sum([p.numel() for p in model.parameters()])} parameters.')

# Guide model
guide = SentenceTransformer("avsolatorio/GIST-small-Embedding-v0")

# Losses
gist_loss = GISTEmbedLoss(model, guide)
cosent_loss = CoSENTLoss(model)

# First Step: NLI

data = (premise, true hypothesis, contradiction)

In [None]:
nli_train = load_dataset("sentence-transformers/all-nli", "triplet", split='train')
nli_test = load_dataset("sentence-transformers/all-nli", "triplet", split='test')

print(f'train size = {len(nli_train)} | test size = {len(nli_test)}')

In [None]:
log_step = 50

args = SentenceTransformerTrainingArguments(
      output_dir="models/mpnet-base-all-nli-triplet",
      report_to="none",
      eval_strategy="steps",
      eval_steps=log_step,
      num_train_epochs=10,
      learning_rate=1e-6,
      logging_steps=log_step,
      per_device_train_batch_size=128,
      per_device_eval_batch_size=128,
  )
trainer = SentenceTransformerTrainer(
      model=model,
      args=args,
      train_dataset=nli_train,
      eval_dataset=nli_test,
      loss=gist_loss
      )

trainer.train()

# **Fine Tuning**: STS-B

In [None]:
# STS-B
train_stsb = load_dataset('stsb_multi_mt', 'en', split='train')
train_stsb = train_stsb.map(lambda row: {'similarity_score': row['similarity_score']/5})
train_stsb = train_stsb.rename_column("similarity_score", "score")

test_stsb = load_dataset('stsb_multi_mt', 'en', split='test')
test_stsb = test_stsb.map(lambda row: {'similarity_score': row['similarity_score']/5})
test_stsb = test_stsb.rename_column("similarity_score", "score")

# BIOSSES
train_bio = load_dataset("tabilab/biosses", split='train')
train_bio = train_bio.map(lambda row: {'score': row['score']/4})

# SICK
train_sick = load_dataset("sick", trust_remote_code=True, split='train')
train_sick = train_sick.rename_column("relatedness_score", "score")
train_sick = train_sick.rename_column("sentence_A", "sentence1")
train_sick = train_sick.rename_column("sentence_B", "sentence2")
train_sick = train_sick.map(lambda row: {'score': row['score']/5})
train_sick = train_sick.remove_columns(['id', 'label', 'entailment_BA', 'entailment_AB', 'sentence_A_dataset', 'sentence_B_dataset', 'sentence_A_original', 'sentence_B_original'])

test_sick = load_dataset("sick", trust_remote_code=True, split='test')
test_sick = test_sick.rename_column("relatedness_score", "score")
test_sick = test_sick.rename_column("sentence_A", "sentence1")
test_sick = test_sick.rename_column("sentence_B", "sentence2")
test_sick = test_sick.map(lambda row: {'score': row['score']/5})
test_sick = test_sick.remove_columns(['id', 'label', 'entailment_BA', 'entailment_AB', 'sentence_A_dataset', 'sentence_B_dataset', 'sentence_A_original', 'sentence_B_original'])

# # Concatenating all together
train_sts = concatenate_datasets([train_stsb, train_bio, train_sick])
test_sts = concatenate_datasets([test_stsb, test_sick])
train_sts = train_stsb
test_sts = test_stsb

print(f'train size = {len(train_sts)} | test size = {len(test_sts)}')

In [8]:
evaluator = EmbeddingSimilarityEvaluator(sentences1 = test_sts['sentence1'],
                             sentences2 = test_sts['sentence2'],
                             scores = test_sts['score'],
                             main_similarity=SimilarityFunction.COSINE)

In [None]:
log_step = 500

args = SentenceTransformerTrainingArguments(
      output_dir="models/mpnet-base-all-nli-triplet",
      report_to="none",
      eval_strategy="steps",
      eval_steps=log_step,
      num_train_epochs=20,
      learning_rate=1e-5,
      logging_steps=log_step,
      per_device_train_batch_size=32,
      per_device_eval_batch_size=32
  )

trainer = SentenceTransformerTrainer(
      model=model,
      args=args,
      train_dataset=train_sts,
      eval_dataset=test_sts,
      loss=cosent_loss,
      evaluator=evaluator
  )

trainer.train()

In [None]:
en_query = ['It is sunny outside',
            'Which fruit is the healthiest?',
            'cheap toys for a 5 year old boy']

en_docs = [ # English Docs
            'I want to improve my health by eating more fruits. Which do you recommend?',
            'The weather is lovely',
            'this inexpensive action figure is the perfect gift for a young boy!']

queries = en_query
docs = en_docs

queries_emb = model.encode(queries)
docs_emb  = model.encode(docs)

similarities = model.similarity(queries_emb, docs_emb)
for j, query in enumerate(queries):
  for i, sim in enumerate(similarities[j]):
    print(f'{query} -> {docs[i]}: {round(sim.item(),2)}')
  print()

In [None]:
tasks = mteb.get_tasks(tasks=["BIOSSES",
                              "SICK-R",
                              "STS12",
                              "STS13",
                              "STS14",
                              "STS15",
                              "STS16",
                              "STS17",
                              "STS22.v2",
                              "STSBenchmark",
                              "SummEval"],
                       languages=['eng'])

evaluation = mteb.MTEB(tasks=tasks)

results = evaluation.run(model, output_folder=f"results_tiny/")