## This notebook is for testing and getting familiar with embedding models

In [3]:
from sentence_transformers import SentenceTransformer
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch

### Load a sentence transformer embedding model

In [4]:
embedder = SentenceTransformer("msmarco-distilbert-base-v4")
embedder



SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

Notice that the above embedder has first a transformer layer and then a pooling layer.

### Try out the embedding model

In [5]:
wikipedia_text = "The European Union (EU) is a supranational political and economic union of 27 member states that are located primarily in Europe"
embeddings = embedder.encode(wikipedia_text)
embeddings

array([ 8.37452352e-01, -4.49766785e-01,  5.94369173e-01, -3.64690214e-01,
       -2.04791799e-01,  9.67684269e-01,  4.59626615e-01,  1.13824344e+00,
        6.86243951e-01, -1.44724578e-01, -8.19937170e-01, -2.27527738e-01,
       -5.06509781e-01, -2.50946224e-01,  7.02426910e-01, -4.33130145e-01,
        1.50074840e-01,  2.91990250e-01, -2.18762130e-01, -5.76998889e-01,
       -5.91868639e-01,  9.97827768e-01,  7.48944819e-01, -1.04504779e-01,
        1.31007805e-01, -1.01986721e-01,  3.01252782e-01, -8.44390094e-01,
       -1.92844257e-01, -2.89206877e-02, -3.71905029e-01,  5.61008334e-01,
       -8.65827739e-01,  7.32985139e-01,  7.27419078e-01,  9.89027470e-02,
       -1.00309134e-01, -1.09461918e-01,  1.47822231e-01,  1.59677878e-01,
        5.94619989e-01,  1.25826165e-01,  1.18856937e-01,  3.86250794e-01,
        4.10713792e-01, -2.79500365e-01,  2.73893446e-01,  6.11875176e-01,
        1.18775442e-01,  2.80967146e-01,  4.16664451e-01, -2.86158681e-01,
        1.87005639e-01,  

Two different texts can be compared by using different metrics. Cosine similarity (inner product) is one popular metric. Try that out.

In [6]:
wikipedia_text_2 = "The EU has often been described as a sui generis political entity (without precedent or comparison) combining the characteristics of both a federation and a confederation."
embeddings_2 = embedder.encode(wikipedia_text_2)

# Calculate the similarity between the two embeddings
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

cosine(embeddings, embeddings_2)


0.47032338

In [7]:
text_about_python = "Python is an interpreted, high-level and general-purpose programming language. Python's design philosophy emphasizes code readability with its notable use of significant indentation."
embeddings_python = embedder.encode(text_about_python)

cosine(embeddings, embeddings_python)

0.010332608

### TODO

Calculate the similarity of some other text samples.

## Calculate the similary by constructing the pooling "by hand"

This example is exactly the same as in Hugging Face model card.

In [8]:

def create_embeddings(sentences, model_name):

    #Mean Pooling - Take attention mask into account for correct averaging
    def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


    # Load model from HuggingFace Hub
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling. In this case, max pooling.
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    return sentence_embeddings



### Calculate first the similary between the sentences by using the msmarco-distilbert-base-v4 model

In [9]:
sentence_embeddings = create_embeddings([wikipedia_text, wikipedia_text_2, text_about_python], 'sentence-transformers/msmarco-distilbert-base-v4')

print("Similarity between wiki sentences:", cosine(sentence_embeddings[0], sentence_embeddings[1]))
print("Similarity between wiki and python sentences:", cosine(sentence_embeddings[0], sentence_embeddings[2]))


Similarity between wiki sentences: 0.47032377
Similarity between wiki and python sentences: 0.010332571


### Do the same thing with the distilbert base model

In [10]:
sentence_embeddings = create_embeddings([wikipedia_text, wikipedia_text_2, text_about_python], 'distilbert/distilbert-base-uncased')

print("Similarity between wiki sentences:", cosine(sentence_embeddings[0], sentence_embeddings[1]))
print("Similarity between wiki and python sentences:", cosine(sentence_embeddings[0], sentence_embeddings[2]))


Similarity between wiki sentences: 0.85148394
Similarity between wiki and python sentences: 0.58959365


### TODO

What do you notice about the results?

### Create dataset for evaluating embedding models

In [11]:
from datasets import load_dataset
from sentence_transformers.readers import InputExample
from sentence_transformers import SentenceTransformer, models
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Load the STS benchmark dataset
data = load_dataset("mteb/stsbenchmark-sts")

test_samples = []
for row in data["test"]:
    score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
    inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
    test_samples.append(inp_example)

train_samples = []
for row in data["train"]:
    score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
    inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
    train_samples.append(inp_example)

### Construct our own embedding model with SentenceTransformer

In [12]:

word_embedding_model = models.Transformer('distilbert/distilbert-base-uncased', max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

test_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

### Evaluate the model accuracy (without fine-tuning)

In [20]:

test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
test_evaluator(test_model)

0.6823797788593509

In [21]:
test_evaluator(embedder)

0.7885316636814821

## Fine-tune the distilbert base model

In [15]:
train_samples[:2]

[<sentence_transformers.readers.InputExample.InputExample at 0x355dd3d10>,
 <sentence_transformers.readers.InputExample.InputExample at 0x355dd3dd0>]

In [16]:
train_samples[0].texts

['A plane is taking off.', 'An air plane is taking off.']

In [17]:
train_samples[0].label

1.0

In [23]:
from torch.utils.data import DataLoader
from sentence_transformers import losses, evaluation

n_samples = 5000
train_dataloader = DataLoader(train_samples[:n_samples], shuffle=True, batch_size=16)

train_loss = losses.CosineSimilarityLoss(test_model)

evaluator = evaluation.EmbeddingSimilarityEvaluator([s.texts[0] for s in test_samples], [s.texts[1] for s in test_samples], [s.label for s in test_samples])

# Tune the model
test_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100, evaluator=evaluator, evaluation_steps=10, output_path="/Users/aappopulkkinen/repos/llm-finetuning-public/embedding_finetuning/")


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/313 [00:00<?, ?it/s]

In [24]:
test_evaluator(test_model)

0.8108965168539495