In [7]:
import torch
import numpy as np
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [8]:

# the most downloaded sentence transformer on HuggingFace
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')



In [17]:
sentences = ["This is an example sentence", "Here is an example sentence"]

In [18]:
embeddings = model.encode(sentences)

In [19]:
# each input sentence gets a 384 dimension embedding array associated with it
embeddings.shape

(2, 384)

In [20]:
embeddings[0]

array([ 6.76568821e-02,  6.34958595e-02,  4.87130657e-02,  7.93049410e-02,
        3.74480337e-02,  2.65276711e-03,  3.93748507e-02, -7.09845778e-03,
        5.93614802e-02,  3.15370001e-02,  6.00980520e-02, -5.29051535e-02,
        4.06067818e-02, -2.59308070e-02,  2.98427828e-02,  1.12691033e-03,
        7.35149160e-02, -5.03819808e-02, -1.22386612e-01,  2.37028431e-02,
        2.97265183e-02,  4.24768664e-02,  2.56337654e-02,  1.99519354e-03,
       -5.69190867e-02, -2.71598585e-02, -3.29035483e-02,  6.60248846e-02,
        1.19007066e-01, -4.58791442e-02, -7.26214424e-02, -3.25839929e-02,
        5.23413643e-02,  4.50552590e-02,  8.25300720e-03,  3.67023498e-02,
       -1.39415013e-02,  6.53919429e-02, -2.64272932e-02,  2.06431447e-04,
       -1.36643406e-02, -3.62809524e-02, -1.95043329e-02, -2.89738476e-02,
        3.94270346e-02, -8.84090513e-02,  2.62426864e-03,  1.36713749e-02,
        4.83063087e-02, -3.11565604e-02, -1.17329173e-01, -5.11690117e-02,
       -8.85287449e-02, -

In [21]:
# Compute cosine similarity between the two sentence embeddings
# this is the foundation of all RAG stuff. basically embed your dataset
# then embed a query and find the most similar embeddings in your dataset
# then use the original text to answer the query
cosine_similarity = torch.nn.functional.cosine_similarity(torch.tensor(embeddings[0]).unsqueeze(0), torch.tensor(embeddings[1]).unsqueeze(0))
cosine_similarity


tensor([0.8809])

In [22]:
# The Sentence transformers library simplifies a lot of things for making embeddings
# let's take a look at some of the stuff that goes on under the hood
# this will be more like the LLM stuff
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

In [23]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')



In [55]:
# Many sentence transformers models are based on the BERT architecture
# though recently there are some based on LLMs 
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-5): 6 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)


In [24]:
# we can see here that hidden_size is 384, the dimensionality of the embeddings
model.config

BertConfig {
  "_name_or_path": "sentence-transformers/all-MiniLM-L6-v2",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.45.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [27]:
# a key variable here is model_max_length=512, so that means the model can only handle up to 512 tokens
tokenizer

BertTokenizerFast(name_or_path='sentence-transformers/all-MiniLM-L6-v2', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [43]:
# lets do a single longer input to take a closer look at what happens with tokens
# uncomment the second input to see truncation happen (its 233 * 3 tokens)
inputs = [
  """
  Books bombarded his shoulder, his arms, his upturned face.  A book lit, almost obediently, like a white pigeon, in his hands, wings fluttering.  In the dim, wavering light, a page hung open and it was like a snowy feather, the words delicately painted thereon.  In all the rush and fervor, Montage had only an instant to read a line, but it blazed in his mind for the next minute as if stamped there with fiery steel.  “Time has fallen asleep in the afternoon sunshine.”  He dropped the book.  Immediately, another fell into his arms.
  """,
  # """
  # The tent he lived in stood right smack up against the wall of the shallow, dull-colored forest separating his own squadron from Dunbar’ s.  Immediately alongside was the abandoned railroad ditch that carried the pipe that carried the aviation gasoline down to the fuel trucks at the airfield.  Thanks to Orr, his roommate, it was the most luxurious tent in the squadron.  Each time Yossarian returned from one of his holidays in the hospital or rest leaves in Rome, he was surprised by some new comfort Orr had installed in his absence - running water, wood-burning fireplace, cement floor.  Yossarian had chosen the site, and he and Orr had raised the tent to get her.  Orr, who was a grinning pygmy with pilot’s wings and thick, wavy brown hair parted in the middle, furnished all the knowledge, while Yossarian, who was taller, stronger, broader, and faster, did most of the work.  Just the two of them lived there, although the tent was big enough for six.  When summer came, Orr rolled up the side flaps to allow a breeze that never blew to flush away the air baking inside.
  # The tent he lived in stood right smack up against the wall of the shallow, dull-colored forest separating his own squadron from Dunbar’ s.  Immediately alongside was the abandoned railroad ditch that carried the pipe that carried the aviation gasoline down to the fuel trucks at the airfield.  Thanks to Orr, his roommate, it was the most luxurious tent in the squadron.  Each time Yossarian returned from one of his holidays in the hospital or rest leaves in Rome, he was surprised by some new comfort Orr had installed in his absence - running water, wood-burning fireplace, cement floor.  Yossarian had chosen the site, and he and Orr had raised the tent to get her.  Orr, who was a grinning pygmy with pilot’s wings and thick, wavy brown hair parted in the middle, furnished all the knowledge, while Yossarian, who was taller, stronger, broader, and faster, did most of the work.  Just the two of them lived there, although the tent was big enough for six.  When summer came, Orr rolled up the side flaps to allow a breeze that never blew to flush away the air baking inside.
  # The tent he lived in stood right smack up against the wall of the shallow, dull-colored forest separating his own squadron from Dunbar’ s.  Immediately alongside was the abandoned railroad ditch that carried the pipe that carried the aviation gasoline down to the fuel trucks at the airfield.  Thanks to Orr, his roommate, it was the most luxurious tent in the squadron.  Each time Yossarian returned from one of his holidays in the hospital or rest leaves in Rome, he was surprised by some new comfort Orr had installed in his absence - running water, wood-burning fireplace, cement floor.  Yossarian had chosen the site, and he and Orr had raised the tent to get her.  Orr, who was a grinning pygmy with pilot’s wings and thick, wavy brown hair parted in the middle, furnished all the knowledge, while Yossarian, who was taller, stronger, broader, and faster, did most of the work.  Just the two of them lived there, although the tent was big enough for six.  When summer came, Orr rolled up the side flaps to allow a breeze that never blew to flush away the air baking inside.
  # """
]

In [44]:
# Tokenize sentences
encoded_input = tokenizer(inputs, padding=True, truncation=True, return_tensors='pt')

In [45]:
encoded_input["input_ids"].shape

torch.Size([1, 127])

In [46]:
# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

In [47]:
#Mean Pooling - Take attention mask into account for correct averaging
# basically we average the embeddings, but only for the tokens that are not padding tokens
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


In [53]:
# we have an embedding for each token
model_output[0].shape

torch.Size([1, 127, 384])

In [48]:
# Perform pooling, average the embeddings for each token into a single embedding vector
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])


In [49]:

# Normalize embeddings
# something we should explain... its often taken for granted or glossed over
# but the embeddings are normalized to unit length
# this is important for cosine similarity
normalized_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

In [50]:
normalized_embeddings.shape

torch.Size([1, 384])