- Codes originally from James Briggs and his YouTube Video on ['Sentence Similarity With Transformers and PyTorch'](https://www.youtube.com/watch?v=jVPd7lEvjtg)

- The codes have been rewritten by codenavy94 for studying purposes


In [1]:
sentences = [
    "Three years later, the coffin was still full of Jello.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
    "The person box was packed with jelly many dozens of months later.",
    "Standing on one's head at job interviews forms a lasting impression.",
    "It took him a month to finish the meal.",
    "He found a leprechaun in his walnut shell."
]

model_name = 'sentence-transformers/bert-base-nli-mean-tokens'

In [2]:
from transformers import AutoTokenizer, AutoModel
import torch

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Downloading: 100%|██████████| 399/399 [00:00<00:00, 133kB/s]
Downloading: 100%|██████████| 625/625 [00:00<00:00, 208kB/s]
Downloading: 100%|██████████| 226k/226k [00:00<00:00, 275kB/s]  
Downloading: 100%|██████████| 455k/455k [00:00<00:00, 484kB/s]  
Downloading: 100%|██████████| 2.00/2.00 [00:00<00:00, 1.00kB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 37.9kB/s]
Downloading: 100%|██████████| 418M/418M [00:13<00:00, 33.0MB/s] 


In [4]:
tokens = {'input_ids':[], 'attention_mask':[]}

In [5]:
for sentence in sentences:
    new_tokens = tokenizer.encode_plus(sentence,
                                       max_length=128, # how many tokens per each sequence (sentence)
                                       truncation=True,
                                       padding='max_length',
                                       return_tensors='pt')
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

In [7]:
tokens['input_ids'] = torch.stack(tokens['input_ids']) # a list of tensors to a single tensor (with an extra dimension)
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

In [9]:
tokens['input_ids'].shape # sentence_num, token_num(max_length) # Warning: token_num != embedding_size (768)

torch.Size([6, 128])

In [10]:
outputs = model(**tokens)

In [12]:
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [15]:
embeddings = outputs.last_hidden_state
embeddings.shape # 6 sentences, 128 tokens, 768 hidden_state size of BERT
# each token is represented as a (768,) size of hidden state vector

torch.Size([6, 128, 768])

In [16]:
# Remove embedding values from where there shouldn't be embedding values
# Padding tokens, for example, aren't real tokens.
# Therefore, we remove the embedding values from such 'fake' tokens, since you don't want your model to focus on the padding tokens!
attention = tokens['attention_mask']
attention.shape

torch.Size([6, 128])

In [18]:
mask = attention.unsqueeze(-1).expand(embeddings.shape).float() # add a dimension and expand it to match the size of embeddings

In [20]:
mask_embeddings = embeddings * mask

In [21]:
embeddings

tensor([[[-0.0692,  0.6230,  0.0354,  ...,  0.8033,  1.6314,  0.3281],
         [ 0.0367,  0.6842,  0.1946,  ...,  0.0848,  1.4747, -0.3008],
         [-0.0121,  0.6543, -0.0727,  ..., -0.0326,  1.7717, -0.6812],
         ...,
         [ 0.6712,  1.1871,  0.2726,  ...,  0.9307,  0.9777, -0.2166],
         [ 0.5063,  1.1734,  0.3907,  ...,  0.8818,  0.9201, -0.2802],
         [ 0.3268,  1.0662,  0.2948,  ...,  0.9246,  1.0575, -0.3765]],

        [[-0.3212,  0.8251,  1.0554,  ..., -0.1855,  0.1517,  0.3937],
         [-0.7146,  1.0297,  1.1217,  ...,  0.0331,  0.2382, -0.1563],
         [-0.2352,  1.1353,  0.8594,  ..., -0.4310, -0.0272, -0.2968],
         ...,
         [-0.0894,  0.5312,  1.1392,  ..., -0.2256, -0.5657,  0.1937],
         [-0.2446,  0.5845,  1.1385,  ..., -0.2867, -0.5586, -0.1296],
         [-0.1811,  0.6800,  1.2574,  ..., -0.2590, -0.5041, -0.0792]],

        [[-0.7576,  0.8399, -0.3792,  ...,  0.1271,  1.2514,  0.1365],
         [-0.6591,  0.7613, -0.4662,  ...,  0

In [22]:
mask_embeddings # Among 6 x 128 x 768 vectors, replace 768-sized vectors with 0 for tokens that are not real

tensor([[[-0.0692,  0.6230,  0.0354,  ...,  0.8033,  1.6314,  0.3281],
         [ 0.0367,  0.6842,  0.1946,  ...,  0.0848,  1.4747, -0.3008],
         [-0.0121,  0.6543, -0.0727,  ..., -0.0326,  1.7717, -0.6812],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.0000]],

        [[-0.3212,  0.8251,  1.0554,  ..., -0.1855,  0.1517,  0.3937],
         [-0.7146,  1.0297,  1.1217,  ...,  0.0331,  0.2382, -0.1563],
         [-0.2352,  1.1353,  0.8594,  ..., -0.4310, -0.0272, -0.2968],
         ...,
         [-0.0000,  0.0000,  0.0000,  ..., -0.0000, -0.0000,  0.0000],
         [-0.0000,  0.0000,  0.0000,  ..., -0.0000, -0.0000, -0.0000],
         [-0.0000,  0.0000,  0.0000,  ..., -0.0000, -0.0000, -0.0000]],

        [[-0.7576,  0.8399, -0.3792,  ...,  0.1271,  1.2514,  0.1365],
         [-0.6591,  0.7613, -0.4662,  ...,  0

In [23]:
# Mean Pooling Operation
# Currently, each token has a separate latent vector of size (768,)
# Getting a 'sentence embedding' requires mean pooling of 128 tokens (i.e., their hidden state embeddings)

summed = torch.sum(mask_embeddings, dim=1) # mask_embeddings.shape => (6, 128, 768) after sum => (6, 768), each sentence is represented as a (768,) embedding
summed.shape # the second dimension removed (summed)

torch.Size([6, 768])

In [24]:
counts = torch.clamp(mask.sum(dim=1), min=1e-9)
counts.shape

torch.Size([6, 768])

In [25]:
mean_pooled = summed / counts
mean_pooled.shape # <= This is our sentence vector!

torch.Size([6, 768])

In [26]:
mean_pooled

tensor([[ 0.0745,  0.8637,  0.1795,  ...,  0.7734,  1.7247, -0.1803],
        [-0.3715,  0.9729,  1.0840,  ..., -0.2552, -0.2759,  0.0358],
        [-0.5030,  0.7950, -0.1240,  ...,  0.1441,  0.9704, -0.1791],
        [-0.0132,  0.9773,  1.4516,  ..., -0.8462, -1.4004, -0.4118],
        [-0.2019,  0.0597,  0.8603,  ..., -0.0100,  0.8431, -0.0841],
        [-0.2131,  1.0175, -0.8833,  ...,  0.7371,  0.1947, -0.3011]],
       grad_fn=<DivBackward0>)

In [27]:
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
mean_pooled = mean_pooled.detach().numpy()

cosine_similarity(
    [mean_pooled[0]],
    mean_pooled[1:]
)

array([[0.3308891 , 0.7219258 , 0.17475498, 0.4470964 , 0.55483633]],
      dtype=float32)

In [None]:
sentences = [
    "Three years later, the coffin was still full of Jello.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
    "The person box was packed with jelly many dozens of months later.",
    "Standing on one's head at job interviews forms a lasting impression.",
    "It took him a month to finish the meal.",
    "He found a leprechaun in his walnut shell."
]