We will be using this blog for reference - https://towardsdatascience.com/bert-for-measuring-text-similarity-eec91c6bf9e1

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel


: 

In [34]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

In [35]:
# To encode a single sentence. Since encode_plus encodes a single sentence, we have to encode 
# all sentneces using a for loop. Instead we can use batch_encode_plus to encode an array of
# sentences
# initialize dictionary to store tokenized sentences
# tokens = {'input_ids': [], 'attention_mask': []}

# for sentence in sentences:
#     # encode each sentence and append to dictionary
#     new_tokens = tokenizer.encode_plus(sentence, max_length=128,
#                                        truncation=True, padding='max_length',
#                                        return_tensors='pt')
#     tokens['input_ids'].append(new_tokens['input_ids'][0])
#     tokens['attention_mask'].append(new_tokens['attention_mask'][0])

# # reformat list of tensors into single tensor
# tokens['input_ids'] = torch.stack(tokens['input_ids'])
# tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

In [36]:
sentences = [
    "chocolates are my favourite items.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
    "The person box was packed with jelly many dozens of months later.",
    "white chocolates and dark chocolates are favourites for many people.",
    "I love chocolates"
]
sentences

['chocolates are my favourite items.',
 'The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.',
 'The person box was packed with jelly many dozens of months later.',
 'white chocolates and dark chocolates are favourites for many people.',
 'I love chocolates']

In [37]:
encoding = tokenizer.batch_encode_plus(sentences, add_special_tokens = True, truncation = True, padding = "max_length", return_attention_mask = True, return_tensors = "pt")

In [38]:
attention_mask = encoding['attention_mask']

In [39]:
#outputs[0] has the last hidden_state and outputs[1] has the pooler_output
outputs = model(**encoding)

In [40]:
# we are interested in the embedddings , i.e. the last hidden state 
embeddings = outputs[0] 
embeddings.size()

torch.Size([5, 512, 768])

#### We are interested in embeddings of dimension 768 for all the five sentences. Final embeddings  required should be in the shape 5 X 768. In order to get that,  we have to perform a mean pooling. We multiply each of our values in the embeddings with attention mask adn then divide by the sum.



In [41]:
#let us ge tthe attention_mask from our encoding.
attention_mask = encoding['attention_mask']

In [42]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape

torch.Size([5, 512, 768])

In [43]:
mask

tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 

In [44]:
masked_embeddings = embeddings * mask
masked_embeddings.shape

torch.Size([5, 512, 768])

In [45]:
# sum the embeddings along the axis =1 .So we get 5 X 768

summed = torch.sum(masked_embeddings, 1)
summed.shape

torch.Size([5, 768])

In [46]:
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

torch.Size([5, 768])

In [47]:
mean_pooled = summed / summed_mask
mean_pooled.shape

torch.Size([5, 768])

In [48]:
mean_pooled

tensor([[ 0.2974,  0.1260,  0.1364,  ...,  0.1278,  0.0883, -0.2417],
        [ 0.0527, -0.1373,  0.1023,  ..., -0.0057,  0.0762,  0.2454],
        [ 0.0273, -0.1436,  0.4123,  ..., -0.2376,  0.2174, -0.1170],
        [ 0.0664,  0.2313,  0.5076,  ..., -0.2370, -0.0501,  0.0009],
        [ 0.3953,  0.5431,  0.0312,  ...,  0.0463,  0.1247,  0.1673]],
       grad_fn=<DivBackward0>)

In [49]:
from sklearn.metrics.pairwise import cosine_similarity
# convert from PyTorch tensor to numpy array
mean_pooled = mean_pooled.detach().numpy()

# calculate
cosine_similarity(
    [mean_pooled[0]],
    mean_pooled[1:]
)

array([[0.51168066, 0.5200645 , 0.7674701 , 0.8000995 ]], dtype=float32)

References:
https://towardsdatascience.com/bert-for-measuring-text-similarity-eec91c6bf9e1
https://towardsdatascience.com/why-are-there-so-many-tokenization-methods-for-transformers-a340e493b3a8
     

#### We can see that the cosine similarity for the first sentence with respect to fourth and fifth sentences are high