<a href="https://colab.research.google.com/github/erikapaceep/NLP/blob/main/sentence_vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Sentence Vec

In [43]:
text = 'hello word what a great time to be alive'

In [None]:
!pip install transformers

In [4]:
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
from transformers import AutoTokenizer, AutoModel 
import torch

In [6]:
# initalize the model and the tokenizer
model_name = 'sentence-transformers/bert-base-nli-mean-tokens'

In [8]:
# initialize our tokenizer model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [44]:
# tokenize our text using our tokenizer
tokens = tokenizer.encode_plus(text, max_length=128,
                               truncation=True,
                               padding='max_length',
                               return_tensors='pt')

In [42]:
tokens

{'input_ids': tensor([[ 101, 7592, 2773, 2054, 1037, 2307, 2051, 2000, 2022, 4142, 1010, 7632,
          102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0,

In [13]:
# now that the input are tokenized we can pass it through our model
output = model(**tokens)

In [None]:
output
# Extracting the last hidden state: the model output  and inside that we have this tensor called the last hidden state

In [None]:
# this is the last hidden state
output[0]

In [18]:
# this is also the last hidden state
embeddings = output.last_hidden_state
embeddings

tensor([[[ 0.1143, -0.2471,  1.6116,  ...,  0.4679, -0.2416,  0.0699],
         [ 0.6115,  0.1260,  1.6738,  ...,  0.5420, -0.0708,  0.0985],
         [ 0.4344, -0.0689,  1.4304,  ...,  0.3937, -0.1738, -0.1908],
         ...,
         [ 0.3213, -0.0943,  1.1681,  ...,  0.3382, -0.3864,  0.0301],
         [ 0.2114, -0.1746,  1.1498,  ...,  0.3368, -0.4444, -0.0992],
         [ 0.2228, -0.2167,  1.2706,  ...,  0.3394, -0.4571, -0.1503]]],
       grad_fn=<NativeLayerNormBackward0>)

In [20]:
embeddings.shape

torch.Size([1, 128, 768])

Now we need to perform a mean pooling operation in order to create a single vector encoding and convert these form the current embedddings to sentence embedding all vector

In [22]:
# convert the embeddings from BERT into sentence vectors using a mean pooling operation:
# to do this we need to multiply our embedding tensor by the attention mask value so that we ignore the non teal token (that correspond to 0 the attention mask tensor)

attention_mask = tokens['attention_mask']
attention_mask.shape # we have one value for each one of our input

torch.Size([1, 128])

In [30]:
#we want an attention mask to be the same size as the embedding shape
mask = attention_mask.unsqueeze(-1).expand(embeddings.shape).float()

In [39]:
# Now that attention mask is the right shape we jsut need to multiply the embedding by the attention mask
masked_embeddings = embeddings * mask
masked_embeddings[0]

tensor([[ 0.1143, -0.2471,  1.6116,  ...,  0.4679, -0.2416,  0.0699],
        [ 0.6115,  0.1260,  1.6738,  ...,  0.5420, -0.0708,  0.0985],
        [ 0.4344, -0.0689,  1.4304,  ...,  0.3937, -0.1738, -0.1908],
        ...,
        [ 0.0000, -0.0000,  0.0000,  ...,  0.0000, -0.0000,  0.0000],
        [ 0.0000, -0.0000,  0.0000,  ...,  0.0000, -0.0000, -0.0000],
        [ 0.0000, -0.0000,  0.0000,  ...,  0.0000, -0.0000, -0.0000]],
       grad_fn=<SelectBackward0>)

In [38]:
# if we compare this wtih respect to what embeddings was before - the masked embeddings maked the last values 
embeddings

tensor([[[ 0.1143, -0.2471,  1.6116,  ...,  0.4679, -0.2416,  0.0699],
         [ 0.6115,  0.1260,  1.6738,  ...,  0.5420, -0.0708,  0.0985],
         [ 0.4344, -0.0689,  1.4304,  ...,  0.3937, -0.1738, -0.1908],
         ...,
         [ 0.3213, -0.0943,  1.1681,  ...,  0.3382, -0.3864,  0.0301],
         [ 0.2114, -0.1746,  1.1498,  ...,  0.3368, -0.4444, -0.0992],
         [ 0.2228, -0.2167,  1.2706,  ...,  0.3394, -0.4571, -0.1503]]],
       grad_fn=<NativeLayerNormBackward0>)

In [46]:
# now we need to sum the embeddings
summed = torch.sum(masked_embeddings, 1)
summed.shape

torch.Size([1, 768])

In [55]:
counts = torch.clamp(mask.sum(1), min=1e-9)
counts.shape

torch.Size([1, 768])

In [None]:
meaned = torch.mean(masked_embeddings, 1)
meaned

In [58]:
# this is our sentence vector
mean_pooles = summed/counts
mean_pooles

tensor([[ 4.5706e-01, -1.3274e-01,  1.5678e+00,  9.7545e-02,  1.0961e-01,
         -4.5339e-01, -4.4104e-01,  7.1403e-01,  4.2600e-01, -8.5828e-01,
         -1.2381e-01, -3.5462e-02,  3.2522e-01,  1.0569e-01,  3.1507e-01,
          2.5949e-01, -4.4319e-01, -8.2659e-01,  3.9691e-01, -9.9923e-01,
         -5.7139e-01, -5.8446e-01, -3.6041e-01, -6.3283e-01,  5.3884e-01,
         -6.4072e-01,  1.6995e-02, -8.8052e-01, -2.6783e-01,  3.1300e-01,
         -4.7900e-01,  2.2191e-01,  8.6288e-01, -7.6205e-01,  5.7911e-02,
          1.5526e+00, -1.5126e-01, -8.1886e-02, -1.6821e-01, -1.1182e-01,
          1.1469e+00, -4.6340e-01,  9.2029e-01,  1.7389e-01, -1.2793e+00,
          9.6926e-02, -6.6143e-01,  2.6481e-01,  5.9966e-01, -8.8305e-01,
          3.3510e-01, -7.8292e-01, -8.6958e-01,  2.7097e-02, -7.1468e-01,
          4.0136e-01,  2.2119e-01, -6.1775e-01,  1.7207e-01,  2.2425e-01,
          6.8732e-01, -4.4249e-01,  5.0432e-01,  4.2942e-01, -9.7778e-01,
          5.5970e-01,  1.1844e+00, -2.