In [14]:
from transformers import BertTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

In [26]:
texts = ["She is religious, but does not believe in one specific religion.", 
        "She is Christian and believes in God.", 
        "He is religious, but doesn't believe in one religion.", 
        "She is religious and believes in many religions."]

In [27]:
tokens = {'input_ids': [], 'attention_mask': []}

In [28]:
for text in texts: 
    new_tokens = tokenizer.encode_plus(text, max_length=128,
                                       truncation=True, padding='max_length',
                                       return_tensors='pt')
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

In [29]:
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

In [30]:
outputs = model(**tokens)
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [31]:
embeddings = outputs.last_hidden_state

In [32]:
attention_mask = tokens['attention_mask']
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
masked_embeddings = embeddings * mask

summed = torch.sum(masked_embeddings, 1)
summed_mask = torch.clamp(mask.sum(1), min=1e-9)

mean_pooled = summed / summed_mask

In [33]:
mean_pooled = mean_pooled.detach().numpy()

In [34]:
cosine_similarity(
    [mean_pooled[0]],
    mean_pooled[1:]
)

array([[0.48953307, 0.91896856, 0.5036423 ]], dtype=float32)