# Experiments with BERT for measuring word similarities in context

# Calculate some predictions for masked and unmasked tokens

In [39]:
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM

# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
# import logging
# logging.basicConfig(level=logging.INFO)

In [40]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [41]:
# Tokenize input
text = "[CLS] I fed my cat some of it and he damn near passed out [SEP]"
tokenized_text = tokenizer.tokenize(text)

In [42]:
# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 4
tokenized_text[masked_index] = '[MASK]'
print(tokenized_text)
#assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
# segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
segments_ids = [0] * len(tokenized_text) # NOT SURE IF THIS IS NEEDED

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])


['[CLS]', 'i', 'fed', 'my', '[MASK]', 'some', 'of', 'it', 'and', 'he', 'damn', 'near', 'passed', 'out', '[SEP]']


In [43]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')

# Set the model in evaluation mode to deactivate the DropOut modules
# This is IMPORTANT to have reproducible results during evaluation!
model.eval()


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [45]:
# Predict hidden states features for each layer
with torch.no_grad():
    # See the models docstrings for the detail of the inputs
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    # Transformers models always output tuples.
    # See the models docstrings for the detail of all the outputs
    # In our case, the first element is the hidden state of the last layer of the Bert model
    encoded_layers = outputs[0]
# We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension)
assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size)
print(encoded_layers.shape)


torch.Size([1, 15, 768])


In [59]:
# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [51]:
# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]

print(predictions.shape)
# confirm we were able to predict 'henson'
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
print(predicted_token)
#assert predicted_token == 'henson'

torch.Size([1, 15, 30522])
brother


In [52]:
import numpy as np
# Find the k highest predictions
def get_top_predictions(pred_tensor, pred_index, k=5):
    top_indexes = np.argpartition(pred_tensor[0,pred_index], -k)[-k:]
    sorted_indexes = top_indexes[np.argsort(-pred_tensor[0, pred_index][top_indexes])]
    print(f"Ordered top predicted ids: {sorted_indexes}")
    top_tokens = tokenizer.convert_ids_to_tokens(sorted_indexes)
    print(f"Ordered top predicted tokens: {top_tokens}")
    print(f"Ordered top predicted values: {predictions[0, pred_index][sorted_indexes]}")
get_top_predictions(predictions, masked_index)

Ordered top predicted ids: tensor([2567, 3129, 2269, 3611, 2365])
Ordered top predicted tokens: ['brother', 'husband', 'father', 'dad', 'son']
Ordered top predicted values: tensor([11.0595, 10.8847, 10.6277, 10.5208,  9.9940])


In [53]:
# Compare probabilities for some words
probe_tokens = ['cat', 'dog', 'sister', 'wife', 'window', 'the', 'phone', 'bottle']
probe_ids = tokenizer.convert_tokens_to_ids(probe_tokens)
print(f"Probed tokens: {probe_tokens}")
print(f"Probed values: {predictions[0, masked_index][probe_ids]}")

Probed tokens: ['cat', 'dog', 'sister', 'wife', 'window', 'the', 'phone', 'bottle']
Probed values: tensor([6.8761, 8.4708, 5.6236, 6.1946, 0.6963, 0.8745, 2.6098, 0.9079])


In [54]:
# Check probabilities for unmasked words
# Prediction for "I"
get_top_predictions(predictions, 1)

Ordered top predicted ids: tensor([1045, 2057, 2027, 2074, 1000])
Ordered top predicted tokens: ['i', 'we', 'they', 'just', '"']
Ordered top predicted values: tensor([16.4698,  9.1743,  6.7145,  6.0044,  5.6862])


## Getting sentence probability without using any mask

In [82]:
# Tokenize input
text = "[CLS] I fed my cat some of it and he damn near passed out [SEP]"
tokenized_text = tokenizer.tokenize(text)

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
# segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
segments_ids = [0] * len(tokenized_text) # NOT SURE IF THIS IS NEEDED

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])


In [83]:
# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]

# Calculate sentence score, by adding each word's score in current sentence
sent_score = 0
for i, curr_index in enumerate(indexed_tokens):
    # Print predicted token just for fun
    predicted_index = torch.argmax(predictions[0, i]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
    print(f"Predicted token {predicted_token}\twith value: {torch.max(predictions[0, i])}")
    
    # Calculate sentence probability
    sent_score += predictions[0, i][curr_index]

print(sent_score)

Predicted token .	with value: 4.458500862121582
Predicted token i	with value: 11.762747764587402
Predicted token fed	with value: 13.628535270690918
Predicted token my	with value: 20.001148223876953
Predicted token cat	with value: 15.935842514038086
Predicted token some	with value: 16.707128524780273
Predicted token of	with value: 19.702524185180664
Predicted token it	with value: 15.31606388092041
Predicted token and	with value: 19.96385955810547
Predicted token he	with value: 20.69625473022461
Predicted token a	with value: 8.45186710357666
Predicted token near	with value: 14.07364273071289
Predicted token passed	with value: 12.538019180297852
Predicted token .	with value: 16.093671798706055
Predicted token .	with value: 17.31720542907715
tensor(184.3773)
