In [0]:
# workbook written with google colab
# trying out the example from https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/
# on a new sentence
# main purpose of this workbook is to get more familiar with using bert

In [0]:
#!pip install pytorch-pretrained-bert

In [0]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
# Define a new example sentence with multiple meanings of the word "bow"
text = "She wore a lovely bow in her hair as she took a bow after she tied a bow to the back of the lovely dress she wore, as they faced the bow of the ship"

# Add the special tokens.
marked_text = "[CLS] " + text + " [SEP]"

# Split the sentence into tokens.
tokenized_text = tokenizer.tokenize(marked_text)

# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Display the words with their indeces.
for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

[CLS]           101
she           2,016
wore          5,078
a             1,037
lovely        8,403
bow           6,812
in            1,999
her           2,014
hair          2,606
as            2,004
she           2,016
took          2,165
a             1,037
bow           6,812
after         2,044
she           2,016
tied          5,079
a             1,037
bow           6,812
to            2,000
the           1,996
back          2,067
of            1,997
the           1,996
lovely        8,403
dress         4,377
she           2,016
wore          5,078
,             1,010
as            2,004
they          2,027
faced         4,320
the           1,996
bow           6,812
of            1,997
the           1,996
ship          2,911
[SEP]           102


In [5]:
# Mark each of the tokens as belonging to sentence "1".
segments_ids = [1] * len(tokenized_text)

print (segments_ids)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [6]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Lin

In [0]:
# Predict hidden states features for each layer
with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, segments_tensors)

In [8]:
# Concatenate the tensors for all layers. We use `stack` here to
# create a new dimension in the tensor.
token_embeddings = torch.stack(encoded_layers, dim=0)

token_embeddings.size()

torch.Size([12, 1, 38, 768])

In [9]:
# Remove dimension 1, the "batches".
token_embeddings = torch.squeeze(token_embeddings, dim=1)

token_embeddings.size()

torch.Size([12, 38, 768])

In [10]:
# Swap dimensions 0 and 1.
token_embeddings = token_embeddings.permute(1,0,2)

token_embeddings.size()

torch.Size([38, 12, 768])

In [11]:
# Stores the token vectors
token_vecs_cat = []

# `token_embeddings` is a  tensor.

# For each token in the sentence...
for token in token_embeddings:
    
    # `token` is a  tensor

    # Concatenate the vectors (that is, append them together) from the last 
    # four layers.
    cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
    
    # Use `cat_vec` to represent `token`.
    token_vecs_cat.append(cat_vec)

print ('Shape is: %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0])))

Shape is: 38 x 3072


In [12]:
# Stores the token vectors
token_vecs_sum = []

# `token_embeddings` is a tensor.

# For each token in the sentence...
for token in token_embeddings:

    # `token` is a tensor

    # Sum the vectors from the last four layers.
    sum_vec = torch.sum(token[-4:], dim=0)
    
    # Use `sum_vec` to represent `token`.
    token_vecs_sum.append(sum_vec)

print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))

Shape is: 38 x 768


In [0]:
# `encoded_layers` 

# `token_vecs` is a tensor
token_vecs = encoded_layers[11][0]

# Calculate the average of all token vectors.
sentence_embedding = torch.mean(token_vecs, dim=0)

In [14]:
for i, token_str in enumerate(tokenized_text):
  print (i, token_str)

0 [CLS]
1 she
2 wore
3 a
4 lovely
5 bow
6 in
7 her
8 hair
9 as
10 she
11 took
12 a
13 bow
14 after
15 she
16 tied
17 a
18 bow
19 to
20 the
21 back
22 of
23 the
24 lovely
25 dress
26 she
27 wore
28 ,
29 as
30 they
31 faced
32 the
33 bow
34 of
35 the
36 ship
37 [SEP]


In [15]:
print(tokenized_text[5])
print(tokenized_text[13])
print(tokenized_text[18])
print(tokenized_text[33])

bow
bow
bow
bow


In [16]:
print('First 5 vector values for each instance of "bow".')
print('')
print("wore a lovely bow   ", str(token_vecs_sum[5][:5]))
print("took a bow  ", str(token_vecs_sum[13][:5]))
print("tied a bow   ", str(token_vecs_sum[18][:5]))
print("bow of the ship  ", str(token_vecs_sum[33][:5]))

First 5 vector values for each instance of "bow".

wore a lovely bow    tensor([ 5.9100, -0.1691,  0.5826, -2.7905, -0.2299])
took a bow   tensor([ 3.5386,  0.2097,  0.5809, -1.8370, -0.4172])
tied a bow    tensor([ 5.9890,  0.5890,  0.3699,  0.2781, -1.2229])
bow of the ship   tensor([ 0.0821, -4.4067,  0.8191,  0.1743,  0.1624])


In [17]:
from scipy.spatial.distance import cosine

# Calculate the cosine similarity between the word bow 
# in "wore a bow" vs "took a bow" (different meanings).
wore_took = 1 - cosine(token_vecs_sum[13], token_vecs_sum[5])

# Calculate the cosine similarity between the word bank
# in "wore a bow" vs "tied a bow" (similar meaning).
wore_tied = 1 - cosine(token_vecs_sum[18], token_vecs_sum[5])

# "bow of ship" vs "wore a bow"
wore_ship = 1 - cosine(token_vecs_sum[33], token_vecs_sum[5])

# "bow of ship" vs "tied a bow"
tied_ship = 1 - cosine(token_vecs_sum[33], token_vecs_sum[18])

print('Vector similarity for wore bow and took bow:  %.2f' % wore_took)
print('Vector similarity for wore bow and tied bow:  %.2f' % wore_tied)
print('Vector similarity for wore bow and bow of ship:  %.2f' % wore_ship)
print('Vector similarity for tied bow and bow of ship:  %.2f' % tied_ship)


Vector similarity for wore bow and took bow:  0.80
Vector similarity for wore bow and tied bow:  0.82
Vector similarity for wore bow and bow of ship:  0.52
Vector similarity for tied bow and bow of ship:  0.54


In [18]:
# similarity for the two appearances of "lovely"
print(1 - cosine(token_vecs_sum[24], token_vecs_sum[4]))

0.8442650437355042
