In [1]:
# Demonstrate creating word embeddings using BERT

import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel

In [2]:
# use the online model bert-base-uncased, 12/768/110M

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

100%|██████████| 231508/231508 [00:00<00:00, 922096.77B/s]


In [3]:
# check the size of the entire vocabulary
len(tokenizer.vocab)

30522

In [4]:
# peek at some of the vocabulary items
list(tokenizer.vocab.keys())[5000:5020]

['knight',
 'lap',
 'survey',
 'ma',
 '##ow',
 'noise',
 'billy',
 '##ium',
 'shooting',
 'guide',
 'bedroom',
 'priest',
 'resistance',
 'motor',
 'homes',
 'sounded',
 'giant',
 '##mer',
 '150',
 'scenes']

In [5]:
# [CLS] denotes start of classification
# [SEP] is separator between sentences in a classification
text = "Here is the sentence I want embeddings for."
marked_text = "[CLS] " + text + " [SEP]"

# Tokenize our sentence with the BERT tokenizer.
tokenized_text = tokenizer.tokenize(marked_text)

# Print out the tokens.
tokenized_text

# note hashes in output, which means that is a subword or character of a larger word precented by another word
# hashes split to subword tokens instead of unknowns and then average for approximation

['[CLS]',
 'here',
 'is',
 'the',
 'sentence',
 'i',
 'want',
 'em',
 '##bed',
 '##ding',
 '##s',
 'for',
 '.',
 '[SEP]']

In [6]:
# Define a new example sentence with multiple meanings of the word "bank"
text = "After stealing money from the bank vault, the bank robber was seen " \
       "fishing on the Mississippi river bank."

# Word2Vec classifies bank as the same meaning, BERT creates 3

# Split the sentence into tokens.
tokenized_text = tokenizer.tokenize("[CLS] " + text + " [SEP]")
tokenized_text

['[CLS]',
 'after',
 'stealing',
 'money',
 'from',
 'the',
 'bank',
 'vault',
 ',',
 'the',
 'bank',
 'robber',
 'was',
 'seen',
 'fishing',
 'on',
 'the',
 'mississippi',
 'river',
 'bank',
 '.',
 '[SEP]']

In [7]:
# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Display the words with their indeces.
for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

[CLS]           101
after         2,044
stealing     11,065
money         2,769
from          2,013
the           1,996
bank          2,924
vault        11,632
,             1,010
the           1,996
bank          2,924
robber       27,307
was           2,001
seen          2,464
fishing       5,645
on            2,006
the           1,996
mississippi   5,900
river         2,314
bank          2,924
.             1,012
[SEP]           102


In [8]:
# Mark each of the 22 tokens as belonging to sentence "1".
# tokens must be mapped to the appropriate sentence

segments_ids = [1] * len(tokenized_text)
segments_ids

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [9]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [10]:
tokens_tensor

tensor([[  101,  2044, 11065,  2769,  2013,  1996,  2924, 11632,  1010,  1996,
          2924, 27307,  2001,  2464,  5645,  2006,  1996,  5900,  2314,  2924,
          1012,   102]])

In [11]:
segments_tensors

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [12]:
# load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')

# Put the model in "evaluation" mode, meaning feed-forward operation, less memory
model.eval()

100%|██████████| 407873900/407873900 [00:53<00:00, 7694267.09B/s]


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Lin

In [13]:
# Predict hidden states features for each layer
with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, segments_tensors)

In [14]:
# check on the numer of layers, batches, tokens and hidden units
print ("Number of layers:", len(encoded_layers))
print ("Number of batches:", len(encoded_layers[0]))
print ("Number of tokens:", len(encoded_layers[0][0]))
print ("Number of hidden units:", len(encoded_layers[0][0][0]))

Number of layers:12
Number of batches:1
Number of tokens:22
Number of hidden units:768


In [15]:
# `encoded_layers` is a Python list.
print('     Type of encoded_layers: ', type(encoded_layers))

# Each layer in the list is a torch tensor.
print('Tensor shape for each layer: ', encoded_layers[0].size())

Type of encoded_layers:<class 'list'>
Tensor shape for each layer:torch.Size([1, 22, 768])


In [16]:
# Concatenate the tensors for all layers. We use `stack` here to
# create a new dimension in the tensor.
token_embeddings = torch.stack(encoded_layers, dim=0)
token_embeddings.size()

torch.Size([12, 1, 22, 768])

In [17]:
# Remove dimension 1, the "batches" - that is, the sentences as we only have one
token_embeddings = torch.squeeze(token_embeddings, dim=1)
token_embeddings.size()

torch.Size([12, 22, 768])

In [18]:
# Swap dimensions 0 and 1.
token_embeddings = token_embeddings.permute(1,0,2)
token_embeddings.size()

# token_embeddings is a [22 x 12 x 768] tensor.

torch.Size([22, 12, 768])

In [19]:
# Stores the token vectors, with shape [22 x 768] 
# sums the last 4 vectors

# rolled up sum of tensors / vectors
token_vecs_sum = []

# token_embeddings is a [22 x 12 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:
    # token is a [12 x 768] tensor

    # Sum the vectors from the last four layers.
    sum_vec = torch.sum(token[-4:], dim=0)
    
    # Use sum_vec` to represent `token`.
    token_vecs_sum.append(sum_vec)

print('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))

Shape is: 22 x 768


In [20]:
# reminder of the tokens
for i, token_str in enumerate(tokenized_text):
  print (i, token_str)

0[CLS]
1after
2stealing
3money
4from
5the
6bank
7vault
8,
9the
10bank
11robber
12was
13seen
14fishing
15on
16the
17mississippi
18river
19bank
20.
21[SEP]


In [21]:
# peek at some of the embeddings
print('First 5 vector values for each instance of "bank".')
print("bank vault   ", str(token_vecs_sum[6][:5]))
print("bank robber  ", str(token_vecs_sum[10][:5]))
print("river bank   ", str(token_vecs_sum[19][:5]))

First 5 vector values for each instance of "bank".
bank vaulttensor([ 2.1319, -2.1413, -1.6260,  0.8638,  3.3173])
bank robbertensor([ 1.1868, -1.5298, -1.3770,  1.0648,  3.1446])
river banktensor([ 1.1295, -1.4724, -0.7296, -0.0901,  2.4970])


In [22]:
# examine one of the bank tokens encoding in its entirety (768 values)
token_vecs_sum[6] # bank vault

tensor([ 2.1319e+00, -2.1413e+00, -1.6260e+00,  8.6377e-01,  3.3173e+00,
         1.7965e-01, -4.4853e+00,  3.1215e+00, -9.7403e-01, -3.1780e+00,
         1.0455e-01, -1.5481e+00,  4.7579e-01,  1.1703e+00, -4.4859e+00,
         2.0283e-01,  9.5524e-01,  4.2386e+00,  4.7911e+00,  1.9296e+00,
        -1.5251e+00, -1.4261e-01,  2.7351e+00,  1.1919e-01,  1.9293e+00,
         6.8548e-02,  3.7796e+00,  1.6841e+00,  1.7592e+00,  8.8806e-01,
         3.5501e+00, -1.5417e-01,  1.1845e+00,  4.4052e-01, -9.8483e-01,
        -1.4193e+00, -2.6208e+00,  1.2208e+00, -1.1315e+00,  3.3494e-01,
         3.6034e-01, -2.5285e+00, -7.8882e-01,  2.3313e+00, -5.6662e-01,
         3.8081e-01, -2.1388e+00,  1.0505e+00, -5.0555e+00,  1.5860e+00,
        -7.0210e-01,  3.4588e+00, -7.7145e+00, -2.5656e+00, -4.1447e-01,
         1.6298e+00, -3.3544e+00, -3.5672e+00, -9.2165e-01, -1.6571e+00,
         4.8018e+00, -6.6727e-01,  4.0046e+00, -4.6979e+00, -9.1355e-01,
        -2.4762e+00,  3.4751e+00,  8.3189e-01, -1.7

In [23]:
from scipy.spatial.distance import cosine

# "bank robber" vs "bank vault" 
same_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[6])
same_bank

0.945675253868103

In [24]:
# "bank robber" vs "river bank" 

diff_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[19])
diff_bank

0.6797333359718323