In [1]:
from transformers import BertModel, BertTokenizer
import torch

In [2]:
model_ckpt = 'bert-base-uncased'
model = BertModel.from_pretrained(model_ckpt)
sentence = 'She is a Machine Learning Engineer and works at Silicon Valley in California.'

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokens = tokenizer.tokenize(sentence)
print(tokens)

tokens = ['[CLS]'] + tokens + ['[SEP]'] #start and end tokens
tokens = tokens + ['[PAD]'] + ['[PAD]'] #how to add padding
print(tokens)

['she', 'is', 'a', 'machine', 'learning', 'engineer', 'and', 'works', 'at', 'silicon', 'valley', 'in', 'california', '.']
['[CLS]', 'she', 'is', 'a', 'machine', 'learning', 'engineer', 'and', 'works', 'at', 'silicon', 'valley', 'in', 'california', '.', '[SEP]', '[PAD]', '[PAD]']


In [10]:
#Attention Mask
attention_mask = [1 if i!= '[PAD]' else 0 for i in tokens]
print(attention_mask)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]


In [11]:
#Unique Token IDs
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)
token_ids = torch.tensor(token_ids).unsqueeze(0)
attention_mask = torch.tensor(attention_mask).unsqueeze(0)
print(token_ids)
print(attention_mask)

[101, 2016, 2003, 1037, 3698, 4083, 3992, 1998, 2573, 2012, 13773, 3028, 1999, 2662, 1012, 102, 0, 0]
tensor([[  101,  2016,  2003,  1037,  3698,  4083,  3992,  1998,  2573,  2012,
         13773,  3028,  1999,  2662,  1012,   102,     0,     0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])


In [16]:
#Get embedding
output = model(token_ids, attention_mask = attention_mask)

#last_hidden_state
print(output[0])
print(output[0].shape) #[1,18,768] #(1, num words, embed_dim)
print("\n\n")
#pooler_output
print(output[1])
print(output[1].shape) #[1, 768]


tensor([[[-0.1099,  0.0468, -0.6595,  ..., -0.2623,  0.5234,  0.2463],
         [ 0.2299, -0.4289, -0.8433,  ...,  0.1221,  0.3654, -0.2318],
         [-0.1189,  0.1354, -0.0965,  ..., -0.4777,  0.1248,  0.2993],
         ...,
         [-0.0959,  0.2789, -0.8420,  ...,  0.3402, -0.3266, -0.5629],
         [ 0.0823,  0.2564, -0.2476,  ...,  0.3251,  0.2154, -0.1537],
         [ 0.0115,  0.2023, -0.3834,  ...,  0.4589,  0.3258, -0.4677]]],
       grad_fn=<NativeLayerNormBackward0>)
torch.Size([1, 18, 768])



tensor([[-9.5204e-01, -5.6682e-01, -9.3476e-01,  9.0912e-01,  8.6666e-01,
         -3.3698e-01,  9.4771e-01,  5.7378e-01, -8.6329e-01, -1.0000e+00,
         -7.5686e-01,  9.6853e-01,  9.8878e-01,  5.4817e-01,  9.6468e-01,
         -8.5410e-01, -2.4779e-01, -7.3305e-01,  4.6576e-01, -7.4243e-01,
          8.0826e-01,  1.0000e+00, -5.6346e-02,  4.6798e-01,  6.2356e-01,
          9.9699e-01, -8.7362e-01,  9.6176e-01,  9.7230e-01,  8.6774e-01,
         -8.1800e-01,  4.6473e-01, -9.9430e