In [1]:
from transformers import BertModel, BertTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [39]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

f'{len(tokenizer.vocab):,}'

'30,522'

In [3]:
text = 'A simple sentence!'
tokens = tokenizer.encode(text)
print(tokens)

[101, 1037, 3722, 6251, 999, 102]


In [4]:
tokenizer.decode(tokens)

'[CLS] a simple sentence! [SEP]'

In [5]:
text = 'My friend told me about this class and I love it so far! She was right.'
tokens = tokenizer.encode(text)
tokens

[101,
 2026,
 2767,
 2409,
 2033,
 2055,
 2023,
 2465,
 1998,
 1045,
 2293,
 2009,
 2061,
 2521,
 999,
 2016,
 2001,
 2157,
 1012,
 102]

In [6]:
text, len(tokens)

('My friend told me about this class and I love it so far! She was right.', 20)

In [7]:
for t in tokens:
    print(f'token: {t}, subword: {tokenizer.decode([t])}')

token: 101, subword: [CLS]
token: 2026, subword: my
token: 2767, subword: friend
token: 2409, subword: told
token: 2033, subword: me
token: 2055, subword: about
token: 2023, subword: this
token: 2465, subword: class
token: 1998, subword: and
token: 1045, subword: i
token: 2293, subword: love
token: 2009, subword: it
token: 2061, subword: so
token: 2521, subword: far
token: 999, subword: !
token: 2016, subword: she
token: 2001, subword: was
token: 2157, subword: right
token: 1012, subword: .
token: 102, subword: [SEP]


In [8]:
'sunngon' in tokenizer.vocab

False

In [9]:
text_with_unknown_words = 'Sunggon loves a beautiful day'
tokens_with_unknown_words = tokenizer.encode(text_with_unknown_words)
tokens_with_unknown_words

[101, 7042, 7446, 7459, 1037, 3376, 2154, 102]

In [10]:
for t in tokens_with_unknown_words:
    print(f'token: {t}, subword: {tokenizer.decode([t])}')

token: 101, subword: [CLS]
token: 7042, subword: sung
token: 7446, subword: ##gon
token: 7459, subword: loves
token: 1037, subword: a
token: 3376, subword: beautiful
token: 2154, subword: day
token: 102, subword: [SEP]


In [16]:
for t in tokenizer.encode('gon'):
    print(f'token: {t}, subword: {tokenizer.decode([t])}')

token: 101, subword: [CLS]
token: 2175, subword: go
token: 2078, subword: ##n
token: 102, subword: [SEP]


In [20]:
text_with_unknown_words = 'Sunggon is our instructor for this awesomesauce class'
tokens_with_unknown_words = tokenizer.encode(text_with_unknown_words)

for t in tokens_with_unknown_words:
    print(f'token: {t}, subword: {tokenizer.decode([t])}')

token: 101, subword: [CLS]
token: 7042, subword: sung
token: 7446, subword: ##gon
token: 2003, subword: is
token: 2256, subword: our
token: 9450, subword: instructor
token: 2005, subword: for
token: 2023, subword: this
token: 12476, subword: awesome
token: 23823, subword: ##sau
token: 3401, subword: ##ce
token: 2465, subword: class
token: 102, subword: [SEP]


In [21]:
text = 'My friend told me about this class and I love it so far! She was right.'
tokens = tokenizer.encode_plus(text)
tokens

{'input_ids': [101, 2026, 2767, 2409, 2033, 2055, 2023, 2465, 1998, 1045, 2293, 2009, 2061, 2521, 999, 2016, 2001, 2157, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [22]:
tokenizer(text)

{'input_ids': [101, 2026, 2767, 2409, 2033, 2055, 2023, 2465, 1998, 1045, 2293, 2009, 2061, 2521, 999, 2016, 2001, 2157, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [23]:
python_pet = tokenizer.encode('I love my pet python')
python_language = tokenizer.encode('I love coding in python')

In [27]:
model(torch.tensor(python_pet).unsqueeze(0))

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.0626,  0.3430, -0.0584,  ..., -0.1488,  0.3335,  0.5332],
         [ 0.2981,  0.3265, -0.1502,  ..., -0.2316,  0.8080,  0.3994],
         [ 1.3638,  1.0808,  0.6733,  ..., -0.0417,  0.6030,  0.1758],
         ...,
         [ 0.4767,  0.2946,  0.8069,  ..., -0.5178,  0.4988,  0.6098],
         [-0.1752,  0.1880, -0.6628,  ...,  0.4838,  0.0873, -0.0886],
         [ 0.7509,  0.2521, -0.0804,  ...,  0.0018, -0.5243, -0.3450]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-8.6265e-01, -3.3994e-01, -3.2708e-01,  6.2681e-01,  2.3546e-01,
         -7.8475e-02,  8.5182e-01,  2.0823e-01, -2.1635e-01, -9.9996e-01,
         -1.3629e-01,  7.3364e-01,  9.8811e-01,  7.5972e-02,  9.2794e-01,
         -5.3177e-01, -9.9756e-02, -5.6424e-01,  3.6173e-01, -4.2933e-01,
          6.8106e-01,  9.9767e-01,  4.9564e-01,  2.2357e-01,  4.7007e-01,
          9.0762e-01, -6.6177e-01,  9.3297e-01,  9.5279e-01,  6.876

In [29]:
model(torch.tensor(python_pet).unsqueeze(0))[0].shape

torch.Size([1, 7, 768])

In [30]:
model(torch.tensor(python_pet).unsqueeze(0))[0][:, 5, :].shape

torch.Size([1, 768])

In [31]:
python_pet_embedding = model(torch.tensor(python_pet).unsqueeze(0))[0][:, 5, :].detach().numpy() # python is 6th token

In [32]:
python_language_embedding = model(torch.tensor(python_language).unsqueeze(0))[0][:, 5, :].detach().numpy() # python is 6th token

In [33]:
snake_alone_embedding = model(torch.tensor(tokenizer.encode('snake')).unsqueeze(0))[0][:, 1, :].detach().numpy()
programming_alone_embedding = model(torch.tensor(tokenizer.encode('programming')).unsqueeze(0))[0][:, 1, :].detach().numpy()

In [34]:
cosine_similarity(python_pet_embedding, snake_alone_embedding), cosine_similarity(python_pet_embedding, programming_alone_embedding)

(array([[0.6928656]], dtype=float32), array([[0.49864388]], dtype=float32))

In [36]:
cosine_similarity(programming_alone_embedding, programming_alone_embedding), cosine_similarity(programming_alone_embedding, snake_alone_embedding)

(array([[1.]], dtype=float32), array([[0.6274053]], dtype=float32))

In [37]:
cosine_similarity(programming_alone_embedding, python_pet_embedding)

array([[0.49864388]], dtype=float32)