## Importing the Libraries

In [22]:
from transformers import BertModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
# the tutorial used scipi's cosine function but that didn't work, and Bing suggested this instead

## Defining the model and the tokenizer

In [2]:
model_name = "bert-base-cased"

model = BertModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

## Tokenize the text

In [5]:
text = "Tokenize me this please"
encoded_inputs = tokenizer(text, return_tensors="pt")
# wrapped in a tensor object
output = model(**encoded_inputs)


## Set the last hidden state

In [7]:
last_hidden_state = output.last_hidden_state
# the final state before ending customizations
pooler_output = output.pooler_output

print(last_hidden_state.shape)
# this represents the vector space for each token
print(pooler_output.shape)
# this represents a new vector space for the entire sentence. This is useful for sentiment classification

torch.Size([1, 8, 768])
torch.Size([1, 768])


## A function to show that the same token can get a unique, context-specific encoding

In [19]:
def predict(text):
    encoded_inputs = tokenizer(text, return_tensors="pt")
    return model(**encoded_inputs)[0]

sentence1 = "There was a fly drinking from my soup"
sentence2 = "To become a commercial pikot, you have to fly for 1500 hours"

tokens1 = tokenizer.tokenize(sentence1)
tokens2 = tokenizer.tokenize(sentence2)


out1 = predict(sentence1)
out2 = predict(sentence2)

emb1 = out1[0:, tokens1.index("fly"), :].detach()
emb2 = out2[0:, tokens2.index("fly"), :].detach()

print(emb1.shape)
print(emb2.shape)

print(emb1)
print(emb2)

torch.Size([1, 768])
torch.Size([1, 768])
tensor([[ 1.3092e-01, -1.0668e-01,  9.2522e-02,  1.3643e-01, -2.0440e-02,
         -4.5883e-01,  2.8127e-01,  6.1601e-02, -3.5428e-01, -2.1475e-01,
         -2.5499e-01,  5.1753e-01, -4.3148e-01, -6.3329e-02, -5.3246e-01,
         -3.2249e-01, -2.1388e-01,  1.3354e-01, -1.9712e-01, -2.6240e-01,
         -1.4709e-01, -1.0367e-01, -2.5171e-01,  5.1877e-01,  9.3922e-02,
         -5.8072e-02, -2.8924e-01,  3.1419e-01, -1.9856e-02,  8.9696e-02,
         -4.5797e-01, -7.0561e-02,  5.0785e-01,  3.4331e-02, -1.3408e-01,
          1.4696e-01,  2.7969e-01,  3.0273e-02,  1.2643e-01, -4.4700e-01,
          2.9013e-01,  4.3903e-02, -2.0638e-01, -1.5396e-01,  2.4499e-01,
         -6.9515e-01,  4.1689e-01, -6.1323e-02, -3.0181e-01, -1.1547e-01,
         -3.1198e-01, -1.6191e-01, -1.8917e-01,  1.5833e-01,  1.3775e-01,
         -3.2496e-01,  2.6430e-01, -1.0327e-01,  2.0264e-01,  6.1191e-01,
         -6.3990e-02,  5.9297e-01,  4.8045e-01,  1.2240e-01, -3.5769e-

## Calculating the cosine distance between the two vectors

In [28]:
emb1 = emb1.reshape(-1, emb1.shape[-1])
emb2 = emb2.reshape(-1, emb2.shape[-1])
# this wasn't necessary in the tutorial, but it was here. I don't know why.

print(cosine_similarity(emb1, emb2))

[[0.5706227]]
