## Importing the Libraries

In [None]:
from transformers import BertModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
# the tutorial used scipi's cosine function but that didn't work, and Bing suggested this instead

## Defining the model and the tokenizer

In [None]:
model_name = "bert-base-cased"

model = BertModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

## Tokenize the text

In [None]:
text = "Tokenize me this please"
encoded_inputs = tokenizer(text, return_tensors="pt")
# wrapped in a tensor object
output = model(**encoded_inputs)


## Set the last hidden state

In [None]:
last_hidden_state = output.last_hidden_state
# the final state before ending customizations
pooler_output = output.pooler_output

print(last_hidden_state.shape)
# this represents the vector space for each token
print(pooler_output.shape)
# this represents a new vector space for the entire sentence. This is useful for sentiment classification

## A function to show that the same token can get a unique, context-specific encoding

In [None]:
def predict(text):
    encoded_inputs = tokenizer(text, return_tensors="pt")
    return model(**encoded_inputs)[0]

sentence1 = "There was a fly drinking from my soup"
sentence2 = "To become a commercial pikot, you have to fly for 1500 hours"

tokens1 = tokenizer.tokenize(sentence1)
tokens2 = tokenizer.tokenize(sentence2)


out1 = predict(sentence1)
out2 = predict(sentence2)

emb1 = out1[0:, tokens1.index("fly"), :].detach()
emb2 = out2[0:, tokens2.index("fly"), :].detach()

print(emb1.shape)
print(emb2.shape)

print(emb1)
print(emb2)

## Calculating the cosine distance between the two vectors

In [None]:
emb1 = emb1.reshape(-1, emb1.shape[-1])
emb2 = emb2.reshape(-1, emb2.shape[-1])
# this wasn't necessary in the tutorial, but it was here. I don't know why.

print(cosine_similarity(emb1, emb2))