In [1]:
# https://huggingface.co/google-bert/models

In [2]:
from transformers import BertTokenizer, BertModel
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

In [4]:
text = "Replace me by any text you'd like."

In [5]:
encoded_input = tokenizer(text, return_tensors='pt')

In [6]:
encoded_input

{'input_ids': tensor([[ 101, 5672, 2033, 2011, 2151, 3793, 2017, 1005, 1040, 2066, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [7]:
tokenizer.decode(encoded_input['input_ids'].tolist()[0])

"[CLS] replace me by any text you'd like. [SEP]"

In [8]:
output = model(**encoded_input)
output.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [9]:
output['last_hidden_state'].shape, output['pooler_output'].shape

(torch.Size([1, 12, 768]), torch.Size([1, 768]))

In [10]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [11]:
model.forward(encoded_input['input_ids']).pooler_output.grad_fn

<TanhBackward0 at 0x111704130>

In [12]:
inp1= "the man saw the girl with the telescope"
inp2= "the man with the telescope saw the girl"
inp3= "the man saw the girl having the telescope"

In [13]:
out1= model(**tokenizer(inp1, return_tensors='pt'))['pooler_output'][0].detach().numpy()
out2= model(**tokenizer(inp2, return_tensors='pt'))['pooler_output'][0].detach().numpy()
out3= model(**tokenizer(inp3, return_tensors='pt'))['pooler_output'][0].detach().numpy()

In [14]:
embeddings = np.array([out1, out2, out3])

In [15]:
labels = [inp1, inp2, inp3]

In [21]:
import plotly.express as px
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
embeddings_2d = pca.fit_transform(embeddings)

# Create interactive scatter plot
fig = px.scatter(
    x=embeddings_2d[:, 0],
    y=embeddings_2d[:, 1],
    text=labels,  # show label on hover
    title="PCA Visualization with Hover Labels"
)

fig.update_traces(marker=dict(size=8, opacity=0.7),
                  hovertemplate="Label: %{text}<br>PC1: %{x}<br>PC2: %{y}<extra></extra>")

fig.show()