In [1]:
import torch
from transformers import BertTokenizer, BertModel

In [2]:
# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [4]:
# Input text
text = "He is a nurse."

# Tokenize the input text
tokens = tokenizer.tokenize(text)

# Convert tokens to IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens)

# Convert the input IDs to a PyTorch tensor
input_tensor = torch.tensor([input_ids])

In [5]:
# Get the BERT model embeddings
with torch.no_grad():
    outputs = model(input_tensor)

# Get the embeddings for the first (and only) input
embeddings = outputs[0][0]

# Print the embeddings for each token
for token, embedding in zip(tokens, embeddings):
    print(token, embedding)

he tensor([-4.6338e-01, -3.0996e-01,  1.6054e-01,  8.0972e-01,  4.8232e-01,
        -4.0423e-01,  2.6335e-02,  7.5363e-02,  1.0247e-01, -4.6137e-01,
        -1.2992e-01, -1.4313e-01,  3.2640e-01,  2.1742e-01, -8.0770e-01,
         3.5144e-01,  8.7717e-02,  9.2970e-02, -7.7151e-02, -5.0190e-01,
         3.4620e-01, -1.8899e-01,  3.6843e-01, -2.1992e-01, -1.2888e-02,
         1.2670e-01,  2.1332e-01,  1.4199e-01, -3.3493e-01,  1.9363e-01,
         2.1231e-01, -9.2829e-03, -8.6492e-02,  2.8928e-01, -2.5328e-01,
        -8.7520e-01,  3.5050e-01,  1.4615e-01,  1.9863e-01, -3.1830e-01,
        -3.0778e-01, -1.4636e-01, -7.7228e-02,  1.0241e-02,  5.2721e-01,
        -2.0565e-01, -2.0468e-01,  2.6904e-01, -2.9444e-02, -4.5367e-01,
        -5.1624e-01,  2.5022e-01, -2.3693e-01,  5.9179e-01, -2.5898e-02,
         1.6132e-01,  2.7476e-02, -1.4668e-01, -4.1934e-01,  1.3714e-01,
         1.9671e-01, -3.4640e-01, -4.2966e-03, -7.8448e-02,  5.8164e-01,
         1.9243e-01,  4.4958e-01, -7.3122e-01, -

In [6]:
# Input occupation words
occupations = ["doctor", "nurse", "engineer", "teacher"]

# Create input texts for different genders
male_texts = [f"He is a {occupation}." for occupation in occupations]
female_texts = [f"She is a {occupation}." for occupation in occupations]

# Tokenize and convert input texts to IDs
male_input_ids = [tokenizer.encode(text, add_special_tokens=True) for text in male_texts]
female_input_ids = [tokenizer.encode(text, add_special_tokens=True) for text in female_texts]

# Convert the input IDs to PyTorch tensors
male_input_tensors = [torch.tensor([input_ids]) for input_ids in male_input_ids]
female_input_tensors = [torch.tensor([input_ids]) for input_ids in female_input_ids]


In [7]:
# Get the BERT model embeddings
with torch.no_grad():
    male_outputs = [model(input_tensor).last_hidden_state for input_tensor in male_input_tensors]
    female_outputs = [model(input_tensor).last_hidden_state for input_tensor in female_input_tensors]

# Calculate the average embeddings for each occupation and gender
male_avg_embeddings = [torch.mean(embeddings, dim=1) for embeddings in male_outputs]
female_avg_embeddings = [torch.mean(embeddings, dim=1) for embeddings in female_outputs]

# Calculate the cosine similarity between male and female average embeddings for each occupation
similarities = [torch.cosine_similarity(male_avg_emb, female_avg_emb) for male_avg_emb, female_avg_emb in
                zip(male_avg_embeddings, female_avg_embeddings)]

# Print the cosine similarities
for occupation, similarity in zip(occupations, similarities):
    print(f"Occupation: {occupation}, Cosine Similarity: {similarity.item()}")


Occupation: doctor, Cosine Similarity: 0.9412516951560974
Occupation: nurse, Cosine Similarity: 0.9453706741333008
Occupation: engineer, Cosine Similarity: 0.917951762676239
Occupation: teacher, Cosine Similarity: 0.9249525666236877


In [14]:
!pip install plotly

Collecting plotly
  Downloading plotly-5.15.0-py2.py3-none-any.whl (15.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.5/15.5 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hCollecting tenacity>=6.2.0 (from plotly)
  Using cached tenacity-8.2.2-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.15.0 tenacity-8.2.2


In [17]:
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import plotly.graph_objects as go

In [15]:
# Get the BERT embeddings for a list of words
words = ["doctor", "nurse", "engineer", "teacher", "musician", "artist"]
word_input_ids = [tokenizer.encode(word, add_special_tokens=True) for word in words]
word_input_tensors = [torch.tensor([input_ids]) for input_ids in word_input_ids]

# Get the BERT model embeddings
with torch.no_grad():
    word_outputs = [model(input_tensor).last_hidden_state.squeeze(0).numpy() for input_tensor in word_input_tensors]

# Concatenate the embeddings for all words
all_embeddings = np.concatenate(word_outputs)

In [20]:
# Apply t-SNE to reduce the dimensionality of the embeddings
tsne_embeddings = TSNE(n_components=3, perplexity=5, random_state=42).fit_transform(all_embeddings)

# Create an interactive 3D scatter plot using Plotly
fig = go.Figure(data=go.Scatter3d(
    x=tsne_embeddings[:, 0],
    y=tsne_embeddings[:, 1],
    z=tsne_embeddings[:, 2],
    mode='markers',
    text=words,
    marker=dict(
        size=8,
        color=np.arange(len(words)),
        colorscale='Viridis',
        opacity=0.8
    )
))

# Set plot layout
fig.update_layout(
    title="BERT Embeddings - t-SNE Visualization",
    scene=dict(
        xaxis_title="t-SNE Dimension 1",
        yaxis_title="t-SNE Dimension 2",
        zaxis_title="t-SNE Dimension 3"
    )
)

# Show the interactive plot
fig.show()