# Visualize Word Embeddings via Tensorboard

## Setup

In [36]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForMaskedLM

## Get Model

In [37]:
model_name = "bert-base-uncased"
model_type = AutoModelForMaskedLM

In [39]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = model_type.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Get Vocabulary - Vector pairs

In [40]:
tokenizer.add_special_tokens({"pad_token":"[PAD]"})

0

In [18]:
model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=128)

Embedding(32128, 4096)

In [13]:
def embed_words(tokenizer:AutoTokenizer, embedder, num_samples:int = None):
    if num_samples:
        temp = list(tokenizer.vocab.keys())
        out_labels = []
        for idx in torch.randperm(len(tokenizer.vocab))[:num_samples]:
            out_labels.append(temp[idx])
    else:
        out_labels = list(tokenizer.vocab.keys())

    # batch encode
    encodedInputs = tokenizer(out_labels, return_tensors="pt", max_length=3, truncation=True, padding=True)
    embeddedOutputs = embedder(encodedInputs.input_ids)
    return embeddedOutputs[:, 1], tokenizer.batch_decode(encodedInputs.input_ids)

In [19]:
embedder = model.get_input_embeddings()
embedder

Embedding(32128, 4096)

In [20]:
embeddings, labels = embed_words(tokenizer, embedder)
embeddings.shape

torch.Size([32001, 4096])

## Write Results in Tensorboard

In [21]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter("./experiment")

In [22]:
writer.add_embedding(embeddings, labels, global_step=2)
writer.close()