# BERT Step by Step: Positional Embeddings

In natural language processing, positional embeddings play a crucial role in understanding the sequential nature of language data. Word embeddings capture semantic relationships between words but lack the ability to encode sequential information.

Positional embeddings complement word embeddings by encoding the position or order of words in a sequence. They provide a way for models to differentiate between words based not only on their meanings but also on their positions within the input sequence.

* [*Yu-An Wang, Yun-Nung Chen*. What Do Position Embeddings Learn?An Empirical Study of Pre-Trained Language Model Positional Encoding](https://arxiv.org/abs/2010.04903)

In [None]:
import matplotlib
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'svg'

import torch
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoConfig, AutoTokenizer
from transformers import BertForPreTraining

In [None]:
model_checkpoint = 'bert-base-uncased'

In [None]:
model = BertForPreTraining.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
config = AutoConfig.from_pretrained(model_checkpoint)

In [None]:
encoding = tokenizer.encode("let's tokenize something?", return_tensors="pt")
seq_embedding = model.bert.embeddings.word_embeddings(encoding)
seq_embedding.shape   # (batch_size, seq_len, hidden_size)

In [None]:
model.bert.embeddings

In [None]:
config.hidden_size              # size of the embeddings

In [None]:
config.max_position_embeddings  # max seq_len

In [None]:
seq_len = encoding.shape[-1]

# Array with the postions for our sequence
pos_embedding_seq = model.bert.embeddings.position_embeddings.weight[:seq_len]

# Reshape as a batch of 1
pos_embedding_seq.view((1, seq_len, config.hidden_size)).shape

In [None]:
seq_embedding + pos_embedding_seq

In [None]:
matplotlib.rcParams['figure.figsize'] = (12, 1)

for i in [0, 1, 2, 10, 100, 200, 300, 400, 500]:
    plt.plot(model.bert.embeddings.position_embeddings.weight.detach().numpy()[i],    alpha=0.5, c='red')
    plt.plot(seq_embedding[0, 2].detach().numpy(), alpha=0.5, c='blue')
    plt.xlim([0, config.hidden_size])
    plt.ylim([-0.15, 0.15])
    plt.show()

matplotlib.rcParams['figure.figsize'] = (6, 4)

In [None]:
similarity_matrix = cosine_similarity(model.bert.embeddings.position_embeddings.weight.detach().numpy())
plt.matshow(similarity_matrix, cmap='Blues')
plt.ylabel('Position')
plt.xlabel('Position')
plt.gca().xaxis.tick_top()
plt.gca().xaxis.set_label_position('top') 
plt.show()