In [None]:
from transformers import GPT2LMHeadModel

In [None]:
model_hf = GPT2LMHeadModel.from_pretrained("gpt2")  # 124M model
state_dict_hf = model_hf.state_dict()

for k, v in state_dict_hf.items():
    print(k, v.shape)

In [None]:
# here, wte is the embedding matrix for the tokens (vocab_size: 50257, embedding_dim: 768)
# wpe is the learnt positional embedding matrix (seq_len: 1024, embedding_dim: 768)
# - If I visualize the positional embeddings, I see sinusoidal-like structure emerge:
#   - Each row represents a position, and optimization has shaped them to resemble sinusoidal and cosinusoidal patterns
#   - Unlike the original Transformer paper where sinusoidal embeddings were fixed, GPT-2 trains these parameters from scratch and they converge to similar patterns
#   - Some channels show noisy curves, suggesting the model wasn't fully trained to convergence
#   - Overall the embeddings capture positional information effectively

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.imshow(state_dict_hf["transformer.wpe.weight"], cmap="gray")

In [None]:
plt.plot(state_dict_hf["transformer.wpe.weight"][:, 150])
plt.plot(state_dict_hf["transformer.wpe.weight"][:, 500])
plt.plot(state_dict_hf["transformer.wpe.weight"][:, 600])

In [None]:
# Generati

In [None]:
from transformers import pipeline, set_seed

generator = pipeline("text-generation", model="gpt2")
set_seed(42)
start_text = "Hello, I'm a language model,"
generator(start_text, max_length=50, num_return_sequences=5)