# Transformers Feature Extraction

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [6]:
# Load language model and tokenizer 
lm = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Encode input text
input_text = "Thinking fast and slow"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Forward pass
with torch.no_grad():
    output = lm(input_ids, output_hidden_states=True)

# Extract hidden states
hidden_states = output.hidden_states


In [12]:
print(f"output.hidden_states is a {type(hidden_states)} of length {len(hidden_states)},")
print(f"where {len(hidden_states)} is the number of layers in the model, starting from the zero-th layer (input embeddings)")

output.hidden_states is a <class 'tuple'> of length 13,
where 13 is the number of layers in the model, starting from the zero-th layer (input embeddings)


In [14]:
layer_6 = hidden_states[6]
print(f"layer_6 is a {type(layer_6)} of shape {layer_6.shape}, where")
print(f"- {layer_6.shape[0]} is the number of input sequences (batch size)")
print(f"- {layer_6.shape[1]} is the number of tokens in the input sequences (if multiple sequences are provided, shorter ones are padded)")
print(f"- {layer_6.shape[2]} is the number of dimensions in the hidden states")

layer_6 is a <class 'torch.Tensor'> of shape torch.Size([1, 5, 768]), where
- 1 is the number of input sequences (batch size)
- 5 is the number of tokens in the input sequences (if multiple sequences are provided, shorter ones are padded)
- 768 is the number of dimensions in the hidden states
