## Understanding Transformers and GPT

## Transformers 101 (Hands-On)

Goal:
- Understand the Transformer pipeline using a pretrained model
- Run inference on a real dataset
- (Optional) fine-tune for a few steps

We will use:
- Dataset: AG News (4-class news classification)
- Model: DistilBERT (pretrained Transformer encoder)

### 1.Install & Imports

In [1]:
!pip -q install transformers datasets accelerate evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

### 2.Dataset Downloading

In [3]:
dataset = load_dataset("ag_news")
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [4]:
# Inspect a sample
dataset["train"][0]

{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.",
 'label': 2}

In [6]:
label_names = dataset["train"].features["label"].names
label_names

['World', 'Sports', 'Business', 'Sci/Tech']

### 3.Load Tokenizer + Pretrained Model

In [7]:
model_name = "textattack/roberta-base-ag-news" #0.94 accuracy on AG News Dataset
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=4
)

model.eval()

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/754 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at textattack/roberta-base-ag-news were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

### 4. Tokenization: Text → Input IDs + Attention Mask

In [16]:
text = "SpaceX unveils a new methodology"
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

inputs

{'input_ids': tensor([[    0, 27404,  1000, 36685,  5290,    10,    92, 18670,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [17]:
# Reviewing Tokens
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
tokens

['<s>', 'Space', 'X', 'Ġunve', 'ils', 'Ġa', 'Ġnew', 'Ġmethodology', '</s>']

## 5. Inference: Get Logits -> Probabilities -> Prediction

In [18]:
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

probs = torch.softmax(logits, dim=-1)
pred_id = probs.argmax(dim=-1).item()

print("Text:", text)
print("Predicted label:", pred_id, "->", label_names[pred_id])
print("Probabilities:", probs.squeeze().tolist())

Text: SpaceX unveils a new methodology
Predicted label: 3 -> Sci/Tech
Probabilities: [0.01621493324637413, 0.0001677013497101143, 0.0003490319359116256, 0.9832683205604553]


In [12]:
batch_texts = [dataset["test"][i]["text"] for i in range(8)]
batch_labels = [dataset["test"][i]["label"] for i in range(8)]

batch_inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True)

with torch.no_grad():
    logits = model(**batch_inputs).logits
    preds = logits.argmax(dim=-1).tolist()

for i, (t, y, p) in enumerate(zip(batch_texts, batch_labels, preds)):
    print("="*80)
    print("Text:", t[:200], "...")
    print("True:", y, label_names[y])
    print("Pred:", p, label_names[p])

Text: Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul. ...
True: 2 Business
Pred: 2 Business
Text: The Race is On: Second Private Team Sets Launch Date for Human Spaceflight (SPACE.com) SPACE.com - TORONTO, Canada -- A second\team of rocketeers competing for the  #36;10 million Ansari X Prize, a co ...
True: 3 Sci/Tech
Pred: 3 Sci/Tech
Text: Ky. Company Wins Grant to Study Peptides (AP) AP - A company founded by a chemistry researcher at the University of Louisville won a grant to develop a method of producing better peptides, which are s ...
True: 3 Sci/Tech
Pred: 3 Sci/Tech
Text: Prediction Unit Helps Forecast Wildfires (AP) AP - It's barely dawn when Mike Fitzpatrick starts his shift with a blur of colorful maps, figures and endless charts, but already he knows what the day w ...
True: 3 Sci/Tech
Pred: 3 Sci/Tech
Text: Calif. Aims to Limit Farm-Related Smog (AP) AP - Sout

## GPT


- Load a pretrained GPT-2 model and tokenizer
- See how text becomes token IDs
- Run a forward pass to get logits (next-token distribution)
- Compute causal language modeling loss
- Generate text with sampling

We use:
- Model: distilgpt2 (fast), optionally gpt2
- Dataset: WikiText-2 (free)

## 1. Install and Imports

In [19]:
!pip -q install transformers datasets accelerate

In [20]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

## 2. Load Tokenizer + Pretrained GPT-2

In [21]:
model_name = "distilgpt2"  # change to "gpt2" if you have GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Previous step because the tokenizer does not have a pad_token
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print("Device:", device)
print("Vocab size:", tokenizer.vocab_size)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device: cpu
Vocab size: 50257


## 3. Tokenization: Text → Token IDs

In [22]:
text = "Transformers are amazing because"
enc = tokenizer(text, return_tensors="pt")
input_ids = enc["input_ids"].to(device)

print("Text:", text)
print("input_ids shape:", input_ids.shape)
print("Token IDs:", input_ids[0].tolist())
print("Tokens:", tokenizer.convert_ids_to_tokens(input_ids[0]))

Text: Transformers are amazing because
input_ids shape: torch.Size([1, 5])
Token IDs: [41762, 364, 389, 4998, 780]
Tokens: ['Transform', 'ers', 'Ġare', 'Ġamazing', 'Ġbecause']


## 4. Forward Pass: Logits (Next-token Scores)

In [23]:
with torch.no_grad():
    out = model(input_ids=input_ids)

logits = out.logits  # (B, T, V)
print("logits shape:", logits.shape)

logits shape: torch.Size([1, 5, 50257])


In [None]:
text = "Transformers are amazing because"

In [24]:
# Next Token Probabilities
last_logits = logits[0, -1]  # (V,)
probs = F.softmax(last_logits, dim=-1)

topk = torch.topk(probs, k=10)
top_ids = topk.indices.tolist()
top_probs = topk.values.tolist()

print("Top-10 next token predictions:")
for tid, p in zip(top_ids, top_probs):
    print(f"{tokenizer.decode([tid])!r:12s}  p={p:.4f}")

Top-10 next token predictions:
' they'       p=0.4544
' of'         p=0.0586
' the'        p=0.0507
' it'         p=0.0465
' you'        p=0.0405
' we'         p=0.0372
' their'      p=0.0250
' when'       p=0.0186
' there'      p=0.0165
' I'          p=0.0152


In [25]:
# Loss Function of our model using the previous inputs_ids. This is self-supervised learning: the text supplies its own labels.
with torch.no_grad():
    out = model(input_ids=input_ids, labels=input_ids)

print("Loss:", out.loss.item())

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Loss: 6.066323280334473


In [26]:
ids = input_ids[0].tolist()
tokens = tokenizer.convert_ids_to_tokens(ids)

print("t | input token -> target token")
for t in range(min(len(tokens)-1, 12)):
    print(f"{t:2d} | {tokens[t]!r:15s} -> {tokens[t+1]!r}")

t | input token -> target token
 0 | 'Transform'     -> 'ers'
 1 | 'ers'           -> 'Ġare'
 2 | 'Ġare'          -> 'Ġamazing'
 3 | 'Ġamazing'      -> 'Ġbecause'


## 5. We have two ways to see the results (Greedy vs Probabilistic)

In [27]:
## Greedy
prompt = "The meaning of life is"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

greedy = model.generate(
    **inputs,
    max_new_tokens=40,
    do_sample=False
)

print(tokenizer.decode(greedy[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The meaning of life is to be understood as a whole, and to be understood as a whole, and to be understood as a whole, and to be understood as a whole, and to be understood as a whole, and


In [30]:
## Probabilistic or Sampling
sampled = model.generate(
    **inputs,
    max_new_tokens=25,
    do_sample=True,
    temperature=0.8,
    top_k=50
)

print(tokenizer.decode(sampled[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The meaning of life is that we are not as conscious of what we are really really. The real life, as we have become, is not merely


## HOLD ON!! WHAT IS TEMPERATURE?

In [None]:
## Probabilistic or Sampling
sampled = model.generate(
    **inputs,
    max_new_tokens=25,
    do_sample=True,
    temperature=2.0,
    top_k=50
)

print(tokenizer.decode(sampled[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The meaning of life is that it changes when an external change happens. This applies not only in our personal life to an external life in this specific country


## 6. Conclusions

- GPT-2 is a decoder-only Transformer trained with next-token prediction.
- Tokenization produces `input_ids`.
- The model outputs `logits` with shape (batch, time, vocab).
- Causal LM loss predicts token t+1 from tokens up to t.
- Generation is iterative: forward pass → sample next token → append.
- Fine-tuning adapts the model quickly with a small dataset subset.

# Let's start with: Nanochat GPT

In [None]:
from nanochat.gpt import GPT, GPTConfig
from nanochat.tokenizer import get_tokenizer

tokenizer = get_tokenizer()

In [None]:
config = GPTConfig(
    vocab_size=tokenizer.get_vocab_size(),
    n_layer=4,
    n_head=4,
    n_kv_head=4,
    n_embd=256,
)
model = GPT(config)

In [None]:
print(f"Nanochat has almost {round(sum(p.numel() for p in model.parameters()) /1e6, 2)} MM parameters")

In [None]:
conversation = {
    "messages": [
        {"role": "user", "content": "What is a transformer?"},
        {"role": "assistant", "content": "A transformer is a neural network based on attention."}
    ]
}

In [None]:
ids, loss_mask = tokenizer.render_conversation(conversation)

print("Number of tokens:", len(ids))
print("Loss tokens:", sum(loss_mask))

In [None]:
import torch

input_ids = torch.tensor(ids).unsqueeze(0)  # (1, T)
logits = model(input_ids)

loss_mask_t = torch.tensor(loss_mask).unsqueeze(0)

targets = input_ids[:, 1:]
logits = logits[:, :-1, :]

loss_mask_t = loss_mask_t[:, 1:]

log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
target_log_probs = log_probs.gather(-1, targets.unsqueeze(-1)).squeeze(-1)

loss = -(target_log_probs * loss_mask_t).sum() / loss_mask_t.sum()
loss

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

for step in range(100):
    optimizer.zero_grad()
    logits = model(input_ids)

    logits = logits[:, :-1, :]
    targets = input_ids[:, 1:]
    mask = loss_mask_t

    log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
    loss = -(log_probs.gather(-1, targets.unsqueeze(-1)).squeeze(-1) * mask).sum() / mask.sum()

    loss.backward()
    optimizer.step()

    if step % 10 == 0:
        print(f"step {step} | loss {loss.item():.4f}")

In [None]:
from nanochat.engine import Engine

engine = Engine(model, tokenizer)

print("Model output:")
print("-" * 40)

input_ids = tokenizer.render_for_completion(conversation)

output = ""
for token_ids, token_mask in engine.generate(
    input_ids,
    max_tokens=50,
    temperature=0.8,
    top_k=40,
):
    text_piece = tokenizer.decode(token_ids)
    print(text_piece, end="", flush=True)
    output += text_piece

print("\n" + "-" * 40)