# Lecture 9: Fine-tuning

Lecture 9 | CMU ANLP Spring 2025 | Instructor: Sean Welleck


This is a notebook for [CMU CS11-711 Advanced NLP](https://cmu-l3.github.io/anlp-spring2025/).

In [48]:
data = open('names.txt').read().splitlines()
data[1000:1010]

['paityn',
 'evalyn',
 'luz',
 'nathalia',
 'winnie',
 'chandler',
 'ciara',
 'danica',
 'nailah',
 'rilynn']

Load the model

In [19]:
import transformers
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model = "HuggingFaceTB/SmolLM2-135M"

tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForCausalLM.from_pretrained(model)

# Add a new pad token to the tokenizer
tokenizer.add_special_tokens({
    "pad_token": "<|pad|>",
    "bos_token": "<|startoftext|>",
})

# Add the new pad token to the model
model.resize_token_embeddings(len(tokenizer))

Create dataset and dataloaders

In [None]:
from torch.utils.data import Dataset, DataLoader
class NamesDataset(Dataset):
    def __init__(self, names, tokenizer, max_length=128):
        self.names = names
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        name = self.tokenizer.bos_token + self.names[idx] + self.tokenizer.eos_token
        return name

    def collate_fn(self, batch):
        # pad based on the longest sequence in the batch using padding "longest"
        inputs = tokenizer(
            batch, 
            padding="longest", 
            return_tensors="pt", 
            truncation=True
        )
        return inputs


# Split into train, dev, test
import random
random.seed(123)
random.shuffle(data)

n1 = int(0.8 * len(data))
n2 = int(0.9 * len(data))

train_data = data[:n1]
dev_data = data[n1:n2]
test_data = data[n2:]

# Create the datasets and dataloaders
train_dataset = NamesDataset(train_data, tokenizer)
dev_dataset = NamesDataset(dev_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=train_dataset.collate_fn)
dev_loader = DataLoader(dev_dataset, batch_size=8, shuffle=False, collate_fn=dev_dataset.collate_fn)

In [20]:
# example batch
next(iter(train_loader))

{'input_ids': tensor([[49153,    90,  2214,  7650,     0],
        [49153,  4933,   693,   254,     0],
        [49153,  5802,   552,    81,     0],
        [49153, 14151, 10333,     0, 49152],
        [49153, 10328,   373,  5488,     0],
        [49153,  6925,   792,    95,     0],
        [49153,   541,   552,    85,     0],
        [49153,    92,  1111,  1287,     0],
        [49153, 38093, 18133,     0, 49152],
        [49153,    91,  9331,     0, 49152],
        [49153,    88,  1131,   332,     0],
        [49153,    99, 45896,     0, 49152],
        [49153,  1317, 15034,     0, 49152],
        [49153,   763,   105,   391,     0],
        [49153, 21900,   992,     0, 49152],
        [49153,   264, 10675,     0, 49152]]), 'attention_mask': tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 

In [21]:
tokenizer.decode([49153,  3298,  7712,     0, 49152])

'<|startoftext|>marilla<|endoftext|><|pad|>'

Training loop

In [None]:
import torch.optim as optim
import torch.nn as nn

# Count model parameters
print(f"Model parameters: {sum(p.numel() for p in model.parameters())}")

# Hyperparameters
learning_rate = 0.0001
num_epochs = 1

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Learning rate schedule (cosine with warmup)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=len(train_loader) * num_epochs, eta_min=0)

# Training loop
for epoch in range(num_epochs):
    
    model.train()
    total_loss = 0

    for i, batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]

        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        scheduler.step()

        # Print average loss every 10 batches
        if i % 10 == 0:
            avg_loss = total_loss / (i + 1)
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i}/{len(train_loader)}], Loss: {avg_loss:.4f}')


    avg_loss = total_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

    # Evaluate validation loss
    eval_loss = 0
    model.eval()
    with torch.no_grad():

        for i, batch in enumerate(dev_loader):
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]

            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss

            eval_loss += loss.item()

    avg_eval_loss = eval_loss / len(dev_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {avg_eval_loss:.4f}')

Model parameters: 134516160
Epoch [1/1], Step [0/1602], Loss: 10.6860
Epoch [1/1], Step [10/1602], Loss: 7.4223
Epoch [1/1], Step [20/1602], Loss: 6.8610
Epoch [1/1], Step [30/1602], Loss: 6.5487
Epoch [1/1], Step [40/1602], Loss: 6.2980
Epoch [1/1], Step [50/1602], Loss: 6.0415
Epoch [1/1], Step [60/1602], Loss: 5.8234
Epoch [1/1], Step [70/1602], Loss: 5.5985
Epoch [1/1], Step [80/1602], Loss: 5.3880
Epoch [1/1], Step [90/1602], Loss: 5.1745
Epoch [1/1], Step [100/1602], Loss: 4.9964
Epoch [1/1], Step [110/1602], Loss: 4.8135
Epoch [1/1], Step [120/1602], Loss: 4.6661
Epoch [1/1], Step [130/1602], Loss: 4.5397
Epoch [1/1], Step [140/1602], Loss: 4.4444
Epoch [1/1], Step [150/1602], Loss: 4.3554
Epoch [1/1], Step [160/1602], Loss: 4.2737
Epoch [1/1], Step [170/1602], Loss: 4.1982
Epoch [1/1], Step [180/1602], Loss: 4.1369
Epoch [1/1], Step [190/1602], Loss: 4.0737
Epoch [1/1], Step [200/1602], Loss: 4.0226
Epoch [1/1], Step [210/1602], Loss: 3.9771
Epoch [1/1], Step [220/1602], Loss: 

Generation

In [45]:

prompt = "<|startoftext|>"
input_ids = tokenizer(prompt, return_tensors="pt")
output = model.generate(
    input_ids["input_ids"], 
    do_sample=True,
    temperature=1,
    max_length=20, 
    num_return_sequences=10, 
    pad_token_id=tokenizer.pad_token_id
)

for i, sample_output in enumerate(output):
    print(f"Sample {i+1}: {tokenizer.decode(sample_output, skip_special_tokens=True)}")

Sample 1: chyna
Sample 2: cody
Sample 3: jayn
Sample 4: sabin
Sample 5: amruth
Sample 6: shari
Sample 7: kaylie
Sample 8: maelynn
Sample 9: kyran
Sample 10: zamarion
