# Lecture 4: Recurrent Neural Networks

Lecture 4 | CMU ANLP Spring 2025 | Instructor: Sean Welleck

### Part 1: Recurrent language model

This is a notebook for [CMU CS11-711 Advanced NLP](https://cmu-l3.github.io/anlp-spring2025/) that trains a recurrent language model.

In [26]:
data = open('names.txt').read().splitlines()
data[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [27]:
token_to_index = {tok: i for i, tok in enumerate('abcdefghijklmnopqrstuvwxyz')}
# Start/stop token
token_to_index['[S]'] = 26
# Padding token
token_to_index['[PAD]'] = 27

index_to_token = {i: tok for tok, i in token_to_index.items()}

In [28]:
import torch

def build_dataset(data):
    X, Y = [], []
    for item in data:
        tokens = ['[S]'] + list(item) + ['[S]']
        indices = [token_to_index[token] for token in tokens]
        X.append(indices[:-1])
        Y.append(indices[1:])
    return X, Y

# Split into train, dev, test
import random
random.seed(123)
random.shuffle(data)

n1 = int(0.8 * len(data))
n2 = int(0.9 * len(data))

X_train, Y_train = build_dataset(data[:n1])
X_dev, Y_dev = build_dataset(data[n1:n2])
X_test, Y_test = build_dataset(data[n2:])

len(X_train), len(Y_train)

(25626, 25626)

In [29]:
X_train[0], len(X_train[0]), X_train[1], len(X_train[1]), max(len(x) for x in X_train)

([26, 11, 20, 0, 13, 13], 6, [26, 18, 7, 0, 8, 13], 6, 16)

In [30]:
# Write our own RNN cell 
import torch.nn as nn

class RNNCell(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super(RNNCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.Wh = torch.nn.Linear(hidden_size, hidden_size)
        self.Wx = torch.nn.Linear(input_size, hidden_size)
        self.activation = torch.nn.Tanh()
        
    def forward(self, x, h):
        h = self.activation(self.Wh(h) + self.Wx(x))
        return h


In [31]:

class RNNLM(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(RNNLM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.rnn = RNNCell(hidden_size, hidden_size)
        self.output = nn.Linear(hidden_size, vocab_size)
        self.hidden_size = hidden_size

    def forward(self, x, hidden=None):
        if hidden is None:
            hidden = self.init_hidden(x.size(0))
        
        x = self.embedding(x)
        
        outs = []
        for i in range(x.size(1)):
            hidden = self.rnn(x[:, i:i+1], hidden)
            out = self.output(hidden)
            outs.append(out)
        
        outs = torch.cat(outs, dim=1)
        return outs, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(batch_size, 1, self.hidden_size)

In [32]:
model = RNNLM(len(token_to_index), 32)

x = torch.tensor(X_train[:1])

output, hidden = model(x, hidden=None)
output.size(), hidden.size()

(torch.Size([1, 6, 28]), torch.Size([1, 1, 32]))

In [8]:
x = torch.tensor(X_train[:2])

In [9]:
def pad_batch(X_batch, Y_batch, pad_index):
    max_len = max(len(x) for x in X_batch)
    X_padded = torch.zeros(len(X_batch), max_len, dtype=torch.long) + pad_index
    Y_padded = torch.zeros(len(Y_batch), max_len, dtype=torch.long) + pad_index
    for i, (x, y) in enumerate(zip(X_batch, Y_batch)):
        X_padded[i, :len(x)] = torch.tensor(x)
        Y_padded[i, :len(y)] = torch.tensor(y)
    return X_padded, Y_padded

xp, yp = pad_batch(X_train[:4], Y_train[:4], token_to_index['[PAD]'])

print(xp)
for x in xp:
    print([index_to_token[i.item()] for i in x])

tensor([[26, 11, 20,  0, 13, 13, 27, 27, 27, 27],
        [26, 18,  7,  0,  8, 13, 27, 27, 27, 27],
        [26, 17, 20, 15,  4, 17, 19, 27, 27, 27],
        [26, 12, 14, 10, 18,  7,  0,  6, 13,  0]])
['[S]', 'l', 'u', 'a', 'n', 'n', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
['[S]', 's', 'h', 'a', 'i', 'n', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
['[S]', 'r', 'u', 'p', 'e', 'r', 't', '[PAD]', '[PAD]', '[PAD]']
['[S]', 'm', 'o', 'k', 's', 'h', 'a', 'g', 'n', 'a']


In [10]:
X_batch, Y_batch = pad_batch(X_train[:2], Y_train[:2], token_to_index['[PAD]'])

output, hidden = model(X_batch, hidden=None)
output.size(), hidden.size()

(torch.Size([2, 6, 28]), torch.Size([2, 1, 32]))

In [None]:
import torch.optim as optim

model = RNNLM(vocab_size=len(token_to_index), hidden_size=96)
# Count model parameters
print(f"Model parameters: {sum(p.numel() for p in model.parameters())}")

# Hyperparameters
learning_rate = 0.001
num_epochs = 10
batch_size = 16

# Loss function and optimizer
# NOTE: We ignore the loss whenever the target token is a padding token
criterion = nn.CrossEntropyLoss(ignore_index=token_to_index['[PAD]'])

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    # Reshuffle the data
    perm = torch.randperm(len(X_train))
    X_train = [X_train[i] for i in perm]
    Y_train = [Y_train[i] for i in perm]
    
    model.train()
    total_loss = 0
    for i in range(0, len(X_train), batch_size):
        X_batch = X_train[i:i+batch_size]
        Y_batch = Y_train[i:i+batch_size]
        X_batch, Y_batch = pad_batch(X_batch, Y_batch, token_to_index['[PAD]'])

        # Forward pass
        outputs, _ = model(X_batch) # [batch_size, seq_len, vocab_size]
        outputs = outputs.view(-1, len(token_to_index)) # [batch_size * seq_len, vocab_size]
        Y_batch = Y_batch.view(-1) # [batch_size * seq_len]
        loss = criterion(outputs, Y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / (len(X_train) // batch_size)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

    # Evaluate validation loss
    eval_loss = 0
    model.eval()
    with torch.no_grad():
        for i in range(0, len(X_dev), batch_size):
            X_batch = X_dev[i:i+batch_size]
            Y_batch = Y_dev[i:i+batch_size]
            X_batch, Y_batch = pad_batch(X_batch, Y_batch, token_to_index['[PAD]'])

            outputs, _ = model(X_batch)
            outputs = outputs.view(-1, len(token_to_index))
            Y_batch = Y_batch.view(-1)
            loss = criterion(outputs, Y_batch)

            eval_loss += loss.item()
    avg_eval_loss = eval_loss / (len(X_dev) // batch_size)
    print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {avg_eval_loss:.4f}')


Model parameters: 24028
Epoch [1/10], Loss: 2.2432
Epoch [1/10], Validation Loss: 2.1708
Epoch [2/10], Loss: 2.1317
Epoch [2/10], Validation Loss: 2.1361
Epoch [3/10], Loss: 2.0956
Epoch [3/10], Validation Loss: 2.1093
Epoch [4/10], Loss: 2.0723
Epoch [4/10], Validation Loss: 2.0942
Epoch [5/10], Loss: 2.0562
Epoch [5/10], Validation Loss: 2.0905
Epoch [6/10], Loss: 2.0436
Epoch [6/10], Validation Loss: 2.0814
Epoch [7/10], Loss: 2.0332
Epoch [7/10], Validation Loss: 2.0784
Epoch [8/10], Loss: 2.0250
Epoch [8/10], Validation Loss: 2.0732
Epoch [9/10], Loss: 2.0181
Epoch [9/10], Validation Loss: 2.0747
Epoch [10/10], Loss: 2.0111
Epoch [10/10], Validation Loss: 2.0660


In [15]:
# Sample from the model
def sample(model, context, max_length=100):
    model.eval()
    output = []
    with torch.no_grad():
        x = torch.tensor([[token_to_index['[S]']] + context])
        hidden = None
        for _ in range(max_length):
            y, hidden = model(x, hidden)
            y = y[0, -1].softmax(dim=0)
            y = torch.multinomial(y, 1)
            token = index_to_token[y.item()]
            if token == '[S]':
                break
            output.append(token)
            x = y.view(1, 1)
    return ''.join(output)

In [24]:
for i in range(10):
    print(sample(model, []))

blander
javor
savina
zanlyn
edmice
korio
pailo
madalyn
hourtal
kloie


In [25]:
prompt = 's'
for i in range(10):
    out = sample(model, [token_to_index[tok] for tok in prompt])
    print(prompt + out)

saper
surah
sayee
starun
shyala
sudelee
sillie
samsen
shiros
ses


### Suggested Exercises

1. Use `nn.RNN` instead of our `RNNCell`. Do you have to change anything in the implementation?
2. Change `nn.RNN` to `nn.GRU`. Do you have to change anything else in the implementation? Does the loss improve?
3. Change `nn.RNN` to `nn.LSTM`. Do you have to change anything else in the implementation? Does the loss improve?
4. Vary the hyperparameters (e.g., hidden size, batch size, learning rate, number of epochs). Can you find any consistent relationships between hyperparameter(s) and the loss?
5. When the validation loss begins to increase, and the training loss is decreasing, we have evidence of **overfitting**. Can you induce this overfitting by changing the hyperparameters?
6. Train a recurrent model on a more complex dataset. Use a tokenizer learned with BPE (either one that you train your own, or a pre-existing one).