# Lecture 3: Language Modeling Fundamentals

Lecture 3 | CMU ANLP Spring 2025 | Instructor: Sean Welleck

#### Part 2: Feedforward neural language model

This is a notebook for [CMU CS11-711 Advanced NLP](https://cmu-l3.github.io/anlp-spring2025/) that trains a feedforward language model, i.e. one based on [Bengio et al 2003, A Neural Probabilistic Language Model](https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf).

In [1]:
data = open('names.txt').read().splitlines()
data[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [2]:
token_to_index = {tok: i for i, tok in enumerate('abcdefghijklmnopqrstuvwxyz')}
token_to_index['[S]'] = 26
index_to_token = {i: tok for tok, i in token_to_index.items()}

#### Build the dataset

Our dataset consists of $x,y$ pairs, where $x$ is a $(n-1)$-token context, and $y$ is a token.

In [3]:
import torch

context_size = 5

def build_dataset(data):
    X, Y = [], []
    for item in data:
        context = [token_to_index['[S]']] * context_size
        tokens = list(item) + ['[S]']
        for token in tokens:
            X.append(context)
            Y.append(token_to_index[token])
            context = context[1:] + [token_to_index[token]]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

# Split into train, dev, test
import random
random.seed(123)
random.shuffle(data)

n1 = int(0.8 * len(data))
n2 = int(0.9 * len(data))

X_train, Y_train = build_dataset(data[:n1])
X_dev, Y_dev = build_dataset(data[n1:n2])
X_test, Y_test = build_dataset(data[n2:])

X_train.shape, Y_train.shape

(torch.Size([182427, 5]), torch.Size([182427]))

### Define the model

In [None]:
import torch.nn as nn

class MLPLM(nn.Module):
    def __init__(self, vocab_size, context_size, embedding_size, hidden_size):
        super(MLPLM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.fc1 = nn.Linear(context_size * embedding_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)       # (batch_size, context_size, hidden_size)
        x = x.view(x.shape[0], -1)  # (batch_size, context_size * hidden_size)
        x = torch.relu(self.fc1(x)) # (batch_size, hidden_size)
        x = self.fc2(x)             # (batch_size, vocab_size)
        return x


In [5]:
model = MLPLM(len(token_to_index), context_size, 64, 64)

x = X_train[:2]
x

tensor([[26, 26, 26, 26, 26],
        [26, 26, 26, 26, 11]])

In [6]:
model.forward(x).shape

torch.Size([2, 27])

### Training

In [7]:
import torch.optim as optim

model = MLPLM(len(token_to_index), context_size, 64, 64)
print(f"Model parameters: {sum(p.numel() for p in model.parameters())}")

# Hyperparameters
learning_rate = 0.001
num_epochs = 10
batch_size = 32

# Loss function and optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(num_epochs):
    # Reshuffle the data
    perm = torch.randperm(len(X_train))
    X_train = X_train[perm]
    Y_train = Y_train[perm]
    
    model.train()
    total_loss = 0
    for i in range(0, len(X_train), batch_size):
        X_batch = X_train[i:i+batch_size]
        Y_batch = Y_train[i:i+batch_size]

        # Forward pass
        outputs = model(X_batch)
        loss = criterion(outputs, Y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / (len(X_train) // batch_size)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

Model parameters: 24027
Epoch [1/10], Loss: 2.2374
Epoch [2/10], Loss: 2.1336
Epoch [3/10], Loss: 2.1027
Epoch [4/10], Loss: 2.0855
Epoch [5/10], Loss: 2.0739
Epoch [6/10], Loss: 2.0660
Epoch [7/10], Loss: 2.0598
Epoch [8/10], Loss: 2.0554
Epoch [9/10], Loss: 2.0519
Epoch [10/10], Loss: 2.0481


### Generation

In [8]:
# Sample from the model
def sample(model, context, max_length=100):
    model.eval()
    output = []
    with torch.no_grad():
        context = torch.tensor(context).unsqueeze(0)
        for i in range(max_length):
            logits = model(context)
            probs = torch.softmax(logits, dim=-1)
            token = torch.multinomial(probs, num_samples=1)
            context = torch.cat([context[:, 1:], token], dim=1)

            output.append(index_to_token[token.item()])
            if index_to_token[token.item()] == '[S]':
                return ''.join(output)
    return ''.join(output)

In [9]:
for i in range(10):
    print(sample(model, [token_to_index['[S]']] * context_size))

tahi[S]
araniko[S]
calen[S]
milezian[S]
malayie[S]
terrohette[S]
keven[S]
medanie[S]
famoum[S]
leion[S]


### Conditional generation

In [10]:
prompt = 's'
for i in range(10):
    out = sample(model, ([token_to_index['[S]']] * (context_size-len(prompt))) + [token_to_index[c] for c in prompt])
    print(prompt + out)

shyanna[S]
serai[S]
soaddon[S]
soadi[S]
sureel[S]
suer[S]
shell[S]
stere[S]
shaeree[S]
saralynneeson[S]
