### Neural Probabilistic Language Model, Bengio et al. (2003) 

In [None]:
# =========================
# Neural Probabilistic Language Model 
# Bengio et al. (2003) 
# =========================

# 1. Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# 2. Config
n = 4        # context size = n-1 previous words + 1 target
m = 10       # embedding dimension [m x 1]
h = 16       # hidden layer dimension [h x 1]
epochs = 10
lr = 0.01


### Toy dataset

In [20]:
# -----------------------------
# 3. Toy Corpus (~20 sentences)
# -----------------------------
corpus = [
    "the cat sat down",
    "the cat ate food",
    "the dog sat down",
    "the dog ate food",
    "a cat chased a mouse",
    "the dog chased the cat",
    "a dog barked loudly",
    "the cat meowed softly",
    "the bird sang sweetly",
    "a bird flew away",
    "the fish swam fast",
    "a fish jumped high",
    "the boy played ball",
    "the girl sang song",
    "a boy read book",
    "a girl wrote letter",
    "the sun shines bright",
    "the moon glows softly",
    "the stars twinkle bright",
    "a cat slept quietly"
]


# 4. Preprocessing
tokens = set(" ".join(corpus).split())
word2idx = {word: i for i, word in enumerate(sorted(tokens))}
idx2word = {i: word for word, i in word2idx.items()}
V = len(word2idx)   # vocabulary size |V|

# make context-target pairs for n-gram model
def make_ngrams(corpus, n):
    X, y = [], []
    for sentence in corpus:
        words = sentence.split()
        for i in range(len(words) - n):
            context = words[i:i+n]
            target = words[i+n]
            X.append([word2idx[w] for w in context])
            y.append(word2idx[target])
    return torch.tensor(X), torch.tensor(y)

X, y = make_ngrams(corpus, n-1)


print("Vocabulary size:", V)
print("Number of training samples:", len(X))
print('Example context:', X[0].tolist(), '-> target:', y[0].item())
print('Example context words:', [idx2word[i] for i in X[0].tolist()])
print('Example target word:', idx2word[y[0].item()])


Vocabulary size: 42
Number of training samples: 22
Example context: [39, 9, 30] -> target: 12
Example context words: ['the', 'cat', 'sat']
Example target word: down


### Model architecture and data loader

In [17]:
# 5. Dataset/Dataloader
class NGramDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_loader = DataLoader(NGramDataset(X, y), batch_size=4, shuffle=True)


# 6. Model definition 
class NPLM(nn.Module):
    def __init__(self, V, m, n, h):
        super().__init__()
        
        # Embedding matrix C: maps word index to [m x 1] vector
        # C(w) ∈ R^{m}
        self.C = nn.Embedding(V, m)
        
        # W ∈ R^{[V x (n-1)·m]}, no bias
        self.W = nn.Linear((n-1)*m, V, bias=False)
        
        # H ∈ R^{[h x (n-1)·m]}, no bias
        self.H = nn.Linear((n-1)*m, h, bias=False)
        
        # U ∈ R^{[V x h]}, no bias
        self.U = nn.Linear(h, V, bias=False)
        
        # Bias terms as separate parameters
        # b ∈ R^{[V x 1]}, d ∈ R^{[h x 1]}
        self.b = nn.Parameter(torch.zeros(V))
        self.d = nn.Parameter(torch.zeros(h))
        
    def forward(self, x):
        # Step 1: Lookup embeddings for each context word
        # C(w_{t-1}), ..., C(w_{t-n+1})
        embeddings = self.C(x)  # [batch, n-1, m]
        
        # Step 2: Flatten into single vector
        # x_flat ∈ R^{[(n-1)·m x 1]}
        x_flat = embeddings.view(embeddings.size(0), -1)
        
        # Step 3: Hidden computation
        # h_tanh = tanh(d + H · x_flat)
        h_tanh = torch.tanh(self.d + self.H(x_flat))  # [batch, h]
        
        # Step 4: Output logits
        # y = b + W · x_flat + U · h_tanh
        y = self.b + self.W(x_flat) + self.U(h_tanh)  # [batch, V]
        
        # Step 5: Softmax for probabilities
        # P̂(w_t | context) = exp(y_wt) / Σ_i exp(y_i)
        # Ensures:
        # 1. f > 0 for all sequences
        # 2. Σ_i f(...) = 1
        return torch.log_softmax(y, dim=1)

# After model instantiation, add parameter counting
model = NPLM(V, m, n, h)

# Print model architecture and parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters())

print("\nModel Architecture:")
print("==================")
print(f"Vocabulary size (V): {V}")
print(f"Context size (n-1): {n-1}")
print(f"Embedding dim (m): {m}")
print(f"Hidden dim (h): {h}")
print("\nModel Parameters:")
print("==================")
print(f"Embedding matrix (C): {V*m:,d}")
print(f"Context matrix (W): {V*(n-1)*m:,d}")
print(f"Hidden matrix (H): {h*(n-1)*m:,d}")
print(f"Output matrix (U): {V*h:,d}")
print(f"Biases (b, d): {V + h:,d}")
print(f"Total parameters: {count_parameters(model):,d}")


Model Architecture:
Vocabulary size (V): 42
Context size (n-1): 3
Embedding dim (m): 10
Hidden dim (h): 16

Model Parameters:
Embedding matrix (C): 420
Context matrix (W): 1,260
Hidden matrix (H): 480
Output matrix (U): 672
Biases (b, d): 58
Total parameters: 2,890


### Training loop

In [18]:
# 7. Training loop
criterion = nn.NLLLoss()  # since we used log_softmax
optimizer = optim.Adam(model.parameters(), lr=lr)

for epoch in range(epochs):
    total_loss = 0
    for context, target in train_loader:
        optimizer.zero_grad()
        output = model(context)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if (epoch+1) % 1 == 0:
        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


Epoch 1, Loss: 23.0056
Epoch 2, Loss: 14.9684
Epoch 3, Loss: 9.8470
Epoch 4, Loss: 6.3263
Epoch 5, Loss: 3.8484
Epoch 6, Loss: 2.4036
Epoch 7, Loss: 1.6115
Epoch 8, Loss: 0.9805
Epoch 9, Loss: 0.6869
Epoch 10, Loss: 0.4971


### Probability of next word

In [19]:
# -----------------------------
# 6. Prediction Function
# -----------------------------
def predict_next(context_words, k=5):
    context_idxs = torch.tensor([[word2idx[w] for w in context_words]])
    with torch.no_grad():
        output = model(context_idxs)
        probs = torch.softmax(output, dim=1)
        top_probs, top_indices = torch.topk(probs, k, dim=1)

        
        
    # Get words and probabilities for top k predictions
    predictions = [(idx2word[idx.item()], prob.item()) 
                  for idx, prob in zip(top_indices[0], top_probs[0])]
    return predictions

# Test a few predictions
test_contexts = [
    ["the", "cat", "sat"],
    ["a", "cat", "chased"],
    ["a","dog","barked"],
    ["the","stars","glows"]
]

for context in test_contexts:
    predictions = predict_next(context)
    print(f"\nContext: {context}")
    for i, (word, prob) in enumerate(predictions, 1):
        print(f"  {i}. '{word}' ({prob:.2f})")


Context: ['the', 'cat', 'sat']
  1. 'down' (0.92)
  2. 'softly' (0.04)
  3. 'food' (0.01)
  4. 'ball' (0.01)
  5. 'sweetly' (0.01)

Context: ['a', 'cat', 'chased']
  1. 'a' (0.94)
  2. 'the' (0.02)
  3. 'quietly' (0.01)
  4. 'high' (0.01)
  5. 'softly' (0.01)

Context: ['a', 'dog', 'barked']
  1. 'loudly' (0.94)
  2. 'a' (0.01)
  3. 'letter' (0.01)
  4. 'away' (0.01)
  5. 'mouse' (0.01)

Context: ['the', 'stars', 'glows']
  1. 'softly' (0.38)
  2. 'fast' (0.17)
  3. 'bright' (0.14)
  4. 'ball' (0.07)
  5. 'food' (0.04)
