# Recitation 2: Build and train neural networks by PyTorch
_Date_: 2025-09-11

## Recall from data preprocessing

In [1]:
import torch
from typing import Dict, List, Tuple
from torch.utils.data import Dataset

In [2]:
# Below corpus is generated by AI
corpus = """
The old bookstore on the corner smelled of paper and dust
Have you ever wondered what lies beyond the farthest star
Please hand me the blue folder from the top shelf
Although the weather was cold, we enjoyed our walk along the beach
What an incredible view from the mountaintop
He practiced the piano for an hour every day; his dedication was admirable
The new software update will be installed automatically tonight
She brewed a cup of tea and watched the rain fall outside her window
Innovation often arises from the intersection of different fields of study
The children laughed as the puppy chased its tail in circles
Can we reschedule our meeting for early next week
The project was a success, but there were many challenges along the way
"""

# Generate synthetic labels
labels = ['y', 'n', 'n', 'n', 'y', 'y', 'n', 'n', 'y', 'y', 'n', 'n']

In [3]:
def tokenize(instance: str) -> List[str]:
    """Tokenize a text data instance into a list features"""
    return instance.lower().strip().split()


def build_vocabulary(tokens: List[str], most_common: int) -> Dict[str, int]:
    from collections import Counter

    word_freq = Counter(tokens).most_common(most_common)
    vocab = [word for word, _ in word_freq]
    vocab.extend(['<PAD>', '<UNK>'])

    return {w: i for i, w in enumerate(vocab)}


def build_label_map(labels: List[str]) -> Dict[str, int]:
    label_set = set(labels)

    return {label: i for i, label in enumerate(label_set)}


def to_sparse_vector(instance: str, vocab: Dict[str, int]) -> List[int]:
    """Encode a sentence to a sparse vector by multi-hot encoding"""
    features = tokenize(instance)

    return [1 if elem in set(features) else 0 for elem in vocab]


class CustomDataset(Dataset):
    def __init__(self, corpus: str, labels: List[str], k: int):
        # Step 1: Split corpus into sentences, and tokens
        self.instances = corpus.strip().split('\n')
        self.labels = labels
        tokens = []

        for instance in self.instances:
            tokens.extend(tokenize(instance))

        # Step 2: Build vocabulary and label map
        self.vocab = build_vocabulary(tokens, k)
        self.label_map = build_label_map(labels)

    def __len__(self) -> int:
        return len(self.instances)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        instance = self.instances[idx]
        
        label_val = torch.tensor(self.label_map[self.labels[idx]], dtype=torch.long)
        
        sparse_vec = torch.tensor(to_sparse_vector(instance, self.vocab), dtype=torch.float)

        return sparse_vec, label_val

## Implement logistic regression

### Review: Logistic regression
Start with a linear model for predicting label $y$, where $y \in \cal{Y}$, $$z = \vec{\theta} \cdot f(\vec{x}, y)$$ where $z \in \mathbb{R}$, $\vec{\theta}$ is the weight vector and $f(\vec{x}, y)$ is the feature vector.

However, if we want a model that directly maps the feature vector $f(\vec{x}, y)$ to a probability that falls in the range $[0,1]$?

First, make the scalar score positive by exponentiation, $$z' = \exp \big({\vec{\theta} \cdot f(\vec{x}, y)}\big)$$

Then, normalize it to obtain a probability-like distribution, $$\tilde{z} = \frac{\exp \big({\vec{\theta} \cdot f(\vec{x}, y)}\big)}{\sum_{y' \in \cal{Y}} \exp \big({\vec{\theta} \cdot f(\vec{x}, y')}\big)}$$

* Linear discriminative model

### `torch.nn.Module`

#### Introduction
`nn.Module` is the base class for implementing any module within in a neural network.

In other words, any module, layer, functions that constitutes a neural network is a subclass of `nn.Module`.

#### Implementation
Basically, you need to implement two functions:
* `__init__`: initialize the architecture, parameters for a neural network
* `forward`: the function for forward pass

There're some modules you need to know for PA0:
* `nn.Linear`: apply linear transformation to the input tensor
  * $A \cdot \Theta^{T} = B$
  * Given the input tensor $A$ with shape $(m, n)$ and want the transformed tensor $B$ with shape $(m, k)$, what would be the shape of the $\Theta^{T}$?
  * How to apply it to linear model $\vec{y} = \Theta \cdot \vec{x}$?
* `nn.Softmax`: apply softmax function to an n-dimensional tensor
* `nn.Sequential`: a sequential module container which stores modules in the order they're passed in

#### Modes
A deep learning model acts two modes:
* learning: model adjusts its weights for fitting seen data
* inference: model estimates predicted values based on existing weights

Recall from recitation 0 (@8/28/2025), PyTorch provides an architecture tracking and saving gradients for each parameter in a computational graph. Then it enables `nn.Module` class have "learning" and "inference" modes via instance functions:
* `train()`: enable the model execute certain functions required for training (e.g. dropout)
* `eval()`: disable those certains functions for training


### Exercise: multinomial logistic regression using `nn.Module`

In [4]:
import torch.nn as nn

In [5]:
class LogisticRegression(nn.Module):
    def __init__(self, input_dim: int, out_dim: int):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, out_dim),
            nn.LogSoftmax(dim=0)
        )

    def forward(self, input_tensor):
        return self.model(input_tensor)

In [6]:
corpus_ds = CustomDataset(corpus, labels, 5)

i = 5
sparse_vec, label_val = corpus_ds[i]

model = LogisticRegression(input_dim=len(corpus_ds.vocab), out_dim=len(corpus_ds.label_map))
logits = model(sparse_vec)

logits


tensor([-0.7183, -0.6686], grad_fn=<LogSoftmaxBackward0>)

In [7]:
# loss_func = nn.NLLLoss()
# loss_val = loss_func(logits, label_val)
# print(loss_val.item())

## Optimization loop

* The loop optimizes model parameters given training dataset.
* An optimization loop consists of two sub-loops: train loop and test loop
  * As many other kinds of loops, the optimization loop has several iterations called "epochs"
* Each sub-loop has following components:
  * Model
  * Dataloader
  * Loss/objective function
  * Optimizer

### Pseudocode
```
for batch in dataloader:
    set the model in either train/eval mode
    compute logits y_hat
    compute loss between (y_hat) and (y from batch)

    if train_loop:
        loss function computes current gradients for weights
        optimizer computes new weights based on current weights, current gradients and learning rate
        optimizer resets/updates weights for the model

    if test_loop:
        make prediction by argmax(y_hat)
```
---
### References
* _Optimization loop_: https://docs.pytorch.org/tutorials/beginner/basics/optimization_tutorial.html
* _Auto differentiation_: https://docs.pytorch.org/tutorials/beginner/basics/autogradqs_tutorial.html
* _Understand `optimizer.step()`_: https://medium.com/@whyamit404/what-does-optimizer-step-do-in-pytorch-83f0fb0cbfe5

### Exercise: implement optimization loop

In [7]:
from torch.utils.data import DataLoader, random_split

In [8]:
# Exercise: implement train loop using SGD optimizer and negative log-liklihood loss
# ``nn.NLLLoss``: negative log-likelihood loss
# ``torch.optim.SGD``: SGD optimizer

def train_loop(
    model,
    dataloader,
    optimizer,
    loss_fn
):
    model.train()

    for i, (X, y) in enumerate(dataloader):
        y_hat = model(X)
        loss = loss_fn(y_hat, y)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        print(f"Batch {i+1} loss: {loss.item():.5f}")

In [9]:
# Exercise: implement test loop
# NOTE: use ``torch.no_grad`` context for inference

def test_loop(
    model,
    dataloader,
    loss_fn
):
    model.eval()

    eval_loss = []

    with torch.no_grad():
        for X, y in dataloader:
            y_hat = model(X)
            pred_label = y_hat.argmax(1).item()

            print(f"True: {y.item()} | Prediction: {pred_label}")

In [10]:
epochs = 3
lr = 1e-3

ds = CustomDataset(corpus, labels, 5)
model = LogisticRegression(input_dim=len(ds.vocab), out_dim=len(ds.label_map))

train_ds, test_ds = random_split(ds, [0.8, 0.2])

for i in range(epochs):
    print(f"\nEpoch {i+1}\n{'-' * 30}")
    train_loop(
        model=model,
        dataloader=DataLoader(train_ds, batch_size=2),
        optimizer=torch.optim.SGD(model.parameters(), lr=lr),
        loss_fn=nn.NLLLoss()
    )
    print('---')
    test_loop(
        model=model,
        dataloader=DataLoader(test_ds),
        loss_fn=nn.NLLLoss()
    )



Epoch 1
------------------------------
Batch 1 loss: 0.81180
Batch 2 loss: 0.69355
Batch 3 loss: 0.70306
Batch 4 loss: 0.69465
Batch 5 loss: 0.69315
---
True: 0 | Prediction: 0
True: 1 | Prediction: 0

Epoch 2
------------------------------
Batch 1 loss: 0.81163
Batch 2 loss: 0.69355
Batch 3 loss: 0.70305
Batch 4 loss: 0.69464
Batch 5 loss: 0.69315
---
True: 0 | Prediction: 0
True: 1 | Prediction: 0

Epoch 3
------------------------------
Batch 1 loss: 0.81147
Batch 2 loss: 0.69355
Batch 3 loss: 0.70304
Batch 4 loss: 0.69463
Batch 5 loss: 0.69315
---
True: 0 | Prediction: 0
True: 1 | Prediction: 0
