### Step 1. Importing all important libraries

In [84]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Step 2: Load the data

In [85]:
df_train = pd.read_csv("/content/mnist_train.csv")


Training Data + Normalise [0-1]

In [86]:
X_train_np = (df_train.iloc[:, 1:].values.astype("float32")) / 255.0
y_train_np = df_train.iloc[:, 0].values.astype("int64")


Testing Data + Normalise

In [87]:
df_test = pd.read_csv("/content/mnist_test.csv")
X_test_np  = (df_test.iloc[:, 1:].values.astype("float32")) / 255.0
y_test_np  = df_test.iloc[:, 0].values.astype("int64")

**Convert to "Torch Tensors"**

In [88]:
#to torch tensors
X_train = torch.from_numpy(X_train_np)          # shape (N, 784)
y_train = torch.from_numpy(y_train_np)          # shape (N,)
X_test  = torch.from_numpy(X_test_np)
y_test  = torch.from_numpy(y_test_np)

In [89]:
X_train = X_train.view(X_train.shape[0], -1).contiguous()
X_test  = X_test.view(X_test.shape[0], -1).contiguous()
y_train = y_train.long()
y_test  = y_test.long()

In [90]:
torch.set_default_dtype(torch.float32)
device = torch.device("cpu")

### Task 2

In [91]:
import torch
import torch.nn.functional as F
import torch.nn as nn
torch.manual_seed(0)

<torch._C.Generator at 0x7a3a11b08750>

In [92]:
X = np.reshape(X_train, newshape=(X_train.shape[0], 784))

**SoftMax**:-transforms a vector of raw model outputs, known as logits, into a probability distribution.
- raw outputs from a neural network's final layer are often called "logits."
- those values can range upto -ve infinity to + infinity
- It takes a set of numbers and converts them into probabilities that sum up to 1.
- works on an entire vector of values
- typically applied to the final layer of a network designed for multi-class classification.
- When we have multiple possible categories and need our model to indicate the probability of each category.


In [93]:
print("label range:", int(y_train.min()), "to", int(y_train.max()))


label range: 0 to 9


In [94]:
import csv
import math
import random
from pathlib import Path
import torch
random.seed(0)


#mentioned in HW
in_dim = X_train.shape[1]
hidden = 100
out_dim = 10 #classes

In [95]:
W1 = torch.randn(in_dim, hidden) * math.sqrt(2.0 / (in_dim + hidden))
b1 = torch.zeros(1, hidden)
W2 = torch.randn(hidden, out_dim) * math.sqrt(2.0 / (hidden + out_dim))
b2 = torch.zeros(1, out_dim)

# keep float32
W1 = W1.float(); b1 = b1.float()
W2 = W2.float(); b2 = b2.float()

**Forward Pass**

In [109]:
def iterate_minibatches(X, y, batch_size=256, shuffle=True):
    N = X.shape[0]
    idx = torch.arange(N)
    if shuffle:
        idx = idx[torch.randperm(N)]
    for s in range(0, N, batch_size):
        e = min(s + batch_size, N)
        b = idx[s:e]
        yield X[b], y[b]

def sigmoid(z):
    return 1.0 / (1.0 + torch.exp(-z))

def sigmoid_deriv(a):
    return a * (1.0 - a)   # if a = sigmoid(z)

## keepdim=True; keep the reduced axis in the result (with size 1) instead of removing it.
def softmax(z):
    zmax = z.max(dim=1, keepdim=True).values
    e = torch.exp(z - zmax)
    return e / e.sum(dim=1, keepdim=True)

def accuracy(probs, y):
    return (probs.argmax(dim=1) == y).float().mean().item()

In [97]:
def forward(X):
    z1 = X @ W1 + b1          # (B, 100)
    a1 = sigmoid(z1)          # hidden activation
    z2 = a1 @ W2 + b2         # (B, 10) logits
    probs = softmax(z2)       # (B, 10) class probabilities
    cache = (X, z1, a1, z2, probs)
    return probs, cache

# quick sanity check
with torch.no_grad():
    p, _ = forward(X_train[:4])           # small batch
    print("forward shapes:", p.shape)     # expect (4, 10)
    print("row sums (should be 1):", p.sum(dim=1))

forward shapes: torch.Size([4, 10])
row sums (should be 1): tensor([1.0000, 1.0000, 1.0000, 1.0000])


**cross-entropy** (or some loss function) in order to train the network.
- forward pass gives you probabilities p.
- to update weights, need to know how wrong the predictions are. that’s what the loss does.
- in classification with softmax output, the natural choice is cross-entropy loss.

In [103]:
def one_hot(y, C=10):
    Y = torch.zeros((y.shape[0], C), dtype=torch.float32, device=y.device)
    Y.scatter_(1, y.view(-1,1), 1.0)
    return Y

def cross_entropy_from_probs(probs, y_onehot, eps=1e-12):
    p = torch.clamp(probs, eps, 1.0)              # avoid log(0)
    return -(y_onehot * torch.log(p)).sum(dim=1).mean()
def logsumexp(x, dim=-1, keepdim=False):
    m = x.max(dim=dim, keepdim=True).values
    out = m + torch.log(torch.sum(torch.exp(x - m), dim=dim, keepdim=True))
    return out if keepdim else out.squeeze(dim)

def cross_entropy_from_logits(logits, y):
    # logits: (B, 10), y: (B,) class indices
    logZ = logsumexp(logits, dim=1, keepdim=True)     # (B,1)
    log_probs = logits - logZ                         # log-softmax
    picked = log_probs.gather(1, y.view(-1,1)).squeeze(1)
    return (-picked).mean()



In [99]:
with torch.no_grad():
    probs, _ = forward(X_train[:8])
    yoh = one_hot(y_train[:8], C=out_dim)
    loss = cross_entropy_from_probs(probs, yoh)
    print(f"loss (sanity): {float(loss):.4f}")

loss (sanity): 2.4021


**Backward Pass**

In [104]:
def backward(cache, y):
    X, z1, a1, z2, probs = cache
    B = y.shape[0]
    Y = one_hot(y, C=10)                 # (B, 10)
    dz2 = (probs - Y) / B                # (B, 10)
    dW2 = a1.T @ dz2                     # (100, 10)
    db2 = dz2.sum(dim=0, keepdim=True)   # (1, 10)
    da1 = dz2 @ W2.T                     # (B, 100)
    dz1 = da1 * (a1 * (1.0 - a1))        # sigmoid_deriv(a1)
    dW1 = X.T @ dz1                      # (784, 100)
    db1 = dz1.sum(dim=0, keepdim=True)   # (1, 100)

    return dW1, db1, dW2, db2
probs, cache = forward(X_train[:32])
grads = backward(cache, y_train[:32])
print([g.shape for g in grads])  # expect [(784,100), (1,100), (100,10), (1,10)]


[torch.Size([784, 100]), torch.Size([1, 100]), torch.Size([100, 10]), torch.Size([1, 10])]


SGD

In [106]:
def step(grads, lr):
    global W1, b1, W2, b2
    dW1, db1, dW2, db2 = grads
    with torch.no_grad():
        W1 -= lr * dW1
        b1 -= lr * db1
        W2 -= lr * dW2
        b2 -= lr * db2

Accuracy

In [108]:
# accuracy
@torch.no_grad()
def accuracy(X, y):
    probs, _ = forward(X)
    pred = probs.argmax(dim=1)
    return (pred == y).float().mean().item()

# simple minibatch iterator
def iterate_minibatches(X, y, batch_size=256, shuffle=True):
    N = X.shape[0]
    idx = torch.arange(N)
    if shuffle:
        idx = idx[torch.randperm(N)]
    for s in range(0, N, batch_size):
        e = min(s + batch_size, N)
        b = idx[s:e]
        yield X[b], y[b]

# hyperparameters
lr = 0.05
epochs = 20
batch_size = 256

print("training 784→100→10 …")
for ep in range(1, epochs + 1):
    total_loss, batches = 0.0, 0
    for xb, yb in iterate_minibatches(X_train, y_train, batch_size=batch_size, shuffle=True):
        probs, cache = forward(xb)
        y_onehot = one_hot(yb, C=10)
        loss = cross_entropy_from_probs(probs, y_onehot)
        grads = backward(cache, yb)
        step(grads, lr)
        total_loss += float(loss); batches += 1

    tr_acc = accuracy(X_train, y_train)
    te_acc = accuracy(X_test,  y_test)
    print(f"Epoch {ep:02d} | loss={total_loss/max(1,batches):.3f} | "
          f"train_acc={tr_acc:.3f} | test_acc={te_acc:.3f}")


training 784→100→10 …
Epoch 01 | loss=0.209 | train_acc=0.941 | test_acc=0.940
Epoch 02 | loss=0.207 | train_acc=0.942 | test_acc=0.940
Epoch 03 | loss=0.205 | train_acc=0.943 | test_acc=0.941
Epoch 04 | loss=0.204 | train_acc=0.943 | test_acc=0.941
Epoch 05 | loss=0.202 | train_acc=0.944 | test_acc=0.942
Epoch 06 | loss=0.201 | train_acc=0.944 | test_acc=0.942
Epoch 07 | loss=0.199 | train_acc=0.945 | test_acc=0.942
Epoch 08 | loss=0.198 | train_acc=0.945 | test_acc=0.943
Epoch 09 | loss=0.197 | train_acc=0.945 | test_acc=0.943
Epoch 10 | loss=0.195 | train_acc=0.945 | test_acc=0.943
Epoch 11 | loss=0.194 | train_acc=0.946 | test_acc=0.944
Epoch 12 | loss=0.193 | train_acc=0.946 | test_acc=0.944
Epoch 13 | loss=0.191 | train_acc=0.946 | test_acc=0.944
Epoch 14 | loss=0.190 | train_acc=0.947 | test_acc=0.944
Epoch 15 | loss=0.189 | train_acc=0.947 | test_acc=0.945
Epoch 16 | loss=0.187 | train_acc=0.947 | test_acc=0.945
Epoch 17 | loss=0.186 | train_acc=0.948 | test_acc=0.946
Epoch 18 