# COM2028 Coursework 25/26

**Topics:**  
1. Linear algebra (under/overdetermined systems)  
2. Probability
3. Linear regression  
4. Logistic regression
5. Unsupervised learning – K-means clustering 
6. Classification – Convolutional Neural Networks (PyTorch)

**Instructions**
- Implement code only in **solution** cells marked with `# YOUR CODE HERE` and `raise NotImplementedError()`.
- Do not change function signatures or make modifications on locked cells.
- **DO NOT ERASE OR CREATE CELLS**.
- All datasets are generated **inside the notebook**.
- The autograder will re-generate **hidden datasets** with different seeds/parameters to prevent hardcoding.
- **Public tests** are visible.
- **Hidden tests** will run on different seeds, parameters, and augmentation.
- **Do not** import extra libraries other than those already imported.
- **Do not** load or save external files—everything must run in this notebook.
- **Reproducibility:** Use provided RNG helpers.
- **Do not** set global seeds in your solutions.

**Academic integrity:** The design includes randomized hidden tests, gradient checks, and banned import checks to detect shortcuts.

In [None]:
# DO NOT MODIFY: Imports, utilities, and global configuration
import math, sys, os, inspect
import numpy as np
import matplotlib.pyplot as plt
from dataclasses import dataclass
from typing import Tuple, Callable, Dict, Any
from PIL import Image, ImageDraw

np.set_printoptions(precision=5, suppress=True)

# Lightweight banned token checks on student's function source (not bulletproof).
BANNED_TOKENS = {
    'global': ['sklearn', 'tensorflow', 'keras', 'scikit-image'],
    'q1': ['np.linalg.pinv', 'np.linalg.lstsq', 'scipy.linalg.lstsq', 'scipy.linalg.pinv', 'np.linalg.svd', 'np.linalg.solve']
}

def _check_banned(func, scope='global'):
    try:
        src = inspect.getsource(func)
    except Exception:
        return  # If source can't be retrieved (rare), skip
    toks = BANNED_TOKENS.get(scope, []) + BANNED_TOKENS['global']
    for t in toks:
        if t in src:
            raise AssertionError(f"Use of banned token '{t}' detected in {func.__name__}.")

# Test helpers
def rel_error(x, y, eps=1e-12):
    num = np.linalg.norm(x - y)
    den = np.linalg.norm(x) + np.linalg.norm(y) + eps
    return num / den

def numeric_grad(f: Callable[[np.ndarray], float], x: np.ndarray, eps: float = 1e-6) -> np.ndarray:
    grad = np.zeros_like(x)
    for i in range(x.size):
        old = x.flat[i]
        x.flat[i] = old + eps
        f1 = f(x)
        x.flat[i] = old - eps
        f2 = f(x)
        x.flat[i] = old
        grad.flat[i] = (f1 - f2) / (2 * eps)
    return grad

# RNG helper to avoid students overriding global state
@dataclass
class RNG:
    seed: int
    def rand(self, *shape):
        rng = np.random.default_rng(self.seed)
        return rng.random(shape)
    def normal(self, *shape, mean=0.0, std=1.0):
        rng = np.random.default_rng(self.seed)
        return rng.normal(mean, std, size=shape)
    def integers(self, low, high=None, size=None):
        rng = np.random.default_rng(self.seed)
        return rng.integers(low, high, size=size)
    def choice(self, a, size=None, replace=True):
        rng = np.random.default_rng(self.seed)
        return rng.choice(a, size=size, replace=replace)


: 

## Q1. Linear algebra: under-/overdetermined solutions of system of equations (20 marks)

**Task: Implement**

- Underdetermined (m < n) solution to system of equations $\mathbf{A x} = \mathbf{y}$.
- Overdetermined (m > n) solution to system of equations $\mathbf{A x} \approx \mathbf{y}$.
  
**Restriction (Q1): You cannot use direct solvers np.linalg.pinv, np.linalg.lstsq, np.linalg.solve, np.linalg.svd.**

In [None]:
# DO NOT MODIFY: Q1 data generators

def gen_underdetermined(m=10, n=15, seed=0):
    rng = np.random.default_rng(seed)
    A = rng.normal(size=(m, n))
    x_true = rng.normal(size=(n,))
    y = A @ x_true  # exact
    return A, y, x_true

def gen_overdetermined(m=80, n=5, noise=0.1, seed=1):
    rng = np.random.default_rng(seed)
    A = rng.normal(size=(m, n))
    x_true = rng.normal(size=(n,))
    y = A @ x_true + rng.normal(scale=noise, size=(m,))
    return A, y, x_true


### Q1.1 Implement underdetermined(A, y) returning solution x that satisfies $\mathbf{A x} = \mathbf{y}$ ($m<n$) (10 marks)

**Hint: Look at solution from the lecture slides, but DO NOT call direct solvers np.linalg.pinv, np.linalg.lstsq, np.linalg.svd, np.linalg.solve directly.**

In [None]:
def underdetermined(A: np.ndarray, y: np.ndarray) -> np.ndarray:
    """Return solution to A x = y for m < n.
    
    Parameters
    ----------
    A : (m, n) np.ndarray
    y : (m,) np.ndarray

    Returns
    -------
    x : (n,) np.ndarray
        The unique minimum-norm solution.
    """
    
    # YOUR CODE HERE
    G = A @ A.T
    w = np.linalg.inv(G) @ y
    
    x = (A.T @ w)
    
    return x


In [None]:
# Q1.1 Public Tests (2 marks)

A, y, _ = gen_underdetermined(8, 12, 42)

def ref_min_norm(A, y):
    U, s, Vt = np.linalg.svd(A, full_matrices=False)
    s_inv = np.array([1/si if si>1e-12 else 0.0 for si in s])
    return Vt.T @ np.diag(s_inv) @ U.T @ y

x = underdetermined(A, y)
_check_banned(underdetermined, scope='q1')
ref = ref_min_norm(A, y)
assert x.shape == (12,)
assert rel_error(x, ref) < 1e-7
print("Passed tests: +2 marks")

In [None]:
# Q1.1 Hidden Tests (4 marks)




In [None]:
# Q1.1 Hidden Tests (4 marks)


### Q1.2 Implement overdetermined(A, y) returning solution x that satisfies $\mathbf{A x} \approx \mathbf{y}$  ($m>n$) (10 marks).

**Hint: Look at solution from the lecture slides, but DO NOT call direct solvers np.linalg.pinv, np.linalg.lstsq, np.linalg.svd, np.linalg.solve directly.**

In [None]:
def overdetermined(A: np.ndarray, y: np.ndarray) -> np.ndarray:
    """Return solution to A x = y for m > n.
    
    Parameters
    ----------
    A : (m, n) np.ndarray
    y : (m,) np.ndarray

    Returns
    -------
    x : (n,) np.ndarray
        The unique minimum-norm solution.
    """
    # YOUR CODE HERE
    G = A.T @ A
    b = A.T @ y
    
    x = (np.linalg.inv(G) @ b)
    
    return x
  

In [None]:
# Q1.2 Public Tests (2 marks)
A, y, _ = gen_overdetermined(120, 6, 0.05, 7)
x = overdetermined(A, y)
_check_banned(overdetermined, scope='q1')
ATA = A.T @ A + 1e-4*np.eye(6)
ATy = A.T @ y
ref = np.linalg.solve(ATA, ATy)
assert rel_error(x, ref) < 1e-3
print("Passed tests: +2 marks")

In [None]:
# Q1.2 Hidden Tests (4 marks)


In [None]:
# Q1.2 Hidden Tests (4 marks)


## Q2. Probability: Conditional Probability and Bayes' Theorem (5 marks)

A factory has two machines, **M1** and **M2**, producing widgets:
- Machine M1 produces P(M1) of all widgets, and Machine M2 produces P(M2).
- M1 produces defective widgets P(defective | M1) of the time, while M2 produces defective widgets P(defective | M2) of the time.

We pick a widget at random and observe that it is defective.

**Task:**
2. Write a function `bayes_defective(p_m1, p_m2, p_def_m1, p_def_m2)` that returns the probability that the defective widget came from Machine M1 given:
   - `p_m1`: prior probability of M1
   - `p_m2`: prior probability of M2
   - `p_def_m1`: P(defective | M1)
   - `p_def_m2`: P(defective | M2)

**Hint:**  
$$
P(M_1 \mid D) = \frac{P(D \mid M_1) P(M_1)}{P(D \mid M_1) P(M_1) + P(D \mid M_2) P(M_2)}
$$

**Restriction (Q2): Do not import/call sklearn, tensorflow, keras, scikit-image.**

In [None]:
def bayes_defective(p_m1: float, p_m2: float, p_def_m1: float, p_def_m2: float) -> float:
    """
    Compute the posterior probability that a defective item came from machine 1
    using Bayes' theorem.

    Parameters
    ----------
    p_m1 : float
        Prior probability that an item is produced by machine 1.
    p_m2 : float
        Prior probability that an item is produced by machine 2.
    p_def_m1 : float
        Probability that an item is defective given it was produced by machine 1.
    p_def_m2 : float
        Probability that an item is defective given it was produced by machine 2.

    Returns
    -------
    float
        Posterior probability that a defective item was produced by machine 1.
    """
    # YOUR CODE HERE

In [None]:
# Q2 Public Tests (1 mark)
_check_banned(bayes_defective)
p = bayes_defective(0.6, 0.4, 0.01, 0.02)
assert abs(p - 0.42857) < 1e-4, f"Expected ~0.42857, got {p}"
print("Passed tests: +1 mark")

In [None]:
# Q2 Hidden Tests (4 marks)


## Q3. Linear regression (20 marks)

- We provide code to generate synthetic data for the linear regression problem

**Task: implement**
- closed-form ridge and
- gradient descent (using both full-batch or mini-batch).

**Restriction (Q3): Do not import/call sklearn, tensorflow, keras, scikit-image.**

In [None]:
# DO NOT MODIFY: Q3 data generator
def gen_linreg(n_samples=500, n_features=10, noise=0.5, seed=0):
    rng = np.random.default_rng(seed)
    X = rng.normal(size=(n_samples, n_features))
    w_true = rng.normal(size=(n_features,))
    y = X @ w_true + rng.normal(scale=noise, size=(n_samples,))
    return X, y, w_true


### Q3.1 — Closed-form ridge regression (10 marks)

**Task.** Implement closed-form ridge regression.  
You should write a function that takes a matrix `X` (shape `n x d`), targets `y` (`n`), and a regularization strength `reg` (≥ 0), and returns the fitted weight vector `w` (shape `d`). 

**Notes / requirements**
- Use the standard closed-form solution:
  $$\mathbf{w} = (\mathbf{X}^\top \mathbf{X} + \lambda \mathbf{I})^{-1}\mathbf{X}^\top \mathbf{y},$$

Implement the function in the code cell below.

In [None]:
def ridge_closed_form(X: np.ndarray, y: np.ndarray, reg: float = 0.0) -> np.ndarray:
    """
    Closed form ridge regression

    Parameters
    ----------
    X : (n, d) design matrix
    y : (n,) target vector
    reg : L2 regularization strength (lambda)

    Returns
    -------
    w : (d,) learned weights
    """
    # YOUR CODE HERE
 

In [None]:
# Q3.1 Public Tests (2 marks)
X, y, _ = gen_linreg(800, 12, 0.1, 10)
w_cf = ridge_closed_form(X, y, reg=1e-3)
_check_banned(ridge_closed_form)
ref = np.linalg.solve(X.T @ X + 1e-3*np.eye(X.shape[1]), X.T @ y)
assert w_cf.shape == (X.shape[1],)
assert rel_error(w_cf, ref) < 1e-7
print("Passed tests: +2 marks")

In [None]:
# Q3.1 Hidden Tests (4 marks)
    

In [None]:
# Q3.1 Hidden Tests (4 marks)

### Q3.2 — Gradient Descent Regression (10 marks)

**Task.** In this question, you will implement linear regression trained by gradient descent, with support for **optional L2 regularization (ridge regression)**.  
Unlike Q3.1, which relied on the closed-form solution, here you will explicitly optimize the loss function using iterative updates.

**Requirements**
- Implement a function that fits a linear regression model to data `X, y` using **full-batch gradient descent (GD)** or **mini-batch stochastic gradient descent (SGD)**.  
- The objective function is:
  $$\mathcal{L}(\mathbf{w}) = \frac{1}{2n}\|\mathbf{X}\mathbf{w} - \mathbf{y}\|^2_2 + \frac{\lambda}{2}\|\mathbf{w}\|^2_2,$$
  where \(\lambda \geq 0\) controls the strength of L2 regularization (set \(\lambda = 0\) for plain linear regression).
- In this function, the gradient is defined by
  $$\nabla_w \mathcal{L}(\mathbf{w}) = \frac{1}{n}\mathbf{X}^{\top}(\mathbf{X}\mathbf{w}-\mathbf{y})+\lambda\mathbf{w},$$
  and the update rule for GD and SGD is
  $$\mathbf{w} \leftarrow \mathbf{w} - \eta \nabla_w \mathcal{L}(\mathbf{w}),$$ where $\eta$ is the learning rate.
- Your implementation should:
  - Accept hyperparameters such as learning rate, number of epochs, batch size, and regularization strength.
  - Return both the learned weights and the training loss history.
  - Work for full-batch GD, or mini-batch SGD regimes (depending on batch size).

**Hints**
- Vectorize your gradient computations; avoid explicit Python loops over samples.
- Test your implementation on small synthetic data to ensure that the loss decreases over iterations and that the solution approaches the closed-form result from Q3.1 when using full-batch gradient descent with a small learning rate and many iterations.

**Restriction (Q3): Do not import/call sklearn, tensorflow, keras, scikit-image.**

In [None]:
def linreg_gd(X: np.ndarray, y: np.ndarray, lr: float = 0.1, reg: float = 0.0,
              epochs: int = 200, batch_size: int | None = None,
              w0: np.ndarray | None = None, seed: int = 0) -> Tuple[np.ndarray, dict]:
    """
    Linear regression with optional L2 regularization via gradient descent / mini-batch SGD.

    Parameters
    ----------
    X : (n, d) design matrix
    y : (n,) target vector
    lr : learning rate
    reg : L2 regularization strength (lambda)
    epochs : number of passes over the training set
    batch_size : if None, use full-batch GD; else mini-batch SGD with given batch size
    w0 : optional initial weights (d,)
    seed : RNG seed for batch shuffling

    Returns
    -------
    w : (d,) learned weights
    history : dict with key 'loss' containing per-epoch full-data loss values
    """
    # YOUR CODE HERE

In [None]:
# Q3.2 Public Tests (2 marks)
_check_banned(linreg_gd)
X, y, _ = gen_linreg(20, 9, 0.3, 10)
w_gd, hist = linreg_gd(X, y, lr=0.2, reg=1e-3, epochs=200, batch_size=None, seed=0)
w_cf = ridge_closed_form(X, y, reg=1e-2)
assert 'loss' in hist and len(hist['loss']) >= 50
assert rel_error(w_gd, w_cf) < 1e-2
print("Passed tests: +2 marks")

In [None]:
# Q3.2 Hidden Tests (4 marks)


In [None]:
# Q3.2 Hidden Tests (4 marks)


## Q4. Logistic regression (25 marks)

In this question, you will implement logistic regression from scratch, including all the core components needed for binary classification:
- The sigmoid function to map linear scores to probabilities.
- The logistic loss function (cross-entropy) with L2 regularization for weight decay.
- The gradient of the loss with respect to model parameters.
- A training loop that uses gradient descent to optimize the weights.

You will also perform numerical gradient checks to validate your implementation and ensure correctness.

**Important restrictions**:
- **Restriction (Q4): Do not import/call sklearn, tensorflow, keras, scikit-image, or any pre-built logistic regression functions.**
- You may use NumPy for vectorized operations.
- Your implementation should be efficient and handle reasonably sized datasets.

In [None]:
# DO NOT MODIFY: Q4 data generator
def gen_logreg(n_pos=400, n_neg=400, dim=3, separation=1.5, seed=0):
    rng = np.random.default_rng(seed)
    mu_pos = rng.normal(scale=separation, size=(dim,))
    mu_neg = -mu_pos
    cov = np.eye(dim)
    X_pos = rng.multivariate_normal(mu_pos, cov, size=n_pos)
    X_neg = rng.multivariate_normal(mu_neg, cov, size=n_neg)
    X = np.vstack([X_pos, X_neg])
    y = np.concatenate([np.ones(n_pos), np.zeros(n_neg)])
    idx = rng.permutation(X.shape[0])
    return X[idx], y[idx]


### Q4.1 Implementation of the sigmoid function (2 marks)

The sigmoid function is a key building block in logistic regression and many neural network models. It maps any real-valued input $\mathbf{z}$ to a value in the range $(0,1)$, making it suitable for representing probabilities:
$$ \sigma(\mathbf{z}) = \frac{1}{1 + \exp(-\mathbf{z})}$$
Your task is to implement the sigmoid function in a numerically stable way.

Why is this important? For large positive or negative values of $\mathbf{z}$, the naive implementation can cause overflow or underflow in the exponential calculation. For example:
- If $\mathbf{z}$ is very large (e.g., 1000), $e^{-\mathbf{z}}$ underflows to 0.
- If $\mathbf{z}$ is very negative (e.g., -1000), $e^{-\mathbf{z}}$ overflows to infinity.

To avoid these issues, you should:
- Use conditional logic or algebraic tricks to keep values in a safe range.
- Ensure the function works for scalars and NumPy arrays.

Expected behavior:
- Input: NumPy array of any shape.
- Output: same shape, with values in $(0,1)$.


In [None]:
def sigmoid(z: np.ndarray) -> np.ndarray:
    """
    Compute the sigmoid function σ(z) = 1 / (1 + exp(-z)) in a numerically stable way.

    Parameters
    ----------
    z : np.ndarray
        Input array (can be scalar, vector, or matrix).

    Returns
    -------
    np.ndarray
        Element-wise sigmoid values in the range (0, 1).
    """
    # YOUR CODE HERE

In [None]:
# Q4.1 Public Tests (2 marks)
_check_banned(sigmoid)
X, y = gen_logreg(300, 300, 4, 2.0, 2)
z = np.array([-100.0, -1.0, 0.0, 1.0, 100.0])
s = sigmoid(z)
assert np.all(s >= 0) and np.all(s <= 1)
assert np.isclose(s[2], 0.5, atol=1e-8)
print("Passed tests: +2 marks")

### Q4.2: Logistic Loss with L2 Regularization and Gradient (12 marks)

In this part, you will implement the **binary logistic regression loss** (average cross-entropy) with **L2 regularization** and its **gradient** with respect to the model parameters.

#### Model and Notation
Given a dataset of $n$ examples with features $\mathbf{x}_i \in \mathbb{R}^d$ and labels $\mathbf{y}_i \in \{0,1\}$, the logistic regression model predicts:
$$
\mathbf{z}_i = \mathbf{w}^\top \mathbf{x}_i, \qquad \hat{\mathbf{y}}_i = \sigma(\mathbf{z}_i) = \frac{1}{1 + e^{-\mathbf{z}_i}}.
$$

#### Loss (average negative log-likelihood)
The **average logistic loss** with **L2 regularization** is:
$$
\mathcal{L}(\mathbf{w}) = -\frac{1}{n}\sum_{i=1}^{n}\Big[\mathbf{y}_i \log \hat{\mathbf{y}}_i + (1-\mathbf{y}_i)\log(1-\hat{\mathbf{y}}_i)\Big] \;+\; \frac{\lambda}{2}\,\lVert \mathbf{w} \rVert_2^2.
$$

A numerically **stable** and equivalent expression (recommended for implementation) is:
$$
\mathcal{L}(\mathbf{w}) \;=\; \frac{1}{n}\sum_{i=1}^{n}\Big(\operatorname{softplus}(\mathbf{z}_i) - \mathbf{y}_i \mathbf{z}_i\Big) \;+\; \frac{\lambda}{2}\,\lVert \mathbf{w} \rVert_2^2,
$$
where $\operatorname{softplus}(\mathbf{z}) = \log\big(1 + e^{\mathbf{z}}\big)$. 

#### Gradients
Let $ \hat{\mathbf{y}} = \sigma(\mathbf{z}) $. The gradients of the loss are:
$$
\nabla_{\mathbf{w}} \mathcal{L} \;=\; \frac{1}{n} \mathbf{X}^\top (\hat{\mathbf{y}} - \mathbf{y}) \;+\; \lambda \mathbf{w}.
$$

#### Requirements & I/O
- Inputs:
  - `X`: shape `(n, d)` — feature matrix (NumPy array).
  - `y`: shape `(n,)` with values in `{0, 1}` — labels.
  - `w`: shape `(d,)` — weight vector.
  - `lambda` (or `l2`): non-negative scalar — L2 strength.
- Outputs:
  - `loss`: scalar float — the average regularized loss.
  - `grad_w`: shape `(d,)` — gradient w.r.t. `w`.

#### Numerical Stability Tips
- If you compute via probabilities, clip with a small epsilon (e.g., `p = np.clip(p, 1e-12, 1-1e-12)`) before taking logs to avoid `log(0)`.
- Vectorize computations; avoid Python loops for performance.

In [None]:
def logreg_loss_grad(w: np.ndarray, X: np.ndarray, y: np.ndarray, reg: float = 0.0) -> Tuple[float, np.ndarray]:
    """
    Compute the binary logistic regression average loss with L2 regularization and its gradient.

    Loss:
        L(w) = (1/n) * sum_i [ softplus(z_i) - y_i * z_i ] + reg * ||w||^2
        where z = X @ w, softplus(t) = log(1 + exp(t)) computed stably.

    Gradient:
        ∇L(w) = (1/n) * X^T (σ(z) - y) + 2 * reg * w
        where σ(z) = 1 / (1 + exp(-z)) computed stably.

    Parameters
    ----------
    w : (d,) array
        Parameter vector.
    X : (n, d) array
        Design matrix.
    y : (n,) array with values in {0,1}
        Binary targets.
    reg : float >= 0
        L2 regularization strength.

    Returns
    -------
    loss : float
        The average regularized logistic loss.
    grad : (d,) array
        The gradient of the loss w.r.t. w.
    """
    # YOUR CODE HERE


In [None]:
# Q4.2 Public Tests (2 marks)
_check_banned(logreg_loss_grad)
X, y = gen_logreg(200, 200, 5, 1.3, 10)
w = np.zeros(X.shape[1])
loss, grad = logreg_loss_grad(w, X, y, reg=1e-3)
assert np.isfinite(loss) and np.all(np.isfinite(grad))
ng = numeric_grad(lambda ww: logreg_loss_grad(ww, X, y, reg=1e-3)[0], w.copy())
assert rel_error(grad, ng) < 1e-5
print("Passed tests: +2 marks")

In [None]:
# Q4.2 Hidden Tests (5 marks)

In [None]:
# Q4.2 Hidden Tests (5 marks)

### Q4.3: Training a Logistic Regression Model (11 marks)

In this part, you will implement a **gradient descent training loop** for binary logistic regression using the stable loss and gradient function you wrote in the previous question (`logreg_loss_grad`).

#### **What you need to do**
- Initialize the model parameters:
  - `w`: weight vector of shape `(d,)` (e.g., zeros or small random values).
- For a given number of epochs:
  1. Compute the **loss** and **gradients** using your `logreg_loss_grad` function.
  2. Update the parameters using **gradient descent**:
     $$
     \mathbf{w} \leftarrow \mathbf{w} - \eta \cdot \nabla_{\mathbf{w}}
     $$
     where $\eta$ is the learning rate.
  3. Record the **full-dataset loss** at each epoch for monitoring convergence.

#### **Inputs**
- `X`: shape `(n, d)` — feature matrix.
- `y`: shape `(n,)` — binary labels in `{0,1}`.
- `epochs`: number of passes over the dataset.
- `lr`: learning rate.
- `reg`: L2 regularization strength.

#### **Outputs**
- `w`: learned weight vector of shape `(d,)`.
- `history`: dictionary with:
  - `loss`: list of per-epoch loss values (full dataset).

#### **Requirements**
- Use **full-batch gradient descent** (compute loss and gradient on the entire dataset each epoch).
- Do **not** use scikit-learn or any external ML library.
- Ensure your implementation is **vectorized** for efficiency.

#### **Hints**
- Start with a small learning rate and check if the loss decreases.
- Use your previous implementation of `logreg_loss_grad` for stability.
- You can print or plot the loss curve to verify convergence.

In [None]:
def train_logreg(X: np.ndarray,
                 y: np.ndarray,
                 lr: float = 0.1,
                 reg: float = 0.0,
                 epochs: int = 200,
                 w0: np.ndarray | None = None) -> Tuple[np.ndarray, dict]:
    """
    Train binary logistic regression with L2 regularization via (full-batch) gradient descent.

    Minimizes (average) loss:
        L(w) = (1/n) * sum_i [ softplus(z_i) - y_i * z_i ] + reg * ||w||^2
        where z = X @ w and softplus(t) = log(1 + exp(t)) computed stably in logreg_loss_grad.

    Parameters
    ----------
    X : (n, d) ndarray
        Design matrix.
    y : (n,) ndarray of {0,1}
        Binary labels.
    lr : float
        Learning rate (step size).
    reg : float >= 0
        L2 regularization strength (lambda).
    epochs : int
        Number of passes over the dataset.
    w0 : (d,) ndarray or None
        Optional initialization; if None, initializes to zeros.

    Returns
    -------
    w : (d,) ndarray
        Learned parameter vector.
    history : dict
        Contains key 'loss' -> list of per-epoch full-dataset loss values.
    """
    # YOUR CODE HERE



In [None]:
# Q4 Visble Tests (1 mark)
_check_banned(train_logreg)
seed=10
X, y = gen_logreg(200, 300, 5, 1.3, seed)    
w_tr, hist = train_logreg(X, y, lr=0.2, reg=1e-2, epochs=300)
probs = 1/(1+np.exp(-(X@w_tr)))
yhat = (probs >= 0.5).astype(int)
acc = (yhat == y).mean()
assert acc > 0.95
print("Passed tests: +1 mark")

In [None]:
# Q4 Hidden Tests (4 marks)

In [None]:
# Q4 Hidden Tests (6 marks)


## Q5. Unsupervised Learning — K-means Clustering (10 marks)

In this question you will implement functions to cluster polygon images into k clusters (k = number of shape types).

Key constraints:
- You must implement feature extraction from images that is robust to translation/scale/rotation/noise (e.g., normalized moments, eccentrity, compactness, edge density, radial profiles — your choice).
- You must implement k-means.
- Your solution should generalize to a hidden dataset (different seed, stronger noise/rotation ranges).
- **Restriction (Q5): Do not import/call sklearn, tensorflow, keras, scikit-image, or any pre-built K-means functions.**

In [None]:
# DO NOT MODIFY THIS CELL - SETUP FOR REMAINING QUESTIONS
import math
import time
import random
from typing import Tuple, List, Optional, Dict

import numpy as np
from PIL import Image, ImageDraw, ImageFilter
import matplotlib.pyplot as plt

# Handy utilities
def show_grid(images: np.ndarray, ncols=8, title=None):
    """
    images: (N, H, W) or (N, 1, H, W) in 0..1 float
    """
    imgs = images.copy()
    if imgs.ndim == 4 and imgs.shape[1] == 1:
        imgs = imgs[:, 0]
    N, H, W = imgs.shape
    ncols = min(ncols, N)
    nrows = int(math.ceil(N / ncols))
    fig, axes = plt.subplots(nrows, ncols, figsize=(1.8*ncols, 1.8*nrows))
    axes = np.array(axes).reshape(nrows, ncols)
    for i, ax in enumerate(axes.flat):
        if i < N:
            ax.imshow(imgs[i], cmap="gray", vmin=0, vmax=1)
            ax.axis("off")
        else:
            ax.axis("off")
    if title:
        fig.suptitle(title)
    plt.tight_layout()
    plt.show()

# Image generator: polygons with translation, rotation, scale, thickness, and noise.
def _regular_polygon_vertices(n_sides: int, radius: float, rotation: float) -> List[Tuple[float, float]]:
    """Generate vertices of a regular polygon centered at (0,0) before translation.
    rotation in radians, radius in pixels."""
    return [
        (
            radius * math.cos(rotation + 2 * math.pi * k / n_sides),
            radius * math.sin(rotation + 2 * math.pi * k / n_sides)
        )
        for k in range(n_sides)
    ]

def _transform_points(pts, tx: float, ty: float, scale: float) -> List[Tuple[float, float]]:
    return [(scale * x + tx, scale * y + ty) for x, y in pts]

def render_polygon_image(
    n_sides: int,
    img_size: int = 64,
    radius: float = 20,
    rotation_deg: float = 0,
    translate: Tuple[float, float] = (0, 0),
    scale: float = 1.0,
    thickness: int = 1,
    fill: bool = True,
    blur: float = 0.0,
    noise_std: float = 0.02,
    antialias: int = 2,
) -> np.ndarray:
    """
    Returns a single-channel grayscale image in [0,1] shape (1, H, W).
    """
    H = W = img_size
    big = img_size * antialias
    img = Image.new("L", (big, big), color=0)
    draw = ImageDraw.Draw(img)

    rot = math.radians(rotation_deg)
    base_pts = _regular_polygon_vertices(n_sides, radius * antialias, rot)
    cx = cy = big // 2
    tx, ty = translate
    pts = _transform_points(base_pts, cx + tx * antialias, cy + ty * antialias, scale)

    if fill:
        draw.polygon(pts, fill=255)
        if thickness > 0:
            draw.line(pts + [pts[0]], fill=255, width=max(1, thickness * antialias))
    else:
        draw.line(pts + [pts[0]], fill=255, width=max(1, thickness * antialias))

    if blur > 0:
        img = img.filter(ImageFilter.GaussianBlur(radius=blur * antialias))

    # Downsample for antialiasing
    if antialias > 1:
        img = img.resize((W, H), resample=Image.Resampling.LANCZOS)

    arr = np.asarray(img).astype(np.float32) / 255.0

    if noise_std > 0:
        arr = arr + np.random.normal(0, noise_std, size=arr.shape).astype(np.float32)

    arr = np.clip(arr, 0.0, 1.0)
    return arr[None, ...]  # (1,H,W)

def sample_polygons_dataset(
    n_per_class: int,
    classes: List[int] = [3, 4, 5, 6],
    img_size: int = 64,
    noise_std: float = 0.03,
    max_rotation_deg: float = 180,
    max_translate: float = 8.0,
    min_scale: float = 0.8,
    max_scale: float = 1.2,
    thickness: int = 1,
    blur: float = 0.0,
    antialias: int = 2,
    rng: Optional[np.random.Generator] = None,
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Returns X: (n,1,h,w) in [0,1], y: (n,) as class indices from [0..len(classes)-1].
    """
    if rng is None:
        rng = np.random.default_rng(123)
    X_list, y_list = [], []
    for ci, n_sides in enumerate(classes):
        for _ in range(n_per_class):
            rotation = float(rng.uniform(-max_rotation_deg, max_rotation_deg))
            tx = float(rng.uniform(-max_translate, max_translate))
            ty = float(rng.uniform(-max_translate, max_translate))
            sc = float(rng.uniform(min_scale, max_scale))
            radius = float(rng.uniform(16, 22))
            img = render_polygon_image(
                n_sides=n_sides,
                img_size=img_size,
                radius=radius,
                rotation_deg=rotation,
                translate=(tx, ty),
                scale=sc,
                thickness=thickness,
                fill=True,
                blur=blur,
                noise_std=noise_std,
                antialias=antialias,
            )
            X_list.append(img)
            y_list.append(ci)
    X = np.stack(X_list, axis=0).astype(np.float32)
    y = np.array(y_list, dtype=np.int64)
    idx = rng.permutation(len(y))
    return X[idx], y[idx]

rng_demo = np.random.default_rng(42)
X_demo, y_demo = sample_polygons_dataset(
    n_per_class=8,
    classes=[3,4,5,6],
    img_size=64,
    noise_std=0.03,
    max_rotation_deg=180,
    max_translate=8.0,
    min_scale=0.8,
    max_scale=1.2,
    thickness=1,
    blur=0.0,
    antialias=2,
    rng=rng_demo
)
print("Demo dataset:", X_demo.shape, y_demo.shape)
show_grid(X_demo[:32], ncols=8, title="Sample polygon images")

In [None]:
# DO NOT MODIFY THIS CELL - SETUP FOR REMAINING QUESTIONS
def silhouette_score(features: np.ndarray, labels: np.ndarray) -> float:
    """
    Compute mean silhouette score without external libs.
    """
    X = features.astype(np.float64, copy=False)
    y = labels.astype(np.int64, copy=False)
    N = X.shape[0]
    # Precompute distances (O(N^2) but N is small in tests)
    # Use squared Euclidean for efficiency; silhouette uses distances, but monotonic transformation preserves ordering.
    # We will use true Euclidean distance for accuracy (still fast for small N).
    dists = np.sqrt(((X[:, None, :] - X[None, :, :]) ** 2).sum(axis=2))
    s_vals = []
    for i in range(N):
        same = (y == y[i])
        other = ~same
        same_idx = np.where(same)[0]
        if len(same_idx) <= 1:
            a = 0.0
        else:
            a = dists[i, same_idx][dists[i, same_idx] > 0].mean() if (dists[i, same_idx] > 0).any() else 0.0
        # mean distance to each other cluster
        other_clusters = np.unique(y[other])
        if len(other_clusters) == 0:
            b = 0.0
        else:
            means = []
            for c in other_clusters:
                idx = np.where(y == c)[0]
                means.append(dists[i, idx].mean())
            b = np.min(means) if len(means) > 0 else 0.0
        denom = max(a, b)
        s = 0.0 if denom == 0 else (b - a) / denom
        s_vals.append(s)
    return float(np.mean(s_vals)) if len(s_vals) > 0 else 0.0


### Q5.1  Implement feature extraction (translation/scale/rotation/noise robustness) (4 marks)

In this question, you will design a **classical vision** feature extractor for our polygon images (triangles, squares, pentagons). The goal is to produce **compact, informative, and robust** features that a simple classifier (provided elsewhere) can learn from, **without** using deep learning libraries.

**Task**: implement extract_features(X) -> (n, d) float32 array.

Input: X: (n, 1, h, w) with values in [0,1].

Output: Return features that are reasonably invariant to translation/scale/rotation/noise.

You may use numpy only (i.e., no scikit-image).

Hints (you don't need all of these):
- Normalized central moments up to 2nd order.
- Eccentricity from covariance of foreground pixels.
- Edge density (Sobel or simple finite differences).
- Radial profile histogram from centroid.
- Area and compactness: perimeter^2 / area (estimate perimeter via simple gradient)

Your features MUST be deterministic and not rely on fixed seeds.

In [None]:
def extract_features(X: np.ndarray) -> np.ndarray:
    """
    X: (n, 1, h, w) with values in [0,1].

    Return features that are reasonably invariant to translation/scale/rotation/noise.
    
    Args:
        X: (n, 1, h, w), float32 in [0,1]
    Returns:
        F: (n, ), float32
    """
    # YOUR CODE HERE

In [None]:
# Q5.1 Visible Tests (4 marks)
_check_banned(extract_features)
rng_vis = np.random.default_rng(2025)
Xv, _ = sample_polygons_dataset(
    n_per_class=6, classes=[3,4,5,6], img_size=64, noise_std=0.02,
    max_rotation_deg=40, max_translate=4.0, min_scale=0.9, max_scale=1.1,
    rng=rng_vis
)
Fv = extract_features(Xv)
assert isinstance(Fv, np.ndarray), "Features must be a numpy array"
assert Fv.ndim == 2 and Fv.shape[0] == Xv.shape[0], "Feature shape must be (n, d)"
print("Passed tests: +4 marks")

### Q5.2 Implement K-means (6 marks)

Implement a simple k-means clustering using the notes from the lectures:
- Inputs: features (n,d), k, max_iter, tol, rng (np.random.Generator)
- Return: labels (n,) int in [0..k-1], and centers (k, d) with k clusters, each with d dimensions
- Use k-means++ or random init. Must be deterministic given rng.

In [None]:

def kmeans(features: np.ndarray, k: int, max_iter: int = 100, tol: float = 1e-4,
           rng: Optional[np.random.Generator] = None) -> Tuple[np.ndarray, np.ndarray]:
    """
    Args:
        features: (n, d) array
        k: number of clusters
        max_iter: maximum k-means iterations
        tol: convergence tolerance (max L2 shift of centres)
        rng: np.random.Generator for determinism; if None, a default is used

    Returns:
        labels: (n,) cluster assignments in [0..k-1]
        centers: (k, d) cluster centers
    """
    # YOUR CODE HERE


In [None]:
# Q5.2 Visible Tests (1 mark)
_check_banned(kmeans)
rng_vis = np.random.default_rng(2025)
Xv, _ = sample_polygons_dataset(
    n_per_class=6, classes=[3,4,5,6], img_size=64, noise_std=0.02,
    max_rotation_deg=40, max_translate=4.0, min_scale=0.9, max_scale=1.1,
    rng=rng_vis)
Fv = extract_features(Xv)
labels_v, centers_v = kmeans(Fv, k=4, max_iter=50, rng=np.random.default_rng(0))
assert labels_v.shape == (Xv.shape[0],), "Labels must be (n,)"
assert len(np.unique(labels_v)) >= 3, "K-means should produce multiple clusters"
print("Passed tests: +1 mark")

In [None]:
# Q5.2 Visible Tests (1 mark)
_check_banned(kmeans)
rng_vis = np.random.default_rng(2025)
Xv, _ = sample_polygons_dataset(
    n_per_class=6, classes=[3,4,5,6], img_size=64, noise_std=0.02,
    max_rotation_deg=40, max_translate=4.0, min_scale=0.9, max_scale=1.1,
    rng=rng_vis
)
Fv = extract_features(Xv)
labels_v, centers_v = kmeans(Fv, k=4, max_iter=50, rng=np.random.default_rng(0))
sv = silhouette_score(Fv, labels_v)
assert np.isfinite(sv), "Silhouette must be finite"
# Easy regime -> should be at least weakly positive
assert sv > 0.05, f"Silhouette too low in easy regime: {sv:.3f}"
print("Passed tests: +1 mark")

In [None]:
# Q5 Hidden Tests (4 marks)



## Q6 Classification with a CNN (20 marks)

Build and train a CNN to classify polygons into four classes: triangle, square, pentagon, hexagon.

Constraints:
- Train on synthetic images generated in-notebook (no external data).
- Keep the model modest (< 150k parameters) and trainable on CPU in ~30–120 seconds.
- Use a training/validation split derived from different seeds and augmentations.
- Hidden tests will generate harder datasets (different seed, stronger noise, rotations, and scales).
- You are allowed to use torch.
  

In [None]:
# DO NOT MODIFY THIS CELL

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Reproducibility helper
def set_all_seeds(seed: int = 123):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # torch.backends.cudnn.deterministic = True  # CPU-only for this assignment
    # torch.backends.cudnn.benchmark = False

set_all_seeds(7)

DEVICE = torch.device("cpu")

class PolygonsDataset(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray):
        assert X.ndim == 4 and X.shape[1] == 1
        self.X = X
        self.y = y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        x = self.X[idx]  # (1,H,W), float32
        y = int(self.y[idx])
        # Convert to torch tensors
        return torch.from_numpy(x), torch.tensor(y, dtype=torch.long)

def make_split(
    n_train_per_class=60,
    n_val_per_class=20,
    classes=[3,4,5,6],
    img_size=64,
    train_noise=0.04,
    val_noise=0.05,
    rng_train=np.random.default_rng(101),
    rng_val=np.random.default_rng(202),
):
    Xtr, ytr = sample_polygons_dataset(
        n_per_class=n_train_per_class, classes=classes, img_size=img_size,
        noise_std=train_noise, max_rotation_deg=180, max_translate=10.0,
        min_scale=0.75, max_scale=1.25, rng=rng_train
    )
    Xva, yva = sample_polygons_dataset(
        n_per_class=n_val_per_class, classes=classes, img_size=img_size,
        noise_std=val_noise, max_rotation_deg=180, max_translate=10.0,
        min_scale=0.75, max_scale=1.25, rng=rng_val
    )
    return (Xtr, ytr), (Xva, yva)


### Q6.1 Model definition (2 marks)

**Build a CNN model with <150k parameters.**

Tips:
- Small conv stack + batch norm + dropout.
- Global average pooling or small FC head.
- Keep strides modest to preserve signal.
- Input is (1, 64, 64).


In [None]:
class StudentNet(nn.Module):
    """
    A compact convolutional neural network for classifying simple 64×64 grayscale shapes.

    **Intended use**
    ----------------
    - Input: tensors of shape **(n, 1, 64, 64)** with values in **[0, 1]** (float32).
    - Output: **logits** of shape **(n, num_classes)**. Apply `softmax` externally if you need
      probabilities, e.g., `F.softmax(logits, dim=1)`.

    **Architecture**
    ----------------
    Feature extractor (`self.features`):
      - **Blocks 1 to m (you decide on the number and shape of convolution blocks)

    Head:
      - Global Average Pooling to (n, k, 1, 1) → Flatten to (n, k)
      - Dropout(p=0.20)
      - Linear(k → num_classes)
    
    **Initialization**
    ------------------
    - Conv weights: Kaiming normal (fan_out, ReLU).
    - BatchNorm: weight=1, bias=0.
    - Linear: N(0, 0.01), bias=0.

    Parameters
    ----------
    num_classes : int, default=4
        Number of output classes (size of the final linear layer).
    """
    def __init__(self, num_classes: int = 4):
        super().__init__()
        # YOUR CODE HERE
        
        
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Input of shape **(n, 1, 64, 64)** with values in [0, 1], dtype float32.

        Returns
        -------
        torch.Tensor
            **Logits** of shape **(n, num_classes)**. Apply `softmax` externally if probabilities
            are required; e.g., `F.softmax(logits, dim=1)`.
        """
        # YOUR CODE HERE
        
def build_model(num_classes: int = 4) -> nn.Module:
    """
    Factory function to construct a `StudentNet`.

    Parameters
    ----------
    num_classes : int, default=4
        Number of output classes.

    Returns
    -------
    nn.Module
        An instance of `StudentNet(num_classes=num_classes)`.
    """
    # YOUR CODE HERE


### Q6.2 Training loop (18 marks)

Implement a funtion to train the model, called train_model, with:
- Adam or SGD optimizer
- Cross-entropy loss
- Reasonable Learning Rates and Number of Epochs (e.g., between 5 and 15)
- Returns trained model and a history dict (loss/acc per epoch)

In [None]:
def train_model(
    model: nn.Module,
    train_loader: DataLoader,
    val_loader: DataLoader,
    epochs: int = 8,
    lr: float = 1e-3,
    weight_decay: float = 0.0,
    device: torch.device = DEVICE,
) -> Tuple[nn.Module, Dict[str, List[float]]]:
    """
    Train a classification model for a fixed number of epochs using CrossEntropy loss and Adam,
    and return the trained model together with per-epoch metrics.

    This routine performs a standard supervised training loop:
    - Moves the model to the specified `device`.
    - For each epoch:
      * **Training phase**: sets `model.train()`, iterates over `train_loader`, computes logits,
        CrossEntropy loss, backpropagates, applies gradient clipping (max-norm=5.0), and steps the
        Adam optimizer. Accumulates **training loss** and **accuracy**.
      * **Validation phase**: sets `model.eval()`, disables grad, iterates over `val_loader`,
        computes logits and CrossEntropy loss, and accumulates **validation loss** and **accuracy**.
    - Records per-epoch metrics in a `history` dictionary:
        `{'train_loss': [...], 'val_loss': [...], 'train_acc': [...], 'val_acc': [...]}`.

    **Data/Model expectations**
    - `train_loader` / `val_loader` must yield batches `(xb, yb)` where:
        * `xb` is a float-like tensor of shape `(n, c, h, w)` (or `(n, d)` for MLPs) compatible with the model.
        * `yb` is a `LongTensor` of shape `(n,)` with class indices in `[0, num_classes-1]`.
    - The model’s `forward` must return **logits** of shape `(n, num_classes)` compatible with
      `torch.nn.CrossEntropyLoss`.

    Parameters
    ----------
    model : nn.Module
        The neural network to train (moved to `device` at the start).
    train_loader : DataLoader
        Dataloader for training batches; must produce `(inputs, targets)`.
    val_loader : DataLoader
        Dataloader for validation batches; must produce `(inputs, targets)`.
    epochs : int, default=8
        Number of full passes over `train_loader`.
    lr : float, default=1e-3
        Learning rate for Adam.
    weight_decay : float, default=0.0
        L2 weight decay for Adam (applied to parameters via optimizer).
    device : torch.device, default=DEVICE
        Device on which to run training/validation (e.g., `torch.device('cuda')` or `'cpu'`).

    Returns
    -------
    model : nn.Module
        The trained model (still residing on `device`).
    history : Dict[str, List[float]]
        Per-epoch metrics:
        - `history['train_loss']`: mean CrossEntropy on training set each epoch.
        - `history['val_loss']`  : mean CrossEntropy on validation set each epoch.
        - `history['train_acc']` : training accuracy each epoch in `[0,1]`.
        - `history['val_acc']`   : validation accuracy each epoch in `[0,1]`.

    Notes
    -----
    - **Criterion / Optimizer**: Uses `nn.CrossEntropyLoss()` and `torch.optim.Adam` with the provided
      `lr` and `weight_decay`.
    - **Gradient clipping**: Clips gradients by global norm at 5.0 to improve stability on noisy batches.
    - **Dtypes & device**: Inputs are cast to `float` and moved to `device`; targets are moved as-is.
    - **Determinism**: If you require reproducibility, set seeds and deterministic flags **outside**
      this function (e.g., `torch.manual_seed`, `np.random.seed`, `torch.backends.cudnn.deterministic = True`).
    
    Example
    -------
    >>> device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    >>> model = StudentNet(num_classes=4).to(device)
    >>> model, history = train_model(model, train_loader, val_loader,
    ...                              epochs=10, lr=1e-3, weight_decay=1e-4, device=device)
    >>> history['val_acc'][-1]
    0.92
    """
    # YOUR CODE HERE


In [None]:
# Q6 Visible Tests (1 mark)

set_all_seeds(10)
(Xtr, ytr), (Xva, yva) = make_split(
    n_train_per_class=40, n_val_per_class=12, img_size=64,
    train_noise=0.03, val_noise=0.04,
    rng_train=np.random.default_rng(333),
    rng_val=np.random.default_rng(444)
)
train_ds = PolygonsDataset(Xtr, ytr)
val_ds = PolygonsDataset(Xva, yva)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64, shuffle=False)

model = build_model(num_classes=4).to(DEVICE)

# Param budget
n_params = sum(p.numel() for p in model.parameters())
assert n_params < 150_000, f"Model too large: {n_params} parameters"
print("Passed tests: +1 mark")

In [None]:
# Q6 Visible Tests (1 mark)

set_all_seeds(10)
(Xtr, ytr), (Xva, yva) = make_split(
    n_train_per_class=40, n_val_per_class=12, img_size=64,
    train_noise=0.03, val_noise=0.04,
    rng_train=np.random.default_rng(333),
    rng_val=np.random.default_rng(444)
)
train_ds = PolygonsDataset(Xtr, ytr)
val_ds = PolygonsDataset(Xva, yva)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64, shuffle=False)

model = build_model(num_classes=4).to(DEVICE)

# Forward shape check
xb, yb = next(iter(train_loader))
with torch.no_grad():
    logits = model(xb.to(DEVICE).float())
assert logits.shape == (xb.shape[0], 4), f"Logits shape incorrect: {logits.shape}"
print("Passed tests: +1 mark")

In [None]:
# Q6 Visible Tests (2 marks)

set_all_seeds(10)
(Xtr, ytr), (Xva, yva) = make_split(
    n_train_per_class=40, n_val_per_class=12, img_size=64,
    train_noise=0.03, val_noise=0.04,
    rng_train=np.random.default_rng(333),
    rng_val=np.random.default_rng(444)
)
train_ds = PolygonsDataset(Xtr, ytr)
val_ds = PolygonsDataset(Xva, yva)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64, shuffle=False)

model = build_model(num_classes=4).to(DEVICE)

# Quick training: small epochs, should beat chance significantly (> 0.5 on easy val)
model, hist = train_model(model, train_loader, val_loader, epochs=4, lr=1e-3, weight_decay=0.0, device=DEVICE)
assert "val_acc" in hist and len(hist["val_acc"]) >= 1
acc_last = hist["val_acc"][-1]
assert acc_last >= 0.3, f"Validation accuracy too low on visible split: {acc_last:.3f}"
print("Passed tests: +2 marks")

In [None]:
# Q6 Hidden Tests (5 marks)


In [None]:
# Q6 Hidden Tests (5 marks)


In [None]:
# Q6 Hidden Tests (6 marks)
