# Assignment 04

**Citation:** The following questions and supporting code were originally developed as part of Dr. Eric Eaton's [Introduction to Machine Learning Course at U Penn](https://www.cis.upenn.edu/~cis5190/fall2014/). 


In [None]:
from typing import Tuple

import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

import torch
from torch import nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset

from utils import load_dataset

import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'

**1.1** $k$-Means Clustering


Given a dataset $x_1,..., x_n \in \mathbb{R}^{d}$ and an integer $1 \leq k \leq n$, recall the following $k$-means objective function
\begin{align}
    \min_{\pi_1, ..., \pi_k} \sum_{i=1}^{k} \sum_{j \in \pi_i} \| x_j - \mu_{i} \|_{2}^{2} \ , \quad \mu_i = \frac{1}{|\pi_i|} \sum_{j \in \pi_i} x_j \ .
\end{align}

Above, $\{\pi_i\}_{i=1}^{k}$ is a partition of $\{1, 2, ..., n\}$. The objective function above is NP-hard (To be more precise, it is both NP-hard in $d$ when $k=2$ and $k$ when $d=2$. See the references on the wikipedia page for $k$-means for more details.} to find a global minimizer of. Nevertheless the commonly-used algorithm we discussed in lecture (Lloyd's algorithm), typically works well in practice.)


**Note on Time to Run** --- The runtime of a good implementation for this problem should be fairly fast (a few minutes); if you find it taking upwards of one hour, please check your implementation! (Hint: **For loops are costly.** Can you vectorize it or use Numpy operations to make it faster in some ways? If not, is looping through data-points or through centers faster?)



**a.** [5 points] The $k$-Means clustering algorithm we covered in class is called Lloyd's algorithm.  Implement Lloyd's algorithm for solving the $k$-means objective above. Do not use any off-the-shelf implementations, such as those found in ```scikit-learn```. Include your code in your submission.
    
**b.** [5 points] Run the algorithm on the *training* dataset of MNIST with $k=10$. Visualize (and include in your report) the cluster centers as a set of $k$ $28\times 28$ images.

**c.** [5 points] For $k=\{2, 4, 8, 16, 32, 64\}$ run the algorithm on the *training* dataset to obtain centers $\{\mu_{i}\}_{i=1}^k$. If $\{(x_i,y_i)\}_{i=1}^n$ and $\{(x_i',y_i')\}_{i=1}^m$ denote the training and test sets, respectively, plot the training error $\frac{1}{n} \sum_{i=1}^n \min_{j=1,\dots,k} \| \mu_j - x_i \|_2$ and test error $\frac{1}{m} \sum_{i=1}^m \min_{j=1,\dots,k} \| \mu_j - x_i' \|_2$ as a function of $k$ on the same plot.

**What to Submit:**
 - **For part (a):** Lloyd's algorithm code
 - **For part (b):** 10 images of cluster centers.
 - **For part (c):** Plot of training and test error as function of k.
 - Code in this jupyter notebook

In [None]:
def calculate_centers(
    data: np.ndarray, classifications: np.ndarray, num_centers: int
) -> np.ndarray:
    """
    Sub-routine of Lloyd's algorithm that calculates the centers given datapoints and their respective classifications/assignments.
    num_centers is additionally provided for speed-up purposes.

    Args:
        data (np.ndarray): Array of shape (n, d). Training data set.
        classifications (np.ndarray): Array of shape (n,) full of integers in range {0, 1, ...,  num_centers - 1}.
            Data point at index i is assigned to classifications[i].
        num_centers (int): Number of centers for reference.
            Might be usefull for pre-allocating numpy array (Faster that appending to list).

    Returns:
        np.ndarray: Array of shape (num_centers, d) containing new centers.
    """
    centers = np.zeros((num_centers, data.shape[1])) # (k, d)
    for i in range(num_centers): # For each class
        centers[i] = np.mean(data[classifications == i], axis=0) # Mean across samples
    return centers


def cluster_data(data: np.ndarray, centers: np.ndarray) -> np.ndarray:
    """
    Sub-routine of Lloyd's algorithm that clusters datapoints to centers given datapoints and centers.

    Args:
        data (np.ndarray): Array of shape (n, d). Training data set.
        centers (np.ndarray): Array of shape (k, d). Each row is a center to which a datapoint can be clustered.

    Returns:
        np.ndarray: Array of integers of shape (n,), with each entry being in range {0, 1, 2, ..., k - 1}.
            Entry j at index i should mean that j^th center is the closest to data[i] datapoint.
    """
    classifications = np.zeros(data.shape[0], dtype=int) # (n,), integer type because it's an index
    for i, point in enumerate(data): # Iterate over sample points
        classifications[i] = np.argmin(np.linalg.norm(centers - point, axis=1)) # Assign to the minimum norm over all dimensions
    return classifications


def calculate_error(data: np.ndarray, centers: np.ndarray) -> float:
    """Calculates error/objective function on a provided dataset, with trained centers.

    Args:
        data (np.ndarray): Array of shape (n, d). Dataset to evaluate centers on.
        centers (np.ndarray): Array of shape (k, d). Each row is a center to which a datapoint can be clustered.
            These should be trained on training dataset.

    Returns:
        float: Single value representing mean objective function of centers on a provided dataset.
    """
    # Implement as the MSE of each point to its assigned cluster center
    classifications = cluster_data(data, centers)
    errors = np.linalg.norm(data - centers[classifications], axis=1) ** 2 # Norm over the dimension axis --> (n,)
    return np.mean(errors)


def lloyd_algorithm(
    data: np.ndarray, num_centers: int, epsilon: float = 10e-3
) -> np.ndarray:
    """Main part of Lloyd's Algorithm.

    Args:
        data (np.ndarray): Array of shape (n, d). Training data set.
        num_centers (int): Number of centers to train/cluster around.
        epsilon (float, optional): Epsilon for stopping condition.
            Training should stop when max(abs(centers - previous_centers)) is smaller or equal to epsilon.
            Defaults to 10e-3.

    Returns:
        np.ndarray: Array of shape (num_centers, d) containing trained centers.

    Note:
        - For initializing centers please use the first `num_centers` data points.
    """
    centers = data[:num_centers] # Initialize centers as the first num_centers data points
    prev_centers = centers.copy()
    converged = False
    counter = 0

    while not converged:
        classifications = cluster_data(data, centers)
        centers = calculate_centers(data, classifications, num_centers)

        error = calculate_error(data, centers)
        # print(f"Iteration: {counter}, Error: {error}")

        # Check if converged
        converged = np.max(np.abs(centers - prev_centers)) <= epsilon
        if converged:
            break
        prev_centers = centers.copy()
        counter += 1

    return centers

In [None]:
def Q1b():
    """Main function of k-means problem

    You should:
        a. Run Lloyd's Algorithm for k=10, and report 10 centers returned.
        b. For ks: 2, 4, 8, 16, 32, 64 run Lloyd's Algorithm,
            and report objective function value on both training set and test set.
            (All one plot, 2 lines)

    NOTE: This code takes a while to run. For debugging purposes you might want to change:
        x_train to x_train[:10000]. CHANGE IT BACK before submission.
    """
    (x_train, _), (x_test, _) = load_dataset("mnist")
    
    # b) Run the algorithm on the training set with k=10
    # Report and visualize the cluster centers as a set of k 28x28 images
    centers = lloyd_algorithm(data=x_train, num_centers=10)

    fig, axs = plt.subplots(2, 5, figsize=(12, 5))
    for i, ax in enumerate(axs.flat):
        ax.imshow(centers[i].reshape(28, 28), cmap="gray")
        ax.set_title(f"Center {i}")
        ax.axis("off")
    plt.tight_layout()
    plt.show()

Q1b()

In [None]:
# Plot the training error and test error as a function of k
def Q1c():
    (x_train, _), (x_test, _) = load_dataset("mnist")
    ks = [2, 4, 8, 16, 32, 64]
    train_errors = []
    test_errors = []

    for k in tqdm(ks):
        centers = lloyd_algorithm(data=x_train, num_centers=k)
        train_error = calculate_error(x_train, centers)
        test_error = calculate_error(x_test, centers)
        train_errors.append(train_error)
        test_errors.append(test_error)

        print(f"k={k}, Train Error: {train_error}, Test Error: {test_error}")

    plt.plot(ks, train_errors, label="Train Error")
    plt.plot(ks, test_errors, label="Test Error")
    plt.xlabel("k")
    plt.ylabel("Error")
    plt.title("Training and Test Errors")
    plt.legend()
    plt.show()

Q1c()

**1.2** PCA

Let's do PCA on MNIST dataset and reconstruct the digits in the dimensionality-reduced PCA basis. You will compute your PCA basis using the training dataset only, and evaluate the quality of the basis on the test set, similar to the k-means reconstructions above. 
We have $n_{train}=50,000$ training examples of size $28 \times 28$. Begin by flattening each example to a vector to obtain $X_{train} \in \mathbb{R}^{50,000 \times d}$ and $X_{test} \in \mathbb{R}^{10,000 \times d}$ for $d= 784$.

Let $\mu \in \mathbb{R}^{d}$ denote the average of the training examples in $X_{train}$, i.e., $\mu = \frac{1}{n_{train}} X_{train}^\top \mathbf{1}^\top$. Now let $\Sigma =  (X_{train} - \mathbf{1} \mu^\top)^\top (X_{train} - \mathbf{1} \mu^\top)/50000$ denote the sample covariance matrix of the training examples, and let $\Sigma = UDU^T$ denote the eigenvalue decomposition of $\Sigma$.


**a.** [2 points]
    If $\lambda_i$ denotes the $i$th largest eigenvalue of $\Sigma$, what are the eigenvalues $\lambda_1$, $\lambda_2$, $\lambda_{10}$, $\lambda_{30}$, and $\lambda_{50}$? What is the sum of eigenvalues $\sum_{i=1}^d{\lambda_i}$?
 
**b.** [5 points]
    Let $x \in \mathbb{R}^d$ and $k \in 1,2,\dots,d$. Write a formula for the rank-$k$ PCA approximation of $x$.
 
**c.** [5 points] Using this approximation, plot the reconstruction error from $k=1$ to $100$ (the $X$-axis is $k$ and the $Y$-axis is the mean-squared error reconstruction error) on the training set and the test set (using the $\mu$ and the basis learned from the training set). 
    On a separate plot, plot  $1-\frac{\sum_{i=1}^{k}{\lambda_i}}{\sum_{i=1}^{d}{\lambda_i}}$ from $k=1$ to $100$.
    
**d.** [3 points]
    Now let us get a sense of what the top PCA directions are capturing. Display the first $10$ eigenvectors as images, and provide a brief interpretation of what you think they capture.
    
**e.** [3 points]
    Finally, visualize a set of reconstructed digits from the training set for different values of $k$. In particular provide the reconstructions for digits $2,6,7$ with values $k = 5, 15, 40, 100$ (just choose an image from each digit arbitrarily). Show the original image side-by-side with its reconstruction. Provide a brief interpretation, in terms of your perceptions of the quality of these reconstructions and the dimensionality you used.
    
**What to Submit:**

- **For part (a):** Eigenvalues 1, 2, 10, 30, and 50 and the sum. At least 6 leading digits.
- **For part (b):** The Formula. If you are defining new variables/matrices make sure their definition is stated
        clearly.
- **For part (c):** Plot containing reconstruction error on train and test sets. Plot of $1-\frac{\sum_{i=1}^{k}{\lambda_i}}{\sum_{i=1}^{d}{\lambda_i}}$
- **For part (d):** 10 eigenvectors as images.
- **For part (e):** 15 total images, including 3 original and 12 reconstructed ones. Each reconstructed image
        corresponds to a certain digit (2, 6 or 7) and k value (5, 15, 40 or 100).
- Code for parts c-e in this jupyter notebook.

In [None]:
def reconstruct_demean(uk: np.ndarray, demean_data: np.ndarray) -> np.ndarray:
    """Given a demeaned data, create a recontruction using eigenvectors provided by `uk`.

    Args:
        uk (np.ndarray): First k eigenvectors. Shape (d, k).
        demean_vec (np.ndarray): Demeaned data (centered at 0). Shape (n, d)

    Returns:
        np.ndarray: Array of shape (n, d).
            Each row should correspond to row in demean_data,
            but first compressed and then reconstructed using uk eigenvectors.
    """
    # Compress then reconstruct the inputs
    compressed = np.dot(demean_data, uk) # (n, d) @ (d, k) --> (n, k)
    reconstructed = np.dot(compressed, uk.T) # (n, k) @ (k, d) --> (n, d)
    return reconstructed


def reconstruction_error(uk: np.ndarray, demean_data: np.ndarray) -> float:
    """Given a demeaned data and some eigenvectors calculate the squared L-2 error that recontruction will incur.

    Args:
        uk (np.ndarray): First k eigenvectors. Shape (d, k).
        demean_data (np.ndarray): Demeaned data (centered at 0). Shape (n, d)

    Returns:
        float: Squared L-2 error on reconstructed data.
    """
    reconstructed = reconstruct_demean(uk, demean_data)
    error = np.mean(np.linalg.norm(demean_data - reconstructed, axis=1) ** 2) # Mean squared error
    return error


def calculate_eigen(demean_data: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    """
    Given demeaned data calculate eigenvalues and eigenvectors of it.

    Args:
        demean_data (np.ndarray): Demeaned data (centered at 0). Shape (n, d)

    Returns:
        Tuple[np.ndarray, np.ndarray]: Tuple of two numpy arrays representing:
            1. Eigenvalues array with shape (d,)
            2. Matrix with eigenvectors as columns with shape (d, d)
    """
    eigenvalues, eigenvectors = np.linalg.eigh(demean_data.T @ demean_data) # (d, n) @ (n, d) --> (d, d)
    return eigenvalues, eigenvectors

def main():
    """
    Main function of PCA problem. It should load data, calculate eigenvalues/-vectors,
    and then answer all questions from problem statement.

    Part A:
        - Report 1st, 2nd, 10th, 30th and 50th largest eigenvalues
        - Report sum of eigenvalues

    Part C:
        - Plot reconstruction error as a function of k (# of eigenvectors used)
            Use k from 1 to 101.
            Plot should have two lines, one for train, one for test.
        - Plot ratio of sum of eigenvalues remaining after k^th eigenvalue with respect to whole sum of eigenvalues.
            Use k from 1 to 101.

    Part D:
        - Visualize 10 first eigenvectors as 28x28 grayscale images.

    Part E:
        - For each of digits 2, 6, 7 plot original image, and images reconstruced from PCA with
            k values of 5, 15, 40, 100.
    """
    (x_tr, y_tr), (x_test, _) = load_dataset("mnist")

    # a) Sort and report the eigenvalues
    x_mean = np.mean(x_tr, axis=0)
    demeaned_train = x_tr - x_mean # Subtract the mean sample
    eigenvalues, eigenvectors = calculate_eigen(demeaned_train)

    sorted_eig_indices = np.argsort(eigenvalues)[::-1] # Sort the eigenvalues from biggest to smallest
    eigenvalues = eigenvalues[sorted_eig_indices]
    eigenvectors = eigenvectors[:, sorted_eig_indices]

    for i in [1, 2, 10, 30, 50]:
        print(f"{i}th largest Eigenvalue: {eigenvalues[i-1]}")
    print(f"Sum of Eigenvalues: {np.sum(eigenvalues)}")

    # c) Plot the reconstruction error as a function of k
    train_errors = []
    test_errors = []
    ratios = []

    for k in tqdm(range(1, 101)):
        uk = eigenvectors[:, :k] # Get the first k biggest eigenvectors
        train_error = reconstruction_error(uk, demeaned_train)
        test_error = reconstruction_error(uk, x_test - x_mean) # Demean test using the train mean
        train_errors.append(train_error)
        test_errors.append(test_error)

        ratio = np.sum(eigenvalues[:k]) / np.sum(eigenvalues) # Sum of the first k eigenvalues over the total sum
        ratios.append(ratio)

    plt.plot(range(1, 101), train_errors, label="Train Error")
    plt.plot(range(1, 101), test_errors, label="Test Error")
    plt.xlabel("k")
    plt.ylabel("Mean Squared Error")
    plt.title("Reconstruction Error")
    plt.legend()
    plt.show()

    plt.plot(range(1, 101), ratios)
    plt.xlabel("k")
    plt.ylabel("Ratio")
    plt.title("Eigenvalue Sum Ratio")
    plt.show()

    # d) Display the first 10 eigenvectors as images
    fig, axs = plt.subplots(2, 5, figsize=(12, 5))
    for i, ax in enumerate(axs.flat):
        ax.imshow(eigenvectors[:, i].reshape(28, 28), cmap="gray")
        ax.set_title(f"Eigenvector {i}")
        ax.axis("off")
    plt.tight_layout()
    plt.show()

    # e) Display the original and reconstructed images from the training set for digits 2, 6, 7 using k = 5, 15, 40, 100
    digits = [2, 6, 7]
    ks = [5, 15, 40, 100]

    fig, axs = plt.subplots(len(digits), len(ks) + 1, figsize=(15, 10)) # Prepare the empty figure

    for i, digit in enumerate(digits):
        axs[i, 0].imshow(x_tr[y_tr == digit][0].reshape(28, 28), cmap="gray") # Plot the original image
        axs[i, 0].set_title(f"Original digit {digit}")
        axs[i, 0].axis("off")

        for j, k in enumerate(ks):
            sample_image = x_tr[y_tr == digit][0] # Get a sample image for that digit
            demeaned_sample = sample_image - x_mean # Demean using the training set mean
            uk = eigenvectors[:, :k] # Get the first k eigenvectors
            reconstructed = (reconstruct_demean(uk, demeaned_sample) + x_mean).reshape(28, 28) # Pass the sample through PCA

            axs[i, j + 1].imshow(reconstructed, cmap="gray") # Plot the reconstructed image
            axs[i, j + 1].set_title(f"Reconstructed k={k}")
            axs[i, j + 1].axis("off")

    plt.tight_layout()
    plt.show()

main()


**1.3** Unsupervised Learning with Autoencoders

In this exercise, we will train two simple autoencoders to perform dimensionality reduction on MNIST. As discussed in lecture, autoencoders are a long-studied neural network architecture comprised of an encoder component to summarize the latent features of input data and a decoder component to try and reconstruct the original data from the latent features.

**Weight Initialization and PyTorch**
Last assignment, we had you refrain from using ```torch.nn``` modules. For this assignment, we will use these modules, and recommend using ```nn.Linear``` for your linear layers. You will not need to initialize the weights yourself; the default initialization in PyTorch will be sufficient for this problem. *Hint: we also recommend using the ```nn.Sequential``` module to organize your network class and simplify the process of writing the forward pass. However, you may choose to organize your code however you'd like.*

**Training**
Use ```optim.Adam``` for this question. Feel free to experiment with different learning rates, though you can use $5 \cdot 10^{-5}$ as mentioned in the code. Use mean
squared error (```nn.MSELoss()``` or ```F.mse_loss()```) for the loss function.


**a.** [10 points] Use a network with a single linear layer. Let $W_{\text{e}} \in \mathbb{R}^{h \times d}$ and $W_{\text{d}} \in \mathbb{R}^{d\times h}$. Given some $x \in \mathbb{R}^d$, the forward pass is formulated as 

$$
      \mathcal{F}_{1}(x) = W_{\mathrm{d}} W_{\mathrm{e}} x.
$$

Run experiments for $h \in \{ 32, 64, 128 \}$. For each of the different $h$ values, report your final training error and visualize a set of 10 reconstructed digits, side-by-side with the original image. *Note:* we omit the bias term in the formulation for notational convenience since ```nn.Linear``` learns bias parameters alongside weight parameters by default.
    
**b.** [10 points] Use a single-layer network with non-linearity. Let $W_{\mathrm{e}} \in \mathbb{R}^{h \times d}$, $W_{\mathrm{d}} \in \mathbb{R}^{d\times h}$, and activation $\sigma: \mathbb{R} \longmapsto \mathbb{R}$, where $\sigma$ is the ReLU function. Given some $x \in \mathbb{R}^d$, the forward pass is formulated as 

$$
      \mathcal{F}_{2}(x) = \sigma(W_{\text{d}} \sigma(W_{\mathrm{e}} x)).
$$

Report the same findings as asked for in part a (for $h \in \{ 32,64,128 \}$).
    
**c.** [5 points] Now, evaluate $\mathcal{F}_1(x)$ and $\mathcal{F}_2(x)$ (use $h=128$ here) on the test set. Provide the test reconstruction errors in a table.
    
**d.** [5 points] In a few sentences, compare the quality of the reconstructions from these two autoencoders with those of PCA from problem A5. You may need to re-run your code for PCA using the ranks $k \in \{32, 64, 128\}$ to match the $h$ values used above.
\end{enumerate}
 
**What to Submit:**


- **For parts (a, b):** Final training error and set of 10 reconstructed images of digits, side-by-side with the
original image (10 images for each h).
- **For part (c):** Errors of networks from part a and b on testing set.
- **For part (d):** 2-3 sentences on differences in quality of solutions between PCA and Autoencoders, with
example images
- Code for parts a-c


In [None]:
def F1(h: int) -> nn.Module:
    """Model F1, it should performs an operation W_d * W_e * x as written in spec.

    Note:
        - While bias is not mentioned explicitly in equations above, it should be used.
            It is used by default in nn.Linear which you can use in this problem.

    Args:
        h (int): Dimensionality of the encoding (the hidden layer).

    Returns:
        nn.Module: An initialized autoencoder model that matches spec with specific h.
    """
    encoder = nn.Linear(in_features=28*28, out_features=h) # W_e
    decoder = nn.Linear(in_features=h, out_features=28*28) # W_d
    return nn.Sequential(encoder, decoder)


def F2(h: int) -> nn.Module:
    """Model F2, it should performs an operation ReLU(W_d * ReLU(W_e * x)) as written in spec.

    Note:
        - While bias is not mentioned explicitly in equations above, it should be used.
            It is used by default in nn.Linear which you can use in this problem.

    Args:
        h (int): Dimensionality of the encoding (the hidden layer).

    Returns:
        nn.Module: An initialized autoencoder model that matches spec with specific h.
    """
    encoder = nn.Linear(in_features=28*28, out_features=h) # W_e
    decoder = nn.Linear(in_features=h, out_features=28*28) # W_d
    return nn.Sequential(encoder, nn.ReLU(), decoder, nn.ReLU()) # Add ReLU after each layer


def train(
    model: nn.Module, optimizer: Adam, train_loader: DataLoader, epochs: int = 40
) -> float:
    """
    Train a model until convergence on train set, and return a mean squared error loss on the last epoch.

    Args:
        model (Module): Model to train. Either F1, or F2 in this problem.
        optimizer (Adam): Optimizer that will adjust parameters of the model.
            Hint: You can try using learning rate of 5e-5.
        train_loader (DataLoader): DataLoader with training data.
            You can iterate over it like a list, and it will produce x
            where x is FloatTensor of shape (n, d).

    Note:
        - Unfortunately due to how DataLoader class is implemented in PyTorch
            "for x_batch in train_loader:" will not work. Use:
            "for (x_batch,) in train_loader:" instead.

    Returns:
        float: Final training error/loss
    """
    # Set model to training mode
    model.train()

    # Train the model until convergence
    for epoch in tqdm(range(epochs)):
        epoch_loss = 0
        num_batches = 0
        for (x_batch,) in train_loader:
            optimizer.zero_grad() # Reset gradients
            reconstructed = model(x_batch) # Get the network outputs
            loss = F.mse_loss(reconstructed, x_batch) # Get the MSE reconstruction loss
            loss.backward() # Back-propagation
            optimizer.step() # Update weights
            epoch_loss += loss.item() # Accumulate the total loss
            num_batches += 1

        mean_epoch_loss = epoch_loss / num_batches

    return mean_epoch_loss # Return the MSE for the final epoch


def evaluate(model: nn.Module, loader: DataLoader) -> float:
    """Evaluates a model on a provided dataset.
    It should return an average loss of that dataset.

    Args:
        model (Module): TRAINED Model to evaluate. Either F1, or F2 in this problem.
        loader (DataLoader): DataLoader with some data.
            You can iterate over it like a list, and it will produce x
            where x is FloatTensor of shape (n, d).

    Returns:
        float: Mean Squared Error on the provided dataset.
    """
    # Set the model to evaluation mode
    model.eval()

    total_loss = 0 # Accumulate the loss
    num_batches = 0

    with torch.no_grad():
        for (x_batch,) in loader:
            reconstructed = model(x_batch)
            loss = F.mse_loss(reconstructed, x_batch) # MSE
            total_loss += loss.item()
            num_batches += 1

    return total_loss / num_batches # Return the average loss


def Q3():
    """
    Main function of autoencoders problem.

    It should:
        A. Train an F1 model with hs 32, 64, 128, report loss of the last epoch
            and visualize reconstructions of 10 images side-by-side with original images.
        B. Same as A, but with F2 model
        C. Use models from parts A and B with h=128, and report reconstruction error (MSE) on test set.

    Note:
        - For visualizing images feel free to use images_to_visualize variable.
            It is a FloatTensor of shape (10, 784).
        - For having multiple axes on a single plot you can use plt.subplots function
        - For visualizing an image you can use plt.imshow (or ax.imshow if ax is an axis)
    """
    (x_train, y_train), (x_test, _) = load_dataset("mnist")
    x = torch.from_numpy(x_train).float()
    x_test = torch.from_numpy(x_test).float()

    # Neat little line that gives you one image per digit for visualization in parts a and b
    images_to_visualize = x[[np.argwhere(y_train == i)[0][0] for i in range(10)]]

    train_loader = DataLoader(TensorDataset(x), batch_size=32, shuffle=True)
    test_loader = DataLoader(TensorDataset(x_test), batch_size=32, shuffle=True)

    test_scores = {} # Stores evaluation results from part c)
    
    # a) Train F1 model and visualize the reconstructions
    def train_and_visualize(model_type, hs=[32, 64, 128]):
        fig, axes = plt.subplots(len(hs) + 1, 10, figsize=(20, (len(hs) + 1) * 2)) # Prepare the empty figure
        
        for i in range(10):
            axes[0, i].imshow(images_to_visualize[i].reshape(28, 28), cmap="gray") # Original image
            axes[0, i].axis("off")
            axes[0, i].set_title(f"Original {i}")

        for h_idx, h in enumerate(hs):
            model = model_type(h)
            optimizer = Adam(model.parameters(), lr=5e-5) # Use Adam and the recommended learning rate
            
            final_loss = train(model, optimizer, train_loader)
            print(f"{model_type.__name__} with h = {h}, Final training loss: {final_loss:.6f}")

            if h == 128:
                test_loss = evaluate(model, test_loader)
                test_scores[model_type.__name__] = test_loss

            with torch.no_grad():
                reconstructed = model(images_to_visualize)

                for i in range(10): # Plot each of the reconstructed images
                    image = reconstructed[i].reshape(28, 28).cpu().numpy()
                    axes[h_idx + 1, i].imshow(image, cmap="gray")
                    axes[h_idx + 1, i].axis("off")
                    axes[h_idx + 1, i].set_title(f"h = {h}")

        plt.tight_layout()
        plt.suptitle(f"{model_type.__name__} Reconstructions", y=1.02, fontsize=16)
        plt.show()

    train_and_visualize(model_type=F1)

    # b) Do the same as a) but with F2
    train_and_visualize(model_type=F2)

    # c) Evaluate F1 and F2 on the test set using h=128
    for model_name, test_loss in test_scores.items(): # Read from the results that we stored before
        print(f"{model_name}: {test_loss:.6f}")

    # d) Run PCA using h = [32, 64, 128] and plot the result
    x_mean = np.mean(x_train, axis=0)
    demeaned_train = x_train - x_mean # Subtract the mean sample
    eigenvalues, eigenvectors = calculate_eigen(demeaned_train)

    sorted_eig_indices = np.argsort(eigenvalues)[::-1] # Sort the eigenvalues from biggest to smallest
    eigenvalues = eigenvalues[sorted_eig_indices]
    eigenvectors = eigenvectors[:, sorted_eig_indices]

    hs = [32, 64, 128]
    fig, axes = plt.subplots(len(hs) + 1, 10, figsize=(20, (len(hs) + 1) * 2))
    
    for i in range(10):
        axes[0, i].imshow(images_to_visualize[i].reshape(28, 28), cmap="gray") # Original image
        axes[0, i].axis("off")
        axes[0, i].set_title(f"Original {i}")
    
    for h_idx, h in enumerate(hs): # For each h
        uk = eigenvectors[:, :h]
        
        for i in range(10): # Reconstruct each image
            demeaned_sample = images_to_visualize[i].numpy() - x_mean
            reconstructed = reconstruct_demean(uk, demeaned_sample)
            reconstructed_image = (reconstructed + x_mean).reshape(28, 28)
            
            axes[h_idx + 1, i].imshow(reconstructed_image, cmap="gray")
            axes[h_idx + 1, i].axis("off")
            axes[h_idx + 1, i].set_title(f"h={h}")
    
    plt.tight_layout()
    plt.suptitle("PCA Reconstructions", y=1.02, fontsize=16)
    plt.show()

Q3()

**1.4** Image Classification on CIFAR-10

In this problem we will explore different deep learning architectures for image classification on the CIFAR-10 dataset. Make sure that you are familiar with tensors, two-dimensional convolutions (```nn.Conv2d```) and fully-connected layers (```nn.Linear```), ReLU non-linearities (```F.relu```), pooling (```nn.MaxPool2d```), and tensor reshaping (```view```).

A few preliminaries:

- Each network $f$ maps an image $x^\text{in} \in \mathbb{R}^{32 \times 32 \times 3}$ (3 channels for RGB) to an output $f(x^\text{in}) = x^\text{out} \in \mathbb{R}^{10}$. The class label is predicted as $\arg\max_{i=0,1,\dots,9} x_{i}^\text{out}$. An error occurs if the predicted label differs from the true label for a given image. 
- The network is trained via multiclass cross-entropy loss. 
- Create a validation dataset by appropriately partitioning the train dataset. *Hint*: look at the documentation for ```torch.utils.data.random_split```. Make sure to tune hyperparameters like network architecture and step size on the validation dataset. Do **NOT** validate your hyperparameters on the test dataset.
- At the end of each epoch (one pass over the training data), compute and print the training and validation classification accuracy.
- While one would usually train a network for hundreds of epochs to reach convergence and maximize accuracy, this can be prohibitively time-consuming, so feel free to train for just a dozen or so epochs. 

For parts (a) and (b), apply a hyperparameter tuning method (e.g. random search, grid search, etc.) using the validation set, report the hyperparameter configurations you evaluated and the best set of hyperparameters from this set, and plot the training and validation classification accuracy as a function of epochs. Produce a separate line or plot for each hyperparameter configuration evaluated (top 5 configurations is sufficient to keep the plots clean). Finally, evaluate your best set of hyperparameters on the test data and report the test accuracy. 


**Note 1:** Please refer to the notebook with starter code for this problem and the notebook provided with lecture 19. That notebook provides a complete end-to-end example of loading data, training a model using a simple network with a fully-connected output and no hidden layers (logistic regression), and performing evaluation using canonical Pytorch. We recommend using this as a template for your implementations of the models below.

**Note 2:** If you are attempting this problem and do not have access to GPU we highly recommend using Google Colab. The provided notebook includes instructions on how to use GPU in Google Colab.


Here are the network architectures you will construct and compare. 

**a.** [18 points] **Fully-connected output, 1 fully-connected hidden layer:** this network has one hidden layer denoted as $x^\mathrm{hidden} \in \mathbb{R}^{M}$ where $M$ will be a hyperparameter you choose ($M$ could be in the hundreds). The nonlinearity applied to the hidden layer will be the ```relu``` ($\mathrm{relu}(x) = \max\{0,x\}$. This network can be written as
$$x^{out} = W_2 \mathrm{relu}(W_1 \mathrm{vect}(x^{in}) +b_1) + b_2$$

where $W_1 \in \mathbb{R}^{M \times 3072}$, $b_1 \in \mathbb{R}^M$, $W_2 \in \mathbb{R}^{10 \times M}$, $b_2 \in \mathbb{R}^{10}$, and $\mathrm{vect}(\cdot)$ flattens a tensor into a vector.
Tune the different hyperparameters and train for a sufficient number of epochs to achieve a *validation accuracy* of at least 50\%. Provide the hyperparameter configuration used to achieve this performance.
  
**b.** [18 points] **Convolutional layer with max-pool and fully-connected output:** for a convolutional layer $W_1$ with filters of size $k \times k \times 3$, and $M$ filters (reasonable choices are $M=100$, $k=5$), we have that $\mathrm{Conv2d}(x^\text{in}, W_1) \in \mathbb{R}^{(33-k) \times (33-k) \times M}$.
  
- Each convolution will have its own offset applied to each of the output pixels of the convolution; we denote this as $\mathrm{Conv2d}(x^\text{in}, W) + b_1$ where $b_1$ is parameterized in $\mathbb{R}^M$. Apply a ```relu``` activation to the result of the convolutional layer. 

- Next, use a max-pool of size $N \times N$ (a reasonable choice is $N=14$ to pool to $2 \times 2$ with $k=5$) we have that $\textrm{MaxPool}( \mathrm{relu}( \mathrm{Conv2d}(x^\text{in}, W_1)+b_1)) \in \mathbb{R}^{\lfloor\frac{33-k}{N}\rfloor \times \lfloor\frac{33-k}{N}\rfloor \times M}$.

- We will then apply a fully-connected layer to the output to get a final network given as

\begin{align*}
 x^{output} = W_2 \mathrm{vect}(\textrm{MaxPool}( \mathrm{relu}( \mathrm{Conv2d}(x^\text{input}, W_1)+b_1))) + b_2
\end{align*}
          
where $W_2 \in \mathbb{R}^{10 \times M (\lfloor\frac{33-k}{N}\rfloor)^2}$, $b_2 \in \mathbb{R}^{10}$.
  
The parameters $M,k,N$ (in addition to the step size and momentum) are all hyperparameters, but you can choose a reasonable value. Tune the different hyperparameters (number of convolutional filters, filter sizes, dimensionality of the fully-connected layers, step size, etc.) and train for a sufficient number of epochs to achieve a \emph{validation accuracy} of at least 65\%. Provide the hyperparameter configuration used to achieve this performance.
Make sure to save the best model during the hyperparameter tuning so that you can evaluate test accuracy without retraining.
  
The number of hyperparameters to tune, combined with the slow training times, will hopefully give you a taste of how difficult it is to construct networks with good generalization performance. State-of-the-art networks can have dozens of layers, each with their own hyperparameters to tune. Additional hyperparameters you are welcome to play with if you are so inclined, include: changing the activation function, replace max-pool with average-pool, adding more convolutional or fully connected layers, and experimenting with batch normalization or dropout.

**What to Submit:**

- **Parts a-b:** Plot of training and validation accuracy for each TOP 5 hyperparameter configurations evaluated. (10 lines total). If it took less than 5 hyperparameter configurations to pass performance threshold plot all hyperparameter configurations evaluated. List of the hyperparameter values you searched over, and your search method (random, grid, etc.).
- **Parts a-b:** Values of best performing hyperparameters, and accuracy of best models on test data.
- Code

In [None]:
def fc_network(M: int) -> nn.Module:
    """fc_network, It should implement a neural network with a single hidden layer, e.g., 
    Linear(ReLU(Linear(x)))
    
    Note:
        - While bias is not mentioned explicitly in equations above, it should be used.
            It is used by default in nn.Linear which you can use in this problem.

    Args:
        M (int): Dimensionality of the encoding (the hidden layer).

    Returns:
        nn.Module: An initialized fully connected model.
    """
    w1 = nn.Linear(in_features=3072, out_features=M)
    w2 = nn.Linear(in_features=M, out_features=10)
    return nn.Sequential(w1, nn.ReLU(), w2) # Linear(ReLU(Linear(x)))


def conv_network(k: int, m: int, n: int) -> nn.Module:
    """Model conv_net, it should performs an operation Linear(flatten(MaxPool(relu(Conv2d(x))))) as written in spec.

    Note:
        - While bias is not mentioned explicitly in equations above, it should be used.
            It is used by default in nn.Linear which you can use in this problem.

    Args:
        k (int): size of a convolutional filter.
        m (int): number of filters
        n (int): pool size

    Returns:
        nn.Module: An initialized convolutional network.
    """
    conv = nn.Conv2d(in_channels=3, out_channels=m, kernel_size=k)
    pool = nn.MaxPool2d(kernel_size=n)
    flatten = nn.Flatten()
    in_features = m * ((33 - k) // n) ** 2
    linear = nn.Linear(in_features=in_features, out_features=10)
    return nn.Sequential(conv, nn.ReLU(), pool, flatten, linear)
    
def train(
    model: nn.Module, optimizer: Adam, train_loader: DataLoader, val_loader: DataLoader, epochs: int = 40, flatten_image=False
) -> float:
    """
    Train a model until convergence on train set, and return a mean squared error loss on the last epoch.

    Args:
        model (Module): Model to train. Either F1, or F2 in this problem.
        optimizer (Adam): Optimizer that will adjust parameters of the model.
            Hint: You can try using learning rate of 5e-5.
        train_loader (DataLoader): DataLoader with training data.
            You can iterate over it like a list, and it will produce x
            where x is FloatTensor of shape (n, d).

    Note:
        - Unfortunately due to how DataLoader class is implemented in PyTorch
            "for x_batch in train_loader:" will not work. Use:
            "for (x_batch,) in train_loader:" instead.

    Returns:
        float: Final training error/loss
    """
    # Track accuracies and best model
    train_accuracies = []
    val_accuracies = []
    best_val_accuracy = 0
    best_model = None

    for epoch in tqdm(range(epochs)):
        model.train() # Set model to training mode
        epoch_loss = 0
        num_batches = 0
        for (images, labels) in train_loader:
            optimizer.zero_grad()
            if flatten_image:
                outputs = model(images.view(-1, 3072)) # Flatten for FC
            else:
                outputs = model(images)
            loss = F.cross_entropy(outputs, labels) # Use the multiclass cross-entropy loss
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            num_batches += 1

        train_accuracy = evaluate(model, train_loader, flatten_image)
        val_accuracy = evaluate(model, val_loader, flatten_image)
        train_accuracies.append(train_accuracy)
        val_accuracies.append(val_accuracy)

        print(f"Epoch {epoch}, Train Accuracy: {train_accuracy:.6f}, Validation Accuracy: {val_accuracy:.6f}")

        # Save the model with the best validation accuracy
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_model = model.state_dict().copy()

    # Load the best model state into the model
    model.load_state_dict(best_model)

    return train_accuracies, val_accuracies, best_val_accuracy, model

def evaluate(model: nn.Module, loader: DataLoader, flatten_image=False):
    """Evaluates a model on a provided dataset.
    It should return an average loss of that dataset.

    Args:
        model (Module): TRAINED Model to evaluate. Either F1, or F2 in this problem.
        loader (DataLoader): DataLoader with some data.
            You can iterate over it like a list, and it will produce x
            where x is FloatTensor of shape (n, d).

    Returns:
        float: Mean Squared Error on the provided dataset.
    """
    model.eval() # Set model to evaluation mode
    correct = 0
    total = 0
    with torch.no_grad(): # Turn off gradients during evaluation
        for (images, labels) in loader:
            if flatten_image:
                outputs = model(images.view(-1, 3072)) # For FC, need to flatten images first
            else:
                outputs = model(images) # Don't need to flatten for Conv
            _, predicted = torch.max(outputs.data, 1) # Take the argmax class
            total += labels.size(0) # Count the number of samples in each batch
            correct += (predicted == labels).sum().item() # Count the number of correct predictions
    return correct / total

In [None]:
from torchvision import datasets
from torchvision.transforms import ToTensor

def main():
    training_data = datasets.CIFAR10(
        root=".",
        train=True,
        download=True,
        transform=ToTensor(),
    )

    # Download test data from open datasets.
    test_data = datasets.CIFAR10(
        root=".",
        train=False,
        download=True,
        transform=ToTensor(),
    )
    batch_size = 64

    # Create a validation set from the train set
    val_size = int(0.2 * len(training_data))
    train_size = len(training_data) - val_size
    train_dataset, val_dataset = torch.utils.data.random_split(training_data, [train_size, val_size])
    
    # Create the three data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)
    
    # Tune the FC network
    Ms = [128, 256, 512, 1024]  # Hidden layer sizes
    lrs = [5e-4, 1e-3, 5e-3]    # Learning rates
    
    # Store results for each trial
    fc_results = []
    fc_best_val_accuracy = 0 # The overall best validation accuracy
    fc_best_model = None     # The best model out of all trained models
    fc_best_config = None
    
    for hidden in Ms:
        for lr in lrs:
            # Train and store results
            model = fc_network(hidden)
            optimizer = Adam(model.parameters(), lr=lr)
            train_accuracies, val_accuracies, best_val_accuracy, trained_model = train(model, optimizer, train_loader, val_loader, epochs=20, flatten_image=True)
            fc_results.append((hidden, lr, train_accuracies, val_accuracies, best_val_accuracy))
            
            # Track the best model
            if best_val_accuracy > fc_best_val_accuracy:
                fc_best_val_accuracy = best_val_accuracy
                fc_best_model = trained_model
                fc_best_config = (hidden, lr)
    
    # Sort results by validation accuracy from best to worst
    fc_results.sort(key=lambda x: x[4], reverse=True)
    
    # Plot the top 5 configurations
    plt.figure(figsize=(12, 8))
    top_n = min(5, len(fc_results)) # Plot everything if there are less than 5 results saved
    
    for i in range(top_n):
        hidden, lr, train_accs, val_accs, _ = fc_results[i]
        plt.plot(train_accs, linestyle='-', label=f'Train, M={hidden}, lr={lr}')
        plt.plot(val_accs, linestyle='--', label=f'Val, M={hidden}, lr={lr}')
    
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.title("FC Train and Val Accuracy")
    plt.legend()
    plt.grid(True)
    plt.savefig("fc_accuracy.png")
    plt.show()
    
    # Evaluate the best model on the test set
    test_accuracy = evaluate(fc_best_model, test_loader, flatten_image=True)
    print("FC results:")
    print(f"Best Hyperparameters: M={fc_best_config[0]}, lr={fc_best_config[1]}")
    print(f"Best Validation Accuracy: {fc_best_val_accuracy:.6f}")
    print(f"Test Accuracy: {test_accuracy:.6f}")
    
    # Tune the Conv network
    ms = [32, 64, 100, 128] # Number of filters
    ks = [3, 5, 7]          # Kernel sizes
    ns = [4, 8]             # Pool sizes
    lr = 5e-4               # Learning rate doesn't change
    
    # Store results for each trial
    conv_results = []
    conv_best_val_accuracy = 0
    conv_best_model = None
    conv_best_config = None
    
    for k in ks:
        for m in ms:
            for n in ns:
                # Skip if can't pool after applying the kernel
                if (33 - k) % n != 0:
                    continue
                    
                # Train and store results
                model = conv_network(k, m, n)
                optimizer = Adam(model.parameters(), lr=lr)
                train_accuracies, val_accuracies, best_val_accuracy, trained_model = train(model, optimizer, train_loader, val_loader, epochs=20, flatten_image=False)
                conv_results.append((k, m, n, train_accuracies, val_accuracies, best_val_accuracy))
                
                # Track the best model
                if best_val_accuracy > conv_best_val_accuracy:
                    conv_best_val_accuracy = best_val_accuracy
                    conv_best_model = trained_model
                    conv_best_config = (k, m, n)
    
    # Sort results by validation accuracy from best to worst
    conv_results.sort(key=lambda x: x[5], reverse=True)
    
    # Plot the top 5 configurations
    plt.figure(figsize=(12, 8))
    top_n = min(5, len(conv_results)) # Plot everything if there are less than 5 results saved
    
    for i in range(top_n):
        k, m, n, train_accs, val_accs, _ = conv_results[i]
        plt.plot(train_accs, linestyle='-', label=f'Train, k={k}, m={m}, n={n}')
        plt.plot(val_accs, linestyle='--', label=f'Val, k={k}, m={m}, n={n}')
    
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.title("Conv Train and Val Accuracy")
    plt.legend()
    plt.grid(True)
    plt.savefig("conv_accuracy.png")
    plt.show()
    
    # Evaluate the best model on the test set
    test_accuracy = evaluate(conv_best_model, test_loader)
    print("Conv results:")
    print(f"Best Hyperparameters: k={conv_best_config[0]}, m={conv_best_config[1]}, n={conv_best_config[2]}")
    print(f"Best Validation Accuracy: {conv_best_val_accuracy:.6f}")
    print(f"Test Accuracy: {test_accuracy:.6f}")

main()

**1.5** Random Fourier Features

Kernel methods such as Support Vector Machines are considered memory-based learners. Rather than learning a mapping from a set of input features $\mathcal{X} \subset \mathbb{R}^D$ to outputs in $\mathcal{Y}$, they *remember* all training examples $(\mathbf{x_i}, y_i)$ and learn a corresponding weight for them.

\begin{align*}
   \hat{f}(\mathbf{x}) = \sum_{i=1}^N \omega_i k(\mathbf{x_i}, \mathbf{x})
\end{align*} 

After learning the weight vector $\mathbf{w} = [\mathbf{w}_1, ..., \mathbf{w}_N]$, we can make prediction on unseen samples using the *kernel function* $k$ between all training samples and $\mathbf{x}$. Kernel methods are attractive because they rely on the *kernel trick*. Any positive definite function $k(\mathbf{x}, \mathbf{x}')$ with $\mathbf{x}, \mathbf{x}' \in \mathbb{R}^D$ defines a function $\psi$ mapping $\mathbb{R}^D$ to a higher-dimensional space such that the inner product between datapoints can be quickly computed as $\langle \psi(\mathbf{x}), \psi(\mathbf{x}') \rangle = k(\mathbf{x}, \mathbf{x}')$. In essence, the kernel trick is an efficient way to learn a linear decision boundary in a higher dimension space than that of $\mathcal{X}$.


The kernel trick can be prohibitively expensive for large datasets. This is because the memory-based algorithm accesses the data through evaluations of the kernel matrix $k(x, x')$ which grows in proportion to the dataset size $N$.

Instead of relying on the implicit feature mapping $\psi$ provided by the kernel trick, suppose we can approximate the kernel function $k$ as the inner product of two vectors in $\mathbb{R}^D$. Mathematically, we would like to find a mapping $\mathbf{z}$:

\begin{align*}
    \mathbf{z} : \mathbb{R}^d \rightarrow \mathbb{R}^D \qquad \text{such that} \qquad k(x, x') = \langle \psi(x), \psi(x') \rangle \approx \langle \mathbf{z}(x), \mathbf{z}(x') \rangle
\end{align*}

With this approximation, we no longer require the \textit{kernel trick} to express $\langle \psi(\mathbf{x}), \psi(\mathbf{x}')\rangle$ as $k(\mathbf{x}, \mathbf{x}')$. Rather, we can approximate it by directly computing the tractable inner product $\langle \mathbf{z}(\mathbf{x}), \mathbf{z}(\mathbf{x}')\rangle$.

\begin{align*}
    \hat{f}(\mathbf{x}) = \sum_{i=1}^N \mathbf{\omega}_i k(\mathbf{x_i}, \mathbf{x}) = \sum_{i=1}^N \mathbf{\omega}_i  \langle \psi(x), \psi(x') \rangle \approx  \sum_{i=1}^N \mathbf{\omega}_i \langle \mathbf{z}(x_i), \mathbf{z}(x')\rangle = \left(\sum_{i=1}^N \mathbf{\omega}_i \mathbf{z}(x_i)^T \right) \mathbf{z}(x') = \beta^T \mathbf{z}(x)
\end{align*}

Assuming $\mathbf{z}(\mathbf{x}) = \sigma(M \mathbf{x} + b)$ for some nonlinear function $\sigma$, this "approximate" SVM *can potentially be evaluated much quicker* than the kernel SVM. To see why, note that the left-hand-side requires evaluating $k(\mathbf{x_i}, \mathbf{x})$ for all $i \in \{1,\dots,N\}$, in general, if $\omega_i$ is not sparse. On the other hand, the right-hand-side just requires computing $\mathbf{z}(x)=\sigma(M \mathbf{x} + b)$ which is dominated by the time to compute a $D \times d$ matrix-vector product, and then inner product with $\beta$ which is $\mathbb{R}^D$. Thus, the total computation time for the left-hand-side scales linearly with $N$, and the right-hand-side scales with just $d$ and $D$, independent of $N$!
When training the approximate SVM we also get similar computational savings if $N \gg \max\{d,D\}$. 
 as it involves multiplying the $d \times D$ matrix $M$ times $\mathbf{X}$ and adding the $d$-dimensional weight vector $\beta$ and the the $D$-dimensional bias vector $b$. This runtime does not involve the dataset size $N$!


In this problem, we'll be empirically comparing a **s**upport **v**ector **m**achine classifier (**SVM**) to a classifier with **r**andom **F**ourier **f**eatures  (**RFF**). Let's get our hands dirty with some code! 


**a.** [5 points] Implement both a **SVM** and a **RFF** classifiers on the a subset of 10000 samples from the MNIST dataset. Use the RBF kernel with $\gamma$ for both models.

**Note:** Although you may use any library of your choice for this problem, we recommend looking at ```sklearn```'s [**SVC**](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html) and [**RBFSampler**](https://scikit-learn.org/stable/modules/generated/sklearn.kernel_approximation.RBFSampler.html) classes. Pay attention to the **kernel** and **gamma** parameters of both classes

**b.** [5 points] Plot the test accuracy of the **RFF** for $D \in \{100, 500, 1000, 5000, 10000, 15000, 20000\}$. Draw a horizontal line for the test accuracy of the standard **SVM**. What do you observe? What value of $d$ would you pick for **RFF**? For your choice of $d$, what's the gap between the testing accuracy of both models? (Note: read part **c** before completing this; you will need to record the runtime of these training runs.)

**c.** [5 points] Repeat the same experiment as in part **b**, but instead of recording the test accuracy of both models, record the _runtime (in milliseconds) of training them_. What do you observe now as $D$ increases? Would you still pick the same value of $D$? What is the trade-off between the runtime and correctness of the **RFF**?


**Note:** To record the time elapsed between two points in your code, you can use python's [time](https://docs.python.org/3/library/time.html) library. Only record the runtime required to learn the corresponding model on the training data. Learning an RFF with Sklearn involves a 2-stage approach: fitting the RBFSampler and fitting a linear model the the resulting RFF features in $\mathbb{R}^D$. The runtime should include both steps.

**What to Submit:**

- **Part b:** Plot of test accuracy of SVM and RFF as a function of the number of fourier features $D$. 2-3 sentences describing observed behavior, the gap between models, and justifying choice of $d$
- **Part c:** Plot of model training runtime of SVM and RFF as a function of the number of fourier features $D$. 2-3 sentences describing observed behavior, justifying new choice of $d$, and describing the runtime-correctness tradeoff.
- Code in this jupyter notebook

In [None]:
import time
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import SGDClassifier

# Train an SVM model and record the run time
def train_svm(X_train, y_train, gamma=0.01):
    start_time = time.time()
    svm_model = SVC(kernel='rbf', gamma=gamma) # Use the RBF kernel with gamma
    svm_model.fit(X_train, y_train)
    end_time = time.time()
    total_time = (end_time - start_time) * 1000 # Record the time in milliseconds
    return svm_model, total_time

# Use the RBF kernel to map the data to a higher dimensional space
# Train a simple SGD model to linearly separate the data in this space
def train_rff(X_train, y_train, D=100, gamma=0.01):
    start_time = time.time()
    rbf_feature = RBFSampler(gamma=gamma, n_components=D) # Use the RBF sampler with gamma and number of components D
    X_features = rbf_feature.fit_transform(X_train) # Transform to higher-dimensional space
    sgd_model = SGDClassifier(max_iter=1000, tol=1e-3) # Linear classification in the high-D space
    sgd_model.fit(X_features, y_train)
    end_time = time.time()
    total_time = (end_time - start_time) * 1000 # Record the time in milliseconds
    return (rbf_feature, sgd_model), total_time

def evaluate_model(model, X_test, y_test):
    if isinstance(model, tuple):  # train_rff() returns a tuple of RBF features and SGD model
        rbf_feature, sgd_model = model # Extract the two parts
        X_features = rbf_feature.transform(X_test)
        accuracy = sgd_model.score(X_features, y_test) # Evaluate the accuracy on the test set
    else: # train_svm() only returns the SVM model
        accuracy = model.score(X_test, y_test) # Directly apply & evaluate it on the test set
    return accuracy

# Train and evaluate the model using different values of D
def Q5(gamma=0.01):
    # Get 10,000 samples from the MNIST dataset
    (x_train, y_train), (x_test, y_test) = load_dataset("mnist")

    # Set 8000 samples as train
    train_indices = np.random.choice(len(x_train), 8000, replace=False)
    x_train = x_train[train_indices]
    y_train = y_train[train_indices]
    
    # Set 2000 samples as test
    test_indices = np.random.choice(len(x_test), 2000, replace=False)
    x_test = x_test[test_indices]
    y_test = y_test[test_indices]

    # Train SVM
    svm_model, svm_runtime = train_svm(x_train, y_train, gamma)
    svm_accuracy = evaluate_model(svm_model, x_test, y_test)
    print(f"SVM Accuracy: {svm_accuracy:.6f}, Runtime: {svm_runtime:.2f} ms")
    
    # Train RFF using different values of D
    Ds = [100, 500, 1000, 5000, 10000, 15000, 20000]
    rff_accuracies = []
    rff_runtimes = []
    for D in tqdm(Ds):
        rff_model, rff_runtime = train_rff(x_train, y_train, D, gamma)
        rff_accuracy = evaluate_model(rff_model, x_test, y_test)
        rff_accuracies.append(rff_accuracy)
        rff_runtimes.append(rff_runtime)
        print(f"RFF Accuracy: {rff_accuracy:.6f}, Runtime: {rff_runtime:.2f} ms")
    
    # Plot test accuracy for each D vs. the SVM baseline
    plt.figure(figsize=(10, 6))
    plt.plot(Ds, rff_accuracies, marker='o', label='RFF')
    plt.axhline(y=svm_accuracy, color='r', linestyle='-', label='SVM') # SVM baseline
    plt.xlabel('D')
    plt.ylabel('Test Accuracy')
    plt.title('Test Accuracy vs D')
    plt.grid(True, linestyle='--', alpha=0.75)
    plt.legend()
    plt.show()
    
    # Plot runtime for each D vs. the SVM baseline
    plt.figure(figsize=(10, 6))
    plt.plot(Ds, rff_runtimes, marker='o', label='RFF')
    plt.axhline(y=svm_runtime, color='r', linestyle='-', label='SVM') # SVM baseline
    plt.xlabel('D')
    plt.ylabel('Runtime (ms)')
    plt.title('Runtime vs D')
    plt.grid(True, linestyle='--', alpha=0.75)
    plt.legend()
    plt.show()
    
    # Return all results for analysis
    return {
        'svm_accuracy': svm_accuracy,
        'svm_runtime': svm_runtime,
        'rff_accuracies': rff_accuracies,
        'rff_runtimes': rff_runtimes,
        'component_values': Ds
    }

results = Q5()