In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

class smallMLP(nn.Module):
  def __init__(self, input_size, output_size=10):
    super().__init__()
    self.fc1 = nn.Linear(input_size, 128)
    self.fc2 = nn.Linear(128, 64)
    self.fc3 = nn.Linear(64, output_size)
    self.relu = nn.ReLU()
    self.softmax = nn.Softmax(dim=-1)

  def forward(self, x):
    x = self.relu(self.fc1(x))
    x = self.relu(self.fc2(x))
    x = self.softmax(self.fc3(x))
    return x


# Step 1: Define a toy dataset
class ToyDataset(Dataset):
    def __init__(self, num_samples=1000, input_size=20, num_classes=10):
        self.num_samples = num_samples
        self.input_size = input_size
        self.num_classes = num_classes

        # Generate random input data and corresponding labels
        self.data = torch.randn(num_samples, input_size)
        self.labels = torch.randint(0, num_classes, (num_samples,))

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# Training Loop -> IMPORTANT !
def train_model(model, train_loader, criterion, optimizer, num_epoch=5, device='cpu'):
  model.to(device)
  model.train()

  for epoch in range(num_epoch):
    total_loss, total, correct = 0, 0, 0
    for batch_x, batch_y in train_loader:
      batch_x, batch_y = batch_x.to(device), batch_y.to(device)

      outputs = model(batch_x)
      loss = criterion(outputs, batch_y)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      total_loss += loss.item()
      _, predicted = torch.max(outputs, dim=-1)
      correct += (predicted==batch_y).sum().item()
      total += batch_y.size(0)

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}, Accuracy: {correct/total:.4f}")

def evaluate_model(model, val_loader, device='cpu'):
  model.to(device)
  model.eval()
  correct, total = 0, 0

  with torch.no_grad():
    for batch_x, batch_y in val_loader:
      batch_x, batch_y = batch_x.to(device), batch_y.to(device)
      outputs = model(batch_x)
      _, predicted = torch.max(outputs, dim=-1)
      correct += (predicted==batch_y).sum().item()
      total += batch_y.size(0)

  print(f"Test Accuracy: {correct/total:.4f}")

if __name__ == "__main__":
    # Parameters
    input_size = 20
    num_classes = 10
    batch_size = 32
    num_epochs = 5
    learning_rate = 0.001

    # Create toy datasets
    train_dataset = ToyDataset(num_samples=1000, input_size=input_size, num_classes=num_classes)
    test_dataset = ToyDataset(num_samples=200, input_size=input_size, num_classes=num_classes)

    # Data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    model = smallMLP(input_size, num_classes)
    criterion = nn.CrossEntropyLoss() #important
    optimizer = optim.Adam(model.parameters(), lr=learning_rate) #important

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    train_model(model, train_loader, criterion, optimizer, num_epochs, device)
    evaluate_model(model, test_loader, device)

Epoch [1/5], Loss: 2.3031, Accuracy: 0.1070
Epoch [2/5], Loss: 2.2997, Accuracy: 0.1500
Epoch [3/5], Loss: 2.2961, Accuracy: 0.1610
Epoch [4/5], Loss: 2.2916, Accuracy: 0.1530
Epoch [5/5], Loss: 2.2834, Accuracy: 0.1610
Test Accuracy: 0.1150


In [13]:
# Cell 1 – Install/upgrade the required packages (run once)
!pip install --upgrade numpy torch torchvision pillow



In [None]:
# Transfer Learning

"""
In relu.py, we implemented a simple neural network with a single hidden layer using the ReLU activation function.

Now, let's move on to the next task. The goal is to extract features from a pretrained AlexNet model and use those features to train a two-layer fully connected network on the CIFAR-10 dataset.

Specifically, you need to:
1. Extract features from the pretrained AlexNet model up to the fc2 layer (before the final classification layer).
2. Build a two-layer fully connected network that takes the extracted features as input and classifies the CIFAR-10 dataset (10 classes). Use the ReLU activation function in the hidden layer.
3. Define a custom loss function that combines standard cross-entropy loss with an L1 regularization term on the model parameters.
4. Train the two-layer fully connected network on the CIFAR-10 dataset using the custom loss function.

Here's some useful code hint to consider:
class SampleNetwork(nn.Module):
    def __init__(self):
        super(SampleNetwork, self).__init__()
        # TODO: Initialization code goes here

    def forward(self, input):
        # TODO: Forward pass code goes here
        pass
"""
import argparse
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10


def load_cifar10_dataset() -> CIFAR10:
    """
    Load CIFAR-10 dataset and apply transformations accordingly

    Returns:
        train_dataset (torchvision.datasets.cifar.CIFAR10): CIFAR-10 train dataset
    """
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    ])
    train_dataset = CIFAR10(
        root='./data',
        train=True,
        transform=transform,
        download=True
    )
    return train_dataset

class SampleNetwork(nn.Module):
    def __init__(self, alexnet):
        super(SampleNetwork, self).__init__()    
        self.alexnet = alexnet
        self.fc1 = nn.Linear(4096, 512)
        self.fc2 = nn.Linear(512, 10)
        self.relu = nn.ReLU()

    def forward(self, input):
        # TODO: Forward pass code goes here
        out1 = self.alexnet.features(input)
        out2 = self.alexnet.avgpool(out1)
        out2 = torch.flatten(out2, 1)
        out3 = self.alexnet.classifier[:-1](out2)

        out4 = self.fc1(out3)
        out5 = self.relu(out4)
        return self.fc2(out5)

def main(args: argparse.Namespace) -> None:
    """
    Main function of the script

    Args:
        args (argparse.Namespace): Hyperparameters
    """
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load pretrained AlexNet
    alexnet = models.alexnet(pretrained=True).to(device)
    # Freeze AlexNet parameters
    for param in alexnet.parameters():
        param.requires_grad = False
    alexnet.eval()

    print(alexnet)
    


    # Load CIFAR-10 dataset
    train_dataset = load_cifar10_dataset()
    train_loader = DataLoader(
        train_dataset,
        batch_size=32,
        shuffle=True
    )

    # TODO Put your code here
    MyModel = SampleNetwork(alexnet).to(device)

    # Set optimizer - only optimize parameters that require gradients
    parameters = [p for p in MyModel.parameters() if p.requires_grad]
    
    assert parameters is not None, "Define parameters for optimizer"
    optimizer = torch.optim.Adam(
        parameters,
        lr=args.learning_rate
    )

    # Training loop
    for _ in range(args.n_epoch):
        for inputs, labels in train_loader:
            # TODO Your code goes here
            inputs, labels = inputs.to(device), labels.to(device)
            output = MyModel(inputs)
            print(output.shape, )
            loss_fn = torch.nn.CrossEntropyLoss()  # TODO Put your code here
            loss = loss_fn(output, labels)           

            
            # Backpropagation and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            print(f'Loss: {loss.item():.4f}')


main(argparse.Namespace(learning_rate=0.001, batch_size=32, n_epoch=3))

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
 

100%|██████████| 170M/170M [00:07<00:00, 21.6MB/s] 


torch.Size([32, 10])
Loss: 2.3156
torch.Size([32, 10])
Loss: 2.1478
torch.Size([32, 10])
Loss: 2.2522
torch.Size([32, 10])
Loss: 1.7823
torch.Size([32, 10])
Loss: 1.5049
torch.Size([32, 10])
Loss: 1.6993
torch.Size([32, 10])
Loss: 1.4952
torch.Size([32, 10])
Loss: 1.6149
torch.Size([32, 10])
Loss: 1.4362
torch.Size([32, 10])
Loss: 1.2093
torch.Size([32, 10])
Loss: 0.9687
torch.Size([32, 10])
Loss: 1.2187
torch.Size([32, 10])
Loss: 1.7074
torch.Size([32, 10])
Loss: 1.6654
torch.Size([32, 10])
Loss: 1.6335
torch.Size([32, 10])
Loss: 1.6670
torch.Size([32, 10])
Loss: 1.3739
torch.Size([32, 10])
Loss: 1.1115
torch.Size([32, 10])
Loss: 1.8116
torch.Size([32, 10])
Loss: 1.4260
torch.Size([32, 10])
Loss: 1.4808
torch.Size([32, 10])
Loss: 1.3258
torch.Size([32, 10])
Loss: 1.0661
torch.Size([32, 10])
Loss: 1.1188
torch.Size([32, 10])
Loss: 1.0541
torch.Size([32, 10])
Loss: 1.0684
torch.Size([32, 10])
Loss: 1.0180
torch.Size([32, 10])
Loss: 0.9201
torch.Size([32, 10])
Loss: 1.2838
torch.Size([32

In [None]:
# K-Fold Cross Validation

import numpy as np

def k_fold_cv(X, y, k=5, seed=42):
    np.random.seed(seed)
    idx = np.random.permutation(len(X))
    fold_size = len(X) // k
    scores = []

    for i in range(k):
        start = i * fold_size
        end = (i + 1) * fold_size if i < k - 1 else len(X)
        test_idx = idx[start:end]
        train_idx = np.concatenate([idx[:start], idx[end:]])

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Reinitialize your model here
        model = lambda x: np.dot(x, np.linalg.pinv(X_train) @ y_train)  # Example

        y_pred = model(X_test)
        acc = np.mean((y_pred > 0.5) == y_test)
        scores.append(acc)

    return np.mean(scores)

# Usage
X = np.random.rand(100, 5)
y = np.random.randint(0, 2, 100)
print("Accuracy:", k_fold_cv(X, y))

In [2]:
# K-Means (fixed with minimal changes)
import math
import random

# Datapoints (x1, x2, y) — we only use x1,x2 for clustering
class Kmeans:
    '''Implementing Kmeans algorithm.'''

    def __init__(self, n_clusters, max_iter=100, random_state=123):
        self.number_clusters = n_clusters 
        self.iteration = max_iter
        self.centroids = []
        if random_state:
            random.seed(random_state)
        
    def initialize_centroids(self, X):
        # FIX 1: Actually pick real data points (kmeans++ style is better, but at least not random [0-1])
        # Simple fix: randomly select k points from X
        self.centroids = random.sample(X, self.number_clusters)
        # Extract only the coordinates (x1, x2), ignore y
        self.centroids = [[x[0], x[1]] for x in self.centroids]
        return self.centroids
        
    def compute_centroids(self, X, labels):
        number_points_per_cluster = [0] * self.number_clusters
        # FIX 2: Don't do [[0,0]] * k — this creates references to same list!
        self.centroids = [[0.0, 0.0] for _ in range(self.number_clusters)]
        
        for i in range(len(X)):
            x1, x2, y = X[i]
            label = labels[i]
            number_points_per_cluster[label] += 1
            self.centroids[label][0] += x1
            self.centroids[label][1] += x2
            
        for i in range(self.number_clusters):
            if number_points_per_cluster[i] > 0:  # avoid division by zero
                self.centroids[i][0] /= number_points_per_cluster[i]
                self.centroids[i][1] /= number_points_per_cluster[i]
            # else: keep previous centroid (or could reinitialize)
            
        return self.centroids

    def compute_distance(self, X, centroids):
        res = []
        for i in range(len(X)):
            x1, x2, y = X[i]
            distances = []
            for c1, c2 in centroids:           
                distance = math.sqrt((x1 - c1)**2 + (x2 - c2)**2)
                distances.append(distance)
            res.append(distances)
        return res

    def find_closest_cluster(self, distance):
        ans = []
        for i in range(len(distance)):
            res = 0
            smallest = float("inf")
            for j in range(len(distance[i])):
                if distance[i][j] < smallest:
                    smallest = distance[i][j]
                    res = j
            ans.append(res)
        return ans

    def compute_sse(self, X, labels):
        res = 0
        for i in range(len(X)):
            x1, x2, y = X[i]
            label = labels[i]
            c1, c2 = self.centroids[label]
            res += (x1 - c1)**2 + (x2 - c2)**2
        return res
            
    def fit(self, X):
        self.initialize_centroids(X)
        for i in range(self.iteration):
            labels = self.predict(X)
            error = self.compute_sse(X, labels)
            print(f"Iteration {i+1}, SSE: {error:.4f}")
            
            old_centroids = [c[:] for c in self.centroids]
            self.compute_centroids(X, labels)
            
            # Optional: early stopping if centroids don't move
            if old_centroids == self.centroids:
                print("Converged early.")
                break
        
    def predict(self, X):
        # X can be list of [x1,x2,y] or just [x1,x2]
        distances = self.compute_distance(X, self.centroids)
        return self.find_closest_cluster(distances)


# Test
X = [[1, 1, 100], [1.1, 1.1, 200], [10, 10, 10000]]
k = 2
kmean = Kmeans(k, random_state=42)
kmean.fit(X)
print("Final centroids:", kmean.centroids)
print("Prediction for [1.2, 1.2]:", kmean.predict([[1.2, 1.2, 100]]))

Iteration 1, SSE: 0.0200
Iteration 2, SSE: 0.0100
Converged early.
Final centroids: [[10.0, 10.0], [1.05, 1.05]]
Prediction for [1.2, 1.2]: [1]
