# Setup and Imports

In [None]:
!pip3 -qq install -r ../requirements.txt

In [None]:
import numpy as np
import pandas as pd
from PIL import Image
import os

# Network Implementation

In [140]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def sigmoid_derivative(z):
    s = sigmoid(z)
    
    return s * (1 - s)

def tanh(z):
    return np.tanh(z)

def tanh_derivative(z):
    return 1 - np.tanh(z)**2

def relu(z):
    return np.maximum(0, z)

def relu_derivative(z):
    return (z > 0).astype(float)

In [141]:
def svm_loss(scores, y):
    """
    Compute the multi-class SVM loss.
    scores: numpy array of shape (N, C)
    y: numpy array of shape (N,) with labels in range 0...C-1
    Returns the average loss and the margin matrix.
    """
    N = scores.shape[0]
    
    # Selecting the correct class scores
    correct_class_scores = scores[np.arange(N), y].reshape(-1, 1)
    
    # Computing margins for all classes
    margins = np.maximum(0, scores - correct_class_scores + 1)
    margins[np.arange(N), y] = 0  # Excluding the correct class in loss
    loss = np.sum(margins) / N
    
    return loss, margins

def svm_loss_gradient(margins, y):
    """
    Compute the gradient of the SVM loss with respect to scores.
    """
    N = margins.shape[0]
    binary = (margins > 0).astype(float)
    row_sum = np.sum(binary, axis=1)
    binary[np.arange(N), y] = -row_sum
    dScores = binary / N
    
    return dScores

In [142]:
def initialize_parameters(input_dim, hidden_dim, output_dim):
    """
    Initialize weights and biases for a network with one hidden layer.
    """
    W1 = np.random.randn(hidden_dim, input_dim) * 0.01
    b1 = np.zeros((hidden_dim, 1))
    W2 = np.random.randn(output_dim, hidden_dim) * 0.01
    b2 = np.zeros((output_dim, 1))
    
    return W1, b1, W2, b2

In [143]:
def forward(X, W1, b1, W2, b2, activation_func):
    """
    Compute the forward pass through the network.
    Returns: Z1 (pre-activation), A1 (activation output), and scores.
    """
    Z1 = np.dot(X, W1.T) + b1.T
    A1 = activation_func(Z1)
    scores = np.dot(A1, W2.T) + b2.T  # raw output scores for each class

    return Z1, A1, scores

def backward(X, y, Z1, A1, scores, W2, activation_deriv, margins):
    """
    Compute the backward pass through the network.
    Returns gradients for W1, b1, W2, and b2.
    """
    # Gradient of loss with respect to scores
    dScores = svm_loss_gradient(margins, y)

    # print(dScores.shape)    # For, testing
    
    # Gradients for the output layer parameters
    dW2 = np.dot(A1.T, dScores).T
    db2 = np.sum(dScores, axis=0, keepdims=True).T
    dX2 = np.dot(dScores, W2)

    # print(db2.shape)    # For, testing
    # print(dW2.shape)   
    
    # Backprop into activation function
    # dA1 = activation_deriv(dX2)
    
    dA1 = np.dot(dScores, W2)      # This is the gradient flowing back from the output.
    dZ1 = dA1 * activation_deriv(Z1)  # Multiply by the derivative evaluated at Z1.


    # Backprop into hidden layer
    # dZ1 = np.dot(dA1, activation_deriv(Z1).T).T
    # dW1 = np.dot(dA1.T, X)
    # db1 = np.sum(dA1, axis=0, keepdims=True).T

    dW1 = np.dot(dZ1.T, X)
    db1 = np.sum(dZ1, axis=0, keepdims=True).T

    # print(dW1.shape)    # For, testing
    # print(db1.shape)
    
    return dW1, db1, dW2, db2

In [144]:
def update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate):
    """
    Update parameters using a simple gradient descent step.
    """
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2

    return W1, b1, W2, b2

In [145]:
def predict(X, W1, b1, W2, b2, activation_func):
    """
    Predict class labels for input data X.
    """
    _, _, scores = forward(X, W1, b1, W2, b2, activation_func)

    return np.argmax(scores, axis=1)

def compute_accuracy(y_pred, y_true):
    """
    Compute the accuracy given predictions and true labels.
    """

    return np.mean(y_pred == y_true)

In [146]:
def train_network(X, y, activation='relu', epochs=1000, learning_rate=1e-2,
                  hidden_dim=32, print_every=10):
    """
    Train the neural network.
    Parameters:
      X: Input data of shape (N, input_dim)
      y: Labels (N,)
      activation: Choice of activation function ('relu', 'sigmoid', 'tanh')
      epochs: Number of training iterations
      learning_rate: Gradient descent learning rate
      hidden_dim: Number of neurons in the hidden layer
      print_every: Print loss and accuracy every 'print_every' epochs
    Returns trained parameters: W1, b1, W2, b2
    """
    activation_funcs = {
        'sigmoid': (sigmoid, sigmoid_derivative),
        'tanh': (tanh, tanh_derivative),
        'relu': (relu, relu_derivative)
    }

    if activation not in activation_funcs:
        raise ValueError("Unsupported activation function. Choose from 'relu', 'sigmoid', or 'tanh'.")
    
    act_func, act_deriv = activation_funcs[activation]
    
    input_dim = X.shape[1]
    output_dim = 10  # CIFAR-10 has 10 classes
    W1, b1, W2, b2 = initialize_parameters(input_dim, hidden_dim, output_dim)
    
    from tqdm import tqdm_notebook

    for epoch in tqdm_notebook(range(epochs)):
        # Forward pass
        Z1, A1, scores = forward(X, W1, b1, W2, b2, act_func)
        loss, margins = svm_loss(scores, y)
        
        # Backward pass
        dW1, db1, dW2, db2 = backward(X, y, Z1, A1, scores, W2, act_deriv, margins)
        
        # Updating parameters
        W1, b1, W2, b2 = update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate)
        
        if epoch % print_every == 0:
            y_pred = predict(X, W1, b1, W2, b2, act_func)
            acc = compute_accuracy(y_pred, y)
            print(f"Epoch {epoch}: Loss = {loss:.4f}, Accuracy = {acc:.4f}")
    
    return W1, b1, W2, b2

In [None]:
def load_cifar10_data(path):
    """
    Loads the CIFAR-10 dataset from the given directory.
    
    Args:
        path (str): Path to the CIFAR-10 dataset directory.
    
    Returns:
        X_train (numpy array): Training images of shape (N, 3072) and normalized.
        y_train (numpy array): Training labels of shape (N,).
        X_test (numpy array): Testing images of shape (N, 3072) and normalized.
    """
    
    # Loading training labels
    labels_path = os.path.join(path, "train_labels.csv")
    labels_df = pd.read_csv(labels_path)
    
    # Extracting filenames and labels
    train_filenames = labels_df['id'].values
    train_labels = labels_df['label'].values
    
    train_dir = os.path.join(path, "train")
    test_dir = os.path.join(path, "test")
    
    # Loading training images
    X_train = []
    
    for filename in train_filenames:
        img_path = os.path.join(train_dir, filename)
        img = Image.open(img_path).convert("RGB")
        img = np.array(img, dtype=np.float32) / 255.0  # Normalizing pixels to [0,1]
        X_train.append(img.flatten())  # Flattening to (3072,)
    
    X_train = np.array(X_train)  # Shape: (N, 3072)
    
    from sklearn.preprocessing import LabelEncoder

    # Encoding labels (convert category names to numerical values)
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(train_labels)  # Converts to integers (0-9)
    
    # Loading test images
    X_test = []
    test_filenames = sorted(os.listdir(test_dir))  # Assuming all images in test directory
    
    for filename in test_filenames:
        img_path = os.path.join(test_dir, filename)
        img = Image.open(img_path).convert("RGB")
        img = np.array(img, dtype=np.float32) / 255.0
        X_test.append(img.flatten())
    
    X_test = np.array(X_test)  # Shape: (M, 3072)
    
    return X_train, y_train, X_test

In [None]:
!unzip ../data/cifar-10.zip -d ../data/cifar-10

import py7zr

with py7zr.SevenZipFile('../data/cifar-10/train.7z', mode='r') as archive:
    archive.extractall(path="../data/cifar-10/train")  

with py7zr.SevenZipFile('../data/cifar-10/test.7z', mode='r') as archive:
    archive.extractall(path="../data/cifar-10/test")  

# Driver Code

In [None]:
if __name__ == "__main__":
    # For demonstration, creating dummy data
    np.random.seed(42)
    num_samples = 100
    input_dim = 3072  # 32 x 32 x 3 flattened image
    X_dummy = np.random.rand(num_samples, input_dim)
    y_dummy = np.random.randint(0, 10, size=num_samples)
    
    # Training the network with ReLU activation
    print("Training with ReLU activation:")
    W1_relu, b1_relu, W2_relu, b2_relu = train_network(
        X_dummy, y_dummy, hidden_dim=32, print_every=10
    )
    
    print("\nTraining with Sigmoid activation:")
    W1_sig, b1_sig, W2_sig, b2_sig = train_network(
        X_dummy, y_dummy, activation='sigmoid', hidden_dim=32, print_every=10
    )
    
    print("\nTraining with Tanh activation:")
    W1_tanh, b1_tanh, W2_tanh, b2_tanh = train_network(
        X_dummy, y_dummy, activation='tanh', hidden_dim=32, print_every=10
    )

Training with ReLU activation:


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for epoch in tqdm_notebook(range(epochs)):


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 0: Loss = 9.0046, Accuracy = 0.1500
Epoch 10: Loss = 7.0999, Accuracy = 0.2400
Epoch 20: Loss = 6.9992, Accuracy = 0.1600
Epoch 30: Loss = 6.9588, Accuracy = 0.1700
Epoch 40: Loss = 6.2982, Accuracy = 0.3100
Epoch 50: Loss = 5.3769, Accuracy = 0.6100
Epoch 60: Loss = 3.6907, Accuracy = 0.8000
Epoch 70: Loss = 4.0706, Accuracy = 0.5300
Epoch 80: Loss = 3.3932, Accuracy = 0.3100
Epoch 90: Loss = 1.0197, Accuracy = 0.8600
Epoch 100: Loss = 1.2522, Accuracy = 0.2900
Epoch 110: Loss = 0.0119, Accuracy = 1.0000
Epoch 120: Loss = 0.0000, Accuracy = 1.0000
Epoch 130: Loss = 0.0000, Accuracy = 1.0000
Epoch 140: Loss = 0.0000, Accuracy = 1.0000
Epoch 150: Loss = 0.0000, Accuracy = 1.0000
Epoch 160: Loss = 0.0000, Accuracy = 1.0000
Epoch 170: Loss = 0.0000, Accuracy = 1.0000
Epoch 180: Loss = 0.0000, Accuracy = 1.0000
Epoch 190: Loss = 0.0000, Accuracy = 1.0000
Epoch 200: Loss = 0.0000, Accuracy = 1.0000
Epoch 210: Loss = 0.0000, Accuracy = 1.0000
Epoch 220: Loss = 0.0000, Accuracy = 1.0000

  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 0: Loss = 8.9371, Accuracy = 0.1400
Epoch 10: Loss = 7.2178, Accuracy = 0.1500
Epoch 20: Loss = 7.1360, Accuracy = 0.1600
Epoch 30: Loss = 7.0873, Accuracy = 0.1500
Epoch 40: Loss = 7.0979, Accuracy = 0.1600
Epoch 50: Loss = 7.0451, Accuracy = 0.1700
Epoch 60: Loss = 7.0655, Accuracy = 0.1600
Epoch 70: Loss = 7.0235, Accuracy = 0.1700
Epoch 80: Loss = 6.9957, Accuracy = 0.1600
Epoch 90: Loss = 6.9414, Accuracy = 0.1700
Epoch 100: Loss = 6.9072, Accuracy = 0.3200
Epoch 110: Loss = 6.8656, Accuracy = 0.3700
Epoch 120: Loss = 6.8273, Accuracy = 0.3500
Epoch 130: Loss = 6.7560, Accuracy = 0.4300
Epoch 140: Loss = 6.6907, Accuracy = 0.4100
Epoch 150: Loss = 6.6467, Accuracy = 0.4700
Epoch 160: Loss = 6.5435, Accuracy = 0.5500
Epoch 170: Loss = 6.4455, Accuracy = 0.5600
Epoch 180: Loss = 6.3593, Accuracy = 0.5700
Epoch 190: Loss = 6.2164, Accuracy = 0.5700
Epoch 200: Loss = 6.0906, Accuracy = 0.5700
Epoch 210: Loss = 5.9234, Accuracy = 0.5700
Epoch 220: Loss = 5.7396, Accuracy = 0.5900

  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 0: Loss = 8.9878, Accuracy = 0.1400
Epoch 10: Loss = 7.1421, Accuracy = 0.1500
Epoch 20: Loss = 7.0656, Accuracy = 0.3000
Epoch 30: Loss = 6.8524, Accuracy = 0.4100
Epoch 40: Loss = 6.6143, Accuracy = 0.5300
Epoch 50: Loss = 6.0951, Accuracy = 0.6400
Epoch 60: Loss = 5.8027, Accuracy = 0.6000
Epoch 70: Loss = 5.7977, Accuracy = 0.4900
Epoch 80: Loss = 2.5684, Accuracy = 0.2800
Epoch 90: Loss = 2.5504, Accuracy = 0.4000
Epoch 100: Loss = 2.1852, Accuracy = 0.2900
Epoch 110: Loss = 0.4759, Accuracy = 0.6900
Epoch 120: Loss = 0.0011, Accuracy = 1.0000
Epoch 130: Loss = 0.0000, Accuracy = 1.0000
Epoch 140: Loss = 0.0000, Accuracy = 1.0000
Epoch 150: Loss = 0.0000, Accuracy = 1.0000
Epoch 160: Loss = 0.0000, Accuracy = 1.0000
Epoch 170: Loss = 0.0000, Accuracy = 1.0000
Epoch 180: Loss = 0.0000, Accuracy = 1.0000
Epoch 190: Loss = 0.0000, Accuracy = 1.0000
Epoch 200: Loss = 0.0000, Accuracy = 1.0000
Epoch 210: Loss = 0.0000, Accuracy = 1.0000
Epoch 220: Loss = 0.0000, Accuracy = 1.0000

In [None]:
def train_network(X, y, activation='relu', epochs=1000, learning_rate=1e-2,
                  hidden_dim=64, print_every=10):
    """
    Train the neural network using the given dataset.
    Parameters:
      X: Training data of shape (N, 3072)
      y: Labels (N,)
      activation: Activation function ('relu', 'sigmoid', 'tanh')
      epochs: Number of training iterations
      learning_rate: Learning rate for optimization
      hidden_dim: Number of hidden layer neurons
      print_every: Print loss and accuracy every 'print_every' epochs
    Returns trained parameters: W1, b1, W2, b2
    """
    activation_funcs = {
        'sigmoid': (sigmoid, sigmoid_derivative),
        'tanh': (tanh, tanh_derivative),
        'relu': (relu, relu_derivative)
    }

    if activation not in activation_funcs:
        raise ValueError("Unsupported activation function. Choose from 'relu', 'sigmoid', or 'tanh'.")
    
    act_func, act_deriv = activation_funcs[activation]
    
    input_dim = X.shape[1]  # 3072 (32x32x3)
    output_dim = 10  # CIFAR-10 has 10 classes
    W1, b1, W2, b2 = initialize_parameters(input_dim, hidden_dim, output_dim)
    
    from tqdm import tqdm

    for epoch in tqdm(range(epochs), desc="Training Progress"):
        # Forward pass
        Z1, A1, scores = forward(X, W1, b1, W2, b2, act_func)
        loss, margins = svm_loss(scores, y)
        
        # Backward pass
        dW1, db1, dW2, db2 = backward(X, y, Z1, A1, scores, W2, act_deriv, margins)
        
        # Updating parameters
        W1, b1, W2, b2 = update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate)
        
        if epoch % print_every == 0:
            y_pred = predict(X, W1, b1, W2, b2, act_func)
            acc = compute_accuracy(y_pred, y)
            print(f"Epoch {epoch}: Loss = {loss:.4f}, Accuracy = {acc:.4f}")
    
    return W1, b1, W2, b2

In [None]:
def test_network(X_test, W1, b1, W2, b2, activation='relu'):
    """
    Test the trained network on test data.
    Parameters:
      X_test: Test dataset (M, 3072)
      W1, b1, W2, b2: Trained model parameters
      activation: Activation function used during training
    Returns:
      Predicted labels for the test set
    """
    activation_funcs = {
        'sigmoid': sigmoid,
        'tanh': tanh,
        'relu': relu
    }

    if activation not in activation_funcs:
        raise ValueError("Unsupported activation function. Choose from 'relu', 'sigmoid', or 'tanh'.")
    
    act_func = activation_funcs[activation]
    
    # Forward pass
    _, _, scores = forward(X_test, W1, b1, W2, b2, act_func)
    predictions = np.argmax(scores, axis=1)  # Getting class with highest score

    return predictions


In [None]:
if __name__ == "__main__":
    # Loading CIFAR-10 data
    dataset_path = "./data/cifar-10"  
    X_train, y_train, X_test = load_cifar10_data(dataset_path)

    # Training the network using ReLU activation
    print("Training with ReLU activation:")
    W1_relu, b1_relu, W2_relu, b2_relu = train_network(
        X_train, y_train, hidden_dim=64, print_every=10
    )

    # Evaluating the model on test data
    print("\nEvaluating on Test Data:")
    test_predictions = test_network(X_test, W1_relu, b1_relu, W2_relu, b2_relu)
    print(f"Predicted Labels for Test Data: {test_predictions[:10]}")  # Print first 10 predictions