In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/t10k-labels-idx1-ubyte
/kaggle/input/t10k-images-idx3-ubyte
/kaggle/input/fashion-mnist_test.csv
/kaggle/input/fashion-mnist_train.csv
/kaggle/input/train-labels-idx1-ubyte
/kaggle/input/train-images-idx3-ubyte


In [3]:
import numpy as np
import os
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def load_idx_file(file_path):
    with open(file_path, 'rb') as f:
        data = np.frombuffer(f.read(), dtype=np.uint8)
    return data

def preprocess_images(images):
    """Preprocess images by normalizing and flattening them."""
    images = images.astype(np.float32) / 255.0  # Normalize images to [0, 1] range
    images = images.reshape(images.shape[0], -1)  # Flatten images
    return images

def load_and_preprocess_data(data_path):
    """Load and preprocess dataset."""
    train_images = load_idx_file(os.path.join(data_path, "train-images-idx3-ubyte"))[16:].reshape(-1, 28, 28)
    train_labels = load_idx_file(os.path.join(data_path, "train-labels-idx1-ubyte"))[8:]
    test_images = load_idx_file(os.path.join(data_path, "t10k-images-idx3-ubyte"))[16:].reshape(-1, 28, 28)
    test_labels = load_idx_file(os.path.join(data_path, "t10k-labels-idx1-ubyte"))[8:]
    
    train_images = preprocess_images(train_images)
    test_images = preprocess_images(test_images)
    
    val_size = int(0.2 * len(train_images))
    val_images, val_labels = train_images[:val_size], train_labels[:val_size]
    train_images, train_labels = train_images[val_size:], train_labels[val_size:]
    
    return train_images, train_labels, val_images, val_labels, test_images, test_labels

def create_batches(images, labels, batch_size):
    indices = np.arange(len(images))
    np.random.shuffle(indices)
    for start_idx in range(0, len(images), batch_size):
        end_idx = min(start_idx + batch_size, len(images))
        batch_indices = indices[start_idx:end_idx]
        yield images[batch_indices], labels[batch_indices]

class MLP:
    def __init__(self, input_size, layers, activations, initialization="random", dropout=0.0, learning_rate=0.01):
        self.layers = layers
        self.activations = activations
        self.dropout = dropout
        self.learning_rate = learning_rate
        self.params = self.initialize_weights(input_size, layers, initialization)
    
    def initialize_weights(self, input_size, layers, initialization):
        params = {}
        previous_size = input_size
        for i, layer_size in enumerate(layers):
            if initialization == "he":
                params[f'W{i}'] = np.random.randn(previous_size, layer_size) * np.sqrt(2 / previous_size)
            elif initialization == "glorot":
                params[f'W{i}'] = np.random.randn(previous_size, layer_size) * np.sqrt(1 / previous_size)
            else:
                params[f'W{i}'] = np.random.randn(previous_size, layer_size) * 0.01
            params[f'b{i}'] = np.zeros((1, layer_size))
            # For hidden layers, initialize batch norm parameters
            if i < len(layers) - 1:
                params[f'gamma{i}'] = np.ones((1, layer_size))
                params[f'beta{i}'] = np.zeros((1, layer_size))
            previous_size = layer_size
        return params

    def activation(self, x, func):
        if func == "relu":
            return np.maximum(0, x)
        elif func == "leaky_relu":
            return np.where(x > 0, x, 0.01 * x)
        elif func == "tanh":
            return np.tanh(x)
        elif func == "gelu":
            return 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
        # For any unsupported activation, return input as is.
        return x

    def activation_derivative(self, z, func):
        if func == "relu":
            return (z > 0).astype(np.float32)
        elif func == "leaky_relu":
            return np.where(z > 0, 1, 0.01)
        elif func == "tanh":
            return 1 - np.tanh(z) ** 2
        elif func == "gelu":
            # Approximate derivative for gelu
            c = 0.7978845608028654  # sqrt(2/pi)
            d = 0.044715
            tanh_out = np.tanh(c * (z + d * np.power(z, 3)))
            return 0.5 * (1 + tanh_out) + 0.5 * z * (1 - np.power(tanh_out, 2)) * (c + 3 * d * np.power(z, 2))
        return np.ones_like(z)

    def batchnorm_forward(self, z, layer_index, eps=1e-5):
        # Compute mean and variance over the batch (axis=0)
        mu = np.mean(z, axis=0, keepdims=True)
        var = np.var(z, axis=0, keepdims=True)
        z_norm = (z - mu) / np.sqrt(var + eps)
        gamma = self.params[f'gamma{layer_index}']
        beta = self.params[f'beta{layer_index}']
        out = gamma * z_norm + beta
        # Save values needed for backward pass
        bn_cache = (z, z_norm, mu, var, gamma, beta, eps)
        return out, bn_cache

    def batchnorm_backward(self, dout, bn_cache):
        z, z_norm, mu, var, gamma, beta, eps = bn_cache
        m = z.shape[0]
        dgamma = np.sum(dout * z_norm, axis=0, keepdims=True)
        dbeta = np.sum(dout, axis=0, keepdims=True)
        dz_norm = dout * gamma
        dvar = np.sum(dz_norm * (z - mu) * -0.5 * np.power(var + eps, -1.5), axis=0, keepdims=True)
        dmu = np.sum(dz_norm * -1/np.sqrt(var + eps), axis=0, keepdims=True) + dvar * np.mean(-2 * (z - mu), axis=0, keepdims=True)
        dz = dz_norm / np.sqrt(var + eps) + dvar * 2 * (z - mu) / m + dmu / m
        return dz, dgamma, dbeta

    def forward(self, x):
        cache = {"A0": x}
        for i in range(len(self.layers)):
            # Linear step
            z_linear = np.dot(cache[f'A{i}'], self.params[f'W{i}']) + self.params[f'b{i}']
            # For hidden layers, apply batch norm before activation
            if i < len(self.layers) - 1:
                z, bn_cache = self.batchnorm_forward(z_linear, i)
                cache[f'bn{i+1}'] = bn_cache
            else:
                z = z_linear
            cache[f'Z{i+1}'] = z
            # Activation step
            a = self.activation(z, self.activations[i])
            # Apply dropout on hidden layers if dropout > 0
            if i < len(self.layers) - 1 and self.dropout > 0:
                dropout_mask = (np.random.rand(*a.shape) > self.dropout) / (1 - self.dropout)
                a *= dropout_mask
                cache[f'dropout{i+1}'] = dropout_mask
            cache[f'A{i+1}'] = a
        return cache

    def compute_accuracy(self, predictions, labels):
        return np.mean(np.argmax(predictions, axis=1) == labels)

    def backward(self, cache, labels):
        m = labels.shape[0]
        grads = {}
        # Compute initial gradient from loss (assumes softmax cross-entropy loss)
        dA = cache[f'A{len(self.layers)}']
        dA[np.arange(m), labels] -= 1
        dA /= m
        
        for i in reversed(range(len(self.layers))):
            # If dropout was applied in the forward pass, backpropagate through it.
            if i < len(self.layers) - 1 and f'dropout{i+1}' in cache:
                dA = dA * cache[f'dropout{i+1}']
            # Backprop through activation
            dZ = dA * self.activation_derivative(cache[f'Z{i+1}'], self.activations[i])
            # If batch norm was applied, backprop through it
            if i < len(self.layers) - 1 and f'bn{i+1}' in cache:
                dZ, dgamma, dbeta = self.batchnorm_backward(dZ, cache[f'bn{i+1}'])
                grads[f'dgamma{i}'] = dgamma
                grads[f'dbeta{i}'] = dbeta
            A_prev = cache[f'A{i}']
            grads[f'dW{i}'] = np.dot(A_prev.T, dZ)
            grads[f'db{i}'] = np.sum(dZ, axis=0, keepdims=True)
            dA = np.dot(dZ, self.params[f'W{i}'].T)
        return grads

    def update_params(self, grads):
        for i in range(len(self.layers)):
            self.params[f'W{i}'] -= self.learning_rate * grads[f'dW{i}']
            self.params[f'b{i}'] -= self.learning_rate * grads[f'db{i}']
            if i < len(self.layers) - 1 and f'dgamma{i}' in grads:
                self.params[f'gamma{i}'] -= self.learning_rate * grads[f'dgamma{i}']
                self.params[f'beta{i}'] -= self.learning_rate * grads[f'dbeta{i}']

    def compute_loss(self, predictions, labels, epsilon=1e-12):
        m = labels.shape[0]
        # Compute softmax probabilities
        probs = np.exp(predictions - np.max(predictions, axis=1, keepdims=True))
        probs /= np.sum(probs, axis=1, keepdims=True)
        probs = np.clip(probs, epsilon, 1.0)  # Prevent log(0) issues
        log_likelihood = -np.log(probs[np.arange(m), labels])
        return np.sum(log_likelihood) / m

    def train(self, train_data, val_data, epochs=10):
        for epoch in range(epochs):
            total_loss, total_val_loss = 0, 0
            total_val_acc = 0
            for images, labels in train_data:
                cache = self.forward(images)
                loss = self.compute_loss(cache[f'A{len(self.layers)}'], labels)
                grads = self.backward(cache, labels)
                self.update_params(grads)
                total_loss += loss
            
            for val_images, val_labels in val_data:
                val_cache = self.forward(val_images)
                val_loss = self.compute_loss(val_cache[f'A{len(self.layers)}'], val_labels)
                val_acc = self.compute_accuracy(val_cache[f'A{len(self.layers)}'], val_labels)
                total_val_loss += val_loss
                total_val_acc += val_acc
            
            avg_train_loss = total_loss / len(train_data)
            avg_val_loss = total_val_loss / len(val_data)
            avg_val_acc = total_val_acc / len(val_data)
            print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {avg_val_acc:.4f}")

    

    def evaluate(self, test_data):
        correct, total = 0, 0
        all_predictions = []
        all_labels = []
    
        for images, labels in test_data:
            cache = self.forward(images)
            predictions = np.argmax(cache[f'A{len(self.layers)}'], axis=1)
            correct += np.sum(predictions == labels)
            total += labels.shape[0]
            
            all_predictions.extend(predictions)
            all_labels.extend(labels)
    
        accuracy = accuracy_score(all_labels, all_predictions)
        precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='weighted')
    
        print(f"Test Accuracy: {accuracy * 100:.2f}%")
        print(f"Precision: {precision:.2f}")
        print(f"Recall: {recall:.2f}")
        print(f"F1 Score: {f1:.2f}")


# Load dataset with preprocessing
data_path = "/kaggle/input/"
train_images, train_labels, val_images, val_labels, test_images, test_labels = load_and_preprocess_data(data_path)

BATCH_SIZE = 32
train_batches = list(create_batches(train_images, train_labels, BATCH_SIZE))
val_batches = list(create_batches(val_images, val_labels, BATCH_SIZE))
test_batches = list(create_batches(test_images, test_labels, BATCH_SIZE))



In [4]:
## first we try to fix initialisation then other things

In [5]:
mlp = MLP(input_size=28*28, layers=[256, 128, 64, 10], activations=["gelu", "gelu" , "gelu", "softmax"], initialization="he", dropout=0.1, learning_rate=0.05)
mlp.train(train_batches, val_batches, epochs=20)
mlp.evaluate(test_batches)

mlp = MLP(input_size=28*28, layers=[256, 128, 64, 10], activations=["gelu", "gelu" , "gelu", "softmax"], initialization="random", dropout=0.1, learning_rate=0.05)
mlp.train(train_batches, val_batches, epochs=20)
mlp.evaluate(test_batches)

mlp = MLP(input_size=28*28, layers=[256, 128, 64, 10], activations=["gelu", "gelu" , "gelu", "softmax"], initialization="glorot", dropout=0.1, learning_rate=0.05)
mlp.train(train_batches, val_batches, epochs=20)
mlp.evaluate(test_batches)


Epoch 1, Train Loss: 1.8562, Val Loss: 1.8064, Val Accuracy: 0.7913
Epoch 2, Train Loss: 1.7821, Val Loss: 1.7703, Val Accuracy: 0.8142
Epoch 3, Train Loss: 1.7578, Val Loss: 1.7492, Val Accuracy: 0.8291
Epoch 4, Train Loss: 1.7434, Val Loss: 1.7440, Val Accuracy: 0.8315
Epoch 5, Train Loss: 1.7321, Val Loss: 1.7333, Val Accuracy: 0.8367
Epoch 6, Train Loss: 1.7241, Val Loss: 1.7273, Val Accuracy: 0.8408
Epoch 7, Train Loss: 1.7170, Val Loss: 1.7204, Val Accuracy: 0.8453
Epoch 8, Train Loss: 1.7101, Val Loss: 1.7162, Val Accuracy: 0.8489
Epoch 9, Train Loss: 1.7053, Val Loss: 1.7087, Val Accuracy: 0.8487
Epoch 10, Train Loss: 1.7003, Val Loss: 1.7096, Val Accuracy: 0.8539
Epoch 11, Train Loss: 1.6963, Val Loss: 1.7081, Val Accuracy: 0.8563
Epoch 12, Train Loss: 1.6931, Val Loss: 1.6991, Val Accuracy: 0.8577
Epoch 13, Train Loss: 1.6887, Val Loss: 1.6973, Val Accuracy: 0.8588
Epoch 14, Train Loss: 1.6850, Val Loss: 1.6958, Val Accuracy: 0.8599
Epoch 15, Train Loss: 1.6819, Val Loss: 1.6

In [6]:
## now other combinations as required, we fix random as it performed best above

In [8]:
mlp = MLP(input_size=28*28, layers=[256, 128, 64, 10], activations=["gelu", "gelu" , "gelu", "softmax"], initialization="random", dropout=0.2, learning_rate=0.05)
mlp.train(train_batches, val_batches, epochs=20)
mlp.evaluate(test_batches)

mlp = MLP(input_size=28*28, layers=[256, 128, 64, 10], activations=["gelu", "gelu" , "gelu", "softmax"], initialization="random", dropout=0.2, learning_rate=0.05)
mlp.train(train_batches, val_batches, epochs=20)
mlp.evaluate(test_batches)

mlp = MLP(input_size=28*28, layers=[256, 128, 64, 10], activations=["gelu", "gelu" , "gelu", "softmax"], initialization="random", dropout=0.1, learning_rate=0.05)
mlp.train(train_batches, val_batches, epochs=20)
mlp.evaluate(test_batches)

mlp = MLP(input_size=28*28, layers=[256, 128, 64, 10], activations=["gelu", "gelu" , "gelu", "softmax"], initialization="random", dropout=0.05, learning_rate=0.05)
mlp.train(train_batches, val_batches, epochs=20)
mlp.evaluate(test_batches)

mlp = MLP(input_size=28*28, layers=[256, 128, 64, 10], activations=["gelu", "gelu" , "gelu", "softmax"], initialization="random", dropout=0.01, learning_rate=0.05)
mlp.train(train_batches, val_batches, epochs=20)
mlp.evaluate(test_batches)

mlp = MLP(input_size=28*28, layers=[256, 128, 64, 10], activations=["gelu", "gelu" , "gelu", "softmax"], initialization="random", dropout=0.3, learning_rate=0.05)
mlp.train(train_batches, val_batches, epochs=20)
mlp.evaluate(test_batches)

mlp = MLP(input_size=28*28, layers=[256, 128, 64, 10], activations=["gelu", "gelu" , "gelu", "softmax"], initialization="random", dropout=0.4, learning_rate=0.05)
mlp.train(train_batches, val_batches, epochs=20)
mlp.evaluate(test_batches)

mlp = MLP(input_size=28*28, layers=[256, 128, 64, 10], activations=["gelu", "gelu" , "gelu", "softmax"], initialization="random", dropout=0.5, learning_rate=0.05)
mlp.train(train_batches, val_batches, epochs=20)
mlp.evaluate(test_batches)

mlp = MLP(input_size=28*28, layers=[256, 128, 64, 10], activations=["gelu", "gelu" , "gelu", "softmax"], initialization="random", dropout=0.1, learning_rate=0.01)
mlp.train(train_batches, val_batches, epochs=20)
mlp.evaluate(test_batches)

mlp = MLP(input_size=28*28, layers=[256, 128, 64, 10], activations=["gelu", "gelu" , "gelu", "softmax"], initialization="random", dropout=0.1, learning_rate=0.001)
mlp.train(train_batches, val_batches, epochs=20)
mlp.evaluate(test_batches)

mlp = MLP(input_size=28*28, layers=[256, 128, 64, 10], activations=["gelu", "gelu" , "gelu", "softmax"], initialization="random", dropout=0.1, learning_rate=0.2)
mlp.train(train_batches, val_batches, epochs=20)
mlp.evaluate(test_batches)

mlp = MLP(input_size=28*28, layers=[256, 128, 64, 10], activations=["gelu", "gelu" , "gelu", "softmax"], initialization="random", dropout=0.1, learning_rate=0.3)
mlp.train(train_batches, val_batches, epochs=20)
mlp.evaluate(test_batches)

mlp = MLP(input_size=28*28, layers=[256, 128, 64, 10], activations=["gelu", "gelu" , "gelu", "softmax"], initialization="random", dropout=0.1, learning_rate=0.4)
mlp.train(train_batches, val_batches, epochs=20)
mlp.evaluate(test_batches)

mlp = MLP(input_size=28*28, layers=[256, 128, 64, 10], activations=["gelu", "gelu" , "gelu", "softmax"], initialization="random", dropout=0.1, learning_rate=0.05)
mlp.train(train_batches, val_batches, epochs=20)
mlp.evaluate(test_batches)

mlp = MLP(input_size=28*28, layers=[256, 128, 64, 10], activations=["relu", "relu" , "relu", "softmax"], initialization="random", dropout=0.1, learning_rate=0.05)
mlp.train(train_batches, val_batches, epochs=20)
mlp.evaluate(test_batches)

mlp = MLP(input_size=28*28, layers=[256, 128, 64, 10], activations=["leaky_relu", "leaky_relu" , "leaky_relu", "softmax"], initialization="random", dropout=0.1, learning_rate=0.05)
mlp.train(train_batches, val_batches, epochs=20)
mlp.evaluate(test_batches)

mlp = MLP(input_size=28*28, layers=[256, 128, 64, 10], activations=["tanh", "tanh" , "tanh", "softmax"], initialization="random", dropout=0.1, learning_rate=0.05)
mlp.train(train_batches, val_batches, epochs=20)
mlp.evaluate(test_batches)

mlp = MLP(input_size=28*28, layers=[512,256, 128, 64, 10], activations=["gelu","gelu", "gelu" , "gelu", "softmax"], initialization="random", dropout=0.1, learning_rate=0.05)
mlp.train(train_batches, val_batches, epochs=20)
mlp.evaluate(test_batches)

mlp = MLP(input_size=28*28, layers=[1024,512,256, 128, 64, 10], activations=["gelu","gelu","gelu", "gelu" , "gelu", "softmax"], initialization="random", dropout=0.1, learning_rate=0.05)
mlp.train(train_batches, val_batches, epochs=20)
mlp.evaluate(test_batches)

mlp = MLP(input_size=28*28, layers=[128, 64, 10], activations=["gelu","gelu","softmax"], initialization="random", dropout=0.1, learning_rate=0.05)
mlp.train(train_batches, val_batches, epochs=20)
mlp.evaluate(test_batches)

mlp = MLP(input_size=28*28, layers=[64, 10], activations=["gelu","softmax"], initialization="random", dropout=0.1, learning_rate=0.05)
mlp.train(train_batches, val_batches, epochs=20)
mlp.evaluate(test_batches)

Epoch 1, Train Loss: 1.7732, Val Loss: 1.7309, Val Accuracy: 0.8302
Epoch 2, Train Loss: 1.7200, Val Loss: 1.7188, Val Accuracy: 0.8438
Epoch 3, Train Loss: 1.7044, Val Loss: 1.7044, Val Accuracy: 0.8502
Epoch 4, Train Loss: 1.6948, Val Loss: 1.7022, Val Accuracy: 0.8548
Epoch 5, Train Loss: 1.6874, Val Loss: 1.6932, Val Accuracy: 0.8628
Epoch 6, Train Loss: 1.6796, Val Loss: 1.6858, Val Accuracy: 0.8606
Epoch 7, Train Loss: 1.6749, Val Loss: 1.6869, Val Accuracy: 0.8638
Epoch 8, Train Loss: 1.6698, Val Loss: 1.6879, Val Accuracy: 0.8669
Epoch 9, Train Loss: 1.6658, Val Loss: 1.6853, Val Accuracy: 0.8681
Epoch 10, Train Loss: 1.6626, Val Loss: 1.6761, Val Accuracy: 0.8702
Epoch 11, Train Loss: 1.6598, Val Loss: 1.6772, Val Accuracy: 0.8711
Epoch 12, Train Loss: 1.6563, Val Loss: 1.6650, Val Accuracy: 0.8720
Epoch 13, Train Loss: 1.6531, Val Loss: 1.6793, Val Accuracy: 0.8718
Epoch 14, Train Loss: 1.6503, Val Loss: 1.6791, Val Accuracy: 0.8717
Epoch 15, Train Loss: 1.6473, Val Loss: 1.6