In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [4]:
import torchvision.transforms as transforms

transform = transforms.Compose([             
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

In [5]:
train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)


val_size = int(0.1 * len(train_dataset))
train_size = len(train_dataset) - val_size
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])


train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 26.4M/26.4M [00:00<00:00, 119MB/s]


Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 29.5k/29.5k [00:00<00:00, 4.23MB/s]


Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 4.42M/4.42M [00:00<00:00, 61.9MB/s]


Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 5.15k/5.15k [00:00<00:00, 8.30MB/s]


Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw



In [6]:
import torch
import torch.nn as nn
import math

class MLP(nn.Module):
    def __init__(self, input_size, layers, activations, dropout, learning_rate, flag=False, device = "cpu"):
        super(MLP, self).__init__()
        self.input_size = input_size
        self.layers = layers  
        self.activations = activations 
        self.dropout_rate = dropout
        self.learning_rate = learning_rate
        self.flag = flag
        self.device = device
        self.build_model()

    def build_model(self):
        self.weights = nn.ParameterList()
        self.biases = nn.ParameterList()
        self.batch_norm_params = []
        layer_sizes = [self.input_size] + self.layers

        for i in range(len(layer_sizes) - 1):
            W = nn.Parameter(torch.randn(layer_sizes[i], layer_sizes[i+1]) * math.sqrt(2 / (layer_sizes[i] + layer_sizes[i+1])))
            b = nn.Parameter(torch.zeros(layer_sizes[i+1]))
            self.weights.append(W)
            self.biases.append(b)

            if i < len(layer_sizes) - 2:
                gamma = nn.Parameter(torch.ones(layer_sizes[i+1]))
                beta = nn.Parameter(torch.zeros(layer_sizes[i+1]))
                self.batch_norm_params.append((gamma, beta))

        self.dropout = nn.Dropout(p=self.dropout_rate)

    def batch_norm_forward(self, x, gamma, beta, eps):
        # Ensure input and parameters are on the same device
        x = x.to(self.device)
        gamma = gamma.to(self.device)
        beta = beta.to(self.device)
        
        mean = x.mean(dim=0, keepdim=True)
        var = x.var(dim=0, keepdim=True, unbiased=False)
        x_norm = (x - mean) / torch.sqrt(var + eps)
        out = gamma * x_norm + beta
        cache = (x, x_norm, mean, var, gamma, beta, eps)
        return out, cache

    def batch_norm_backward(self, dout, cache):
        x, x_norm, mean, var, gamma, beta, eps = cache
        N, D = dout.shape

        dx_norm = dout * gamma
        dvar = torch.sum(dx_norm * (x - mean) * -0.5 * torch.pow(var + eps, -1.5), dim=0)
        dmean = torch.sum(dx_norm * -1 / torch.sqrt(var + eps), dim=0) + dvar * torch.mean(-2 * (x - mean), dim=0)

        dx = (dx_norm / torch.sqrt(var + eps)) + (dvar * 2 * (x - mean) / N) + (dmean / N)
        dgamma = torch.sum(dout * x_norm, dim=0)
        dbeta = torch.sum(dout, dim=0)

        return dx, dgamma, dbeta

    def forward(self, x):
        self.a_values = []  
        self.z_values = []  
        self.bn_caches = []
        a = x  

        for i in range(len(self.weights)):
            W = self.weights[i]
            b = self.biases[i]
            
            z = torch.matmul(a, W) + b
            self.z_values.append(z)
            
            if i < len(self.weights) - 1:
                gamma, beta = self.batch_norm_params[i]
                z, bn_cache = self.batch_norm_forward(z, gamma, beta,1e-5)
                self.bn_caches.append(bn_cache)
            
            activation = self.activations[i]

            if i == len(self.weights) - 1 and self.flag == True:
                a = z
            else:
                if activation == "relu":
                    a = self.relu(z)
                elif activation == "leaky_relu":
                    a = self.leaky_relu(z)
                elif activation == "tanh":
                    a = self.tanh(z)
                elif activation == "gelu":
                    a = self.gelu(z)
                elif activation == "softmax":
                    a = self.softmax(z)
                else:
                    raise ValueError(f"Unsupported activation: {activation}")

                if i < len(self.weights) - 1:
                    a = self.dropout(a)

            self.a_values.append(a)

        return a

    def relu(self, z):
        return torch.maximum(z, torch.zeros_like(z))

    def relu_derivative(self, z):
        return torch.where(z > 0, torch.ones_like(z), torch.zeros_like(z))

    def leaky_relu(self, z, negative_slope=0.01):
        return torch.where(z > 0, z, negative_slope * z)

    def leaky_relu_derivative(self, z, negative_slope=0.01):
        return torch.where(z > 0, torch.ones_like(z), negative_slope * torch.ones_like(z))

    def tanh(self, z):
        return torch.tanh(z)

    def tanh_derivative(self, z):
        return 1 - torch.tanh(z) ** 2

    def gelu(self, z):
        return 0.5 * z * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (z + 0.044715 * torch.pow(z, 3))))

    def gelu_derivative(self, z):
        tanh_part = torch.tanh(math.sqrt(2.0 / math.pi) * (z + 0.044715 * torch.pow(z, 3)))
        factor = 0.5 * (1.0 + tanh_part)
        derivative = factor + (z * (1 - tanh_part ** 2) * (math.sqrt(2.0 / math.pi) + 0.134145 * z ** 2))
        return derivative

    def softmax(self, z):
        z_exp = torch.exp(z - torch.max(z, dim=1, keepdim=True)[0])
        return z_exp / torch.sum(z_exp, dim=1, keepdim=True)

    def softmax_derivative(self, z):
        s = self.softmax(z)
        return s * (1 - s)

    def apply_dropout(self, a, rate):
        if rate > 0:
            dropout_mask = (torch.rand_like(a) > rate).float()
            a = dropout_mask * a / (1.0 - rate)
        return a

    def train_model(self, train_batches, val_batches, epochs):
        for epoch in range(epochs):
            self.train()  
            total_loss = 0

            for x_batch, y_batch in train_batches:
                x_batch = torch.tensor(x_batch, dtype=torch.float32)
                y_batch = torch.tensor(y_batch, dtype=torch.long)

                outputs = self.forward(x_batch)

                loss = self.compute_loss(outputs, y_batch)
                total_loss += loss.item()

                self.backward(x_batch, y_batch)

                with torch.no_grad():
                    for i in range(len(self.weights)):
                        self.weights[i] -= self.learning_rate * self.grad_weights[i]
                        self.biases[i] -= self.learning_rate * self.grad_biases[i]

                    for i, (gamma, beta) in enumerate(self.batch_norm_params):
                        gamma_grad = self.grad_batch_norm_params[i][0]
                        beta_grad = self.grad_batch_norm_params[i][1]
                        gamma -= self.learning_rate * gamma_grad
                        beta -= self.learning_rate * beta_grad

            avg_loss = total_loss / len(train_batches)
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}')

            val_accuracy = self.evaluate(val_batches)
            print(f'Validation Accuracy: {val_accuracy:.2f}%\n')

    def backward(self, x, y):
        m = y.shape[0]

        self.grad_weights = [torch.zeros_like(W) for W in self.weights]
        self.grad_biases = [torch.zeros_like(b) for b in self.biases]
        self.grad_batch_norm_params = []

        a_final = self.a_values[-1]
        delta = self.loss_derivative(a_final, y)

        for i in reversed(range(len(self.weights))):
            a_prev = x if i == 0 else self.a_values[i - 1]

            if i < len(self.weights) - 1:
                delta, dgamma, dbeta = self.batch_norm_backward(delta, self.bn_caches[i])
                self.grad_batch_norm_params.insert(0, (dgamma, dbeta))

            self.grad_weights[i] = torch.matmul(a_prev.T, delta) / m
            self.grad_biases[i] = torch.sum(delta, dim=0) / m

            if i != 0:
                W = self.weights[i]
                z = self.z_values[i - 1]

                activation = self.activations[i - 1]
                if activation == "relu":
                    delta = torch.matmul(delta, W.T) * self.relu_derivative(z)
                elif activation == "leaky_relu":
                    delta = torch.matmul(delta, W.T) * self.leaky_relu_derivative(z)
                elif activation == "tanh":
                    delta = torch.matmul(delta, W.T) * self.tanh_derivative(z)
                elif activation == "gelu":
                    delta = torch.matmul(delta, W.T) * self.gelu_derivative(z)
                elif activation == "softmax":
                    delta = torch.matmul(delta, W.T) * self.softmax_derivative(z)
                else:
                    raise ValueError(f"Unsupported activation: {activation}")

                delta = delta * self.dropout_derivative(self.a_values[i - 1])

    def dropout_derivative(self, a):
        return (a != 0).float()

    def loss_derivative(self, a_final, y):
        y_one_hot = torch.zeros_like(a_final)
        y_one_hot.scatter_(1, y.unsqueeze(1), 1)
        return a_final - y_one_hot

    def compute_loss(self, outputs, targets):
        epsilon = 1e-12
        outputs = torch.clamp(outputs, epsilon, 1. - epsilon)

        targets_one_hot = torch.zeros_like(outputs)
        targets_one_hot.scatter_(1, targets.unsqueeze(1), 1)

        loss = -torch.mean(torch.sum(targets_one_hot * torch.log(outputs), dim=1))
        return loss

    def evaluate(self, test_batches):
        self.eval()
        correct = 0
        total = 0

        with torch.no_grad():
            for x_batch, y_batch in test_batches:
                x_batch = torch.tensor(x_batch, dtype=torch.float32)
                y_batch = torch.tensor(y_batch, dtype=torch.long)

                outputs = self.forward(x_batch)

                _, predicted = torch.max(outputs.data, 1)

                total += y_batch.size(0)
                correct += (predicted == y_batch).sum().item()

        accuracy = 100 * correct / total
        return accuracy



In [7]:
class CNNModel(nn.Module):
    def __init__(self, kernel_sizes, strides, paddings, init_method="random"):
        super(CNNModel, self).__init__()
        self.kernel_sizes = kernel_sizes
        self.strides = strides
        self.paddings = paddings
        self.init_method = init_method
        self.input_size = 28
        self.conv1 = nn.Conv2d(1, 10, kernel_size=kernel_sizes[0], stride=strides[0], padding=paddings[0])
        self.bn1 = nn.BatchNorm2d(10)
        
        self.conv2 = nn.Conv2d(10, 20, kernel_size=kernel_sizes[1], stride=strides[1], padding=paddings[1])
        self.bn2 = nn.BatchNorm2d(20)
        
        self.conv3 = nn.Conv2d(20, 40, kernel_size=kernel_sizes[2], stride=strides[2], padding=paddings[2])
        self.bn3 = nn.BatchNorm2d(40)
        
        self.conv4 = nn.Conv2d(40,64, kernel_size=kernel_sizes[3], stride=strides[3], padding=paddings[3])
        self.bn4 = nn.BatchNorm2d(64)
        
        self.conv5 = nn.Conv2d(64, 128, kernel_size=kernel_sizes[4], stride=strides[4], padding=paddings[4])
        self.bn5 = nn.BatchNorm2d(128)
        
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.op_size = self.calculate_output_size()
        self.mlp = MLP(
            input_size=self.op_size * self.op_size * 128,
            layers=[ 128, 64, 10],
            activations=["relu", "relu", "softmax"],
            dropout=0.1,
            learning_rate=0.05,
            flag=True,
            device = device
        )
        self.dropout = nn.Dropout(0.45)
        
        self.initialize_weights()
        
        
    def calculate_output_size(self):
        size = self.input_size
        
        for i in range(2):
            size = ((size + 2 * self.paddings[i] - self.kernel_sizes[i]) // self.strides[i]) + 1
            size = ((size - 2) // 2) + 1  # Assuming max pooling with kernel size 2 and stride 2
        size = ((size - 2) // 2) + 1
        return size
      
    
        return size
    def initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
                if self.init_method == "xavier":
                    nn.init.xavier_uniform_(m.weight)
                elif self.init_method == "he":
                    nn.init.kaiming_uniform_(m.weight, nonlinearity='relu')
                else:
                    nn.init.uniform_(m.weight, -0.1, 0.1)
                
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
        
    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool(x)
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.dropout(x)
        x = F.relu(self.bn4(self.conv4(x)))
        x = self.dropout(x)
        x = F.relu(self.bn5(self.conv5(x)))
        x = self.dropout(x)
        x = self.pool(x)
        x = x.view(-1, 128 * self.op_size * self.op_size)
        x = self.mlp.forward(x)
        return x


In [8]:
kernel_sizes = [3, 3, 3, 3, 3]
strides = [1, 1, 1, 1, 1]
paddings = [1, 1, 1, 1, 1]
model = CNNModel(kernel_sizes, strides, paddings, "he").to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [9]:
def train(model, train_loader, criterion, optimizer, epoch):
    model.train()
    running_loss = 0.0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if batch_idx % 100 == 99:
            print(f'Epoch {epoch}, Batch {batch_idx + 1}, Loss: {running_loss / 100:.6f}')
            running_loss = 0.0


from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def validate(model, val_loader, criterion):
    model.eval()
    val_loss = 0.0
    correct = 0
    all_predictions = []
    all_targets = []

    with torch.no_grad():
        for data, target in val_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            val_loss += criterion(output, target).item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
            
            all_predictions.extend(pred.cpu().numpy())
            all_targets.extend(target.cpu().numpy())

    val_loss /= len(val_loader.dataset)
    val_accuracy = 100. * correct / len(val_loader.dataset)

    precision, recall, f1, _ = precision_recall_fscore_support(all_targets, all_predictions, average='weighted')
    accuracy = accuracy_score(all_targets, all_predictions)

    print(f'loss: {val_loss:.6f}, Accuracy: {val_accuracy:.2f}%')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1 Score: {f1:.2f}')

    return val_loss, val_accuracy


In [10]:
num_epochs = 20
best_val_loss = float('inf')
for epoch in range(1, num_epochs + 1):
    train(model, train_loader, criterion, optimizer, epoch)
    val_loss, val_accuracy = validate(model, val_loader, criterion)
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pth')

print('Training complete!')

Epoch 1, Batch 100, Loss: 1.552390
Epoch 1, Batch 200, Loss: 0.952110
Epoch 1, Batch 300, Loss: 0.810496
Epoch 1, Batch 400, Loss: 0.752942
Epoch 1, Batch 500, Loss: 0.721206
Epoch 1, Batch 600, Loss: 0.692704
Epoch 1, Batch 700, Loss: 0.633869
Epoch 1, Batch 800, Loss: 0.626900
Epoch 1, Batch 900, Loss: 0.601603
Epoch 1, Batch 1000, Loss: 0.596461
Epoch 1, Batch 1100, Loss: 0.613794
Epoch 1, Batch 1200, Loss: 0.557662
Epoch 1, Batch 1300, Loss: 0.525827
Epoch 1, Batch 1400, Loss: 0.537548
Epoch 1, Batch 1500, Loss: 0.530760
Epoch 1, Batch 1600, Loss: 0.505219
loss: 0.012892, Accuracy: 85.18%
Precision: 0.85
Recall: 0.85
F1 Score: 0.85
Epoch 2, Batch 100, Loss: 0.500540
Epoch 2, Batch 200, Loss: 0.487344
Epoch 2, Batch 300, Loss: 0.471153
Epoch 2, Batch 400, Loss: 0.477912
Epoch 2, Batch 500, Loss: 0.478380
Epoch 2, Batch 600, Loss: 0.441333
Epoch 2, Batch 700, Loss: 0.441646
Epoch 2, Batch 800, Loss: 0.456779
Epoch 2, Batch 900, Loss: 0.433252
Epoch 2, Batch 1000, Loss: 0.448515
Epoch

In [11]:
model.load_state_dict(torch.load('best_model.pth'))
print("test performance metrics")
test_loss, test_accuracy = validate(model, test_loader, criterion)

test performance metrics


  model.load_state_dict(torch.load('best_model.pth'))


loss: 0.007499, Accuracy: 91.39%
Precision: 0.91
Recall: 0.91
F1 Score: 0.91
