In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/digit-recognizer/sample_submission.csv
/kaggle/input/digit-recognizer/train.csv
/kaggle/input/digit-recognizer/test.csv


# 1. Import and Setup

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import random

# 2. Random Seeds for Reproducibility

In [3]:
def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_seed(2023)

# 3. Load Dataset

In [4]:
train_df = pd.read_csv("/kaggle/input/digit-recognizer/train.csv")
test_df = pd.read_csv("/kaggle/input/digit-recognizer/test.csv")

print("Train CSV shape:", train_df.shape)
print("Test CSV shape:", test_df.shape)

Train CSV shape: (42000, 785)
Test CSV shape: (28000, 784)


# 4. Creat a Custom Dataset

In [5]:
class myDataset(Dataset):
    def __init__(self, df, transform=None, is_test=False):
        """
        df: DataFrame of shape (N, 785) if labeled, or (N, 784) if test.
        transform: torchvision transform for data augmentation/preprocessing.
        is_test: Whether this Dataset is for the test set (no labels).
        """
        self.is_test = is_test
        self.transform = transform

        if not self.is_test:
            self.labels = df['label'].values
            self.data = df.drop(columns=['label']).values.astype(np.uint8).reshape(-1, 28, 28)
        else:
            self.labels = None
            self.data = df.values.astype(np.uint8).reshape(-1, 28, 28)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img = self.data[idx]

        if self.transform:
            img = self.transform(img)

        if self.is_test:
            return img
        else:
            return img, self.labels[idx]

# 5. Define Transforms

In [6]:
def get_transforms(is_train=True, rotation=10):
    transform_list = [transforms.ToPILImage()]
    
    if is_train and rotation > 0:
        transform_list.append(transforms.RandomRotation(degrees=rotation))

    transform_list.extend([
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])

    return transforms.Compose(transform_list)

# transform
train_transform = get_transforms(is_train=True, rotation=15)
test_transform = get_transforms(is_train=False)

# 6. Create PyTorch Datasets & DataLoaders

In [7]:
# Full training and testing dataset
train_dataset = myDataset(train_df, transform=train_transform, is_test=False)
test_dataset = myDataset(test_df, transform=test_transform, is_test=True)

# Random split with train 80% and validation 20%
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

# For validation use test_transform
val_dataset.dataset.transform = test_transform

print("Train dataset size:", len(train_dataset))
print("Validation dataset size:", len(val_dataset))
print("Kaggle test dataset size:", len(test_dataset))

# Create DataLoaders for each dataset
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

Train dataset size: 33600
Validation dataset size: 8400
Kaggle test dataset size: 28000


# 7. Define CNN Model

In [8]:
class ModuleListCNN(nn.Module):
    def __init__(self):
        super(ModuleListCNN, self).__init__()
        
        # Define CNN Blocks
        self.conv_blocks = nn.ModuleList([
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(32, 32, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.Dropout(0.25)
        ])

        self.fc_blocks = nn.ModuleList([
            nn.Linear(32 * 14 * 14, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(0.25),
            nn.Linear(128, 10)
        ])

    def forward(self, x):
        for layer in self.conv_blocks:
            x = layer(x)  # forward
        
        x = x.view(x.size(0), -1)  # Flatten

        for layer in self.fc_blocks:
            x = layer(x)

        return x

# Check for CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Instantiate the model
model = ModuleListCNN().to(device)

Using device: cpu


# 8. Define Loss and Optimizer

In [9]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Optional learning rate scheduler: step down by factor 0.1 every 10 epochs
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

# 9. Train and Validation

In [10]:
import time

# Function to train one epoch
def train_one_epoch(model, train_loader, optimizer, criterion, device, scaler):
    model.train()
    running_loss, correct, total = 0.0, 0, 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        with torch.cuda.amp.autocast():  # Automatic Mixed Precision (AMP) for faster training
            outputs = model(images)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()  # Scale loss to prevent underflow/overflow
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item()
        correct += (outputs.argmax(1) == labels).sum().item()
        total += labels.size(0)

    return running_loss / len(train_loader), 100.0 * correct / total


# Function to validate the model
def validate(model, val_loader, criterion, device):
    model.eval()
    running_loss, correct, total = 0.0, 0, 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)

            running_loss += loss.item()
            correct += (outputs.argmax(1) == labels).sum().item()
            total += labels.size(0)

    return running_loss / len(val_loader), 100.0 * correct / total


# Function to train and validate the model
def train_and_validate(model, train_loader, val_loader, optimizer, criterion, scheduler, device, num_epochs=15):
    scaler = torch.amp.GradScaler()  # Enable Automatic Mixed Precision (AMP)
    best_val_acc = 0.0  # Store the best validation accuracy
    start_time = time.time()  # Start timer

    for epoch in range(1, num_epochs + 1):
        train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device, scaler)
        val_loss, val_acc = validate(model, val_loader, criterion, device)

        # Update learning rate
        if scheduler:
            scheduler.step()

        # Display training progress
        print(f"Epoch [{epoch}/{num_epochs}] | "
              f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}% | "
              f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")

        # Save the best model based on validation accuracy
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), "best_model.pth")

        # Monitor GPU memory usage
        if torch.cuda.is_available():
            print(f"GPU Memory Usage: {torch.cuda.memory_reserved(device) / 1e9:.2f} GB")

    total_time = time.time() - start_time
    print(f"Training Completed in {total_time:.2f} seconds")


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ModuleListCNN().to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

# Start training
train_and_validate(model, train_loader, val_loader, optimizer, criterion, scheduler, device, num_epochs=15)

  with torch.cuda.amp.autocast():  # Automatic Mixed Precision (AMP) for faster training


Epoch [1/15] | Train Loss: 0.2696 | Train Acc: 91.75% | Val Loss: 0.0749 | Val Acc: 97.80%
Epoch [2/15] | Train Loss: 0.0800 | Train Acc: 97.53% | Val Loss: 0.0545 | Val Acc: 98.38%
Epoch [3/15] | Train Loss: 0.0572 | Train Acc: 98.21% | Val Loss: 0.0493 | Val Acc: 98.61%
Epoch [4/15] | Train Loss: 0.0428 | Train Acc: 98.59% | Val Loss: 0.0473 | Val Acc: 98.65%
Epoch [5/15] | Train Loss: 0.0362 | Train Acc: 98.82% | Val Loss: 0.0413 | Val Acc: 98.85%
Epoch [6/15] | Train Loss: 0.0213 | Train Acc: 99.35% | Val Loss: 0.0412 | Val Acc: 98.96%
Epoch [7/15] | Train Loss: 0.0186 | Train Acc: 99.38% | Val Loss: 0.0410 | Val Acc: 98.93%
Epoch [8/15] | Train Loss: 0.0169 | Train Acc: 99.40% | Val Loss: 0.0398 | Val Acc: 98.95%
Epoch [9/15] | Train Loss: 0.0149 | Train Acc: 99.49% | Val Loss: 0.0415 | Val Acc: 99.01%
Epoch [10/15] | Train Loss: 0.0127 | Train Acc: 99.59% | Val Loss: 0.0447 | Val Acc: 98.93%
Epoch [11/15] | Train Loss: 0.0095 | Train Acc: 99.65% | Val Loss: 0.0407 | Val Acc: 99.0

In [11]:
def create_submission():
    sample_submission = pd.read_csv("/kaggle/input/digit-recognizer/sample_submission.csv")
    predictions = []

    model.eval()
    with torch.no_grad():
        for images in test_loader:
            images = images.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            predictions.extend(predicted.cpu().numpy())

    sample_submission['Label'] = predictions
    sample_submission.to_csv("submission.csv", index=False)
    print("Submission file created: submission.csv")

create_submission()

Submission file created: submission.csv
