In [27]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from torchvision import datasets, transforms
from tqdm import tqdm
from torch.optim import Adam, SGD
from torch.utils.data import Dataset, DataLoader

In [28]:
df = pd.read_csv("fmnist_small.csv")

In [29]:
# train_test split
x = df.iloc[:, 1:].values
y = df.iloc[:, 0].values

In [30]:
# — convert arrays to (N, C, H, W) (grayscale → C=1)
x = x.reshape(-1, 1, 28, 28).astype(np.float32) / 255.0
y = y.astype(np.int64)

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [32]:
# datasets and dataloaders

class dataset(Dataset):

    def __init__(self, features, labels):
        # features and labels expected as numpy arrays; convert to tensors here
        self.features = torch.tensor(features, dtype = torch.float32)
        self.labels = torch.tensor(labels, dtype = torch.long)

    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, index):
        # features[index] is already a torch.Tensor because we converted above
        x = self.features[index]
        y = self.labels[index]
        return x, y
    
train_ds = dataset(X_train, y_train)

In [33]:
# Decide tuneable hyperparameters
batch_size = 64
learning_rate = 0.001
num_epochs = 10
kernal_size = 3
num_filters = 32
dropout_rate = 0.5
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

In [None]:
# define model with tuneable hyperparameters
class cnnmodel(nn.Module):
    def __init__(self, kernal_size, num_filters, dropout_rate):
        super(cnnmodel, self).__init__()
        self.kernal_size = kernal_size
        self.num_filters = num_filters

        self.conv1 = nn.Conv2d(1, num_filters, kernel_size=kernal_size)
        self.pool = nn.MaxPool2d(2, 2)

        # compute output spatial size after conv + pool for 28x28 input
        conv_out_size = (28 - kernal_size + 1)  # conv output spatial dim
        pooled_size = conv_out_size // 2       # after 2x2 max pool
        self.flatten_dim = num_filters * pooled_size * pooled_size

        self.fc1 = nn.Linear(self.flatten_dim, 128)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = x.view(-1, self.flatten_dim)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x


In [35]:
model = cnnmodel(kernal_size, num_filters, dropout_rate)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=learning_rate)

In [36]:
# training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for features, labels in tqdm(train_loader):
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    epoch_loss = running_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")

100%|██████████| 75/75 [00:01<00:00, 53.70it/s]
100%|██████████| 75/75 [00:01<00:00, 53.70it/s]


Epoch [1/10], Loss: 1.1453


100%|██████████| 75/75 [00:01<00:00, 58.55it/s]
100%|██████████| 75/75 [00:01<00:00, 58.55it/s]


Epoch [2/10], Loss: 0.7035


100%|██████████| 75/75 [00:01<00:00, 58.67it/s]
100%|██████████| 75/75 [00:01<00:00, 58.67it/s]


Epoch [3/10], Loss: 0.6395


100%|██████████| 75/75 [00:01<00:00, 54.59it/s]
100%|██████████| 75/75 [00:01<00:00, 54.59it/s]


Epoch [4/10], Loss: 0.5733


100%|██████████| 75/75 [00:01<00:00, 55.73it/s]
100%|██████████| 75/75 [00:01<00:00, 55.73it/s]


Epoch [5/10], Loss: 0.5128


100%|██████████| 75/75 [00:01<00:00, 55.70it/s]
100%|██████████| 75/75 [00:01<00:00, 55.70it/s]


Epoch [6/10], Loss: 0.4884


100%|██████████| 75/75 [00:01<00:00, 54.04it/s]
100%|██████████| 75/75 [00:01<00:00, 54.04it/s]


Epoch [7/10], Loss: 0.4624


100%|██████████| 75/75 [00:01<00:00, 56.21it/s]
100%|██████████| 75/75 [00:01<00:00, 56.21it/s]


Epoch [8/10], Loss: 0.4249


100%|██████████| 75/75 [00:01<00:00, 55.18it/s]
100%|██████████| 75/75 [00:01<00:00, 55.18it/s]


Epoch [9/10], Loss: 0.4050


100%|██████████| 75/75 [00:01<00:00, 56.57it/s]

Epoch [10/10], Loss: 0.3812





In [None]:
# velidation loop
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for features, labels in DataLoader(dataset(X_test, y_test), batch_size=64):
        outputs = model(features)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Validation Accuracy: {accuracy:.2f}%")

Validation Accuracy: 84.58%


In [38]:
# Optuna hyperparameter tuning
# Installs: pip install optuna (run in your environment if not installed)
import optuna
from optuna.trial import TrialState
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Use a small number of epochs for tuning; increase for final training
TUNING_EPOCHS = 3

# Keep references to training/validation data already prepared above
X_tr = X_train
y_tr = y_train
X_val = X_test
y_val = y_test


def objective(trial):
    # suggestions
    kernel_size = trial.suggest_categorical("kernel_size", [3, 5])
    num_filters = trial.suggest_categorical("num_filters", [16, 32, 64])
    dropout_rate = trial.suggest_float("dropout_rate", 0.2, 0.6)
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])

    # create loaders for this trial
    train_ds = dataset(X_tr, y_tr)
    val_ds = dataset(X_val, y_val)
    train_loader_loc = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader_loc = DataLoader(val_ds, batch_size=256, shuffle=False)

    # instantiate model for this trial (note: class uses parameter name `kernal_size`)
    model_t = cnnmodel(kernal_size=kernel_size, num_filters=num_filters, dropout_rate=dropout_rate).to(device)
    optimizer_t = Adam(model_t.parameters(), lr=lr)
    criterion_t = nn.CrossEntropyLoss()

    # quick training loop
    for epoch in range(TUNING_EPOCHS):
        model_t.train()
        for features, labels in train_loader_loc:
            features = features.to(device)
            labels = labels.to(device)

            optimizer_t.zero_grad()
            outputs = model_t(features)
            loss = criterion_t(outputs, labels)
            loss.backward()
            optimizer_t.step()

    # validation
    model_t.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for features, labels in val_loader_loc:
            features = features.to(device)
            labels = labels.to(device)
            outputs = model_t(features)
            _, preds = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (preds == labels).sum().item()

    accuracy = correct / total
    # report intermediate result to optuna (useful if pruning enabled)
    trial.report(accuracy, step=0)
    return accuracy

# Create or load study and run optimization
study_name = "cnn_hyperparam_study"
storage_name = "sqlite:///optuna_cnn_study.db"
study = optuna.create_study(direction="maximize", study_name=study_name, storage=storage_name, load_if_exists=True)
print(f"Starting optimization: study name={study_name}, storage={storage_name}")
start = time.time()
study.optimize(objective, n_trials=20, show_progress_bar=True)
end = time.time()
print(f"Optimization finished in {end-start:.1f}s")

# Results summary
print("Number of finished trials:", len(study.trials))
best = study.best_trial
print("Best trial value:", best.value)
print("Best params:")
for k, v in best.params.items():
    print(f"  {k}: {v}")

# Optional: print top 5 trials
trials_df = study.trials_dataframe()
print(trials_df.sort_values(by="value", ascending=False).head())

# Save study explicitly (already stored in SQLite when using storage argument).
print(f"Study saved to {storage_name}")

# Visualization instructions
print("To visualize results (in notebook):\n  import optuna\n  import optuna.visualization as vis\n  vis.plot_optimization_history(study)\n  vis.plot_param_importances(study)")


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-12-06 19:24:19,511] A new study created in RDB with name: cnn_hyperparam_study
[I 2025-12-06 19:24:19,511] A new study created in RDB with name: cnn_hyperparam_study


Starting optimization: study name=cnn_hyperparam_study, storage=sqlite:///optuna_cnn_study.db


  0%|          | 0/20 [00:00<?, ?it/s]



[W 2025-12-06 19:24:19,662] Trial 0 failed with parameters: {'kernel_size': 3, 'num_filters': 64, 'dropout_rate': 0.381732899455972, 'lr': 0.0034870692570935957, 'batch_size': 64} because of the following error: RuntimeError('mat1 and mat2 shapes cannot be multiplied (128x5408 and 10816x128)').
Traceback (most recent call last):
  File "/home/haider/code/pytorch-notes/.venv/lib/python3.12/site-packages/optuna/study/_optimize.py", line 205, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_3751/1678711674.py", line 46, in objective
    outputs = model_t(features)
              ^^^^^^^^^^^^^^^^^
  File "/home/haider/code/pytorch-notes/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/haider/code/pytorch-notes/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _

RuntimeError: mat1 and mat2 shapes cannot be multiplied (128x5408 and 10816x128)