<a href="https://colab.research.google.com/github/dikshant182004/Kaggle-Notebooks/blob/main/Hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split

In [2]:
from tensorflow.keras.datasets import fashion_mnist

(df_train, y_train), (df_test, y_test) = fashion_mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
[1m29515/29515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
[1m26421880/26421880[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
[1m5148/5148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
[1m4422102/4422102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


In [3]:
df_train = df_train.reshape(-1, 28*28)  # (60000, 784)
df_test = df_test.reshape(-1, 784)      # (10000, 784)


In [4]:
X_train, X_test, y_train1, y_test1 = train_test_split(df_train, y_train, test_size=0.2, random_state=42)

In [5]:
X_train = X_train/255.0
X_test = X_test/255.0

In [37]:
class CustomDataset(Dataset):

  def __init__(self, features, labels):

    # Convert to PyTorch tensors
    self.features = torch.tensor(features, dtype=torch.float32)
    self.labels = torch.tensor(labels, dtype=torch.long)

  def __len__(self):
    return len(self.features)

  def __getitem__(self, index):
    return self.features[index], self.labels[index]

In [38]:
train_dataset = CustomDataset(X_train, y_train1)

In [39]:
test_dataset = CustomDataset(X_test, y_test1)

In [32]:
len(test_dataset)

12000

In [19]:
len(train_dataset)

48000

In [29]:
train_dataset[11007]

(tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2824,
         0.5333, 0.0824, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0157, 0.0000, 0.1608,
         0.6706, 0.7098, 0.7176, 0.8275, 0.7961, 0.7255, 0.6824, 0.5765, 0.4275,
         0.2431, 0.1373, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0196, 0.0000,
         0.3020, 0.5922, 0.6157, 0.6980, 0.7647, 0.7608, 0.7843, 0.8000, 0.8078,
         0.8000, 0.7843, 0.6

In [26]:
# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [11]:
class MyNN(nn.Module):
  def __init__(self, input_dim, output_dim, num_hidden_layers, neuron_per_layer, dropout_rate):

    super().__init__()
    layers=[]
    for i in range(num_hidden_layers):

      layers.append(nn.Linear(input_dim,neuron_per_layer))
      layers.append(nn.BatchNorm1d(neuron_per_layer))
      layers.append(nn.ReLU())
      layers.append(nn.Dropout(dropout_rate))

      input_dim = neuron_per_layer

    layers.append(nn.Linear(neuron_per_layer,output_dim))

    self.model = nn.Sequential(*layers)

  def forward(self,X):
    return self.model(X)

In [22]:
# lets create the objective funtion for the hyperparameter tuning using the optuna
def objective(trial):

  num_hidden_layer= trial.suggest_int("num_hidden_layer",1,5)
  neuron_per_layer = trial.suggest_int("neuron_per_layer",8,128,step=8)
  epochs= trial.suggest_int("epochs",10,50,step=10)
  learning_rate= trial.suggest_float("lr",1e-5,1e-1,log=True)
  dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5, step=0.1)
  batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 128])
  optimizer_name = trial.suggest_categorical("optimizer", ['Adam', 'SGD', 'RMSprop'])
  weight = trial.suggest_float("weight_decay", 1e-5, 1e-3, log=True)

  model=MyNN(784,10,num_hidden_layer,neuron_per_layer,dropout_rate)
  model.to(device)

  # optimizer selection
  criteria = nn.CrossEntropyLoss()
  train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle= True,pin_memory=True)
  test_loader = DataLoader(test_dataset, batch_size = batch_size,shuffle= True,pin_memory=True)

  if optimizer_name == 'Adam':
    optimizer= optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight)
  elif optimizer_name == 'SGD':
    optimizer =optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight)
  else:
    optimizer = optim.RMSprop(model.parameters(), lr=learning_rate, weight_decay=weight)
 # training loop

  for epoch in range(epochs):
    model.train()
    for batch_features, batch_labels in train_loader:

      # move data to gpu
      batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

      # forward pass
      outputs = model(batch_features)

      # calculate loss
      loss = criteria(outputs, batch_labels)

      # back pass
      optimizer.zero_grad()
      loss.backward()

      # update grads
      optimizer.step()

  # evaluation
  model.eval()
  # evaluation on test data
  total = 0
  correct = 0

  with torch.no_grad():

    for batch_features, batch_labels in test_loader:

      # move data to gpu
      batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

      outputs = model(batch_features)

      _, predicted = torch.max(outputs, 1)

      total = total + batch_labels.shape[0]

      correct = correct + (predicted == batch_labels).sum().item()

    accuracy = correct/total

  return accuracy



In [14]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.5.0


In [23]:
import optuna
study = optuna.create_study(direction='maximize')  # here the sampler is by default set for bayesian optimization

[I 2025-09-18 12:36:46,608] A new study created in memory with name: no-name-fa1b8f7a-52ee-4067-9e50-b7e036dea325


In [25]:
torch.manual_seed(23)

<torch._C.Generator at 0x7c443db87d50>

In [40]:
study.optimize(objective,n_trials=10)

[I 2025-09-18 12:50:18,623] Trial 2 finished with value: 0.5390833333333334 and parameters: {'num_hidden_layer': 5, 'neuron_per_layer': 40, 'epochs': 20, 'lr': 3.161655699551644e-05, 'dropout_rate': 0.5, 'batch_size': 32, 'optimizer': 'RMSprop', 'weight_decay': 1.1697841327356769e-05}. Best is trial 2 with value: 0.5390833333333334.
[I 2025-09-18 12:52:00,078] Trial 3 finished with value: 0.8671666666666666 and parameters: {'num_hidden_layer': 1, 'neuron_per_layer': 96, 'epochs': 20, 'lr': 1.0860587288364432e-05, 'dropout_rate': 0.1, 'batch_size': 16, 'optimizer': 'RMSprop', 'weight_decay': 0.00013149070336756935}. Best is trial 3 with value: 0.8671666666666666.
[I 2025-09-18 12:53:23,310] Trial 4 finished with value: 0.6648333333333334 and parameters: {'num_hidden_layer': 4, 'neuron_per_layer': 120, 'epochs': 10, 'lr': 9.455947068450502e-05, 'dropout_rate': 0.4, 'batch_size': 16, 'optimizer': 'SGD', 'weight_decay': 2.8040376050576257e-05}. Best is trial 3 with value: 0.867166666666666

In [41]:
study.best_params

{'num_hidden_layer': 4,
 'neuron_per_layer': 128,
 'epochs': 30,
 'lr': 2.0664381549521216e-05,
 'dropout_rate': 0.1,
 'batch_size': 128,
 'optimizer': 'Adam',
 'weight_decay': 2.569929646535556e-05}

In [42]:
study.best_value

0.885

lets have some vizualizationn in which optuna is very good at

In [43]:
import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_parallel_coordinate, plot_slice


In [44]:
fig = plot_optimization_history(study)
fig.show()


In [45]:
fig = plot_param_importances(study)
fig.show()


In [46]:
fig = plot_parallel_coordinate(study)
fig.show()


In [47]:
fig = plot_slice(study)
fig.show()


these vizualizations tell what the bayesian optimization have done

In [50]:
study.trials_dataframe().head(1)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_batch_size,params_dropout_rate,params_epochs,params_lr,params_neuron_per_layer,params_num_hidden_layer,params_optimizer,params_weight_decay,state
0,0,,2025-09-18 12:36:49.228057,2025-09-18 12:37:06.993906,0 days 00:00:17.765849,128,0.2,30,6.5e-05,96,3,Adam,0.000986,FAIL


Making a CNN model to get better accuracy on the image data

In [51]:
# we will use the old data variables and new dataset class
class Custom1Dataset(Dataset):

  def __init__(self,features, labels):

    # here we are reshaping becz it accepts the images
    self.features= torch.tensor(features, dtype=torch.float32).reshape(-1,1,28,28)
    self.labels = torch.tensor(labels,dtype = torch.long)

  def __len__(self):
    return len(self.features)

  def __getitem__(self,index):
    return self.features[index], self.labels[index]


In [52]:
train_dataset1=Custom1Dataset(X_train,y_train1)
test_dataset1=Custom1Dataset(X_test,y_test1)

In [53]:
train_loader1 = DataLoader(train_dataset1, batch_size=32, shuffle=True, pin_memory=True)
test_loader1 = DataLoader(test_dataset1, batch_size=32, shuffle=False, pin_memory=True)

In [55]:
class MyNN1(nn.Module):
  def __init__(self,input_features):
    super().__init__()

    self.features = nn.Sequential(
        # input features will be 1 becz all images are grayscale
        nn.Conv2d(input_features, 32, kernel_size=3, padding='same'),
        nn.ReLU(),
        nn.BatchNorm2d(32),
        nn.MaxPool2d(kernel_size=2, stride=2),

        nn.Conv2d(32, 64, kernel_size=3, padding='same'),
        nn.ReLU(),
        nn.BatchNorm2d(64),
        nn.MaxPool2d(kernel_size=2, stride=2)
    )
    self.classifier = nn.Sequential(
        nn.Flatten(),
        nn.Linear(64*7*7,128),
        nn.ReLU(),
        nn.Dropout(p=0.3),

        nn.Linear(128,64),
        nn.ReLU(),
        nn.Dropout(p=0.3),

        nn.Linear(64,10)
    )
  def forward(self,x):
    x= self.features(x)
    x= self.classifier(x)

    return x

In [56]:
model = MyNN1(1)

model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [57]:
# training loop

for epoch in range(100):

  total_epoch_loss = 0

  for batch_features, batch_labels in train_loader1:

    # move data to gpu
    batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

    # forward pass
    outputs = model(batch_features)

    # calculate loss
    loss = criterion(outputs, batch_labels)

    # back pass
    optimizer.zero_grad()
    loss.backward()

    # update grads
    optimizer.step()

    total_epoch_loss = total_epoch_loss + loss.item()

  avg_loss = total_epoch_loss/len(train_loader1)
  print(f'Epoch: {epoch + 1} , Loss: {avg_loss}')


Epoch: 1 , Loss: 0.5935467355797688
Epoch: 2 , Loss: 0.33909142171343165
Epoch: 3 , Loss: 0.2867447137112419
Epoch: 4 , Loss: 0.25689152539893984
Epoch: 5 , Loss: 0.23333648443656663
Epoch: 6 , Loss: 0.21301973814268907
Epoch: 7 , Loss: 0.19564378340604405
Epoch: 8 , Loss: 0.18249855184058347
Epoch: 9 , Loss: 0.1683897037593027
Epoch: 10 , Loss: 0.15513307672490675
Epoch: 11 , Loss: 0.14672104610626896
Epoch: 12 , Loss: 0.13466630857965597
Epoch: 13 , Loss: 0.12539533761919786
Epoch: 14 , Loss: 0.11605499756491433
Epoch: 15 , Loss: 0.10803626714662339
Epoch: 16 , Loss: 0.10478724659327418
Epoch: 17 , Loss: 0.0931312717066612
Epoch: 18 , Loss: 0.0893853658461012
Epoch: 19 , Loss: 0.08224685460907252
Epoch: 20 , Loss: 0.07862021414764846
Epoch: 21 , Loss: 0.07381244138542874
Epoch: 22 , Loss: 0.07002224295648436
Epoch: 23 , Loss: 0.06269291441258974
Epoch: 24 , Loss: 0.060207639704050964
Epoch: 25 , Loss: 0.05943135926728913
Epoch: 26 , Loss: 0.0525837372475847
Epoch: 27 , Loss: 0.048299

In [58]:
model.eval()

MyNN1(
  (features): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (1): ReLU()
    (2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (5): ReLU()
    (6): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (classifier): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=3136, out_features=128, bias=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=128, out_features=64, bias=True)
    (5): ReLU()
    (6): Dropout(p=0.3, inplace=False)
    (7): Linear(in_features=64, out_features=10, bias=True)
  )
)

In [60]:
# evaluation on test data
total = 0
correct = 0

with torch.no_grad():

  for batch_features, batch_labels in test_loader1:

    # move data to gpu
    batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

    outputs = model(batch_features)

    _, predicted = torch.max(outputs, 1)

    total = total + batch_labels.shape[0]

    correct = correct + (predicted == batch_labels).sum().item()

print(correct/total)

0.9274166666666667


In [61]:
# evaluation on training data
total = 0
correct = 0

with torch.no_grad():

  for batch_features, batch_labels in train_loader1:

    # move data to gpu
    batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

    outputs = model(batch_features)

    _, predicted = torch.max(outputs, 1)

    total = total + batch_labels.shape[0]

    correct = correct + (predicted == batch_labels).sum().item()

print(correct/total)

0.9995833333333334


Now lets hypertune the cnn model parameters and use the data augmentation conept too

*********************************************

***


In [41]:
from torchvision import transforms

# using Data Augmentation for the training dataset
train_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomRotation(10),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomAffine(0, translate=(0.1,0.1)),# distortion
    transforms.ToTensor()  # converting to tensor
])

test_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor() # converting to tensor
])

In [42]:
class CustomDataset2(Dataset):
  def __init__(self,features, labels, transform=None):
    self.features = torch.tensor(features, dtype = torch.float32).reshape(-1,1,28,28)
    self.labels = torch.tensor(labels,dtype = torch.long)
    self.transform= transform

  def __getitem__(self,index):
    # we extract one (1, 28, 28) image and its label.
    # squeeze(0) removes the channel dimension → becomes (28, 28).
    #   Then you call .numpy() and apply transform becz transformations are done on numpy array
    feature, label = self.features[index], self.labels[index]
    if self.transform:
      feature= self.transform(feature.squeeze(0).numpy())
    return feature, label

  def __len__(self):
    return len(self.features)

In [43]:
train_dataset = CustomDataset2(X_train, y_train1,transform=train_transform)
test_dataset = CustomDataset2(X_test, y_test1, transform=test_transform)

In [45]:
test_dataset[0]

(tensor([[[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0078, 0.0157, 0.0000, 0.0000, 0.5765, 0.4902, 0.4157,
           0.4431, 0.1059, 0.0000, 0.0039, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0039, 0.0039, 0.0000, 0.0000,
           0.0000, 0.0118, 0.0000, 0.1059, 0.7765, 0.2314, 0.0000, 0.0000,
           0.0627, 0.4863, 0.1569, 0.0000, 

In [46]:
# Now here we will be using the dynamic cnn class whose architecture is not
# fixed as it is constructed in runtime which is highly useful in hyperparameter tuning

class DynamicCNN(nn.Module):
  def __init__(self, num_conv, num_filters, kernel_size,num_fc, fc_layer_size, dropout_rate):

    super(DynamicCNN,self).__init__()
    # Inheriting the dynamic cnn class constructer so that it can be changed flexibly

    layers =[]
    in_channels =1 # as images are grayscale

    for _ in range(num_conv):
      layers.append(nn.Conv2d(in_channels,num_filters,kernel_size=kernel_size, padding = 'same'))
      layers.append(nn.ReLU())
      layers.append(nn.BatchNorm2d(num_filters))
      layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
      in_channels = num_filters

    self.features= nn.Sequential(*layers)

    # fc layers
    fc_layers = [nn.Flatten()]
    input_size = num_filters *(28 // (2** num_conv))**2
    for _ in range(num_fc):
      fc_layers.append(nn.Linear(input_size, fc_layer_size))
      fc_layers.append(nn.ReLU())
      fc_layers.append(nn.Dropout(dropout_rate))
      input_size = fc_layer_size

    self.classifier = nn.Sequential(*fc_layers)

  def forward(self,x):
    x= self.features(x)
    x= self.classifier(x)
    return x


In [47]:
def objective(trial):
    num_conv_layers = trial.suggest_int('num_conv_layers', 1, 3)
    num_filters = trial.suggest_categorical('num_filters', [16, 32, 64, 128])
    kernel_size = trial.suggest_categorical('kernel_size', [3, 5])
    num_fc_layers = trial.suggest_int('num_fc_layers', 1, 3)
    fc_layer_size = trial.suggest_categorical('fc_layer_size', [64, 128, 256])
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.2, 0.5)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-2)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)
    optimizer_name = trial.suggest_categorical('optimizer', ['SGD', 'Adam', 'RMSprop'])
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])
    num_epochs = trial.suggest_int('num_epochs', 10, 30)

    # Model
    model = DynamicCNN(num_conv_layers, num_filters, kernel_size, num_fc_layers, fc_layer_size, dropout_rate).to(device)

    # Data
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Optimizer
    if optimizer_name == 'SGD':
        optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    elif optimizer_name == 'Adam':
        optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    elif optimizer_name == 'RMSprop':
        optimizer = optim.RMSprop(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    # Loss function
    criterion = nn.CrossEntropyLoss()

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        for batch_features, batch_labels in train_loader:
            batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)
            optimizer.zero_grad()
            outputs = model(batch_features)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()

    # Validation loop
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_features, batch_labels in test_loader:
            batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)
            outputs = model(batch_features)
            _, predicted = torch.max(outputs, 1)
            total += batch_labels.size(0)
            correct += (predicted == batch_labels).sum().item()

    accuracy = correct / total
    return accuracy

In [48]:
# we will also add the early stopping technique here using pruner
import optuna
pruner = optuna.pruners.MedianPruner()
study= optuna.create_study(direction='maximize',pruner= pruner)
study.optimize(objective,n_trials=50)

[I 2025-09-19 08:18:43,431] A new study created in memory with name: no-name-08bf1313-1cb0-43b0-8db0-28dad3a002c8
  dropout_rate = trial.suggest_uniform('dropout_rate', 0.2, 0.5)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-2)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)
[I 2025-09-19 08:23:23,385] Trial 0 finished with value: 0.8500833333333333 and parameters: {'num_conv_layers': 2, 'num_filters': 16, 'kernel_size': 3, 'num_fc_layers': 1, 'fc_layer_size': 128, 'dropout_rate': 0.4053246738803355, 'weight_decay': 1.8007192221312434e-05, 'learning_rate': 0.00010071978201500468, 'optimizer': 'Adam', 'batch_size': 128, 'num_epochs': 16}. Best is trial 0 with value: 0.8500833333333333.
[I 2025-09-19 08:31:12,818] Trial 1 finished with value: 0.8821666666666667 and parameters: {'num_conv_layers': 3, 'num_filters': 32, 'kernel_size': 3, 'num_fc_layers': 3, 'fc_layer_size': 64, 'dropout_rate': 0.21879065621742474, 'weight_decay': 0.0004035211030

KeyboardInterrupt: 

In [24]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/400.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.5.0


In [49]:
study.best_params  # we can furthur increase the training time on a gpu provided machine

{'num_conv_layers': 3,
 'num_filters': 128,
 'kernel_size': 5,
 'num_fc_layers': 2,
 'fc_layer_size': 64,
 'dropout_rate': 0.2756873865086421,
 'weight_decay': 0.005301246207601579,
 'learning_rate': 0.00010333627635943423,
 'optimizer': 'RMSprop',
 'batch_size': 32,
 'num_epochs': 29}

In [50]:
study.best_value

0.91775

Now lets use the pretrained models like VGG16 for our fashion mnest data in pytorch

In [51]:
# lets make  a custom transform before passing data to the pretrained model

from torchvision.transforms import transforms

# its from the VGG16 documentation of what kind of image data is accepts
custom_transform= transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406],std=[0.229,0.224,0.225])
])

In [52]:
from PIL import Image
import numpy as np

class CustomDataset3(Dataset):

  def __init__(self,features,labels,transform):
    self.features= features
    self.labels= labels
    self.transform = transform

  def __len__(self):
    return len(self.features)

  def __getitem__(self,index):
    image = self.features[index].reshape(28,28)

    image= image.astype(np.uint8)

    # change black&white to color -> (H,W,C) -> (C,H,W)
    image = np.stack([image]*3,axis = -1)

    image = Image.fromarray(image) # into PIL image
    image = self.transform(image)

    return image,torch.tensor(self.labels[index], dtype=torch.long)


In [54]:
train_dataset1 = CustomDataset3(X_train, y_train1, transform=custom_transform)
test_dataset1 = CustomDataset3(X_test, y_test1, transform=custom_transform)

In [55]:
train_loader = DataLoader(train_dataset1, batch_size=32, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset1, batch_size=32, shuffle=False, pin_memory=True)

In [56]:
import torchvision.models as models

vgg16 = models.vgg16(pretrained=True)



Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth


100%|██████████| 528M/528M [00:07<00:00, 73.9MB/s]


In [57]:
vgg16

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [58]:
for param in vgg16.features.parameters():
  param.requires_grad=False

In [59]:
vgg16.classifier = nn.Sequential(
    nn.Linear(25088, 1024),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(1024, 512),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(512, 10)
)

In [60]:
vgg16 = vgg16.to(device)

In [61]:
learning_rate = 0.0001
epochs = 10

In [62]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(vgg16.classifier.parameters(), lr=learning_rate)

In [None]:
# training loop

for epoch in range(epochs):

  total_epoch_loss = 0

  for batch_features, batch_labels in train_loader:

    # move data to gpu
    batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

    # forward pass
    outputs = vgg16(batch_features)

    # calculate loss
    loss = criterion(outputs, batch_labels)

    # back pass
    optimizer.zero_grad()
    loss.backward()

    # update grads
    optimizer.step()

    total_epoch_loss = total_epoch_loss + loss.item()

  avg_loss = total_epoch_loss/len(train_loader)
  print(f'Epoch: {epoch + 1} , Loss: {avg_loss}')


In [1]:
model.eval()

NameError: name 'model' is not defined

In [None]:
# evaluation on test data
total = 0
correct = 0

with torch.no_grad():

  for batch_features, batch_labels in test_loader:

    # move data to gpu
    batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

    outputs = vgg16(batch_features)

    _, predicted = torch.max(outputs, 1)

    total = total + batch_labels.shape[0]

    correct = correct + (predicted == batch_labels).sum().item()

print(correct/total)

In [None]:
# evaluation on training data
total = 0
correct = 0

with torch.no_grad():

  for batch_features, batch_labels in train_loader:

    # move data to gpu
    batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

    outputs = model(batch_features)

    _, predicted = torch.max(outputs, 1)

    total = total + batch_labels.shape[0]

    correct = correct + (predicted == batch_labels).sum().item()

print(correct/total)

gpu credits are over may run it after some time