<a href="https://colab.research.google.com/github/fazal735/DL_A2/blob/main/DL_A2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
from torchvision.transforms import transforms
import torchvision
from torch.utils.data import DataLoader,SubsetRandomSampler
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_data_dir = '/content/drive/MyDrive/inaturalist_12K/train'
test_data_dir = '/content/drive/MyDrive/inaturalist_12K/val'

In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:

# train data loading
def train_data(train_data_dir,data_augmentation):
  size=transforms.Resize((224,224))
  to_tensor=transforms.ToTensor()
  #check again-autogenerated
  normalize=transforms.Normalize(mean=[0.485,0.456,0.406],std=[0.229,0.224,0.225])
  crop=transforms.RandomResizedCrop(224)
  flip=transforms.RandomHorizontalFlip()
  #try changing the values
  color=transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1)
  rotation=transforms.RandomRotation(30)


  #augmentation of image
  if data_augmentation == 'Yes':
        transform_img = transforms.Compose([crop,flip,color, rotation, to_tensor,normalize]) # Data transformations

  else:
      transform_img = transforms.Compose([size,to_tensor, normalize ])



  #data fetchiing
  training_data=torchvision.datasets.ImageFolder(train_data_dir, transform=transform_img)


  train_index, val_index = train_test_split(list(range(len(training_data))), test_size=0.2, random_state=42)
  train_sampler = SubsetRandomSampler(train_index)
  val_sampler = SubsetRandomSampler(val_index)

  train_data=DataLoader(training_data,batch_size=32,sampler=train_sampler)
  validation_data = DataLoader(training_data, batch_size=32, sampler=val_sampler)
  print('Train data size:', len(train_data))
  print('Validation data size:', len(validation_data))

  return train_data,validation_data

#test data loading
def test_data(test_data_dir,data_augmentation):
  size=transforms.Resize((224,224))
  to_tensor=transforms.ToTensor()
  #check again-autogenerated
  normalize=transforms.Normalize(mean=[0.485,0.456,0.406],std=[0.229,0.224,0.225])


  #augmentation of image
  data_transform=transforms.Compose([size,to_tensor,normalize])

  #data fetching
  test_data=torchvision.datasets.ImageFolder(test_data_dir,transform=data_transform)
  test_data=DataLoader(test_data,batch_size=32)


  return test_data

model definition

In [None]:

class model(nn.Module):
  def __init__(self,num_filters=[32,64,128,256,512],filter_size=[3,3,5,5,7],activation=nn.ReLU(),
               stride=1, padding=1, pool_size=(2,2),fc_size=512,nom_o_classes=10,
               dropout=0,in_channels=3,batch_norm='YES'):
    super(model,self).__init__()
    self.num_filters=num_filters
    self.filter_size=filter_size
    self.activation=activation
    self.stride=stride
    self.padding=padding
    self.pool_size=pool_size
    self.fc_size=fc_size
    self.nom_o_classes=nom_o_classes
    self.dropout=dropout
    self.channels=in_channels


    def image_size(img_w,filter_size,padding,stride):
      return ((img_w-filter_size+2*padding)/stride+1)*0.5


    #layers of convolution
    #layer1
    self.conv_layer1=nn.Conv2d(self.channels,self.num_filters[0], stride=self.stride, padding=self.padding,
                               kernel_size=self.filter_size[0])
    self.batch_norm1=nn.BatchNorm2d(self.num_filters[0])
    self.dropout1=nn.Dropout2d(self.dropout)

    img_size1=image_size(224,self.filter_size[0],self.padding,self.stride)

    #layer2
    self.conv_layer2=nn.Conv2d(self.num_filters[0],self.num_filters[1], stride=self.stride, padding=self.padding,
                              kernel_size=self.filter_size[1])
    self.batch_norm2=nn.BatchNorm2d(self.num_filters[1])
    self.dropout2=nn.Dropout2d(self.dropout)

    img_size2=image_size(img_size1,self.filter_size[1],self.padding,self.stride)

    #layer3
    self.conv_layer3=nn.Conv2d(self.num_filters[1],self.num_filters[2], stride=self.stride, padding=self.padding,
                              kernel_size=self.filter_size[2])
    self.batch_norm3=nn.BatchNorm2d(self.num_filters[2])
    self.dropout3=nn.Dropout2d(self.dropout)

    img_size3=image_size(img_size2,self.filter_size[2],self.padding,self.stride)

    #layer4
    self.conv_layer4=nn.Conv2d(self.num_filters[2],self.num_filters[3], stride=self.stride, padding=self.padding,
                              kernel_size=self.filter_size[3])
    self.batch_norm4=nn.BatchNorm2d(self.num_filters[3])
    self.dropout4=nn.Dropout2d(self.dropout)

    img_size4=image_size(img_size3,self.filter_size[3],self.padding,self.stride)

    #layer5
    self.conv_layer5=nn.Conv2d(self.num_filters[3],self.num_filters[4], stride=self.stride, padding=self.padding,
                              kernel_size=self.filter_size[4])
    self.batch_norm5=nn.BatchNorm2d(self.num_filters[4])
    self.dropout5=nn.Dropout2d(self.dropout)

    img_size5=int(image_size(img_size4,self.filter_size[4],self.padding,self.stride))


    self.pool=nn.MaxPool2d(self.pool_size,stride=2)

    self.dropout_layer = nn.Dropout1d(self.dropout)

    # Define fully connected layer
    self.fc = nn.Linear(self.num_filters[4] * (img_size5 ** 2), self.fc_size)
    self.fc_bn = nn.BatchNorm1d(self.fc_size)  # Batch normalization for fully connected layer

    # Output layer
    self.output_layer = nn.Linear(self.fc_size, self.nom_o_classes)

    # forward
  def forward(self,x):
    #layer1
    x=self.conv_layer1(x)
    x=self.activation(x)
    x=self.pool(x)
    x=self.dropout1(x)
    x=self.batch_norm1(x)

      #layer2
    x=self.conv_layer2(x)
    x=self.activation(x)
    x=self.pool(x)
    x=self.dropout2(x)
    x=self.batch_norm2(x)

    #layer3
    x=self.conv_layer3(x)
    x=self.activation(x)
    x=self.pool(x)
    x=self.dropout3(x)
    x=self.batch_norm3(x)

    #layer4
    x=self.conv_layer4(x)
    x=self.activation(x)
    x=self.pool(x)
    x=self.dropout4(x)
    x=self.batch_norm4(x)

    #layer5
    x=self.conv_layer5(x)
    x=self.activation(x)
    x=self.pool(x)
    x=self.dropout5(x)
    x=self.batch_norm5(x)

    x = torch.flatten(x, 1)
    x = self.fc(x)
    x = self.fc_bn(x)
    x = self.activation(x)
    x = self.dropout_layer(x)
    x = self.output_layer(x)

    return x

model1=model()
model1.to(device)
print(model1)

model(
  (activation): ReLU()
  (conv_layer1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (batch_norm1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout1): Dropout2d(p=0, inplace=False)
  (conv_layer2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (batch_norm2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout2): Dropout2d(p=0, inplace=False)
  (conv_layer3): Conv2d(64, 128, kernel_size=(5, 5), stride=(1, 1), padding=(1, 1))
  (batch_norm3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout3): Dropout2d(p=0, inplace=False)
  (conv_layer4): Conv2d(128, 256, kernel_size=(5, 5), stride=(1, 1), padding=(1, 1))
  (batch_norm4): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout4): Dropout2d(p=0, inplace=False)
  (conv_layer5): Conv2d(256, 512, kernel_size=(7, 7), stride=(1, 1), p

In [None]:
epochs=100
learning_rate=0.1

loss=nn.CrossEntropyLoss()
optimizer=torch.optim.SGD(model1.parameters(),lr=learning_rate)

model training function

In [None]:
# loss_metric=nn.CrossEntropyLoss()
# optimizer=torch.optim.SGD(model1.parameters(),lr=learning_rate)

# def training(model1,data):

#   model1.train(True)
#   training_loss=0.0
#   true_label=0
#   total_train=0

#   for input, label in data:
#     input = input.to(device)
#     label = label.to(device)
#     optimizer.zero_grad()

#     output=model1(input)
#     loss=loss_metric(output,label)
#     loss.backward()
#     optimizer.step()
#     training_loss += loss.item()
#     _,predicted=torch.max(output.data,1)
#     total_train += label.size(0)
#     true_label += (predicted==label).sum().item()

#   train_accuracy=100*true_label/total_train
#   return train_accuracy,training_loss,model1


model testing function(on validation data)

In [None]:
loss_metric=nn.CrossEntropyLoss()
optimizer=torch.optim.SGD(model1.parameters(),lr=learning_rate)

def training(model1, data, epoch=0):
    model1.train(True)
    training_loss = 0.0
    true_label = 0
    total_train = 0

    # Add batch tracking
    total_batches = len(data)

    for batch_idx, (input, label) in enumerate(data):
        input = input.to(device)
        label = label.to(device)
        optimizer.zero_grad()

        output = model1(input)
        loss = loss_metric(output, label)
        loss.backward()
        optimizer.step()

        training_loss += loss.item()
        _, predicted = torch.max(output.data, 1)
        total_train += label.size(0)
        true_label += (predicted == label).sum().item()

        # Print progress every few batches
        if (batch_idx + 1) % 10 == 0 or (batch_idx + 1) == total_batches:
            print(f'Epoch: {epoch} [{batch_idx+1}/{total_batches}] Loss: {loss.item():.4f}')

    train_accuracy = 100 * true_label / total_train
    avg_loss = training_loss / total_batches

    print(f"Training completed - Accuracy: {train_accuracy:.2f}%, Avg Loss: {avg_loss:.4f}")

    return train_accuracy, training_loss, model1

In [None]:
def test_on_valid_data(model, test_data):
    model.eval()

    correct_label = 0
    total_label = 0
    with torch.no_grad():
        for img, label in test_data:
            img, label = img.to(device), label.to(device)
            output = model(img)

            _, pred = torch.max(output, 1)
            correct_label += (pred == label).sum().item()
            total_label += label.size(0)

    valid_accuracy = 100 * correct_label / total_label
    print(f'Validation Accuracy: {valid_accuracy:.2f}%')
    return valid_accuracy


In [None]:
def model_train_val(model, train_data, val_data,epochs,device=device):

    # criterion = nn.CrossEntropyLoss()

    # optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    print("model train val")
    for epoch in range(epochs):
        train_accuracy,avg_loss,model1 = training(model, train_data)
        # Print training loss and accuracy
        print(f'Epoch {epoch+1}/{epochs}, Train Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%')

        # Validation loop
        val_accuracy = test_on_valid_data(model, val_data)
        # Print validation accuracy
        print(f'Epoch {epoch+1}/{epochs}, Validation Accuracy: {val_accuracy:.2f}%')


        wandb.log({'Train loss': avg_loss})
        wandb.log({'Train accuracy': train_accuracy})

        wandb.log({'val_accuracy': val_accuracy})
        wandb.log({'epoch': epoch})


In [None]:

import time
import traceback

def model_train_val(model, train_data, val_data, epochs, device=device):
    print("Starting model_train_val function")

    # Define criterion and optimizer here
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Pass these to your training function if needed
    # Or ensure global variables are properly synchronized

    print(f"Beginning training for {epochs} epochs")
    for epoch in range(epochs):
        print(f"Starting epoch {epoch+1}/{epochs}")

        # Capture start time to monitor duration
        start_time = time.time()

        try:
            print("Calling training function...")
            train_accuracy, avg_loss, model = training(model, train_data)
            print(f"Training complete for epoch {epoch+1}")

            # Print training metrics
            print(f'Epoch {epoch+1}/{epochs}, Train Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%')

            print("Starting validation...")
            val_accuracy = test_on_valid_data(model, val_data)
            print(f"Validation complete for epoch {epoch+1}")

            # Print validation metrics
            print(f'Epoch {epoch+1}/{epochs}, Validation Accuracy: {val_accuracy:.2f}%')

            # Calculate epoch duration
            epoch_time = time.time() - start_time
            print(f"Epoch {epoch+1} completed in {epoch_time:.2f} seconds")

            # Log to wandb with try-except to catch any logging errors
            try:
                print("Logging to wandb...")
                wandb.log({
                    'epoch': epoch,
                    'train_loss': avg_loss,
                    'train_accuracy': train_accuracy,
                    'val_accuracy': val_accuracy,
                    'epoch_time': epoch_time
                })
                print("Successfully logged to wandb")
            except Exception as e:
                print(f"Error logging to wandb: {str(e)}")

        except Exception as e:
            print(f"Error during epoch {epoch+1}: {str(e)}")
            traceback.print_exc()  # This will print the full stack trace

    print("Model training and validation completed")
    return model




In [None]:
!pip install wandb



e4d0a8c3ccaf2534e9ab91c659e420ba5114533f

> Add blockquote



In [None]:
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmfazal735[0m ([33mmfazal735-iit-madras-foundation[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:


sweep_config = {
    'method': 'bayes',
    'metric': {
      'name': 'val_accuracy',
      'goal': 'maximize'
    },
    'parameters': {
        'kernel_size':{
            'values': [[3,3,3,3,3],[3,5,5,7,7],[3,5,3,5,7],[5,5,5,5,5]]#,[7,7,7,7,7]]
        },
        'dropout': {
            'values': [0.3, 0.2]
        },
        'activation': {
            'values': [ 'relu','mish','silu', 'gelu',]
        },
        'num_dense':{
            'values': [128, 256]
        },
        'batch_norm':{
            'values': ['Yes','No']
        },
        'filter_org':{
            'values': [[128,128,64,64,32],[32,64,128,256,512],[32,32,32,32,32],[32,64,64,128,128]]
        },
        'learning_rate':{
            'values': [0.001,0.0001]
        },
        'optimizer':{
            'values': ['Adam','SGD']
        },
        'data_aug': {
            'values': ['No', 'Yes']
        }

    }
}

sweep_id = wandb.sweep(sweep=sweep_config, project='DL A2')

Create sweep with ID: 65pmdmkr
Sweep URL: https://wandb.ai/mfazal735-iit-madras-foundation/DL%20A2/sweeps/65pmdmkr


In [None]:
def main():

    with wandb.init(project='DL A2') as run:
        run_name="ks"+str(wandb.config.kernel_size)+"ac-"+(wandb.config.activation)+"_drop-"+str(wandb.config.dropout)+"_fs-"+str(wandb.config.filter_org)+"_bn-"+str(wandb.config.batch_norm)+"_dence-"+str(wandb.config.num_dense)
        wandb.run.name=run_name


        if  wandb.config.activation == 'relu':
            activ=nn.ReLU()
        elif wandb.config.activation == 'gelu':
            activ=nn.GELU()
        elif wandb.config.activation == 'silu':
            activ=nn.SiLU()
        elif wandb.config.activation == 'mish':
            activ=nn.Mish()

        model_= model(num_filters=wandb.config.filter_org, filter_size=wandb.config.kernel_size,
                      activation=activ, stride=1,padding=1, pool_size=(2,2), fc_size=wandb.config.num_dense,
                      nom_o_classes=10,dropout = wandb.config.dropout).to(device)

        train, validation = train_data(train_data_dir,data_augmentation= wandb.config.data_aug)

        model_train_val(model=model_, train_data=train, val_data=validation, epochs = 10)

wandb.agent(sweep_id, function= main,count= 5)


[34m[1mwandb[0m: Agent Starting Run: 3z4tehjr with config:
[34m[1mwandb[0m: 	activation: silu
[34m[1mwandb[0m: 	batch_norm: Yes
[34m[1mwandb[0m: 	data_aug: No
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	filter_org: [32, 32, 32, 32, 32]
[34m[1mwandb[0m: 	kernel_size: [3, 5, 5, 7, 7]
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_dense: 128
[34m[1mwandb[0m: 	optimizer: Adam


Train data size: 250
Validation data size: 63
Starting model_train_val function
Beginning training for 10 epochs
Starting epoch 1/10
Calling training function...
Epoch: 0 [10/250] Loss: 2.3019
Epoch: 0 [20/250] Loss: 2.2779
Epoch: 0 [30/250] Loss: 2.4157
Epoch: 0 [40/250] Loss: 2.3473
Epoch: 0 [50/250] Loss: 2.3740
Epoch: 0 [60/250] Loss: 2.3937
Epoch: 0 [70/250] Loss: 2.3173
Epoch: 0 [80/250] Loss: 2.3553
Epoch: 0 [90/250] Loss: 2.2892
Epoch: 0 [100/250] Loss: 2.3549
Epoch: 0 [110/250] Loss: 2.3009
Epoch: 0 [120/250] Loss: 2.3221
Epoch: 0 [130/250] Loss: 2.3941
Epoch: 0 [140/250] Loss: 2.3710
Epoch: 0 [150/250] Loss: 2.2201
Epoch: 0 [160/250] Loss: 2.2903
Epoch: 0 [170/250] Loss: 2.2598
Epoch: 0 [180/250] Loss: 2.2723
Epoch: 0 [190/250] Loss: 2.4065
Epoch: 0 [200/250] Loss: 2.2486
Epoch: 0 [210/250] Loss: 2.2906
Epoch: 0 [220/250] Loss: 2.3872
Epoch: 0 [230/250] Loss: 2.3618
Epoch: 0 [240/250] Loss: 2.2794
Epoch: 0 [250/250] Loss: 2.4222
Training completed - Accuracy: 10.09%, Avg Loss

0,1
epoch,▁▂▃▃▄▅▆▆▇█
epoch_time,█▁▁▁▁▁▁▁▁▁
train_accuracy,▄▆▆▅▂▆█▁█▁
train_loss,▃▃▅█▃▂▁▆▄▄
val_accuracy,▃▁▆█▆▄▅▅▄▃

0,1
epoch,9.0
epoch_time,152.71961
train_accuracy,9.81373
train_loss,591.14812
val_accuracy,12.15


[34m[1mwandb[0m: Agent Starting Run: nx92y76v with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_norm: No
[34m[1mwandb[0m: 	data_aug: No
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	filter_org: [128, 128, 64, 64, 32]
[34m[1mwandb[0m: 	kernel_size: [5, 5, 5, 5, 5]
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_dense: 256
[34m[1mwandb[0m: 	optimizer: Adam


Train data size: 250
Validation data size: 63
Starting model_train_val function
Beginning training for 10 epochs
Starting epoch 1/10
Calling training function...
Epoch: 0 [10/250] Loss: 2.3253
Epoch: 0 [20/250] Loss: 2.3256
Epoch: 0 [30/250] Loss: 2.4987
Epoch: 0 [40/250] Loss: 2.3765
Epoch: 0 [50/250] Loss: 2.4426
Epoch: 0 [60/250] Loss: 2.3667
Epoch: 0 [70/250] Loss: 2.4269
Epoch: 0 [80/250] Loss: 2.3560
Epoch: 0 [90/250] Loss: 2.4383
Epoch: 0 [100/250] Loss: 2.3689
Epoch: 0 [110/250] Loss: 2.3276
Epoch: 0 [120/250] Loss: 2.3698
Epoch: 0 [130/250] Loss: 2.3546
Epoch: 0 [140/250] Loss: 2.4239
Epoch: 0 [150/250] Loss: 2.2556
Epoch: 0 [160/250] Loss: 2.3860
Epoch: 0 [170/250] Loss: 2.4016
Epoch: 0 [180/250] Loss: 2.3100
Epoch: 0 [190/250] Loss: 2.5424
Epoch: 0 [200/250] Loss: 2.3582
Epoch: 0 [210/250] Loss: 2.4372
Epoch: 0 [220/250] Loss: 2.3425
Epoch: 0 [230/250] Loss: 2.3198
Epoch: 0 [240/250] Loss: 2.5009
Epoch: 0 [250/250] Loss: 2.4475
Training completed - Accuracy: 9.66%, Avg Loss: