In [1]:
!pip install wandb



In [2]:
!wandb login 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

[34m[1mwandb[0m: [wandb.login()] Using explicit session credentials for https://api.wandb.ai.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

# 1. Define data transformations
# Normalization parameters for CIFAR-10
mean = (0.4914, 0.4822, 0.4465)
std = (0.2023, 0.1994, 0.2010)

transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

transform_val = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

# 2. Load the CIFAR-10 dataset
train_dataset = torchvision.datasets.CIFAR10(
    root='./data',
    train=True,
    download=True,
    transform=transform_train
)

val_dataset = torchvision.datasets.CIFAR10(
    root='./data',
    train=False,
    download=True,
    transform=transform_val
)

# 3. Create PyTorch DataLoaders
batch_size = 128
num_workers = 2 # Using 2 workers for demonstration, adjust based on system capabilities

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False, # No need to shuffle validation data
    num_workers=num_workers
)

print("Data preparation complete. Training and validation DataLoaders created.")

Data preparation complete. Training and validation DataLoaders created.


In [5]:
import torch.nn as nn
import torch.nn.functional as F

# 1. Define the CNN Model
class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()
        # Convolutional Block 1
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2) # Output: 32x16x16

        # Convolutional Block 2
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2) # Output: 64x8x8

        # Convolutional Block 3
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2) # Output: 128x4x4

        # Fully Connected Layers
        # The output of the last conv layer is 128 channels * 4 * 4 spatial dimensions
        self.fc1 = nn.Linear(128 * 4 * 4, 512)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(512, 10) # 10 classes for CIFAR-10

    def forward(self, x):
        # Conv Block 1
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        # Conv Block 2
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        # Conv Block 3
        x = self.pool3(F.relu(self.bn3(self.conv3(x))))

        # Flatten the tensor for the fully connected layers
        x = x.view(x.size(0), -1)

        # Fully Connected Layers
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# 2. Instantiate the model
model = CNNModel()

# 3. Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"CNNModel instantiated and moved to {device}.")
print(model)

CNNModel instantiated and moved to cuda.
CNNModel(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=2048, out_features=512, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=512, out_features=10, bias=True)
)


In [6]:
!pip install thop

Collecting thop
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl.metadata (2.7 kB)
Downloading thop-0.1.1.post2209072238-py3-none-any.whl (15 kB)
Installing collected packages: thop
Successfully installed thop-0.1.1.post2209072238


In [7]:
from thop import profile

# Create a dummy input tensor (batch_size=1, 3 channels, 32x32 image for CIFAR-10)
dummy_input = torch.randn(1, 3, 32, 32).to(device)

# Calculate FLOPs and parameters
macs, params = profile(model, inputs=(dummy_input, ))

# Convert FLOPs to GFLOPs
gflops = macs / 1e9

print(f"Model FLOPs: {gflops:.2f} GFLOPs")
print(f"Model parameters: {params/1e6:.2f} M")

[INFO] Register count_convNd() for <class 'torch.nn.modules.conv.Conv2d'>.
[INFO] Register count_normalization() for <class 'torch.nn.modules.batchnorm.BatchNorm2d'>.
[INFO] Register zero_ops() for <class 'torch.nn.modules.pooling.MaxPool2d'>.
[INFO] Register count_linear() for <class 'torch.nn.modules.linear.Linear'>.
[INFO] Register zero_ops() for <class 'torch.nn.modules.dropout.Dropout'>.
Model FLOPs: 0.01 GFLOPs
Model parameters: 1.15 M


In [9]:
import wandb

# 1. Define hyperparameters
hyperparameters = {
    "learning_rate": 0.001,
    "epochs": 10,  # Placeholder, will be adjusted during actual training
    "batch_size": batch_size, # Reusing batch_size from earlier setup
    "optimizer": "Adam", # Common choice
    "loss_function": "CrossEntropyLoss", # Common choice for multi-class classification
    "model_architecture": "CNNModel",
    "dataset": "CIFAR-10",
    "num_workers": num_workers # Reusing num_workers from earlier setup
}

# 2. Initialize a Weights & Biases run
# Replace 'your_username_or_team' with your actual W&B entity if applicable
wandb.init(
    project="CIFAR10_CNN_Training",
    entity=None, # Set to your W&B entity name if you have one, otherwise keep None
    config=hyperparameters
)

print("WandB run initialized with the following hyperparameters:")
for key, value in hyperparameters.items():
    print(f"  {key}: {value}")


WandB run initialized with the following hyperparameters:
  learning_rate: 0.001
  epochs: 10
  batch_size: 128
  optimizer: Adam
  loss_function: CrossEntropyLoss
  model_architecture: CNNModel
  dataset: CIFAR-10
  num_workers: 2


In [10]:
import torch.optim as optim
import torch.nn as nn

# 1. Set the number of epochs in the hyperparameters dictionary
hyperparameters["epochs"] = 25

# 2. Define the loss function
criterion = nn.CrossEntropyLoss()

# 3. Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=hyperparameters["learning_rate"])

# 4. Move the model to the specified device (already done in previous step, but ensuring here)
model.to(device)

print(f"Training will run for {hyperparameters['epochs']} epochs.")
print(f"Loss Function: {hyperparameters['loss_function']}")
print(f"Optimizer: {hyperparameters['optimizer']}")

# 5. Implement the main training loop
for epoch in range(hyperparameters["epochs"]):
    # Training Phase
    model.train()  # Set model to training mode
    running_loss = 0.0
    correct_train = 0
    total_train = 0

    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

    train_loss = running_loss / len(train_loader)
    train_accuracy = 100 * correct_train / total_train

    # Log training loss and accuracy to WandB
    wandb.log({"train_loss": train_loss, "train_accuracy": train_accuracy}, step=epoch)

    # Validation Phase
    model.eval()  # Set model to evaluation mode
    val_loss = 0.0
    correct_val = 0
    total_val = 0

    with torch.no_grad():  # Disable gradient calculations during validation
        for data in val_loader:
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            val_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()

    val_loss /= len(val_loader)
    val_accuracy = 100 * correct_val / total_val

    # Log validation loss and accuracy to WandB with commit=True
    wandb.log({"val_loss": val_loss, "val_accuracy": val_accuracy}, step=epoch, commit=True)

    print(f"Epoch {epoch+1}/{hyperparameters['epochs']} - Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.2f}%, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.2f}%")

print("Finished Training")

# End the WandB run
wandb.finish()

Training will run for 25 epochs.
Loss Function: CrossEntropyLoss
Optimizer: Adam
Epoch 1/25 - Train Loss: 1.5809, Train Acc: 42.18%, Val Loss: 1.2059, Val Acc: 55.49%
Epoch 2/25 - Train Loss: 1.2319, Train Acc: 55.51%, Val Loss: 1.0690, Val Acc: 62.07%
Epoch 3/25 - Train Loss: 1.0879, Train Acc: 61.17%, Val Loss: 0.9131, Val Acc: 67.97%
Epoch 4/25 - Train Loss: 1.0051, Train Acc: 64.42%, Val Loss: 0.8294, Val Acc: 70.77%
Epoch 5/25 - Train Loss: 0.9486, Train Acc: 66.61%, Val Loss: 0.7724, Val Acc: 72.54%
Epoch 6/25 - Train Loss: 0.9020, Train Acc: 68.43%, Val Loss: 0.8067, Val Acc: 71.18%
Epoch 7/25 - Train Loss: 0.8683, Train Acc: 69.63%, Val Loss: 0.7325, Val Acc: 74.58%
Epoch 8/25 - Train Loss: 0.8324, Train Acc: 71.29%, Val Loss: 0.7433, Val Acc: 74.17%
Epoch 9/25 - Train Loss: 0.8065, Train Acc: 71.85%, Val Loss: 0.7029, Val Acc: 75.36%
Epoch 10/25 - Train Loss: 0.7750, Train Acc: 73.10%, Val Loss: 0.6796, Val Acc: 76.00%
Epoch 11/25 - Train Loss: 0.7562, Train Acc: 73.68%, Val L

0,1
train_accuracy,▁▃▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇████████
train_loss,█▆▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁
val_accuracy,▁▃▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇████▇██
val_loss,█▇▅▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▂▁▁

0,1
train_accuracy,80.76
train_loss,0.5628
val_accuracy,82.15
val_loss,0.52388


In [12]:
import torch.optim as optim
import torch.nn as nn
import wandb

# 1. Re-initialize a Weights & Biases run
# Re-using hyperparameters from previous setup
wandb.init(
    project="CIFAR10_CNN_Training",
    entity=None, # Set to your W&B entity name if you have one, otherwise keep None
    config=hyperparameters
)
print("WandB run re-initialized.")

# 2. Set the number of epochs in the hyperparameters dictionary
hyperparameters["epochs"] = 25

# 3. Define the loss function
criterion = nn.CrossEntropyLoss()

# 4. Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=hyperparameters["learning_rate"])

# 5. Move the model to the specified device (already done in previous step, but ensuring here)
model.to(device)

# 6. Call wandb.watch() to log gradients and weights
#    This should be called after the model has been moved to the device and wandb.init()
wandb.watch(model, log='all')

print(f"Training will run for {hyperparameters['epochs']} epochs.")
print(f"Loss Function: {hyperparameters['loss_function']}")
print(f"Optimizer: {hyperparameters['optimizer']}")
print("WandB is now watching model gradients and weights.")

# 7. Implement the main training loop
for epoch in range(hyperparameters["epochs"]):
    # Training Phase
    model.train()  # Set model to training mode
    running_loss = 0.0
    correct_train = 0
    total_train = 0

    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

    train_loss = running_loss / len(train_loader)
    train_accuracy = 100 * correct_train / total_train

    # Log training loss and accuracy to WandB
    wandb.log({"train_loss": train_loss, "train_accuracy": train_accuracy}, step=epoch)

    # Validation Phase
    model.eval()  # Set model to evaluation mode
    val_loss = 0.0
    correct_val = 0
    total_val = 0

    with torch.no_grad():  # Disable gradient calculations during validation
        for data in val_loader:
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            val_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()

    val_loss /= len(val_loader)
    val_accuracy = 100 * correct_val / total_val

    # Log validation loss and accuracy to WandB with commit=True
    wandb.log({"val_loss": val_loss, "val_accuracy": val_accuracy}, step=epoch, commit=True)

    print(f"Epoch {epoch+1}/{hyperparameters['epochs']} - Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.2f}%, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.2f}%")

print("Finished Training")

# End the WandB run
wandb.finish()


WandB run re-initialized.
Training will run for 25 epochs.
Loss Function: CrossEntropyLoss
Optimizer: Adam
WandB is now watching model gradients and weights.
Epoch 1/25 - Train Loss: 0.5561, Train Acc: 80.92%, Val Loss: 0.5132, Val Acc: 82.16%
Epoch 2/25 - Train Loss: 0.5504, Train Acc: 81.26%, Val Loss: 0.5165, Val Acc: 82.42%
Epoch 3/25 - Train Loss: 0.5368, Train Acc: 81.62%, Val Loss: 0.5511, Val Acc: 81.98%
Epoch 4/25 - Train Loss: 0.5338, Train Acc: 81.74%, Val Loss: 0.5186, Val Acc: 82.33%
Epoch 5/25 - Train Loss: 0.5298, Train Acc: 81.92%, Val Loss: 0.5710, Val Acc: 80.74%
Epoch 6/25 - Train Loss: 0.5203, Train Acc: 82.51%, Val Loss: 0.4867, Val Acc: 83.34%
Epoch 7/25 - Train Loss: 0.5094, Train Acc: 82.56%, Val Loss: 0.5180, Val Acc: 82.13%
Epoch 8/25 - Train Loss: 0.5060, Train Acc: 82.57%, Val Loss: 0.5070, Val Acc: 82.32%
Epoch 9/25 - Train Loss: 0.5027, Train Acc: 82.76%, Val Loss: 0.4940, Val Acc: 83.29%
Epoch 10/25 - Train Loss: 0.4918, Train Acc: 83.06%, Val Loss: 0.492

0,1
train_accuracy,▁▂▂▂▃▄▄▄▄▄▅▅▅▆▆▆▆▆▇▇▇▇███
train_loss,██▇▇▇▆▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁
val_accuracy,▄▄▄▄▁▆▄▄▆▆▆▇▇▅███▇▆█▇█▅▇█
val_loss,▄▄▇▄█▂▄▄▃▃▂▂▂▅▁▂▂▂▃▁▁▂▄▂▂

0,1
train_accuracy,85.276
train_loss,0.42985
val_accuracy,84.16
val_loss,0.4867
