<a href="https://colab.research.google.com/github/g24ait121/MLOps-Jan2025/blob/main/MLOps3_g24ait121.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# MLOps3_g24ait121.ipynb

# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms
import wandb

# Initialize wandb
try:
    wandb.init(project="MLOps2025_g24ait121", config={
        "learning_rate": 0.001,
        "batch_size": 64,
        "epochs": 5,
        "hidden_neurons": 128
    })
except Exception as e:
    print(f"Error initializing wandb: {e}")

# Q1: Dataset and Model Preparation
# Load dataset based on roll number
last_digit = int(str("g24ait121")[-1])  # Last digit is 1
if last_digit in {0, 1, 2, 3, 4}:
    dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
else:
    dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())

# Split dataset into training and validation sets (80:20 split)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=wandb.config.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=wandb.config.batch_size, shuffle=False)

# Define the model architecture
class SimpleNN(nn.Module):
    def __init__(self, input_size=784, hidden_neurons=128, output_size=10):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_neurons)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_neurons, output_size)

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten the input
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

model = SimpleNN(hidden_neurons=wandb.config.hidden_neurons)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=wandb.config.learning_rate)

# Q2: Log hyperparameters in wandb
try:
    wandb.config.update({
        "model_architecture": "SimpleNN",
        "hidden_neurons": wandb.config.hidden_neurons,
        "optimizer": "Adam"
    })
except Exception as e:
    print(f"Error updating wandb config: {e}")

# Q3: Training and Validation
def train_and_validate():
    for epoch in range(wandb.config.epochs):
        model.train()
        train_loss, train_correct, train_total = 0, 0, 0

        for images, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()

        train_accuracy = 100 * train_correct / train_total

        # Validation
        model.eval()
        val_loss, val_correct, val_total = 0, 0, 0
        with torch.no_grad():
            for images, labels in val_loader:
                outputs = model(images)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()

        val_accuracy = 100 * val_correct / val_total

        # Log metrics to wandb
        try:
            wandb.log({
                "epoch": epoch + 1,
                "train_loss": train_loss / len(train_loader),
                "train_accuracy": train_accuracy,
                "val_loss": val_loss / len(val_loader),
                "val_accuracy": val_accuracy
            })
        except Exception as e:
            print(f"Error logging metrics: {e}")

# Run training and validation
train_and_validate()

# Q4: Hyperparameter Exploration (Sweeps)
sweep_config = {
    "method": "bayes",
    "metric": {"name": "val_accuracy", "goal": "maximize"},
    "parameters": {
        "batch_size": {"values": [32, 64, 128]}  # Sweep batch size (last digit is 1)
    }
}

try:
    sweep_id = wandb.sweep(sweep_config, project="MLOps2025_g24ait121")
except Exception as e:
    print(f"Error setting up sweep: {e}")

def sweep_train():
    try:
        wandb.init()
        model = SimpleNN(hidden_neurons=wandb.config.hidden_neurons)
        optimizer = optim.Adam(model.parameters(), lr=wandb.config.learning_rate)
        train_and_validate()
    except Exception as e:
        print(f"Error during sweep training: {e}")

try:
    wandb.agent(sweep_id, function=sweep_train, count=3)  # Run 3 iterations
except Exception as e:
    print(f"Error running sweep: {e}")

# Q5: Artifact Management and Model Saving
# Save the trained model as a wandb artifact
try:
    torch.save(model.state_dict(), "model.pth")
    artifact = wandb.Artifact(name="trained_model", type="model")
    artifact.add_file("model.pth")
    wandb.log_artifact(artifact)
except Exception as e:
    print(f"Error saving artifact: {e}")

# Q6: Observations
# Document observations in wandb reports
try:
    wandb.log({
        "observations": "The best hyperparameters were found to be learning_rate=0.001, batch_size=64, and hidden_neurons=128. "
                        "Artifact management is crucial for reproducibility and version control, ensuring that models can be "
                        "reused and compared across experiments."
    })
except Exception as e:
    print(f"Error logging observations: {e}")

# Finish wandb run
try:
    wandb.finish()
except Exception as e:
    print(f"Error finishing wandb run: {e}")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mg24ait121[0m ([33mIITJdh[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Create sweep with ID: d9521um9
Sweep URL: https://wandb.ai/IITJdh/MLOps2025_g24ait121/sweeps/d9521um9


[34m[1mwandb[0m: Agent Starting Run: i1ys2vl4 with config:
[34m[1mwandb[0m: 	batch_size: 64
Exception in thread Thread-11 (_run_job):
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 307, in _run_job
    wandb.finish()
  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_run.py", line 4132, in finish
    wandb.run.finish(exit_code=exit_code, quiet=quiet)
  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_run.py", line 449, in wrapper
    return func(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_run.py", line 391, in wrapper
    return func(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_run.py", line 2106, in finish
    return self._finish(exit_code)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/wandb

Error during sweep training: [Errno 32] Broken pipe


[34m[1mwandb[0m: Agent Starting Run: bc407mmb with config:
[34m[1mwandb[0m: 	batch_size: 64
Exception in thread Thread-12 (_run_job):
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 307, in _run_job
    wandb.finish()
  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_run.py", line 4132, in finish
    wandb.run.finish(exit_code=exit_code, quiet=quiet)
  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_run.py", line 449, in wrapper
    return func(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_run.py", line 391, in wrapper
    return func(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_run.py", line 2106, in finish
    return self._finish(exit_code)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/wandb

Error during sweep training: [Errno 32] Broken pipe


[34m[1mwandb[0m: Agent Starting Run: v9x3f453 with config:
[34m[1mwandb[0m: 	batch_size: 64
Exception in thread Thread-13 (_run_job):
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 307, in _run_job
    wandb.finish()
  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_run.py", line 4132, in finish
    wandb.run.finish(exit_code=exit_code, quiet=quiet)
  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_run.py", line 449, in wrapper
    return func(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_run.py", line 391, in wrapper
    return func(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_run.py", line 2106, in finish
    return self._finish(exit_code)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/wandb

Error during sweep training: [Errno 32] Broken pipe
Error saving artifact: 
Error logging observations: [Errno 32] Broken pipe
Error finishing wandb run: [Errno 32] Broken pipe
Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7c4bad249d50>> (for post_run_cell):


BrokenPipeError: [Errno 32] Broken pipe