In [1]:
import torch
import mlflow

from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

from mlflow.models.signature import infer_signature

In [2]:
# The set_experiment API creates a new experiment if it doesn't exist.
mlflow.set_tracking_uri("http://127.0.0.1:5000")    # Point to the remote MLflow server via REST API
mlflow.set_experiment("Deep Learning Experiment")

# IMPORTANT: Enable system metrics monitoring
mlflow.config.enable_system_metrics_logging()
mlflow.config.set_system_metrics_sampling_interval(1)

In [3]:
# Download the training data from open datasets
training_data = datasets.FashionMNIST(root="/Users/debajyotidas/Library/CloudStorage/OneDrive-Personal/Online Courses/PyTorch/FashionMNIST", train=True, download=True, transform=ToTensor())

# Download the test data from open datasets
test_data = datasets.FashionMNIST(root="/Users/debajyotidas/Library/CloudStorage/OneDrive-Personal/Online Courses/PyTorch/FashionMNIST",train=False, download=True, transform=ToTensor())

In [4]:
batch_size = 64
softmax = nn.Softmax(dim=1)
loss_fn = nn.CrossEntropyLoss() # Use CrossEntropyLoss when model outputs logits
logloss_fn = nn.NLLLoss()       # Use NLLLoss when model outputs probabilities

if torch.cuda.is_available():
    device = torch.device("cuda")
    device_name = torch.cuda.get_device_name(0)
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

# Alternative using accelerate library if PyTorch version >= 2.6
# device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")

Using mps device


In [None]:
# Define Model
class NeuralNetwork(nn.Module):
    def __init__ (self):
        super().__init__()
        self.flatten = nn.Flatten() # Flatten each image of size 28x28 pixels to vectors of 784 elements
        self.linear_relu_stack = nn.Sequential(nn.Linear(in_features=28*28,
                                                         out_features=512),     # First layer propagating 784 columns of input vector (1 image of 784 pixels) to 512 nodes
                                               nn.ReLU(),                       # Activation function
                                               nn.Linear(in_features=512,
                                                         out_features=512),     # Second layer propagating 512 nodes to 512 nodes
                                               nn.ReLU(),
                                               nn.Linear(in_features=512, 
                                                         out_features=10))      # Final layer propagating 512 nodes to 10 output nodes (for 10 classes)
        
    # Defining the forward pass that lays down the sequence of execution of the layers
    def forward(self, x):
        x = self.flatten(x)                 # Flatten the input image
        logits = self.linear_relu_stack(x)  # Pass the flattened image through the stack of layers
        pred_probab = softmax(logits)       # Apply softmax to get probabilities for each class
        return logits, pred_probab          # Return both the raw output logits and the output probabilities for each class and use either CrossEntropyLoss or NLLLoss respectively during training
    
model = NeuralNetwork().to(device)

In [6]:
def train(dataloader, model, loss_fn, optimizer, losstype, epoch):
    size = len(dataloader.dataset)
    model.train()

    # initialize accumulators
    train_loss = 0.0
    total = 0
    correct = 0

    for batch, (X,y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        
        # clear previous gradients
        optimizer.zero_grad()

        logits, pred_prob = model(X)    # Forward pass

        # Compute prediction error
        if losstype == 'crossentropy':
            # Use CrossEntropyLoss which combines LogSoftmax and NLLLoss if the model outputs logits
            loss = loss_fn(logits, y)
        else:
            # Use NLLLoss if the model outputs probabilities, i.e., after applying Softmax
            loss = loss_fn(torch.log(pred_prob), y)

        # Backpropagation
        loss.backward()
        optimizer.step()

        # Calculate metrics
        train_loss += loss.item()
        predicted = pred_prob.max(1)[1]
        total += y.size(0)
        correct += predicted.eq(y).sum().item()        

        if batch % 100 == 0:
            loss_val = loss.item()
            current = (batch+1) * len(X)
            batch_loss = train_loss / (batch + 1)
            batch_acc = 100.0 * correct / total
            mlflow.log_metrics({"batch_loss": batch_loss, 
                                "batch_accuracy": batch_acc},
                               step=epoch * len(dataloader) + batch)
            print(f"loss: {loss_val:>7f}  [{current:>5d}/{size:>5d}]")

In [7]:
def test(dataloader, model, loss_fn, losstype, epoch):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()

    test_loss, total, correct = 0, 0, 0
    signature = None

    with torch.no_grad():
        for X,y in dataloader:
            
            X, y = X.to(device), y.to(device)

            logits, pred_prob = model(X)

            if losstype == 'crossentropy':
                test_loss += loss_fn(logits, y).item()              
            else:
                test_loss += loss_fn(torch.log(pred_prob), y).item()

            # Infer signature only once from first batch
            if signature is None:
                signature = infer_signature(X.cpu().numpy(), 
                                           (logits.cpu().numpy(), 
                                            pred_prob.cpu().numpy()))

            total += y.size(0)
            correct += (logits.argmax(1)==y).type(torch.float).sum().item()
        
        test_loss /= num_batches
        correct /= size
        accuracy = 100.0 * correct / total
        print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

        # Log epoch metrics
        mlflow.log_metrics({"val_loss": test_loss,
                            "val_accuracy": accuracy},
                            step=epoch)
        
        # Log checkpoint at the end of each epoch
        mlflow.pytorch.log_model(model, name=f"checkpoint_{epoch}", signature=signature)

    return signature

In [8]:
# Define training parameters
params = {"epochs": 5,
          "learning_rate": 1e-3,
          "batch_size": 64,
          "losstype": "nll",
          "optimizer": "SGD",
          "model_type": "MLP",
          "hidden_units": [512, 512]}
optimizer = torch.optim.SGD(model.parameters(), lr=params["learning_rate"])

In [9]:
# Create data loaders
train_dataloader = DataLoader(training_data, batch_size = params["batch_size"])
test_dataloader = DataLoader(test_data, batch_size= params["batch_size"])

In [10]:
# Training loop
with mlflow.start_run() as run:
    # Log training parameters
    mlflow.log_params(params)
    for t in range(params["epochs"]):
        print(f"Epoch {t+1}\n-------------------------------")
        if params["losstype"] == 'crossentropy':
            train(train_dataloader, model, loss_fn, optimizer, params["losstype"], t)
            signature = test(test_dataloader, model, loss_fn, params["losstype"], t)
        else:
            train(train_dataloader, model, logloss_fn, optimizer, params["losstype"], t)
            signature = test(test_dataloader, model, logloss_fn, params["losstype"], t)

    # Log the final trained model
    model_info = mlflow.pytorch.log_model(model, name="final_model", signature=signature)

    print("Done!")

2025/12/08 23:53:43 INFO mlflow.system_metrics.system_metrics_monitor: Skip logging GPU metrics. Set logger level to DEBUG for more details.
2025/12/08 23:53:43 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


Epoch 1
-------------------------------
loss: 2.310027  [   64/60000]
loss: 2.299719  [ 6464/60000]
loss: 2.280293  [12864/60000]
loss: 2.272435  [19264/60000]
loss: 2.247988  [25664/60000]
loss: 2.225052  [32064/60000]
loss: 2.235577  [38464/60000]
loss: 2.197560  [44864/60000]
loss: 2.198009  [51264/60000]




loss: 2.169366  [57664/60000]
Test Error: 
 Accuracy: 38.7%, Avg loss: 2.159209 

Epoch 2
-------------------------------
loss: 2.174826  [   64/60000]
loss: 2.165810  [ 6464/60000]
loss: 2.107644  [12864/60000]
loss: 2.129941  [19264/60000]
loss: 2.060283  [25664/60000]
loss: 2.005828  [32064/60000]
loss: 2.044127  [38464/60000]
loss: 1.959670  [44864/60000]
loss: 1.978841  [51264/60000]




loss: 1.909675  [57664/60000]
Test Error: 
 Accuracy: 46.3%, Avg loss: 1.899466 

Epoch 3
-------------------------------
loss: 1.938183  [   64/60000]
loss: 1.909106  [ 6464/60000]
loss: 1.791040  [12864/60000]
loss: 1.841928  [19264/60000]
loss: 1.706908  [25664/60000]
loss: 1.665682  [32064/60000]
loss: 1.701965  [38464/60000]
loss: 1.591639  [44864/60000]
loss: 1.632536  [51264/60000]




loss: 1.533903  [57664/60000]
Test Error: 
 Accuracy: 56.5%, Avg loss: 1.537029 

Epoch 4
-------------------------------
loss: 1.611799  [   64/60000]
loss: 1.571731  [ 6464/60000]
loss: 1.417023  [12864/60000]
loss: 1.494902  [19264/60000]
loss: 1.363826  [25664/60000]
loss: 1.360286  [32064/60000]
loss: 1.384856  [38464/60000]
loss: 1.299815  [44864/60000]
loss: 1.341639  [51264/60000]




loss: 1.253221  [57664/60000]
Test Error: 
 Accuracy: 62.6%, Avg loss: 1.266855 

Epoch 5
-------------------------------
loss: 1.352241  [   64/60000]
loss: 1.327758  [ 6464/60000]
loss: 1.156206  [12864/60000]
loss: 1.264927  [19264/60000]
loss: 1.140751  [25664/60000]
loss: 1.159068  [32064/60000]
loss: 1.189131  [38464/60000]
loss: 1.118571  [44864/60000]
loss: 1.159230  [51264/60000]




loss: 1.090358  [57664/60000]
Test Error: 
 Accuracy: 64.5%, Avg loss: 1.099311 



2025/12/08 23:54:47 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/12/08 23:54:47 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


Done!
üèÉ View run rumbling-rat-480 at: http://127.0.0.1:5000/#/experiments/5/runs/25d41228443c4a508cc541d7c0c227da
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/5


In [None]:
classes = [
    "T-shirt/top",
    "Trouser",
    "Pullover",
    "Dress",
    "Coat",
    "Sandal",
    "Shirt",
    "Sneaker",
    "Bag",
    "Ankle boot",
]

# Load the final model from the active run using PyTorch loader
loaded_model = mlflow.pytorch.load_model(model_info.model_uri)
loaded_model.eval()

with torch.no_grad():
	x = test_data[10][0].unsqueeze(0).to(device)
	logits, probabilities = loaded_model(x)
	predicted = classes[logits.argmax(1).item()]
	actual = classes[test_data[10][1]]
	print(f'Predicted: "{predicted}", Actual: "{actual}"')

Downloading artifacts:   0%|          | 0/6 [00:00<?, ?it/s]

Predicted: "Coat", Actual: "Coat"


Saving the model the normal PyTorch way as well

In [11]:
# Saving Models
torch.save(model.state_dict(), "/Users/debajyotidas/Library/CloudStorage/OneDrive-Personal/Online Courses/PyTorch/FashionMNIST/FashionMNISTMLFlow.pth")
print("Saved PyTorch Model State to FashionMNISTMLFlow.pth")

Saved PyTorch Model State to FashionMNISTMLFlow.pth


In [12]:
# Loading Models
model = NeuralNetwork().to(device)
model.load_state_dict(torch.load("/Users/debajyotidas/Library/CloudStorage/OneDrive-Personal/Online Courses/PyTorch/FashionMNIST/FashionMNISTMLFlow.pth", weights_only=True))

<All keys matched successfully>

In [None]:
classes = [
    "T-shirt/top",
    "Trouser",
    "Pullover",
    "Dress",
    "Coat",
    "Sandal",
    "Shirt",
    "Sneaker",
    "Bag",
    "Ankle boot",
]

model.eval()
x, y = test_data[0][0], test_data[0][1]
with torch.no_grad():
    x = x.to(device)
    pred = model(x)
    predicted, actual = classes[pred[0].argmax()], classes[y]
    print(f'Predicted: "{predicted}", Actual: "{actual}"')

Predicted: "Ankle boot", Actual: "Ankle boot"
