In [8]:
# Basic Libraries
import mlflow, warnings, sys
import numpy as np

import mlflow.pytorch

# Deep Learning
import torch, torchvision
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

warnings.filterwarnings("ignore")

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [3]:
# The set_experiment API creates a new experiment if it doesn't exist.
mlflow.set_experiment("Deep Learning Experiment")

# IMPORTANT: Enable system metrics monitoring
mlflow.config.enable_system_metrics_logging()
mlflow.config.set_system_metrics_sampling_interval(1)

In [4]:
# Load and prepare data
transform = transforms.Compose(
    [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
)
train_dataset = datasets.FashionMNIST(
    "data", train=True, download=True, transform=transform
)
test_dataset = datasets.FashionMNIST("data", train=False, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1000)

In [5]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


model = NeuralNetwork().to(device)

In [6]:
# Training parameters
params = {
    "epochs": 5,
    "learning_rate": 1e-3,
    "batch_size": 64,
    "optimizer": "SGD",
    "model_type": "MLP",
    "hidden_units": [512, 512],
}

# Define optimizer and loss function
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=params["learning_rate"])

In [7]:
with mlflow.start_run() as run:
    # Log training parameters
    mlflow.log_params(params)

    for epoch in range(params["epochs"]):
        model.train()
        train_loss, correct, total = 0, 0, 0

        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)

            # Forward pass
            optimizer.zero_grad()
            output = model(data)
            loss = loss_fn(output, target)

            # Backward pass
            loss.backward()
            optimizer.step()

            # Calculate metrics
            train_loss += loss.item()
            _, predicted = output.max(1)
            total += target.size(0)
            correct += predicted.eq(target).sum().item()

            # Log batch metrics (every 100 batches)
            if batch_idx % 100 == 0:
                batch_loss = train_loss / (batch_idx + 1)
                batch_acc = 100.0 * correct / total
                mlflow.log_metrics(
                    {"batch_loss": batch_loss, "batch_accuracy": batch_acc},
                    step=epoch * len(train_loader) + batch_idx,
                )

        # Calculate epoch metrics
        epoch_loss = train_loss / len(train_loader)
        epoch_acc = 100.0 * correct / total

        # Validation
        model.eval()
        val_loss, val_correct, val_total = 0, 0, 0
        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                loss = loss_fn(output, target)

                val_loss += loss.item()
                _, predicted = output.max(1)
                val_total += target.size(0)
                val_correct += predicted.eq(target).sum().item()

        # Calculate and log epoch validation metrics
        val_loss = val_loss / len(test_loader)
        val_acc = 100.0 * val_correct / val_total

        # Log epoch metrics
        mlflow.log_metrics(
            {
                "train_loss": epoch_loss,
                "train_accuracy": epoch_acc,
                "val_loss": val_loss,
                "val_accuracy": val_acc,
            },
            step=epoch,
        )
        
        # âœ… CORRECTO: usar artifact_path en lugar de name
        mlflow.pytorch.log_model(
            model, 
            artifact_path=f"checkpoint_epoch_{epoch}"
        )

        print(
            f"Epoch {epoch+1}/{params['epochs']}, "
            f"Train Loss: {epoch_loss:.4f}, Train Acc: {epoch_acc:.2f}%, "
            f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%"
        )

    # âœ… CORRECTO: Log the final trained model with signature
    example_input = next(iter(train_loader))[0][:1].to(device)
    signature = mlflow.models.infer_signature(
        example_input.cpu().numpy(),
        model(example_input).detach().cpu().numpy()
    )
    
    model_info = mlflow.pytorch.log_model(
        model, 
        artifact_path="final_model",
        signature=signature,
        input_example=example_input.cpu().numpy()
    )
    
    print(f"\nâœ… Model logged: {model_info.model_uri}")

2025/12/30 11:04:39 INFO mlflow.system_metrics.system_metrics_monitor: Skip logging GPU metrics. Set logger level to DEBUG for more details.
2025/12/30 11:04:39 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


Epoch 1/5, Train Loss: 1.7224, Train Acc: 55.29%, Val Loss: 1.2241, Val Acc: 64.77%




Epoch 2/5, Train Loss: 1.0085, Train Acc: 68.13%, Val Loss: 0.8856, Val Acc: 69.91%




Epoch 3/5, Train Loss: 0.8073, Train Acc: 73.14%, Val Loss: 0.7659, Val Acc: 74.02%




Epoch 4/5, Train Loss: 0.7146, Train Acc: 76.42%, Val Loss: 0.6938, Val Acc: 76.80%




Epoch 5/5, Train Loss: 0.6523, Train Acc: 78.53%, Val Loss: 0.6426, Val Acc: 78.20%


2025/12/30 11:07:19 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/12/30 11:07:19 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!



âœ… Model logged: models:/m-309d3c8ed2814c7bbe31657df2474fd7


In [10]:
experiment = mlflow.get_experiment_by_name("Deep Learning Experiment")
runs = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id],
    order_by=["start_time DESC"],
    max_results=1
)

run_id = runs.iloc[0]['run_id']
print(f"ðŸ“‹ Using Run ID: {run_id}")
print(f"   Val Accuracy: {runs.iloc[0]['metrics.val_accuracy']:.4f}")
print(f"   Val Loss: {runs.iloc[0]['metrics.val_loss']:.4f}\n")

ðŸ“‹ Using Run ID: 2ed17e245b8a4ae8abe05ccfce0a39ee
   Val Accuracy: 78.2000
   Val Loss: 0.6426



In [11]:
model = mlflow.pytorch.load_model(f"runs:/{run_id}/final_model")


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

In [12]:
model.to(device)
model.eval()

# Resume the previous run to log test metrics
with mlflow.start_run(run_id=run.info.run_id) as run:
    # Evaluate the model on the test set
    test_loss, test_correct, test_total = 0, 0, 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
        output = model(data)
        loss = loss_fn(output, target)

        test_loss += loss.item()
        _, predicted = output.max(1)
        test_total += target.size(0)
        test_correct += predicted.eq(target).sum().item()

    # Calculate and log final test metrics
    test_loss = test_loss / len(test_loader)
    test_acc = 100.0 * test_correct / test_total

    mlflow.log_metrics({"test_loss": test_loss, "test_accuracy": test_acc})
    print(f"Final Test Accuracy: {test_acc:.2f}%")


2025/12/30 11:17:02 INFO mlflow.system_metrics.system_metrics_monitor: Skip logging GPU metrics. Set logger level to DEBUG for more details.
2025/12/30 11:17:02 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
2025/12/30 11:17:04 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/12/30 11:17:04 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


Final Test Accuracy: 78.40%
