<p>
CAS on Advanced Machine Learning <br>
Data Science Lab, University of Bern, 2024<br>
Prepared by Dr. Mykhailo Vladymyrov.

</p>

This work is licensed under a <a href="https://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.

# Installations

In [None]:
# on colab:
# !pip install mlflow
# !pip install optuna

# Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import einops as eo
import pathlib as pl

import matplotlib.cm as cm
from matplotlib import collections  as mc
from matplotlib import animation
%matplotlib inline

from scipy.stats import norm
from scipy.stats import entropy

import pandas as pd
import pickle
from PIL import Image
from time import time as timer
#import umap

from IPython.display import HTML
from IPython.display import Audio
import IPython

import tqdm.auto as tqdm

import torch
from torchvision import datasets, transforms
from torch import nn
from torch import optim
import torch.nn.functional as F

import sys
is_colab = 'google.colab' in sys.modules

import mlflow 
import optuna

# Setup

In [2]:
if is_colab:
    # mount google drive
    from google.colab import drive
    drive.mount('/content/drive')

    root_path = '/content/drive/My Drive/Colab Notebooks/CAS_AML_M3'
    root_path = pl.Path(root_path)
    root_path.mkdir(exist_ok=True, parents=True)
else:
    root_path = pl.Path.cwd()


In [3]:
BATCH_SIZE = 128
LAYERS = []

In [4]:
# create simple torch model for digit classsification

m, s = 0.5, 0.5

# Define a transform to normalize the data
transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((m,), (s,)),
                                transforms.Lambda(lambda x: torch.flatten(x))])

# Download and load the training data
trainset = datasets.FashionMNIST('~/.pytorch/FMNIST_data/', download=True, train=True, transform=transform)
train_loader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True, drop_last=True)

# Download and load the test data
validset = datasets.FashionMNIST('~/.pytorch/FMNIST_data/', download=True, train=False, transform=transform)
valid_loader = torch.utils.data.DataLoader(validset, batch_size=64, shuffle=False, drop_last=True)

# Print shapes of the datasets
print('Train dataset shape:', len(trainset), 'total images and labels')
print('Test dataset shape:', len(validset), 'total images and labels')

Train dataset shape: 60000 total images and labels
Test dataset shape: 10000 total images and labels


In [5]:
# FCN
class MyModel(nn.Module):
    def __init__(self, n_input, n_hiddens, n_output):
        super(MyModel, self).__init__()

        self.ls = []
        n_prev = n_input
        for i, n_out in enumerate(n_hiddens):
          l = nn.Linear(n_prev, n_out)  # for hidden layer we create a linear projection form n_prev features to n_out features
          n_prev = n_out
          self.add_module(f'lin_{i}_{n_out}', l)
          self.ls.append(l)

        self.lout = nn.Linear(n_prev, n_output)  # also we need the output layer

    def forward(self, x):
        h = x
        for li in self.ls:  # for each layer we apply the linear projection and the activation fuinction (ReLU)
          h = li(h)
          h = torch.relu(h)

        logits = self.lout(h)
        # Apply softmax activation per row, to get the class pseudoprobabilities
        probs = F.softmax(logits, dim=1)

        # Prediction: argmax for classification
        pred = torch.argmax(probs, dim=1)  # find the element with highest value in each row

        return logits, probs, pred


In [6]:
n_hidden = []
model = MyModel(n_input=784, n_hiddens=n_hidden, n_output=10)  # 784 input features for 28x28 images, 10 output classes

In [7]:
def get_npars(model):
    """
    Returns the total number of parameters in the given PyTorch model.
    For each parmaeter in the model, the function multiplies the elements of the shape
    of the parameter tensor to get the total number of parameters.
    
    Args:
        model (torch.nn.Module): The PyTorch model to get the number of parameters for.
    
    Returns:
        int: The total number of parameters in the model.
    """
    n = np.sum([np.prod(list(p.shape)) for p in model.parameters()])
    return int(n)

In [8]:
get_npars(model)

7850

In [9]:
def train(model, trainloader, criterion, optimizer):
    model.train()
    running_loss = 0.0
    n_correct = 0
    for images, labels in trainloader:
        optimizer.zero_grad()  # reset gradients
        output, prob, pred = model(images)
        loss = criterion(output, labels)
        loss.backward()    # compute gradients
        optimizer.step()   # update parameters with gradients
        running_loss += loss.item()
        correct = torch.sum(pred == labels)
        n_correct += correct
    return running_loss / len(trainloader), n_correct/len(trainloader)/trainloader.batch_size

In [10]:
def test(model, testloader, criterion):
    model.eval()
    running_loss = 0.0
    n_correct = 0
    with torch.no_grad():
        for images, labels in testloader:
            output, prob, pred = model(images)
            loss = criterion(output, labels)
            running_loss += loss.item()
            correct = torch.sum(pred == labels)
            n_correct += correct
    return running_loss / len(testloader), n_correct/len(testloader)/testloader.batch_size

## Training loop example:

In [None]:
num_epochs = 5

# Loss function
criterion = nn.CrossEntropyLoss()
# Optimizer (e.g., Adam)
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(num_epochs):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer)
    valid_loss, valid_acc = test(model, valid_loader, criterion)

    print(f'Epoch {epoch+1}/{num_epochs}, Valid Loss: {train_loss:.4f}, Valid Loss: {valid_loss:.4f}, Train Acc: {train_acc:.4f}, Valid Acc: {valid_acc:.4f}')


# MLFlow

In [12]:
num_epochs = 7

mlflow_log_path = root_path / 'mlflow'
mlflow_log_path.mkdir(exist_ok=True, parents=True)
mlflow_log_path =  str(mlflow_log_path)  # path to mlflow folder where mlflow will log the experiments
# autolog with mlflow
mlflow_log_path = r'file:///' + mlflow_log_path  # note the tripple slashes
print(mlflow_log_path)


mlflow.set_tracking_uri(mlflow_log_path)  # tell mlflow where to log the experiments
mlflow.pytorch.autolog()  # autolog with mlflow - log all the parameters, metrics, etc.

exp_name = 'FMNIST_CNN'  # name of the experiment
run_name = '1_layer'     # name of the run - multiple runs can be logged under the same experiment. Be sure to use different, clear, yet short names for different runs.


# create model - 1 layer, cross-entropy loss, Adam optimizer
n_hidden = []
model = MyModel(n_input=784, n_hiddens=n_hidden, n_output=10)  # 784 input features for 28x28 images, 10 output classes
# Loss function
criterion = nn.CrossEntropyLoss()
# Optimizer (e.g., Adam)
optimizer = optim.Adam(model.parameters(), lr=0.001)

nested = True
mlflow.set_experiment(exp_name)
with mlflow.start_run(run_name=run_name, nested=nested):
    mlflow.log_param('num_params', get_npars(model))  # you can log any parameters

    for epoch in range(num_epochs):
        train_loss, train_acc = train(model, train_loader, criterion, optimizer)
        valid_loss, valid_acc = test(model, valid_loader, criterion)

        print(f'Epoch {epoch+1}/{num_epochs}, Valid Loss: {train_loss:.4f}, Valid Loss: {valid_loss:.4f}, Train Acc: {train_acc:.4f}, Valid Acc: {valid_acc:.4f}')

        # Log metrics to MLflow at each iteration
        mlflow.log_metric('train_loss', train_loss, step=epoch)
        mlflow.log_metric('valid_loss', valid_loss, step=epoch)
        mlflow.log_metric('train_acc', train_acc, step=epoch)
        mlflow.log_metric('valid_acc', valid_acc, step=epoch)
        
    mlflow.pytorch.log_model(model, 'model_trained')  # you can save the model as a model artifact



file:///d:\docs\DSL\Teaching\CAS_AML\CAS_AML_M3\mlflow
Epoch 1/5, Valid Loss: 0.5475, Valid Loss: 0.4713, Train Acc: 0.8086, Valid Acc: 0.8311
Epoch 2/5, Valid Loss: 0.4242, Valid Loss: 0.4257, Train Acc: 0.8482, Valid Acc: 0.8454
Epoch 3/5, Valid Loss: 0.3883, Valid Loss: 0.4156, Train Acc: 0.8603, Valid Acc: 0.8464
Epoch 4/5, Valid Loss: 0.3683, Valid Loss: 0.4020, Train Acc: 0.8677, Valid Acc: 0.8547
Epoch 5/5, Valid Loss: 0.3511, Valid Loss: 0.4004, Train Acc: 0.8722, Valid Acc: 0.8550


In [14]:
# if run locally, use the following command from the terminal (copy & paste output to terminal)
print(f'mlflow server --host 127.0.0.1 --port 8080 --backend-store-uri {mlflow_log_path}')

mlflow server --host 127.0.0.1 --port 8080 --backend-store-uri file:///d:\docs\DSL\Teaching\CAS_AML\CAS_AML_M3\mlflow


On colab - copy the mlflow folder to local machine, and adjust the path to the mlflow folder.


## Exercise 1

1. Run same with 2 layer NN (e.g. set n_hidden = [32] and a distinct run name)
2. Copy the mlflow directory locally (or mount google drive)
3. Then:
    * explre the effect of n_hidden on the training time
    * explore the effect of n_hidden on performance
    * explore the effect of number of parameters vs performance
    * ... explore :)

# Optuna

In [20]:
def train_model_wrapper(num_epochs, run_name, n_hidden, lr, trial=None):
    mlflow_log_path = root_path / 'mlflow'
    mlflow_log_path.mkdir(exist_ok=True, parents=True)
    mlflow_log_path =  str(mlflow_log_path)  # path to mlflow folder where mlflow will log the experiments
    # autolog with mlflow
    mlflow_log_path = r'file:///' + mlflow_log_path  # note the tripple slashes
    print(mlflow_log_path)


    mlflow.set_tracking_uri(mlflow_log_path)  # tell mlflow where to log the experiments
    mlflow.pytorch.autolog()  # autolog with mlflow - log all the parameters, metrics, etc.

    exp_name = 'FMNIST_CNN'  # name of the experiment
    
    # create model - 1 layer, cross-entropy loss, Adam optimizer
    n_hidden = []
    model = MyModel(n_input=784, n_hiddens=n_hidden, n_output=10)  # 784 input features for 28x28 images, 10 output classes
    # Loss function
    criterion = nn.CrossEntropyLoss()
    # Optimizer (e.g., Adam)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    nested = True
    mlflow.set_experiment(exp_name)
    valid_acc_list = []
    with mlflow.start_run(run_name=run_name, nested=nested):
        mlflow.log_param('num_params', get_npars(model))  # you can log any parameters
        mlflow.log_param('num_hidden', len(n_hidden))
        mlflow.log_param('n_hidden', ','.join(n_hidden))

        mlflow.log_param('lr', lr)
        mlflow.log_param('num_epochs', num_epochs)


        for epoch in range(num_epochs):
            train_loss, train_acc = train(model, train_loader, criterion, optimizer)
            valid_loss, valid_acc = test(model, valid_loader, criterion)

            print(f'Epoch {epoch+1}/{num_epochs}, Valid Loss: {train_loss:.4f}, Valid Loss: {valid_loss:.4f}, Train Acc: {train_acc:.4f}, Valid Acc: {valid_acc:.4f}')

            # Log metrics to MLflow at each iteration
            mlflow.log_metric('train_loss', train_loss, step=epoch)
            mlflow.log_metric('valid_loss', valid_loss, step=epoch)
            mlflow.log_metric('train_acc', train_acc, step=epoch)
            mlflow.log_metric('valid_acc', valid_acc, step=epoch)

            valid_acc_list.append(valid_acc)

            # if trial is not None:
            #     trial.report(valid_acc, epoch)

            # # Handle pruning based on the intermediate value.
            # if trial.should_prune():
            #     raise optuna.TrialPruned()
            
        
        mlflow.pytorch.log_model(model, 'model_trained')  # you can save the model as a model artifact
    return np.max(valid_acc_list)  # best validation accuracy

In [21]:
# use optuna to find the best parameters for the model - i.e. elements of the array n_hidden

def objective(trial):
    n_ep = 9
    n_hidden_layers = trial.suggest_int("n_hidden", 0, 3)
    n_hidden = []
    for i in range(n_hidden_layers):
        # select between 16 and 2048 in log scale
        n_neurons_log = trial.suggest_int(f"n_hidden_{i}", 4, 10)
        n_neurons = 2 ** n_neurons_log
        n_hidden.append(n_neurons)

    #lr = trial.suggest_float("lr", 3e-5, 3e-3, log=True)
    lr = 1e-3

    rname=f"optuna2_{n_hidden_layers}_{n_hidden}_{lr}"
    acc = train_model_wrapper(num_epochs=n_ep, n_hidden=n_hidden, lr=lr, run_name=rname, trial=trial)

    return acc



In [22]:
path_optuna_db_dir = root_path / "optuna_db"
path_optuna_db_dir.mkdir(exist_ok=True)
path_optuna_db = path_optuna_db_dir / "db.sqlite3"
path_optuna_db = "sqlite:///" + str(path_optuna_db)

study = optuna.create_study(storage=path_optuna_db,
                            direction="maximize",
                            study_name="architecture_opt")

study.optimize(objective, n_trials=120, timeout=3600*2, show_progress_bar=True)

DuplicatedStudyError: Another study with name 'arch_opt2' already exists. Please specify a different name, or reuse the existing one by setting `load_if_exists` (for Python API) or `--skip-if-exists` flag (for CLI).

In [18]:
# if run locally - paste the output of the print to the terminal:
print(f'optuna-dashboard {path_optuna_db}')

# otherwise copy file locally and adjust the path accordingly

optuna-dashboard sqlite:///d:\docs\DSL\Teaching\CAS_AML\CAS_AML_M3\optuna_db\db.sqlite3
