<img src="img/hpe_logo.png" alt="HPE Logo" width="125">

# HPE ML Platform Workshop - Model Training

<img src='img/platform_step02_training.png' width='1200'/>

In [None]:
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from configparser import ConfigParser
from sklearn.model_selection import train_test_split

# Torch modules
import torch
from torch import nn
from torch import optim
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torchvision.transforms.functional as TF

# Image modules
from PIL import Image
import matplotlib.pyplot as plt
%matplotlib inline

# Import functions for downloading data
from utils.load_data import download_data

# Import model util functions
from utils.model_utils import set_seed, plot_example, PairedRandomHorizontalFlip, PairedRandomAffine, PairedToTensor, DoubleConv,  InConv, Down, Up, OutConv, UNet

# Import MLDE packages
from determined.experimental import client as det
from determined import pytorch

# Import MLDM packages
import pachyderm_sdk
from pachyderm_sdk.api import pfs
from pachyderm_sdk.api.pfs import File, FileType

# Remove warnings
import warnings
warnings.filterwarnings('ignore')

<br/>
<div style="font-size:20px;color:maroon;font-family: 'Courier New';font-weight:bold">
    Important: set your MLDM Project name
</div>

In [None]:
project = "brain-mri-workshop" # change the project name

In [None]:
# Load Data and Set Variables
config_obj = ConfigParser()
config_obj.read("./utils/config.ini")

mldm_host = config_obj['PDK_INFO']['mldm_host']
mldm_port = config_obj['PDK_INFO']['mldm_port']
token = config_obj['PDK_INFO']['token']
repo = config_obj['PDK_INFO']['repo']
branch = config_obj['PDK_INFO']['branch']
download_dir = config_obj['PDK_INFO']['download_dir']
images_dir = config_obj['PDK_INFO']['images_dir']

<h2>Part 1: Processing, Loading and Analyzing Data </h2>

<img src='img/platform_step01_data.png' width='1200'/>

In [None]:
# Connect to Instance
mldm_client = pachyderm_sdk.Client(mldm_host, mldm_port, token)

In [None]:
# List Files in the Repository
files = []
c_file = 0
c_mask = 0
c_folder = 0

for file_info in mldm_client.pfs.walk_file(file=File.from_uri(f"{project}/{repo}@{branch}")):
    f_path = file_info.file.path
    print(f"'{f_path}'")
    if "_mask.tif" in f_path:
        c_mask += 1
    elif ".tif" in f_path:
        c_file += 1
    else:
        c_folder += 1
c_folder -= 2

In [None]:
print(f"--> Total Images: {c_file}")
print(f"--> Total Masks: {c_mask}")

In [None]:
# Download Pre-Processed Files for local testing
files = download_data(mldm_client, repo, branch, project, download_dir)

In [None]:
ROOT = f"{download_dir}/data1"
ROOT

<h3> Data Exploration </h3>

In [None]:
class Config:   
    # data preprocessing
    data_dir = ROOT
    logdir = 'logdir'
    validation_fraction = 0.15
    test_fraction = 0.10
    train_batch = 16
    valid_batch = 32
    test_batch = 32
    
    # model setup
    input_dim = 256
    input_ch = 3
    output_dim = 256
    output_ch = 1
    
    # training
    seed = 21
    learning_rate = 0.01
    epochs = 10
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
set_seed(Config.seed)

In [None]:
dirs, images, masks = [], [], []
for root, folders, files in os.walk(Config.data_dir):
    for file in files:
        # save only images with corresponding masks
        if 'mask'in file:
            dirs.append(root.replace(Config.data_dir, ''))
            masks.append(file)
            images.append(file.replace('_mask', ''))

PathDF = pd.DataFrame({'directory': dirs, 'images': images, 'masks': masks})
PathDF.head()

In [None]:
train2rest = Config.validation_fraction + Config.test_fraction
test2valid = Config.validation_fraction/train2rest

train_df, rest = train_test_split(
    PathDF, random_state=Config.seed,
    test_size=train2rest
)

test_df, valid_df = train_test_split(
    rest, random_state=Config.seed,
    test_size=test2valid
)

print('Train:', train_df.shape[0])
print('Valid:', valid_df.shape[0])
print('Test:', test_df.shape[0])

In [None]:
plot_example(Config.data_dir, 4, test_df)
plot_example(Config.data_dir, 8, test_df)
plot_example(Config.data_dir, 13, test_df)
plot_example(Config.data_dir, 16, test_df)

&nbsp;

# Usual Model Training Process (without MLDE)

<small> <i> Source: https://github.com/MedMNIST/experiments/blob/main/MedMNIST2D/train_and_eval_pytorch.py </i> </small> <br/>
<small> Full Model Porting tutorial: https://www.youtube.com/watch?v=DHm8FdKN3x0 </small>

<img src="./img/02_mlde/06.png" alt="HP and Device Settings" width=800></img>

<img src="./img/02_mlde/07.png" alt="Metrics Settings" width=800></img>

<img src="./img/02_mlde/08.png" alt="Epoch loop" width=800></img>

## But wait, there's more:
- Hyperparameter Tuning
- Distributed Training
- Checkpoint Management
- Reproducibility
- Auditability

## Focusing on What Matters

<img src="./img/02_mlde/10.png" alt="MLDE BYOM" width=800></img>

<img src="./img/02_mlde/09.png" alt="Experiment Components" width=800></img>

### Inspect configuration file

In [None]:
!cat ./experiment/const.yaml

<br/>
<div style="font-size:20px;color:maroon;font-family: 'Courier New';font-weight:bold">
    Important: Edit the <i>const.yaml</i> file and review the values for:
    <ul><li>workspace</li>
        <li>project</li>
        <li>The MLDM config data in the 'pachyderm' block</li>
    </ul>
</div>

### Create a new Experiment using the const.yaml file

In [None]:
exp = det.create_experiment(config="./experiment/const.yaml", model_dir="./experiment/")
print(f"started experiment {exp.id}")

### (optional) Wait for Experiment to complete and print exit status

In [None]:
exit_status = exp.wait()
print(f"experiment completed with status {exit_status}")

### Get the best Checkpoint from the Experiment and print uuid

In [None]:
best_checkpoint = exp.top_checkpoint()
best_checkpoint_uuid = best_checkpoint.uuid
print(f"Best checkpoint was {best_checkpoint_uuid}")

In [None]:
experiment_id = exp.id

### Download the checkpoint and load it as new model

In [None]:
checkpoint = det.get_experiment(experiment_id).top_checkpoint()
path = checkpoint.download()
mlde_model = pytorch.load_trial_from_checkpoint_path(path).model

### Generate a few predictions to test the new model

In [None]:
import torchvision

In [None]:
def plot_new_predictions(data_dir, model, idx, device, test_df):
    base_path = data_dir + '/' +  test_df['directory'].iloc[idx]
    img_path = os.path.join(base_path, test_df['images'].iloc[idx])
    mask_path = os.path.join(base_path, test_df['masks'].iloc[idx])
    
    size = 256
    shape = [1, 256, 256]
    
    img = Image.open(img_path)
    transforms = [torchvision.transforms.ToTensor()]
    try:
        width, height = size
    except TypeError:
        width = height = size
    scale = min(width / img.width, height / img.height)
    new_width, new_height = int(img.width * scale), int(img.height * scale)
    diff_width, diff_height = width - new_width, height - new_height
    resize = torchvision.transforms.Resize(size=(new_height, new_width))
    pad = torchvision.transforms.Pad(
        padding=(
            diff_width // 2,
            diff_height // 2,
            diff_width // 2 + diff_width % 2,
            diff_height // 2 + diff_height % 2,
        )
    )
    transforms = [resize, pad] + transforms
    transformation = torchvision.transforms.Compose(transforms)
    x = transformation(img)
    x = torch.stack([x], 0)
    
    mask = Image.open(mask_path)

    preds = model(x)
  
    pred_values = torch.tensor(preds[0], requires_grad=True)
    pred_values = pred_values.detach().numpy()
    pred_output = torch.Tensor(np.array(pred_values).reshape(shape))    

    plot_images = {'Image': img, 
                   'Mask': mask, 
                   'Predicted Mask': pred_output.permute(1, 2, 0)}

    fig, ax = plt.subplots(1, 3, figsize=(16,4))
    for i, key in enumerate(plot_images.keys()):
        ax[i].imshow(plot_images[key])
        ax[i].set_title(key)
    plt.show()

In [None]:
plot_new_predictions(Config.data_dir, mlde_model, 4, Config.device, test_df)
plot_new_predictions(Config.data_dir, mlde_model, 8, Config.device, test_df)
plot_new_predictions(Config.data_dir, mlde_model, 13, Config.device, test_df)
plot_new_predictions(Config.data_dir, mlde_model, 16, Config.device, test_df)

&nbsp;

## Run a Distributed Training Experiment

### Inspect configuration file

In [None]:
!cat ./experiment/distributed.yaml

<br/>
<div style="font-size:20px;color:maroon;font-family: 'Courier New';font-weight:bold">
    Important: Edit the <i>distributed.yaml</i> file and set the values for:
    <ul><li>workspace</li>
        <li>project</li>
        <li>The MLDM config data in the 'pachyderm' block</li>
    </ul>
</div>

### Create a new Experiment using the distributed.yaml file

In [None]:
exp = det.create_experiment(config="./experiment/distributed.yaml", model_dir="./experiment/")
print(f"started experiment {exp.id}")

### (optional) Wait for Experiment to complete and print exit status

In [None]:
exit_status = exp.wait()
print(f"experiment completed with status {exit_status}")

### Get the best Checkpoint from the Experiment and print uuid

In [None]:
best_checkpoint = exp.top_checkpoint()
best_checkpoint_uuid = best_checkpoint.uuid
print(f"Best checkpoint was {best_checkpoint_uuid}")

In [None]:
experiment_id = exp.id

### Download the checkpoint and load it as new model

In [None]:
checkpoint = det.get_experiment(experiment_id).top_checkpoint()
path = checkpoint.download()
dist_model = pytorch.load_trial_from_checkpoint_path(path).model

### Generate a few predictions to test the new model

In [None]:
plot_new_predictions(Config.data_dir, dist_model, 4, Config.device, test_df)
plot_new_predictions(Config.data_dir, dist_model, 8, Config.device, test_df)
plot_new_predictions(Config.data_dir, dist_model, 13, Config.device, test_df)
plot_new_predictions(Config.data_dir, dist_model, 16, Config.device, test_df)

## Run a Hyperparameter Search Training Experiment

### Inspect configuration file

In [None]:
!cat ./experiment/search.yaml

<br/>
<div style="font-size:20px;color:maroon;font-family: 'Courier New';font-weight:bold">
    Important: Edit the <i>search.yaml</i> file and set the values for:
    <ul><li>workspace</li>
        <li>project</li>
        <li>The MLDM config data in the 'pachyderm' block</li>
    </ul>
</div>

### Preview the Trial Plan before creating the experiment

In [None]:
!det preview-search ./experiment/search.yaml

### Create a new Experiment using the search.yaml file

In [None]:
exp = det.create_experiment(config="./experiment/search.yaml", model_dir="./experiment/")
print(f"started experiment {exp.id}")

### (optional) Wait for Experiment to complete and print exit status

In [None]:
#exit_status = exp.wait()
#print(f"experiment completed with status {exit_status}")

### Get the best Checkpoint from the Experiment and print uuid

In [None]:
best_checkpoint = exp.top_checkpoint()
best_checkpoint_uuid = best_checkpoint.uuid
print(f"Best checkpoint was {best_checkpoint_uuid}")

In [None]:
experiment_id = exp.id

### Download the checkpoint and load it as new model

In [None]:
checkpoint = det.get_experiment(experiment_id).top_checkpoint()
path = checkpoint.download()
hp_model = pytorch.load_trial_from_checkpoint_path(path).model

### Generate a few predictions to test the new model

In [None]:
plot_new_predictions(Config.data_dir, hp_model, 4, Config.device, test_df)
plot_new_predictions(Config.data_dir, hp_model, 8, Config.device, test_df)
plot_new_predictions(Config.data_dir, hp_model, 13, Config.device, test_df)
plot_new_predictions(Config.data_dir, hp_model, 16, Config.device, test_df)

<h2> Part 3: Deploying Models to Production </h2>

<img src='img/platform_step03_deployment.png' width='1200'/>

### Retrieving Predictions from the Production Instance

In [None]:
import json
import base64
import requests
import uuid

In [None]:
model_name = config_obj['PDK_INFO']['model_name']
ingress_host = config_obj['PDK_INFO']['ingress_host']
ingress_port = config_obj['PDK_INFO']['ingress_port']

In [None]:
# Set the service hostname to your deployment
service_hostname = "brain-mri-ws-deploy.models.example.com"

In [None]:
# Function to caluclate intersection over union of prediction
def iou(pred, label):
    intersection = (pred * label).sum()
    union = pred.sum() + label.sum() - intersection
    if pred.sum() == 0 and label.sum() == 0:
        return 1
    return intersection / union

# Function to create tensor for image and mask
def PairedToTensor(sample):
    img, mask = sample
    img = np.array(img)
    mask = np.expand_dims(mask, -1)
    img = np.moveaxis(img, -1, 0)
    mask = np.moveaxis(mask, -1, 0)
    img, mask = torch.FloatTensor(img), torch.FloatTensor(mask)
    img = img/255
    mask = mask/255
    return img, mask

In [None]:
# Load image and mask
image = Image.open("data/data1/TCGA_CS_6290_20000917/TCGA_CS_6290_20000917_10.tif")
mask = Image.open("data/data1/TCGA_CS_6290_20000917/TCGA_CS_6290_20000917_10_mask.tif")

# Create tuple
sample = (image, mask)

# Create tensors from tuple
tensor_sample = PairedToTensor(sample)

# Create JSON payload for request
data = np.array(tensor_sample[0])
data_shape = list(data.shape)
request = {
    "inputs": [{
        "name": str(uuid.uuid4()),
        "shape": data_shape,
        "datatype": "FP32",
        "data": np.round(data, 4).tolist()
    }]
}

# Show image that will be submitted
plt.figure(figsize=(7,7))
plt.title(f'Submitted Image: ')
plt.imshow(tensor_sample[0].permute(1, 2, 0))

### Create request for Prediction

In [None]:
url = str("http://") + str(ingress_host) + ":" + str(ingress_port) + "/v1/models/" + str(model_name) + ":predict"
headers = {'Host': service_hostname, "Content-Type": "application/json"}
payload = json.dumps(request)

In [None]:
# Submit request, extract prediction in JSON, transform to Tensor
response = requests.post(url, data=payload, headers=headers)
output = response.json()

shape = [1,256,256]
values = output["outputs"][0]["data"]
output = torch.Tensor(np.array(values).reshape(shape))

### Display ground truth and prediction, call IoU function and display IoU

In [None]:
f, axarr = plt.subplots(1,2, figsize=(15, 15))
axarr[0].imshow(tensor_sample[1].permute(1, 2, 0), alpha=0.8)
axarr[0].title.set_text(f'Mask (Ground Truth):')
axarr[1].imshow(output.permute(1, 2, 0), alpha=0.8)
axarr[1].title.set_text(f'Mask (Prediction):')
print(f'Intersection over Union (IoU): {iou(output, tensor_sample[1])}')

In [None]:
# Display groundtruth and prediction overlaid on submitted image, call iou function and display iou
f, axarr = plt.subplots(1,2, figsize=(15, 15))
axarr[0].imshow(tensor_sample[0].permute(1, 2, 0))
axarr[0].imshow(tensor_sample[1].permute(1, 2, 0), alpha=0.4)
axarr[0].title.set_text(f'Full Image (Ground Truth):')
axarr[1].imshow(tensor_sample[0].permute(1, 2, 0))
axarr[1].imshow(output.permute(1, 2, 0), alpha=0.4)
axarr[1].title.set_text(f'Full Image (Prediction):')
print(f'Intersection over Union (IoU): {iou(output, tensor_sample[1])}')

<h2> Bringing It All Together </h2>
<img src='img/big_picture.png' width='1200'/>

# Congratulations! The Model Training lab is completed!