In [2]:
# using optimization to find the optimal mean and variance for normal initialization
from copy import deepcopy
from hydra import compose, initialize
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import mlflow
import optuna
import numpy as np
from tqdm import tqdm
from typing import Tuple
from omegaconf.omegaconf import OmegaConf
from torch.utils.data import DataLoader

from model import UCCDRNModel
from dataset import CamelyonDatasetSeparatedBin, CamelyonDataset
from utils import get_or_create_experiment, parse_experiment_runs_to_optuna_study
torch.autograd.set_detect_anomaly(True)

cfg_name = "train_camelyon_ucc_drn"
with initialize(version_base=None, config_path="../configs"):
    cfg = compose(config_name=cfg_name)
x = np.arange(-0.5,0.6,0.1)


In [3]:
def set_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True

def init_model_and_optimizer(args, model_cfg, device):
    model = UCCDRNModel(model_cfg).to(device)
    optimizer = torch.optim.Adam(
        model.parameters(), lr=args.learning_rate)
    return model, optimizer

def load_model_and_optimizer(experiment_id, run_id):
    model = torch.load(f"mlruns/{experiment_id}/{run_id}/artifacts/best_model/data/model.pth", weights=False)
    optimizer = torch.load(f"mlruns/{experiment_id}/{run_id}/artifacts/optimizer.pt", weights=False)
    return model, optimizer

def init_dataloader(args):
    train_dataset_len = args.train_num_steps * args.batch_size
    train_dataset = CamelyonDataset(
        mode="train",
        patch_size=args.patch_size,
        num_instances=args.num_instances,
        dataset_len = 200000*args.batch_size
    )
    val_dataset_len = args.val_num_steps * args.batch_size
    val_dataset = CamelyonDataset(
        mode="val",
        patch_size=args.patch_size,
        num_instances=args.num_instances,
    )
    # create dataloader
    train_loader = DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        shuffle=True,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        shuffle=False,
    )
    return train_loader, val_loader

def evaluate(model, val_loader, device):
    model.eval()
    val_ae_loss_list = []
    val_ucc_loss_list = []
    val_acc_list = []
    with torch.no_grad():
        for batch_samples, batch_labels in val_loader:
            batch_samples = batch_samples.to(device)
            batch_labels = batch_labels.to(device)

            ucc_logits, reconstruction = model(batch_samples, return_reconstruction=True)

            ucc_loss = F.cross_entropy(ucc_logits, batch_labels)
            val_ucc_loss_list.append(ucc_loss.item())

            ae_loss = F.mse_loss(batch_samples, reconstruction)
            val_ae_loss_list.append(ae_loss.item())

            # acculate accuracy
            # _, batch_labels = torch.max(batch_labels, dim=1)
            
            _, ucc_predicts = torch.max(ucc_logits, dim=1)
            acc = torch.sum(ucc_predicts == batch_labels).item() / len(batch_labels)
            val_acc_list.append(acc)
    return {
                "eval_ae_loss": np.round(np.mean(val_ae_loss_list), 5),
                "eval_ucc_loss": np.round(np.mean(val_ucc_loss_list), 5),
                "eval_ucc_acc": np.round(np.mean(val_acc_list), 5)
            }

def train(args, model, optimizer, lr_scheduler, train_loader, val_loader, device, step=0):
    print("training")
    # mlflow.pytorch.log_model(model, "init_model")
    # output_dir = hydra.core.hydra_config.HydraConfig.get().runtime.output_dir

    model.train()
    best_eval_acc = 0
    if step == 0:
        mlflow.pytorch.log_model(
            model,
            artifact_path = "best_model"
        )
    for batch_samples, batch_labels in train_loader:
        batch_samples = batch_samples.to(device)
        batch_labels = batch_labels.to(device)

        optimizer.zero_grad()
        ucc_logits, reconstruction = model(batch_samples, return_reconstruction=True)
        ucc_loss = F.cross_entropy(ucc_logits, batch_labels)
        ae_loss = F.mse_loss(batch_samples, reconstruction)
        loss = (1-model.alpha)*ucc_loss + model.alpha*ae_loss

        loss.backward()

        optimizer.step()

        step += 1

        if step%20 ==0:
            with torch.no_grad():
                metric_dict = {}
                grad_log = {name: torch.mean(param.grad).cpu().item(
                ) for name, param in model.named_parameters() if isinstance(param.grad, torch.Tensor)}
                if step == 20:
                    encoder_grad_log = [grad for name, grad in grad_log.items() if "encoder" in name and "weight" in name]
                    if max(encoder_grad_log)<1e-9:
                        break
                mlflow.log_metrics(grad_log, step=step)
                metric_dict["train_ae_loss"] = np.round(ae_loss.detach().item(), 5)
                _, pred = torch.max(ucc_logits, dim=1)
                accuracy = torch.sum(pred.flatten() == batch_labels.flatten())/len(batch_labels)
                metric_dict["train_ucc_loss"] = np.round(ucc_loss.detach().item(), 5)
                metric_dict["train_ucc_acc"] = np.round(float(accuracy), 5)
                metric_dict["loss"] = np.round(float(loss), 5)
                print(f"Step {step}:", metric_dict)
            mlflow.log_metrics(metric_dict, step=step)

        if step % args.save_interval == 0:
            eval_metric_dict = evaluate(
                model,
                val_loader,
                device)
            print(f"step: {step}," + ",".join([f"{key}: {value}"for key, value in eval_metric_dict.items()]))
            mlflow.log_metrics(eval_metric_dict, step=step)
            eval_acc = eval_metric_dict["eval_ucc_acc"]
            if eval_acc > best_eval_acc or eval_acc==1.0:
                best_eval_acc = eval_acc
                mlflow.log_metric("best_eval_acc", best_eval_acc)
                mlflow.pytorch.log_model(model, artifact_path="best_model")
                torch.save(optimizer, "optimizer.pt")
                mlflow.log_artifact("optimizer.pt")
            if step == 200000:
                break
            model.train()

    print("Training finished!!!")
    return best_eval_acc

In [None]:
mlflow.set_tracking_uri("file:///Users/tanguanyu/UCC-DRN-Pytorch/camelyon/mlruns")

run_name = "camelyon-ucc-drn-search-init"
experiment_id = get_or_create_experiment(experiment_name=run_name)
mlflow.set_experiment(experiment_id=experiment_id)
for lower_bound in x:
    for upper_bound in x:
        if lower_bound >= upper_bound:
            continue
        with mlflow.start_run(nested=True) as run:
            cfg.model.drn.init_lower_bound = float(lower_bound)
            cfg.model.drn.init_upper_bound = float(upper_bound)
            mlflow.log_params({
                "init_W_lower_bound": float(lower_bound),
                "init_W_upper_bound": float(upper_bound)
            })
            print(cfg.model.drn)
            cfg.args.learning_rate = 0.001
            mlflow.log_dict(dict(OmegaConf.to_object(cfg)), "config.yaml")
            args = cfg.args
            device = torch.device("cuda" if torch.cuda.is_available() else "mps")
            model, optimizer = init_model_and_optimizer(args, cfg, device)
            train_loader, val_loader = init_dataloader(args)
            artifact_path = run.info.artifact_uri
            mlflow.pytorch.log_model(
                    model,
                    artifact_path = "init_model")
            best_acc = train(args, model, optimizer, None,
                            train_loader, val_loader, device)

{'num_bins': 11, 'hidden_q': 100, 'num_layers': 1, 'num_nodes': 12, 'init_method': 'uniform', 'init_upper_bound': -0.4, 'init_lower_bound': -0.5, 'output_bins': 2}




training




Training finished!!!
{'num_bins': 11, 'hidden_q': 100, 'num_layers': 1, 'num_nodes': 12, 'init_method': 'uniform', 'init_upper_bound': -0.30000000000000004, 'init_lower_bound': -0.5, 'output_bins': 2}




training




Training finished!!!
{'num_bins': 11, 'hidden_q': 100, 'num_layers': 1, 'num_nodes': 12, 'init_method': 'uniform', 'init_upper_bound': -0.20000000000000007, 'init_lower_bound': -0.5, 'output_bins': 2}




training




Step 20: {'train_ae_loss': np.float64(1.00029), 'train_ucc_loss': np.float64(0.73202), 'train_ucc_acc': np.float64(0.375), 'loss': np.float64(0.86615)}
Step 40: {'train_ae_loss': np.float64(1.0001), 'train_ucc_loss': np.float64(0.69207), 'train_ucc_acc': np.float64(0.53125), 'loss': np.float64(0.84608)}
Step 60: {'train_ae_loss': np.float64(0.9998), 'train_ucc_loss': np.float64(0.7093), 'train_ucc_acc': np.float64(0.4375), 'loss': np.float64(0.85455)}
Step 80: {'train_ae_loss': np.float64(1.00302), 'train_ucc_loss': np.float64(0.707), 'train_ucc_acc': np.float64(0.4375), 'loss': np.float64(0.85501)}
Step 100: {'train_ae_loss': np.float64(0.97623), 'train_ucc_loss': np.float64(0.71442), 'train_ucc_acc': np.float64(0.375), 'loss': np.float64(0.84532)}
Step 120: {'train_ae_loss': np.float64(0.93069), 'train_ucc_loss': np.float64(0.71237), 'train_ucc_acc': np.float64(0.375), 'loss': np.float64(0.82153)}
Step 140: {'train_ae_loss': np.float64(0.85447), 'train_ucc_loss': np.float64(0.69128),



Step 1020: {'train_ae_loss': np.float64(0.78434), 'train_ucc_loss': np.float64(0.69361), 'train_ucc_acc': np.float64(0.46875), 'loss': np.float64(0.73898)}
Step 1040: {'train_ae_loss': np.float64(0.76507), 'train_ucc_loss': np.float64(0.69307), 'train_ucc_acc': np.float64(0.5), 'loss': np.float64(0.72907)}
Step 1060: {'train_ae_loss': np.float64(0.77669), 'train_ucc_loss': np.float64(0.69277), 'train_ucc_acc': np.float64(0.65625), 'loss': np.float64(0.73473)}
Step 1080: {'train_ae_loss': np.float64(0.75577), 'train_ucc_loss': np.float64(0.69287), 'train_ucc_acc': np.float64(0.59375), 'loss': np.float64(0.72432)}
Step 1100: {'train_ae_loss': np.float64(0.78665), 'train_ucc_loss': np.float64(0.69165), 'train_ucc_acc': np.float64(0.5625), 'loss': np.float64(0.73915)}
Step 1120: {'train_ae_loss': np.float64(0.80385), 'train_ucc_loss': np.float64(0.69486), 'train_ucc_acc': np.float64(0.40625), 'loss': np.float64(0.74935)}
Step 1140: {'train_ae_loss': np.float64(0.76778), 'train_ucc_loss': n



Step 2020: {'train_ae_loss': np.float64(0.75705), 'train_ucc_loss': np.float64(0.69247), 'train_ucc_acc': np.float64(0.53125), 'loss': np.float64(0.72476)}
Step 2040: {'train_ae_loss': np.float64(0.76287), 'train_ucc_loss': np.float64(0.69295), 'train_ucc_acc': np.float64(0.6875), 'loss': np.float64(0.72791)}
Step 2060: {'train_ae_loss': np.float64(0.75292), 'train_ucc_loss': np.float64(0.6936), 'train_ucc_acc': np.float64(0.4375), 'loss': np.float64(0.72326)}
Step 2080: {'train_ae_loss': np.float64(0.78882), 'train_ucc_loss': np.float64(0.69078), 'train_ucc_acc': np.float64(0.625), 'loss': np.float64(0.7398)}
Step 2100: {'train_ae_loss': np.float64(0.79082), 'train_ucc_loss': np.float64(0.6989), 'train_ucc_acc': np.float64(0.28125), 'loss': np.float64(0.74486)}
Step 2120: {'train_ae_loss': np.float64(0.73962), 'train_ucc_loss': np.float64(0.6909), 'train_ucc_acc': np.float64(0.625), 'loss': np.float64(0.71526)}
Step 2140: {'train_ae_loss': np.float64(0.77086), 'train_ucc_loss': np.flo



Step 9020: {'train_ae_loss': np.float64(0.68925), 'train_ucc_loss': np.float64(0.6801), 'train_ucc_acc': np.float64(0.5), 'loss': np.float64(0.68468)}
Step 9040: {'train_ae_loss': np.float64(0.70447), 'train_ucc_loss': np.float64(0.6476), 'train_ucc_acc': np.float64(0.65625), 'loss': np.float64(0.67603)}
Step 9060: {'train_ae_loss': np.float64(0.70601), 'train_ucc_loss': np.float64(0.64189), 'train_ucc_acc': np.float64(0.625), 'loss': np.float64(0.67395)}
Step 9080: {'train_ae_loss': np.float64(0.69714), 'train_ucc_loss': np.float64(0.74425), 'train_ucc_acc': np.float64(0.375), 'loss': np.float64(0.7207)}
Step 9100: {'train_ae_loss': np.float64(0.71511), 'train_ucc_loss': np.float64(0.63144), 'train_ucc_acc': np.float64(0.6875), 'loss': np.float64(0.67328)}
Step 9120: {'train_ae_loss': np.float64(0.70272), 'train_ucc_loss': np.float64(0.70008), 'train_ucc_acc': np.float64(0.5), 'loss': np.float64(0.7014)}
Step 9140: {'train_ae_loss': np.float64(0.70338), 'train_ucc_loss': np.float64(0.



Step 10020: {'train_ae_loss': np.float64(0.69853), 'train_ucc_loss': np.float64(0.61388), 'train_ucc_acc': np.float64(0.71875), 'loss': np.float64(0.6562)}
Step 10040: {'train_ae_loss': np.float64(0.67951), 'train_ucc_loss': np.float64(0.76755), 'train_ucc_acc': np.float64(0.40625), 'loss': np.float64(0.72353)}
Step 10060: {'train_ae_loss': np.float64(0.68975), 'train_ucc_loss': np.float64(0.71429), 'train_ucc_acc': np.float64(0.46875), 'loss': np.float64(0.70202)}
Step 10080: {'train_ae_loss': np.float64(0.68334), 'train_ucc_loss': np.float64(0.69724), 'train_ucc_acc': np.float64(0.53125), 'loss': np.float64(0.69029)}
Step 10100: {'train_ae_loss': np.float64(0.70287), 'train_ucc_loss': np.float64(0.64477), 'train_ucc_acc': np.float64(0.6875), 'loss': np.float64(0.67382)}
Step 10120: {'train_ae_loss': np.float64(0.69969), 'train_ucc_loss': np.float64(0.68153), 'train_ucc_acc': np.float64(0.5625), 'loss': np.float64(0.69061)}
Step 10140: {'train_ae_loss': np.float64(0.69604), 'train_ucc



Step 15020: {'train_ae_loss': np.float64(0.70253), 'train_ucc_loss': np.float64(0.62914), 'train_ucc_acc': np.float64(0.65625), 'loss': np.float64(0.66584)}
Step 15040: {'train_ae_loss': np.float64(0.68516), 'train_ucc_loss': np.float64(0.69021), 'train_ucc_acc': np.float64(0.59375), 'loss': np.float64(0.68768)}
Step 15060: {'train_ae_loss': np.float64(0.69925), 'train_ucc_loss': np.float64(0.63993), 'train_ucc_acc': np.float64(0.6875), 'loss': np.float64(0.66959)}
Step 15080: {'train_ae_loss': np.float64(0.70433), 'train_ucc_loss': np.float64(0.65919), 'train_ucc_acc': np.float64(0.59375), 'loss': np.float64(0.68176)}
Step 15100: {'train_ae_loss': np.float64(0.71064), 'train_ucc_loss': np.float64(0.6459), 'train_ucc_acc': np.float64(0.625), 'loss': np.float64(0.67827)}
Step 15120: {'train_ae_loss': np.float64(0.69243), 'train_ucc_loss': np.float64(0.61828), 'train_ucc_acc': np.float64(0.71875), 'loss': np.float64(0.65535)}
Step 15140: {'train_ae_loss': np.float64(0.67891), 'train_ucc_

{'args': {'dataset': 'camelyon', 'model_dir': 'saved_models/', 'model_name': 'camelyon_ucc_drn', 'num_instances': 32, 'ucc_start': 1, 'ucc_end': 4, 'batch_size': 5, 'num_samples_per_class': 5, 'num_workers': 4, 'learning_rate': 0.0001, 'num_bins': 11, 'num_features': 10, 'train_num_steps': 100000, 'val_num_steps': 200, 'save_interval': 1000, 'patch_size': 32, 'seed': 22}, 'model': {'kde_model': {'num_bins': 11, 'sigma': 0.1}, 'num_channels': 3, 'encoder': {'conv_input_channel': 3, 'conv_output_channel': 16, 'block1_output_channel': 32, 'block1_num_layer': 1, 'block2_output_channel': 64, 'block2_num_layer': 1, 'block3_output_channel': 128, 'block3_num_layer': 1, 'flatten_size': 8192, 'num_features': 16}, 'decoder': {'linear_size': 8192, 'reshape_size': [128, 8, 8], 'block1_output_channel': 128, 'block1_num_layer': 1, 'block2_output_channel': 64, 'block2_num_layer': 1, 'block3_output_channel': 32, 'block3_num_layer': 1, 'output_channel': 3}, 'drn': {'num_bins': 11, 'hidden_q': 100, 'num_layers': 2, 'num_nodes': 9, 'init_method': 'uniform', 'init_upper_bound': 0.5, 'init_lower_bound': -0.5, 'output_bins': 4}, 'ucc_classifier': 'None', 'loss': {'alpha': 0.5}}}

In [5]:
def load_model_and_optimizer(experiment_id, run_id):
    model = torch.load(f"mlruns/{experiment_id}/{run_id}/artifacts/best_model/data/model.pth", weights_only=False, map_location="mps")
    optimizer = torch.optim.Adam(lr=0.0012, params=model.parameters())
    # optimizer.load_state_dict(torch.load(f"mlruns/{experiment_id}/{run_id}/artifacts/optimizer.pt", weights_only=False, map_location="mps").state_dict())

    with open(f"mlruns/{experiment_id}/{run_id}/metrics/eval_ucc_acc") as file:
        lines = file.readlines()
        step = int(lines[-1].split(" ")[-1])
    return model, optimizer, step

def resume_training(run_id):
    mlflow.set_tracking_uri("mlruns")
    run_name = "camelyon-ucc-drn"
    experiment = mlflow.set_experiment(run_name)
    experiment_id = experiment.experiment_id
    cfg_name = "train_camelyon_ucc_drn"
    with initialize(version_base=None, config_path="../configs"):
        cfg = compose(config_name=cfg_name)

    args = cfg.args
    model, optimizer, step = load_model_and_optimizer(experiment_id, run_id)
    train_loader, val_loader = init_dataloader(args)
    device = torch.device("cuda" if torch.cuda.is_available() else "mps")
    print(optimizer)
    print(step)
    with mlflow.start_run(run_id=run_id, nested=True):
        mlflow.set_experiment(experiment_id=experiment_id)
        best_acc = train(args, model, optimizer, None,
                    train_loader, val_loader, device, step=step)

resume_training("67affc095c864c1ba34f32214201b08c")

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.0012
    maximize: False
    weight_decay: 0
)
201000
training
{'train_ae_loss': np.float64(1.00001), 'train_ucc_loss': np.float64(0.68627), 'train_ucc_acc': np.float64(0.71875), 'loss': np.float64(0.84314)}
{'train_ae_loss': np.float64(1.0), 'train_ucc_loss': np.float64(0.69135), 'train_ucc_acc': np.float64(0.5625), 'loss': np.float64(0.84568)}
{'train_ae_loss': np.float64(1.00002), 'train_ucc_loss': np.float64(0.69038), 'train_ucc_acc': np.float64(0.59375), 'loss': np.float64(0.8452)}
{'train_ae_loss': np.float64(0.99998), 'train_ucc_loss': np.float64(0.69322), 'train_ucc_acc': np.float64(0.5), 'loss': np.float64(0.8466)}
{'train_ae_loss': np.float64(1.0), 'train_ucc_loss': np.float64(0.69326), 'train_ucc_acc': np.float64(0.5), 'loss': np.float64(0.84663)}
{'train_ae_loss': np.float64(0.99999), 'train_ucc_loss':



OSError: [Errno 30] Read-only file system: '/content'

In [None]:
mlflow.set_tracking_uri("mlruns")
run_name = "camelyon-ucc-drn"
experiment = mlflow.set_experiment(run_name)
experiment_id = experiment.experiment_id
cfg_name = "train_camelyon_ucc_drn"

In [None]:
experiment_id

In [7]:
import os

prefix_to_replace = "/content/gdrive/MyDrive/UCCDRNPytorch/"
prefix_replacement = "/Users/tanguanyu/UCC-DRN-Pytorch/"

In [20]:
for root, dirs, files in  os.walk("mlruns/152105657986962541"):
    for d in dirs:
        if d=="models":
            for rt, ds, _ in os.walk(f"{root}/models"):
                for d_ in ds:
                    with open(f"{rt}/{d_}/meta.yaml", "r") as file:
                        string = file.read()
                        string = string.replace(prefix_to_replace, prefix_replacement)
                    with open(f"{rt}/{d_}/meta.yaml", "w") as file:
                        file.write(string)
                break
        else:
            with open(f"{root}/{d}/meta.yaml", "r") as file:
                string = file.read()
                string = string.replace(prefix_to_replace, prefix_replacement)
            with open(f"{root}/{d}/meta.yaml", "w") as file:
                file.write(string)
    break

In [14]:
string.replace(prefix_to_replace, prefix_replacement)

"artifact_uri: /Users/tanguanyu/UCC-DRN-Pytorch/camelyon/mlruns/152105657986962541/2601d759316f40c78dc0aa8a8f21b5ad/artifacts\nend_time: 1749875771310\nentry_point_name: ''\nexperiment_id: '152105657986962541'\nlifecycle_stage: active\nrun_id: 2601d759316f40c78dc0aa8a8f21b5ad\nrun_name: funny-stork-278\nsource_name: ''\nsource_type: 4\nsource_version: ''\nstart_time: 1749875771065\nstatus: 4\ntags: []\nuser_id: root\n"

In [15]:
with open("test.yaml", "w") as file:
    file.write(string)

In [16]:
print(os.getcwd())

/Users/tanguanyu/UCC-DRN-Pytorch/camelyon


In [30]:
import yaml
for root, dirs, files in  os.walk("mlruns/152105657986962541"):
    for d in dirs:
        if d=="models":
            pass
        else:
            with open(f"{root}/{d}/meta.yaml", "r") as file:
                obj = yaml.safe_load(file)
            if "run_uuid" not in obj:
                obj["run_uuid"] = obj["run_id"]
                with open(f"{root}/{d}/meta.yaml", "w") as file:
                    file.write(yaml.safe_dump(obj))
    break

In [29]:
yaml.safe_dump(obj)

"artifact_uri: /Users/tanguanyu/UCC-DRN-Pytorch/camelyon/mlruns/152105657986962541/2601d759316f40c78dc0aa8a8f21b5ad/artifacts\nend_time: 1749875771310\nentry_point_name: ''\nexperiment_id: '152105657986962541'\nlifecycle_stage: active\nrun_id: 2601d759316f40c78dc0aa8a8f21b5ad\nrun_name: funny-stork-278\nsource_name: ''\nsource_type: 4\nsource_version: ''\nstart_time: 1749875771065\nstatus: 4\ntags: []\nuser_id: root\n"

In [36]:
filess = []
for root, dirs, files in  os.walk("mlruns/152105657986962541"):
    for d in dirs:
        if d=="models":
            pass
        else:
            loss_file_path = f"{root}/{d}/metrics/loss"
            if os.path.exists(loss_file_path):
                with open(loss_file_path, "r") as file:
                    string = file.read()
                if len(string)==0:
                    filess.append(f"{root}/{d}")
            else:
                filess.append(f"{root}/{d}")
    break

In [39]:
import shutil
for f in filess:
    shutil.rmtree(f)