In [1]:
# using optimization to find the optimal mean and variance for normal initialization
from copy import deepcopy
from hydra import compose, initialize
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import mlflow
import optuna
import numpy as np
from tqdm import tqdm
from typing import Tuple
from omegaconf.omegaconf import OmegaConf
from torch.utils.data import DataLoader

from model import UCCDRNModel
from dataset import CamelyonDatasetSeparatedBin, CamelyonDataset
from utils import get_or_create_experiment, parse_experiment_runs_to_optuna_study
torch.autograd.set_detect_anomaly(True)

cfg_name = "train_camelyon_ucc_drn"
with initialize(version_base=None, config_path="../configs"):
    cfg = compose(config_name=cfg_name)


In [2]:
def set_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True

def init_model_and_optimizer(args, model_cfg, device):
    model = UCCDRNModel(model_cfg).to(device)
    optimizer = torch.optim.Adam(
        model.parameters(), lr=args.learning_rate)
    return model, optimizer

def load_model_and_optimizer(experiment_id, run_id):
    model = torch.load(f"mlruns/{experiment_id}/{run_id}/artifacts/best_model/data/model.pth", weights=False)
    optimizer = torch.load(f"mlruns/{experiment_id}/{run_id}/artifacts/optimizer.pt", weights=False)
    return model, optimizer

def init_dataloader(args):
    train_dataset_len = args.train_num_steps * args.batch_size
    train_dataset = CamelyonDataset(
        mode="train",
        patch_size=args.patch_size,
        num_instances=args.num_instances,
        dataset_len = 200000*args.batch_size
    )
    val_dataset_len = args.val_num_steps * args.batch_size
    val_dataset = CamelyonDataset(
        mode="val",
        patch_size=args.patch_size,
        num_instances=args.num_instances,
    )
    # create dataloader
    train_loader = DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        shuffle=True,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        shuffle=False,
    )
    return train_loader, val_loader

def evaluate(model, val_loader, device):
    model.eval()
    val_ae_loss_list = []
    val_ucc_loss_list = []
    val_acc_list = []
    with torch.no_grad():
        for batch_samples, batch_labels in val_loader:
            batch_samples = batch_samples.to(device)
            batch_labels = batch_labels.to(device)

            ucc_logits, reconstruction = model(batch_samples, return_reconstruction=True)

            ucc_loss = F.cross_entropy(ucc_logits, batch_labels)
            val_ucc_loss_list.append(ucc_loss.item())

            ae_loss = F.mse_loss(batch_samples, reconstruction)
            val_ae_loss_list.append(ae_loss.item())

            # acculate accuracy
            # _, batch_labels = torch.max(batch_labels, dim=1)
            
            _, ucc_predicts = torch.max(ucc_logits, dim=1)
            acc = torch.sum(ucc_predicts == batch_labels).item() / len(batch_labels)
            val_acc_list.append(acc)
    return {
                "eval_ae_loss": np.round(np.mean(val_ae_loss_list), 5),
                "eval_ucc_loss": np.round(np.mean(val_ucc_loss_list), 5),
                "eval_ucc_acc": np.round(np.mean(val_acc_list), 5)
            }

def train(args, model, optimizer, lr_scheduler, train_loader, val_loader, device, step=0):
    print("training")
    # mlflow.pytorch.log_model(model, "init_model")
    # output_dir = hydra.core.hydra_config.HydraConfig.get().runtime.output_dir

    model.train()
    best_eval_acc = 0
    if step == 0:
        mlflow.pytorch.log_model(
            model,
            artifact_path = "best_model"
        )
    for batch_samples, batch_labels in train_loader:
        batch_samples = batch_samples.to(device)
        batch_labels = batch_labels.to(device)

        optimizer.zero_grad()
        ucc_logits, reconstruction = model(batch_samples, return_reconstruction=True)
        ucc_loss = F.cross_entropy(ucc_logits, batch_labels)
        ae_loss = F.mse_loss(batch_samples, reconstruction)
        loss = (1-model.alpha)*ucc_loss + model.alpha*ae_loss

        loss.backward()

        optimizer.step()

        step += 1

        if step%20 ==0:
            with torch.no_grad():
                metric_dict = {}
                grad_log = {name: torch.mean(param.grad).cpu().item(
                ) for name, param in model.named_parameters() if isinstance(param.grad, torch.Tensor)}
                mlflow.log_metrics(grad_log, step=step)
                metric_dict["train_ae_loss"] = np.round(ae_loss.detach().item(), 5)
                # _, batch_labels = torch.max(batch_labels, dim=1)
                _, pred = torch.max(ucc_logits, dim=1)
                accuracy = torch.sum(pred.flatten() == batch_labels.flatten())/len(batch_labels)
                metric_dict["train_ucc_loss"] = np.round(ucc_loss.detach().item(), 5)
                metric_dict["train_ucc_acc"] = np.round(float(accuracy), 5)
                metric_dict["loss"] = np.round(float(loss), 5)
                print(metric_dict)
            mlflow.log_metrics(metric_dict, step=step)

        if step % args.save_interval == 0:
            eval_metric_dict = evaluate(
                model,
                val_loader,
                device)
            print(f"step: {step}," + ",".join([f"{key}: {value}"for key, value in eval_metric_dict.items()]))
            mlflow.log_metrics(eval_metric_dict, step=step)
            eval_acc = eval_metric_dict["eval_ucc_acc"]
            if eval_acc > best_eval_acc or eval_acc==1.0:
                best_eval_acc = eval_acc
                mlflow.log_metric("best_eval_acc", best_eval_acc)
                mlflow.pytorch.log_model(model, artifact_path="152105657986962541/3f2b739573e5450a99a069f64ab47364/artifacts/best_model")
                torch.save(optimizer, "optimizer.pt")
                mlflow.log_artifact("optimizer.pt", "optimizer.pt")
            if step == 300000:
                break
            model.train()

    print("Training finished!!!")
    return best_eval_acc

In [3]:
mlflow.set_tracking_uri("file:///Users/tanguanyu/UCC-DRN-Pytorch/camelyon/mlruns")
run_name = "camelyon-ucc-drn-split-bin"
experiment_id = get_or_create_experiment(experiment_name=run_name)
mlflow.set_experiment(experiment_id=experiment_id)

with mlflow.start_run(nested=True) as run:
#     cfg.model.drn.num_nodes = 11
#     cfg.model.drn.num_layers = 2
    print(cfg.model.drn)
    cfg.args.learning_rate = 0.001
    mlflow.log_dict(dict(OmegaConf.to_object(cfg)), "config.yaml")
    args = cfg.args
    device = torch.device("cuda" if torch.cuda.is_available() else "mps")
    model, optimizer = init_model_and_optimizer(args, cfg, device)
    train_loader, val_loader = init_dataloader(args)
    mlflow.pytorch.log_model(model, artifact_path="init_model")
    artifact_path = run.info.artifact_uri
    mlflow.pytorch.log_model(
            model,
            artifact_path = "init_model")
    best_acc = train(args, model, optimizer, None,
                    train_loader, val_loader, device)

{'num_bins': 11, 'hidden_q': 100, 'num_layers': 1, 'num_nodes': 12, 'init_method': 'uniform', 'init_upper_bound': 0.5, 'init_lower_bound': -0.5, 'output_bins': 2}




training




{'train_ae_loss': np.float64(1.00031), 'train_ucc_loss': np.float64(0.70792), 'train_ucc_acc': np.float64(0.4375), 'loss': np.float64(0.85412)}
{'train_ae_loss': np.float64(1.00008), 'train_ucc_loss': np.float64(0.69598), 'train_ucc_acc': np.float64(0.5), 'loss': np.float64(0.84803)}
{'train_ae_loss': np.float64(1.0001), 'train_ucc_loss': np.float64(0.7125), 'train_ucc_acc': np.float64(0.375), 'loss': np.float64(0.8563)}
{'train_ae_loss': np.float64(1.00005), 'train_ucc_loss': np.float64(0.68533), 'train_ucc_acc': np.float64(0.59375), 'loss': np.float64(0.84269)}
{'train_ae_loss': np.float64(1.00003), 'train_ucc_loss': np.float64(0.68656), 'train_ucc_acc': np.float64(0.59375), 'loss': np.float64(0.84329)}
{'train_ae_loss': np.float64(1.00005), 'train_ucc_loss': np.float64(0.70294), 'train_ucc_acc': np.float64(0.375), 'loss': np.float64(0.8515)}
{'train_ae_loss': np.float64(1.00004), 'train_ucc_loss': np.float64(0.70943), 'train_ucc_acc': np.float64(0.28125), 'loss': np.float64(0.85474)



{'train_ae_loss': np.float64(0.99998), 'train_ucc_loss': np.float64(0.69451), 'train_ucc_acc': np.float64(0.40625), 'loss': np.float64(0.84724)}
{'train_ae_loss': np.float64(0.99995), 'train_ucc_loss': np.float64(0.69565), 'train_ucc_acc': np.float64(0.375), 'loss': np.float64(0.8478)}
{'train_ae_loss': np.float64(0.99999), 'train_ucc_loss': np.float64(0.69451), 'train_ucc_acc': np.float64(0.46875), 'loss': np.float64(0.84725)}
{'train_ae_loss': np.float64(1.0), 'train_ucc_loss': np.float64(0.69196), 'train_ucc_acc': np.float64(0.53125), 'loss': np.float64(0.84598)}
{'train_ae_loss': np.float64(0.99999), 'train_ucc_loss': np.float64(0.69983), 'train_ucc_acc': np.float64(0.3125), 'loss': np.float64(0.84991)}
{'train_ae_loss': np.float64(1.00001), 'train_ucc_loss': np.float64(0.69723), 'train_ucc_acc': np.float64(0.375), 'loss': np.float64(0.84862)}
{'train_ae_loss': np.float64(1.00003), 'train_ucc_loss': np.float64(0.7023), 'train_ucc_acc': np.float64(0.28125), 'loss': np.float64(0.8511

KeyboardInterrupt: 

{'args': {'dataset': 'camelyon', 'model_dir': 'saved_models/', 'model_name': 'camelyon_ucc_drn', 'num_instances': 32, 'ucc_start': 1, 'ucc_end': 4, 'batch_size': 5, 'num_samples_per_class': 5, 'num_workers': 4, 'learning_rate': 0.0001, 'num_bins': 11, 'num_features': 10, 'train_num_steps': 100000, 'val_num_steps': 200, 'save_interval': 1000, 'patch_size': 32, 'seed': 22}, 'model': {'kde_model': {'num_bins': 11, 'sigma': 0.1}, 'num_channels': 3, 'encoder': {'conv_input_channel': 3, 'conv_output_channel': 16, 'block1_output_channel': 32, 'block1_num_layer': 1, 'block2_output_channel': 64, 'block2_num_layer': 1, 'block3_output_channel': 128, 'block3_num_layer': 1, 'flatten_size': 8192, 'num_features': 16}, 'decoder': {'linear_size': 8192, 'reshape_size': [128, 8, 8], 'block1_output_channel': 128, 'block1_num_layer': 1, 'block2_output_channel': 64, 'block2_num_layer': 1, 'block3_output_channel': 32, 'block3_num_layer': 1, 'output_channel': 3}, 'drn': {'num_bins': 11, 'hidden_q': 100, 'num_layers': 2, 'num_nodes': 9, 'init_method': 'uniform', 'init_upper_bound': 0.5, 'init_lower_bound': -0.5, 'output_bins': 4}, 'ucc_classifier': 'None', 'loss': {'alpha': 0.5}}}

In [5]:
def load_model_and_optimizer(experiment_id, run_id):
    model = torch.load(f"mlruns/{experiment_id}/{run_id}/artifacts/best_model/data/model.pth", weights_only=False, map_location="mps")
    optimizer = torch.optim.Adam(lr=0.0012, params=model.parameters())
    # optimizer.load_state_dict(torch.load(f"mlruns/{experiment_id}/{run_id}/artifacts/optimizer.pt", weights_only=False, map_location="mps").state_dict())

    with open(f"mlruns/{experiment_id}/{run_id}/metrics/eval_ucc_acc") as file:
        lines = file.readlines()
        step = int(lines[-1].split(" ")[-1])
    return model, optimizer, step

def resume_training(run_id):
    mlflow.set_tracking_uri("mlruns")
    run_name = "camelyon-ucc-drn"
    experiment = mlflow.set_experiment(run_name)
    experiment_id = experiment.experiment_id
    cfg_name = "train_camelyon_ucc_drn"
    with initialize(version_base=None, config_path="../configs"):
        cfg = compose(config_name=cfg_name)

    args = cfg.args
    model, optimizer, step = load_model_and_optimizer(experiment_id, run_id)
    train_loader, val_loader = init_dataloader(args)
    device = torch.device("cuda" if torch.cuda.is_available() else "mps")
    print(optimizer)
    print(step)
    with mlflow.start_run(run_id=run_id, nested=True):
        mlflow.set_experiment(experiment_id=experiment_id)
        best_acc = train(args, model, optimizer, None,
                    train_loader, val_loader, device, step=step)

resume_training("67affc095c864c1ba34f32214201b08c")

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.0012
    maximize: False
    weight_decay: 0
)
201000
training
{'train_ae_loss': np.float64(1.00001), 'train_ucc_loss': np.float64(0.68627), 'train_ucc_acc': np.float64(0.71875), 'loss': np.float64(0.84314)}
{'train_ae_loss': np.float64(1.0), 'train_ucc_loss': np.float64(0.69135), 'train_ucc_acc': np.float64(0.5625), 'loss': np.float64(0.84568)}
{'train_ae_loss': np.float64(1.00002), 'train_ucc_loss': np.float64(0.69038), 'train_ucc_acc': np.float64(0.59375), 'loss': np.float64(0.8452)}
{'train_ae_loss': np.float64(0.99998), 'train_ucc_loss': np.float64(0.69322), 'train_ucc_acc': np.float64(0.5), 'loss': np.float64(0.8466)}
{'train_ae_loss': np.float64(1.0), 'train_ucc_loss': np.float64(0.69326), 'train_ucc_acc': np.float64(0.5), 'loss': np.float64(0.84663)}
{'train_ae_loss': np.float64(0.99999), 'train_ucc_loss':



OSError: [Errno 30] Read-only file system: '/content'

In [None]:
mlflow.set_tracking_uri("mlruns")
run_name = "camelyon-ucc-drn"
experiment = mlflow.set_experiment(run_name)
experiment_id = experiment.experiment_id
cfg_name = "train_camelyon_ucc_drn"

In [None]:
experiment_id

In [7]:
import os

prefix_to_replace = "/content/gdrive/MyDrive/UCCDRNPytorch/"
prefix_replacement = "/Users/tanguanyu/UCC-DRN-Pytorch/"

In [20]:
for root, dirs, files in  os.walk("mlruns/152105657986962541"):
    for d in dirs:
        if d=="models":
            for rt, ds, _ in os.walk(f"{root}/models"):
                for d_ in ds:
                    with open(f"{rt}/{d_}/meta.yaml", "r") as file:
                        string = file.read()
                        string = string.replace(prefix_to_replace, prefix_replacement)
                    with open(f"{rt}/{d_}/meta.yaml", "w") as file:
                        file.write(string)
                break
        else:
            with open(f"{root}/{d}/meta.yaml", "r") as file:
                string = file.read()
                string = string.replace(prefix_to_replace, prefix_replacement)
            with open(f"{root}/{d}/meta.yaml", "w") as file:
                file.write(string)
    break

In [14]:
string.replace(prefix_to_replace, prefix_replacement)

"artifact_uri: /Users/tanguanyu/UCC-DRN-Pytorch/camelyon/mlruns/152105657986962541/2601d759316f40c78dc0aa8a8f21b5ad/artifacts\nend_time: 1749875771310\nentry_point_name: ''\nexperiment_id: '152105657986962541'\nlifecycle_stage: active\nrun_id: 2601d759316f40c78dc0aa8a8f21b5ad\nrun_name: funny-stork-278\nsource_name: ''\nsource_type: 4\nsource_version: ''\nstart_time: 1749875771065\nstatus: 4\ntags: []\nuser_id: root\n"

In [15]:
with open("test.yaml", "w") as file:
    file.write(string)

In [16]:
print(os.getcwd())

/Users/tanguanyu/UCC-DRN-Pytorch/camelyon


In [30]:
import yaml
for root, dirs, files in  os.walk("mlruns/152105657986962541"):
    for d in dirs:
        if d=="models":
            pass
        else:
            with open(f"{root}/{d}/meta.yaml", "r") as file:
                obj = yaml.safe_load(file)
            if "run_uuid" not in obj:
                obj["run_uuid"] = obj["run_id"]
                with open(f"{root}/{d}/meta.yaml", "w") as file:
                    file.write(yaml.safe_dump(obj))
    break

In [29]:
yaml.safe_dump(obj)

"artifact_uri: /Users/tanguanyu/UCC-DRN-Pytorch/camelyon/mlruns/152105657986962541/2601d759316f40c78dc0aa8a8f21b5ad/artifacts\nend_time: 1749875771310\nentry_point_name: ''\nexperiment_id: '152105657986962541'\nlifecycle_stage: active\nrun_id: 2601d759316f40c78dc0aa8a8f21b5ad\nrun_name: funny-stork-278\nsource_name: ''\nsource_type: 4\nsource_version: ''\nstart_time: 1749875771065\nstatus: 4\ntags: []\nuser_id: root\n"

In [36]:
filess = []
for root, dirs, files in  os.walk("mlruns/152105657986962541"):
    for d in dirs:
        if d=="models":
            pass
        else:
            loss_file_path = f"{root}/{d}/metrics/loss"
            if os.path.exists(loss_file_path):
                with open(loss_file_path, "r") as file:
                    string = file.read()
                if len(string)==0:
                    filess.append(f"{root}/{d}")
            else:
                filess.append(f"{root}/{d}")
    break

In [39]:
import shutil
for f in filess:
    shutil.rmtree(f)