In [2]:
# using optimization to find the optimal mean and variance for normal initialization
from copy import deepcopy
from hydra import compose, initialize
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import mlflow
import numpy as np
from tqdm import tqdm
from typing import Tuple
from omegaconf.omegaconf import OmegaConf
from torch.utils.data import DataLoader

from model import UCCDRNModel
from dataset import CamelyonDatasetSeparatedBin, CamelyonDataset
from utils import get_or_create_experiment, parse_experiment_runs_to_optuna_study
torch.autograd.set_detect_anomaly(True)

cfg_name = "train_camelyon_ucc_drn"
with initialize(version_base=None, config_path="../configs"):
    cfg = compose(config_name=cfg_name)
x = np.arange(-0.25,0.35,0.05)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
x

array([-2.50000000e-01, -2.00000000e-01, -1.50000000e-01, -1.00000000e-01,
       -5.00000000e-02, -5.55111512e-17,  5.00000000e-02,  1.00000000e-01,
        1.50000000e-01,  2.00000000e-01,  2.50000000e-01,  3.00000000e-01])

In [3]:
def set_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True

def init_model_and_optimizer(args, model_cfg, device):
    model = UCCDRNModel(model_cfg).to(device)
    optimizer = torch.optim.Adam(
        model.parameters(), lr=args.learning_rate)
    return model, optimizer

def load_model_and_optimizer(experiment_id, run_id):
    model = torch.load(f"mlruns/{experiment_id}/{run_id}/artifacts/best_model/data/model.pth", weights=False)
    optimizer = torch.load(f"mlruns/{experiment_id}/{run_id}/artifacts/optimizer.pt", weights=False)
    return model, optimizer

def init_dataloader(args):
    train_dataset_len = args.train_num_steps * args.batch_size
    train_dataset = CamelyonDataset(
        mode="train",
        patch_size=args.patch_size,
        num_instances=args.num_instances,
        dataset_len = 200000*args.batch_size
    )
    val_dataset_len = args.val_num_steps * args.batch_size
    val_dataset = CamelyonDataset(
        mode="val",
        patch_size=args.patch_size,
        num_instances=args.num_instances,
    )
    # create dataloader
    train_loader = DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        shuffle=True,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        shuffle=False,
    )
    return train_loader, val_loader

def evaluate(model, val_loader, device):
    model.eval()
    val_ae_loss_list = []
    val_ucc_loss_list = []
    val_acc_list = []
    with torch.no_grad():
        for batch_samples, batch_labels in val_loader:
            batch_samples = batch_samples.to(device)
            batch_labels = batch_labels.to(device)

            ucc_logits, reconstruction = model(batch_samples, return_reconstruction=True)

            ucc_loss = F.cross_entropy(ucc_logits, batch_labels)
            val_ucc_loss_list.append(ucc_loss.item())

            ae_loss = F.mse_loss(batch_samples, reconstruction)
            val_ae_loss_list.append(ae_loss.item())

            # acculate accuracy
            # _, batch_labels = torch.max(batch_labels, dim=1)
            
            _, ucc_predicts = torch.max(ucc_logits, dim=1)
            acc = torch.sum(ucc_predicts == batch_labels).item() / len(batch_labels)
            val_acc_list.append(acc)
    return {
                "eval_ae_loss": np.round(np.mean(val_ae_loss_list), 5),
                "eval_ucc_loss": np.round(np.mean(val_ucc_loss_list), 5),
                "eval_ucc_acc": np.round(np.mean(val_acc_list), 5)
            }

def train(args, model, optimizer, lr_scheduler, train_loader, val_loader, device, step=0):
    print("training")
    # mlflow.pytorch.log_model(model, "init_model")
    # output_dir = hydra.core.hydra_config.HydraConfig.get().runtime.output_dir

    model.train()
    best_eval_acc = 0
    if step == 0:
        mlflow.pytorch.log_model(
            model,
            artifact_path = "best_model"
        )
    for batch_samples, batch_labels in train_loader:
        batch_samples = batch_samples.to(device)
        batch_labels = batch_labels.to(device)

        optimizer.zero_grad()
        ucc_logits, reconstruction = model(batch_samples, return_reconstruction=True)
        ucc_loss = F.cross_entropy(ucc_logits, batch_labels)
        ae_loss = F.mse_loss(batch_samples, reconstruction)
        loss = (1-model.alpha)*ucc_loss + model.alpha*ae_loss

        loss.backward()

        optimizer.step()

        step += 1

        if step%20 ==0:
            with torch.no_grad():
                metric_dict = {}
                grad_log = {name: torch.mean(param.grad).cpu().item(
                ) for name, param in model.named_parameters() if isinstance(param.grad, torch.Tensor)}
                if step == 1000 and ae_loss.detach().item()>0.99:
                    encoder_grad_log = [grad for name, grad in grad_log.items() if "encoder" in name and "weight" in name]
                    if max(encoder_grad_log)<1e-9:
                        break
                mlflow.log_metrics(grad_log, step=step)
                metric_dict["train_ae_loss"] = np.round(ae_loss.detach().item(), 5)
                _, pred = torch.max(ucc_logits, dim=1)
                accuracy = torch.sum(pred.flatten() == batch_labels.flatten())/len(batch_labels)
                metric_dict["train_ucc_loss"] = np.round(ucc_loss.detach().item(), 5)
                metric_dict["train_ucc_acc"] = np.round(float(accuracy), 5)
                metric_dict["loss"] = np.round(float(loss), 5)
                print(f"Step {step}:", metric_dict)
            mlflow.log_metrics(metric_dict, step=step)

        if step % args.save_interval == 0:
            eval_metric_dict = evaluate(
                model,
                val_loader,
                device)
            print(f"step: {step}," + ",".join([f"{key}: {value}"for key, value in eval_metric_dict.items()]))
            mlflow.log_metrics(eval_metric_dict, step=step)
            eval_acc = eval_metric_dict["eval_ucc_acc"]
            if eval_acc > best_eval_acc or eval_acc==1.0:
                best_eval_acc = eval_acc
                mlflow.log_metric("best_eval_acc", best_eval_acc)
                mlflow.pytorch.log_model(model, artifact_path="best_model")
                torch.save(optimizer, "optimizer.pt")
                mlflow.log_artifact("optimizer.pt")
            if step == 200000:
                break
            model.train()

    print("Training finished!!!")
    return best_eval_acc

In [8]:

mlflow.set_tracking_uri("file:\\D:\\UCC-DRN-Pytorch\\camelyon\\mlruns")

run_name = "camelyon-ucc-drn-search-init"
experiment_id = get_or_create_experiment(experiment_name=run_name)
mlflow.set_experiment(experiment_id=experiment_id)
last_lower_bound = 0
last_upper_bound = 0.3
for lower_bound in x:
    lower_bound = np.round(lower_bound, 5)
    for upper_bound in x:
        upper_bound = np.round(upper_bound, 5)
        if lower_bound<last_lower_bound:
            continue
        if lower_bound==last_lower_bound and upper_bound<last_upper_bound:
            continue
        if lower_bound >= upper_bound:
            continue
        # print(lower_bound)
        # print(upper_bound)
        with mlflow.start_run(nested=True) as run:
            cfg.model.drn.init_lower_bound = float(lower_bound)
            cfg.model.drn.init_upper_bound = float(upper_bound)
            mlflow.log_params({
                "init_W_lower_bound": float(lower_bound),
                "init_W_upper_bound": float(upper_bound)
            })
            print(cfg.model.drn)
            print(experiment_id)
            cfg.args.learning_rate = 0.001
            mlflow.log_dict(dict(OmegaConf.to_object(cfg)), "config.yaml")
            args = cfg.args
            device = torch.device("cuda" if torch.cuda.is_available() else "mps")
            model, optimizer = init_model_and_optimizer(args, cfg, device)
            train_loader, val_loader = init_dataloader(args)
            artifact_path = run.info.artifact_uri
            mlflow.pytorch.log_model(
                    model,
                    artifact_path = "init_model")
            best_acc = train(args, model, optimizer, None,
                            train_loader, val_loader, device)

{'num_bins': 11, 'hidden_q': 100, 'num_layers': 2, 'num_nodes': 9, 'init_method': 'uniform', 'init_upper_bound': 0.3, 'init_lower_bound': -0.0, 'output_bins': 2}
716864409634968403
(100, 11)
(100, 100)
(2, 100)




training




Step 20: {'train_ae_loss': 1.00051, 'train_ucc_loss': 0.69157, 'train_ucc_acc': 0.53125, 'loss': 0.84604}
Step 40: {'train_ae_loss': 1.00022, 'train_ucc_loss': 0.67581, 'train_ucc_acc': 0.625, 'loss': 0.83802}
Step 60: {'train_ae_loss': 1.00017, 'train_ucc_loss': 0.7009, 'train_ucc_acc': 0.46875, 'loss': 0.85054}
Step 80: {'train_ae_loss': 1.00009, 'train_ucc_loss': 0.70868, 'train_ucc_acc': 0.40625, 'loss': 0.85438}
Step 100: {'train_ae_loss': 1.00003, 'train_ucc_loss': 0.68349, 'train_ucc_acc': 0.59375, 'loss': 0.84176}
Step 120: {'train_ae_loss': 1.00008, 'train_ucc_loss': 0.70915, 'train_ucc_acc': 0.375, 'loss': 0.85461}
Step 140: {'train_ae_loss': 1.00002, 'train_ucc_loss': 0.69125, 'train_ucc_acc': 0.53125, 'loss': 0.84563}
Step 160: {'train_ae_loss': 1.00006, 'train_ucc_loss': 0.69128, 'train_ucc_acc': 0.53125, 'loss': 0.84567}
Step 180: {'train_ae_loss': 1.00003, 'train_ucc_loss': 0.70636, 'train_ucc_acc': 0.375, 'loss': 0.8532}
Step 200: {'train_ae_loss': 1.00001, 'train_ucc_l



training




Step 20: {'train_ae_loss': 1.00054, 'train_ucc_loss': 0.6969, 'train_ucc_acc': 0.4375, 'loss': 0.84872}
Step 40: {'train_ae_loss': 1.00023, 'train_ucc_loss': 0.69213, 'train_ucc_acc': 0.53125, 'loss': 0.84618}
Step 60: {'train_ae_loss': 1.00014, 'train_ucc_loss': 0.69431, 'train_ucc_acc': 0.46875, 'loss': 0.84723}
Step 80: {'train_ae_loss': 1.00006, 'train_ucc_loss': 0.69602, 'train_ucc_acc': 0.375, 'loss': 0.84804}
Step 100: {'train_ae_loss': 1.00006, 'train_ucc_loss': 0.69478, 'train_ucc_acc': 0.40625, 'loss': 0.84742}
Step 120: {'train_ae_loss': 1.00006, 'train_ucc_loss': 0.69215, 'train_ucc_acc': 0.5625, 'loss': 0.84611}
Step 140: {'train_ae_loss': 1.00001, 'train_ucc_loss': 0.69411, 'train_ucc_acc': 0.4375, 'loss': 0.84706}
Step 160: {'train_ae_loss': 1.00001, 'train_ucc_loss': 0.69256, 'train_ucc_acc': 0.625, 'loss': 0.84629}
Step 180: {'train_ae_loss': 1.00006, 'train_ucc_loss': 0.69337, 'train_ucc_acc': 0.25, 'loss': 0.84671}
Step 200: {'train_ae_loss': 1.00002, 'train_ucc_loss



training




Step 20: {'train_ae_loss': 1.00047, 'train_ucc_loss': 0.69559, 'train_ucc_acc': 0.5, 'loss': 0.84803}
Step 40: {'train_ae_loss': 1.00019, 'train_ucc_loss': 0.68728, 'train_ucc_acc': 0.5625, 'loss': 0.84374}
Step 60: {'train_ae_loss': 1.00007, 'train_ucc_loss': 0.70881, 'train_ucc_acc': 0.375, 'loss': 0.85444}
Step 80: {'train_ae_loss': 1.00005, 'train_ucc_loss': 0.70086, 'train_ucc_acc': 0.4375, 'loss': 0.85046}
Step 100: {'train_ae_loss': 0.99999, 'train_ucc_loss': 0.70219, 'train_ucc_acc': 0.40625, 'loss': 0.85109}
Step 120: {'train_ae_loss': 0.99999, 'train_ucc_loss': 0.69612, 'train_ucc_acc': 0.46875, 'loss': 0.84805}
Step 140: {'train_ae_loss': 1.00008, 'train_ucc_loss': 0.69146, 'train_ucc_acc': 0.53125, 'loss': 0.84577}
Step 160: {'train_ae_loss': 1.00005, 'train_ucc_loss': 0.68733, 'train_ucc_acc': 0.59375, 'loss': 0.84369}
Step 180: {'train_ae_loss': 1.00003, 'train_ucc_loss': 0.6994, 'train_ucc_acc': 0.40625, 'loss': 0.84972}
Step 200: {'train_ae_loss': 1.0, 'train_ucc_loss':



training




Step 20: {'train_ae_loss': 1.00041, 'train_ucc_loss': 0.69725, 'train_ucc_acc': 0.40625, 'loss': 0.84883}
Step 40: {'train_ae_loss': 1.00015, 'train_ucc_loss': 0.69038, 'train_ucc_acc': 0.59375, 'loss': 0.84527}
Step 60: {'train_ae_loss': 1.00007, 'train_ucc_loss': 0.6923, 'train_ucc_acc': 0.53125, 'loss': 0.84618}
Step 80: {'train_ae_loss': 1.00003, 'train_ucc_loss': 0.69757, 'train_ucc_acc': 0.375, 'loss': 0.8488}
Step 100: {'train_ae_loss': 1.00003, 'train_ucc_loss': 0.69628, 'train_ucc_acc': 0.375, 'loss': 0.84816}
Step 120: {'train_ae_loss': 1.00001, 'train_ucc_loss': 0.69157, 'train_ucc_acc': 0.5625, 'loss': 0.84579}
Step 140: {'train_ae_loss': 0.99999, 'train_ucc_loss': 0.69232, 'train_ucc_acc': 0.53125, 'loss': 0.84616}
Step 160: {'train_ae_loss': 1.00006, 'train_ucc_loss': 0.69067, 'train_ucc_acc': 0.59375, 'loss': 0.84536}
Step 180: {'train_ae_loss': 1.00001, 'train_ucc_loss': 0.69324, 'train_ucc_acc': 0.5, 'loss': 0.84663}
Step 200: {'train_ae_loss': 1.00002, 'train_ucc_loss



training




Step 20: {'train_ae_loss': 1.00058, 'train_ucc_loss': 0.69515, 'train_ucc_acc': 0.5, 'loss': 0.84787}
Step 40: {'train_ae_loss': 1.0002, 'train_ucc_loss': 0.70293, 'train_ucc_acc': 0.4375, 'loss': 0.85156}
Step 60: {'train_ae_loss': 1.00007, 'train_ucc_loss': 0.68741, 'train_ucc_acc': 0.5625, 'loss': 0.84374}
Step 80: {'train_ae_loss': 1.00011, 'train_ucc_loss': 0.68785, 'train_ucc_acc': 0.5625, 'loss': 0.84398}
Step 100: {'train_ae_loss': 1.00006, 'train_ucc_loss': 0.68817, 'train_ucc_acc': 0.5625, 'loss': 0.84411}
Step 120: {'train_ae_loss': 1.00006, 'train_ucc_loss': 0.70312, 'train_ucc_acc': 0.40625, 'loss': 0.85159}
Step 140: {'train_ae_loss': 1.00003, 'train_ucc_loss': 0.69132, 'train_ucc_acc': 0.53125, 'loss': 0.84567}
Step 160: {'train_ae_loss': 1.00001, 'train_ucc_loss': 0.70145, 'train_ucc_acc': 0.40625, 'loss': 0.85073}
Step 180: {'train_ae_loss': 1.00002, 'train_ucc_loss': 0.69151, 'train_ucc_acc': 0.53125, 'loss': 0.84576}
Step 200: {'train_ae_loss': 1.00001, 'train_ucc_lo



training




Step 20: {'train_ae_loss': 1.00075, 'train_ucc_loss': 0.69482, 'train_ucc_acc': 0.5, 'loss': 0.84778}
Step 40: {'train_ae_loss': 1.00014, 'train_ucc_loss': 0.69122, 'train_ucc_acc': 0.53125, 'loss': 0.84568}
Step 60: {'train_ae_loss': 1.00012, 'train_ucc_loss': 0.67846, 'train_ucc_acc': 0.65625, 'loss': 0.83929}
Step 80: {'train_ae_loss': 1.00007, 'train_ucc_loss': 0.70664, 'train_ucc_acc': 0.375, 'loss': 0.85335}
Step 100: {'train_ae_loss': 1.00005, 'train_ucc_loss': 0.69132, 'train_ucc_acc': 0.53125, 'loss': 0.84568}
Step 120: {'train_ae_loss': 1.00005, 'train_ucc_loss': 0.69137, 'train_ucc_acc': 0.53125, 'loss': 0.84571}
Step 140: {'train_ae_loss': 1.00002, 'train_ucc_loss': 0.69395, 'train_ucc_acc': 0.5, 'loss': 0.84698}
Step 160: {'train_ae_loss': 1.00004, 'train_ucc_loss': 0.69157, 'train_ucc_acc': 0.53125, 'loss': 0.84581}
Step 180: {'train_ae_loss': 1.00002, 'train_ucc_loss': 0.69555, 'train_ucc_acc': 0.46875, 'loss': 0.84778}
Step 200: {'train_ae_loss': 1.00001, 'train_ucc_los



training




Step 20: {'train_ae_loss': 1.00048, 'train_ucc_loss': 0.69134, 'train_ucc_acc': 0.53125, 'loss': 0.84591}
Step 40: {'train_ae_loss': 1.00022, 'train_ucc_loss': 0.68189, 'train_ucc_acc': 0.59375, 'loss': 0.84106}
Step 60: {'train_ae_loss': 1.00004, 'train_ucc_loss': 0.70018, 'train_ucc_acc': 0.46875, 'loss': 0.85011}
Step 80: {'train_ae_loss': 1.00005, 'train_ucc_loss': 0.69121, 'train_ucc_acc': 0.53125, 'loss': 0.84563}
Step 100: {'train_ae_loss': 0.99999, 'train_ucc_loss': 0.68712, 'train_ucc_acc': 0.5625, 'loss': 0.84356}
Step 120: {'train_ae_loss': 1.00004, 'train_ucc_loss': 0.70708, 'train_ucc_acc': 0.40625, 'loss': 0.85356}
Step 140: {'train_ae_loss': 1.0, 'train_ucc_loss': 0.68756, 'train_ucc_acc': 0.5625, 'loss': 0.84378}
Step 160: {'train_ae_loss': 1.00002, 'train_ucc_loss': 0.69455, 'train_ucc_acc': 0.5, 'loss': 0.84729}
Step 180: {'train_ae_loss': 1.00001, 'train_ucc_loss': 0.70626, 'train_ucc_acc': 0.375, 'loss': 0.85313}
Step 200: {'train_ae_loss': 1.00001, 'train_ucc_loss'



training




Step 20: {'train_ae_loss': 1.00051, 'train_ucc_loss': 0.67505, 'train_ucc_acc': 0.65625, 'loss': 0.83778}
Step 40: {'train_ae_loss': 1.00022, 'train_ucc_loss': 0.7104, 'train_ucc_acc': 0.375, 'loss': 0.85531}
Step 60: {'train_ae_loss': 1.00012, 'train_ucc_loss': 0.69462, 'train_ucc_acc': 0.5, 'loss': 0.84737}
Step 80: {'train_ae_loss': 1.00005, 'train_ucc_loss': 0.70068, 'train_ucc_acc': 0.4375, 'loss': 0.85037}
Step 100: {'train_ae_loss': 1.0001, 'train_ucc_loss': 0.70512, 'train_ucc_acc': 0.375, 'loss': 0.85261}
Step 120: {'train_ae_loss': 1.00004, 'train_ucc_loss': 0.69392, 'train_ucc_acc': 0.5, 'loss': 0.84698}
Step 140: {'train_ae_loss': 1.00003, 'train_ucc_loss': 0.68969, 'train_ucc_acc': 0.5625, 'loss': 0.84486}
Step 160: {'train_ae_loss': 1.00007, 'train_ucc_loss': 0.69539, 'train_ucc_acc': 0.46875, 'loss': 0.84773}
Step 180: {'train_ae_loss': 1.00004, 'train_ucc_loss': 0.6809, 'train_ucc_acc': 0.71875, 'loss': 0.84047}
Step 200: {'train_ae_loss': 1.0, 'train_ucc_loss': 0.69715



training




Step 20: {'train_ae_loss': 1.0008, 'train_ucc_loss': 0.69126, 'train_ucc_acc': 0.53125, 'loss': 0.84603}
Step 40: {'train_ae_loss': 1.00037, 'train_ucc_loss': 0.6953, 'train_ucc_acc': 0.5, 'loss': 0.84784}
Step 60: {'train_ae_loss': 1.0002, 'train_ucc_loss': 0.68358, 'train_ucc_acc': 0.59375, 'loss': 0.84189}
Step 80: {'train_ae_loss': 1.00011, 'train_ucc_loss': 0.69819, 'train_ucc_acc': 0.46875, 'loss': 0.84915}
Step 100: {'train_ae_loss': 1.00011, 'train_ucc_loss': 0.68155, 'train_ucc_acc': 0.625, 'loss': 0.84083}
Step 120: {'train_ae_loss': 1.00006, 'train_ucc_loss': 0.71166, 'train_ucc_acc': 0.34375, 'loss': 0.85586}
Step 140: {'train_ae_loss': 1.00009, 'train_ucc_loss': 0.68179, 'train_ucc_acc': 0.625, 'loss': 0.84094}
Step 160: {'train_ae_loss': 1.00002, 'train_ucc_loss': 0.69748, 'train_ucc_acc': 0.46875, 'loss': 0.84875}
Step 180: {'train_ae_loss': 1.0, 'train_ucc_loss': 0.69426, 'train_ucc_acc': 0.5, 'loss': 0.84713}
Step 200: {'train_ae_loss': 1.00003, 'train_ucc_loss': 0.696



training




Step 20: {'train_ae_loss': 1.00066, 'train_ucc_loss': 0.68574, 'train_ucc_acc': 0.5625, 'loss': 0.8432}
Step 40: {'train_ae_loss': 1.00025, 'train_ucc_loss': 0.73125, 'train_ucc_acc': 0.3125, 'loss': 0.86575}
Step 60: {'train_ae_loss': 1.00018, 'train_ucc_loss': 0.70198, 'train_ucc_acc': 0.46875, 'loss': 0.85108}
Step 80: {'train_ae_loss': 1.00011, 'train_ucc_loss': 0.68638, 'train_ucc_acc': 0.5625, 'loss': 0.84325}
Step 100: {'train_ae_loss': 1.00009, 'train_ucc_loss': 0.7048, 'train_ucc_acc': 0.4375, 'loss': 0.85244}
Step 120: {'train_ae_loss': 1.00004, 'train_ucc_loss': 0.6912, 'train_ucc_acc': 0.53125, 'loss': 0.84562}
Step 140: {'train_ae_loss': 1.00005, 'train_ucc_loss': 0.69518, 'train_ucc_acc': 0.5, 'loss': 0.84762}
Step 160: {'train_ae_loss': 1.00003, 'train_ucc_loss': 0.69466, 'train_ucc_acc': 0.5, 'loss': 0.84735}
Step 180: {'train_ae_loss': 1.00006, 'train_ucc_loss': 0.69446, 'train_ucc_acc': 0.5, 'loss': 0.84726}
Step 200: {'train_ae_loss': 1.00003, 'train_ucc_loss': 0.697



training




Step 20: {'train_ae_loss': 1.00064, 'train_ucc_loss': 0.68542, 'train_ucc_acc': 0.5625, 'loss': 0.84303}
Step 40: {'train_ae_loss': 1.00023, 'train_ucc_loss': 0.67966, 'train_ucc_acc': 0.59375, 'loss': 0.83995}
Step 60: {'train_ae_loss': 1.00013, 'train_ucc_loss': 0.69152, 'train_ucc_acc': 0.53125, 'loss': 0.84582}
Step 80: {'train_ae_loss': 1.00005, 'train_ucc_loss': 0.69634, 'train_ucc_acc': 0.5, 'loss': 0.84819}
Step 100: {'train_ae_loss': 0.99999, 'train_ucc_loss': 0.6822, 'train_ucc_acc': 0.59375, 'loss': 0.8411}
Step 120: {'train_ae_loss': 1.00006, 'train_ucc_loss': 0.69546, 'train_ucc_acc': 0.5, 'loss': 0.84776}
Step 140: {'train_ae_loss': 1.00002, 'train_ucc_loss': 0.69904, 'train_ucc_acc': 0.46875, 'loss': 0.84953}
Step 160: {'train_ae_loss': 1.00005, 'train_ucc_loss': 0.69123, 'train_ucc_acc': 0.53125, 'loss': 0.84564}
Step 180: {'train_ae_loss': 1.0, 'train_ucc_loss': 0.69739, 'train_ucc_acc': 0.46875, 'loss': 0.8487}
Step 200: {'train_ae_loss': 1.0, 'train_ucc_loss': 0.6941



training




Step 20: {'train_ae_loss': 1.00082, 'train_ucc_loss': 0.69221, 'train_ucc_acc': 0.53125, 'loss': 0.84651}
Step 40: {'train_ae_loss': 1.00028, 'train_ucc_loss': 0.69797, 'train_ucc_acc': 0.5, 'loss': 0.84913}
Step 60: {'train_ae_loss': 1.00023, 'train_ucc_loss': 0.69713, 'train_ucc_acc': 0.5, 'loss': 0.84868}
Step 80: {'train_ae_loss': 1.00017, 'train_ucc_loss': 0.70192, 'train_ucc_acc': 0.46875, 'loss': 0.85105}
Step 100: {'train_ae_loss': 1.00011, 'train_ucc_loss': 0.70619, 'train_ucc_acc': 0.4375, 'loss': 0.85315}
Step 120: {'train_ae_loss': 1.00006, 'train_ucc_loss': 0.7099, 'train_ucc_acc': 0.40625, 'loss': 0.85498}
Step 140: {'train_ae_loss': 1.00004, 'train_ucc_loss': 0.70864, 'train_ucc_acc': 0.40625, 'loss': 0.85434}
Step 160: {'train_ae_loss': 1.00005, 'train_ucc_loss': 0.67466, 'train_ucc_acc': 0.65625, 'loss': 0.83736}
Step 180: {'train_ae_loss': 1.00002, 'train_ucc_loss': 0.68355, 'train_ucc_acc': 0.59375, 'loss': 0.84179}
Step 200: {'train_ae_loss': 1.00002, 'train_ucc_los



training




Step 20: {'train_ae_loss': 1.00116, 'train_ucc_loss': 0.71942, 'train_ucc_acc': 0.40625, 'loss': 0.86029}
Step 40: {'train_ae_loss': 1.00034, 'train_ucc_loss': 0.71122, 'train_ucc_acc': 0.4375, 'loss': 0.85578}
Step 60: {'train_ae_loss': 1.00011, 'train_ucc_loss': 0.69169, 'train_ucc_acc': 0.53125, 'loss': 0.8459}
Step 80: {'train_ae_loss': 1.00006, 'train_ucc_loss': 0.69702, 'train_ucc_acc': 0.5, 'loss': 0.84854}
Step 100: {'train_ae_loss': 1.00012, 'train_ucc_loss': 0.69633, 'train_ucc_acc': 0.5, 'loss': 0.84823}
Step 120: {'train_ae_loss': 1.00005, 'train_ucc_loss': 0.7003, 'train_ucc_acc': 0.46875, 'loss': 0.85018}
Step 140: {'train_ae_loss': 1.00005, 'train_ucc_loss': 0.712, 'train_ucc_acc': 0.375, 'loss': 0.85603}
Step 160: {'train_ae_loss': 1.00007, 'train_ucc_loss': 0.69502, 'train_ucc_acc': 0.5, 'loss': 0.84754}
Step 180: {'train_ae_loss': 1.00003, 'train_ucc_loss': 0.69487, 'train_ucc_acc': 0.5, 'loss': 0.84745}
Step 200: {'train_ae_loss': 1.00003, 'train_ucc_loss': 0.69122, 



training




Step 20: {'train_ae_loss': 1.00041, 'train_ucc_loss': 0.6858, 'train_ucc_acc': 0.5625, 'loss': 0.8431}
Step 40: {'train_ae_loss': 1.00018, 'train_ucc_loss': 0.71348, 'train_ucc_acc': 0.40625, 'loss': 0.85683}
Step 60: {'train_ae_loss': 1.00015, 'train_ucc_loss': 0.68631, 'train_ucc_acc': 0.5625, 'loss': 0.84323}
Step 80: {'train_ae_loss': 1.00006, 'train_ucc_loss': 0.69125, 'train_ucc_acc': 0.53125, 'loss': 0.84566}
Step 100: {'train_ae_loss': 1.00007, 'train_ucc_loss': 0.69939, 'train_ucc_acc': 0.46875, 'loss': 0.84973}
Step 120: {'train_ae_loss': 1.00004, 'train_ucc_loss': 0.7021, 'train_ucc_acc': 0.4375, 'loss': 0.85107}
Step 140: {'train_ae_loss': 1.00005, 'train_ucc_loss': 0.69445, 'train_ucc_acc': 0.5, 'loss': 0.84725}
Step 160: {'train_ae_loss': 1.00005, 'train_ucc_loss': 0.68839, 'train_ucc_acc': 0.5625, 'loss': 0.84422}
Step 180: {'train_ae_loss': 0.99999, 'train_ucc_loss': 0.69711, 'train_ucc_acc': 0.46875, 'loss': 0.84855}
Step 200: {'train_ae_loss': 1.00001, 'train_ucc_loss



training




Step 20: {'train_ae_loss': 1.0007, 'train_ucc_loss': 0.70339, 'train_ucc_acc': 0.46875, 'loss': 0.85204}
Step 40: {'train_ae_loss': 1.0002, 'train_ucc_loss': 0.6862, 'train_ucc_acc': 0.5625, 'loss': 0.8432}
Step 60: {'train_ae_loss': 1.0001, 'train_ucc_loss': 0.69134, 'train_ucc_acc': 0.53125, 'loss': 0.84572}
Step 80: {'train_ae_loss': 1.00009, 'train_ucc_loss': 0.70548, 'train_ucc_acc': 0.4375, 'loss': 0.85279}
Step 100: {'train_ae_loss': 1.00008, 'train_ucc_loss': 0.72173, 'train_ucc_acc': 0.3125, 'loss': 0.86091}
Step 120: {'train_ae_loss': 1.00004, 'train_ucc_loss': 0.70327, 'train_ucc_acc': 0.4375, 'loss': 0.85166}
Step 140: {'train_ae_loss': 1.00003, 'train_ucc_loss': 0.67251, 'train_ucc_acc': 0.6875, 'loss': 0.83627}
Step 160: {'train_ae_loss': 1.00003, 'train_ucc_loss': 0.70195, 'train_ucc_acc': 0.4375, 'loss': 0.85099}
Step 180: {'train_ae_loss': 1.00002, 'train_ucc_loss': 0.7154, 'train_ucc_acc': 0.3125, 'loss': 0.85771}
Step 200: {'train_ae_loss': 1.00005, 'train_ucc_loss':



training




Step 20: {'train_ae_loss': 1.00096, 'train_ucc_loss': 0.68605, 'train_ucc_acc': 0.5625, 'loss': 0.8435}
Step 40: {'train_ae_loss': 1.00036, 'train_ucc_loss': 0.67572, 'train_ucc_acc': 0.625, 'loss': 0.83804}
Step 60: {'train_ae_loss': 1.00019, 'train_ucc_loss': 0.71452, 'train_ucc_acc': 0.375, 'loss': 0.85735}
Step 80: {'train_ae_loss': 1.00011, 'train_ucc_loss': 0.71081, 'train_ucc_acc': 0.375, 'loss': 0.85546}
Step 100: {'train_ae_loss': 1.00007, 'train_ucc_loss': 0.69443, 'train_ucc_acc': 0.5, 'loss': 0.84725}
Step 120: {'train_ae_loss': 1.00008, 'train_ucc_loss': 0.69405, 'train_ucc_acc': 0.5, 'loss': 0.84707}
Step 140: {'train_ae_loss': 1.00007, 'train_ucc_loss': 0.68339, 'train_ucc_acc': 0.625, 'loss': 0.84173}
Step 160: {'train_ae_loss': 1.00007, 'train_ucc_loss': 0.69402, 'train_ucc_acc': 0.5, 'loss': 0.84704}
Step 180: {'train_ae_loss': 1.00007, 'train_ucc_loss': 0.68673, 'train_ucc_acc': 0.59375, 'loss': 0.8434}
Step 200: {'train_ae_loss': 1.00003, 'train_ucc_loss': 0.69164, 

In [5]:
mlflow.set_tracking_uri("file:///Users/tanguanyu/UCC-DRN-Pytorch/camelyon/mlruns")

{'args': {'dataset': 'camelyon', 'model_dir': 'saved_models/', 'model_name': 'camelyon_ucc_drn', 'num_instances': 32, 'ucc_start': 1, 'ucc_end': 4, 'batch_size': 5, 'num_samples_per_class': 5, 'num_workers': 4, 'learning_rate': 0.0001, 'num_bins': 11, 'num_features': 10, 'train_num_steps': 100000, 'val_num_steps': 200, 'save_interval': 1000, 'patch_size': 32, 'seed': 22}, 'model': {'kde_model': {'num_bins': 11, 'sigma': 0.1}, 'num_channels': 3, 'encoder': {'conv_input_channel': 3, 'conv_output_channel': 16, 'block1_output_channel': 32, 'block1_num_layer': 1, 'block2_output_channel': 64, 'block2_num_layer': 1, 'block3_output_channel': 128, 'block3_num_layer': 1, 'flatten_size': 8192, 'num_features': 16}, 'decoder': {'linear_size': 8192, 'reshape_size': [128, 8, 8], 'block1_output_channel': 128, 'block1_num_layer': 1, 'block2_output_channel': 64, 'block2_num_layer': 1, 'block3_output_channel': 32, 'block3_num_layer': 1, 'output_channel': 3}, 'drn': {'num_bins': 11, 'hidden_q': 100, 'num_layers': 2, 'num_nodes': 9, 'init_method': 'uniform', 'init_upper_bound': 0.5, 'init_lower_bound': -0.5, 'output_bins': 4}, 'ucc_classifier': 'None', 'loss': {'alpha': 0.5}}}

In [6]:
def load_model_and_optimizer(experiment_id, run_id):
    model = torch.load(f"mlruns/{experiment_id}/{run_id}/artifacts/best_model/data/model.pth", weights_only=False, map_location="cuda")
    optimizer = torch.optim.Adam(lr=0.0012, params=model.parameters())
    optimizer.load_state_dict(torch.load(f"mlruns/{experiment_id}/{run_id}/artifacts/optimizer.pt", weights_only=False, map_location="cuda").state_dict())

    with open(f"mlruns/{experiment_id}/{run_id}/metrics/eval_ucc_acc") as file:
        lines = file.readlines()
        step = int(lines[-1].split(" ")[-1])
    return model, optimizer, step

def resume_training(run_id):
    mlflow.set_tracking_uri("mlruns")
    run_name = "camelyon-ucc-drn-search-init"
    experiment = mlflow.set_experiment(run_name)
    experiment_id = experiment.experiment_id
    cfg_name = "train_camelyon_ucc_drn"
    with initialize(version_base=None, config_path="../configs"):
        cfg = compose(config_name=cfg_name)

    args = cfg.args
    model, optimizer, step = load_model_and_optimizer(experiment_id, run_id)
    train_loader, val_loader = init_dataloader(args)
    device = torch.device("cuda" if torch.cuda.is_available() else "mps")
    print(device)
    print(optimizer)
    print(step)
    with mlflow.start_run(run_id=run_id, nested=True):
        mlflow.set_experiment(experiment_id=experiment_id)
        best_acc = train(args, model, optimizer, None,
                    train_loader, val_loader, device, step=step)

resume_training("91929ff20e614b84ae2855d3ee3f565a")

cuda
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    decoupled_weight_decay: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)
39000
training
Step 39020: {'train_ae_loss': 0.6916, 'train_ucc_loss': 0.416, 'train_ucc_acc': 0.90625, 'loss': 0.5538}
Step 39040: {'train_ae_loss': 0.67035, 'train_ucc_loss': 0.51921, 'train_ucc_acc': 0.8125, 'loss': 0.59478}
Step 39060: {'train_ae_loss': 0.68061, 'train_ucc_loss': 0.41042, 'train_ucc_acc': 0.9375, 'loss': 0.54552}
Step 39080: {'train_ae_loss': 0.67488, 'train_ucc_loss': 0.60488, 'train_ucc_acc': 0.6875, 'loss': 0.63988}
Step 39100: {'train_ae_loss': 0.68034, 'train_ucc_loss': 0.57203, 'train_ucc_acc': 0.71875, 'loss': 0.62618}
Step 39120: {'train_ae_loss': 0.68149, 'train_ucc_loss': 0.50473, 'train_ucc_acc': 0.8125, 'loss': 0.59311}
Step 39140: {'train_ae_loss': 0.67443, 'train_ucc_loss': 0.44293, 'train_ucc_ac



step: 40000,eval_ae_loss: 0.67765,eval_ucc_loss: 0.51457,eval_ucc_acc: 0.7959




Step 40020: {'train_ae_loss': 0.68062, 'train_ucc_loss': 0.58808, 'train_ucc_acc': 0.75, 'loss': 0.63435}
Step 40040: {'train_ae_loss': 0.69193, 'train_ucc_loss': 0.43797, 'train_ucc_acc': 0.875, 'loss': 0.56495}
Step 40060: {'train_ae_loss': 0.66665, 'train_ucc_loss': 0.54457, 'train_ucc_acc': 0.8125, 'loss': 0.60561}
Step 40080: {'train_ae_loss': 0.68385, 'train_ucc_loss': 0.52384, 'train_ucc_acc': 0.75, 'loss': 0.60384}
Step 40100: {'train_ae_loss': 0.67599, 'train_ucc_loss': 0.59935, 'train_ucc_acc': 0.71875, 'loss': 0.63767}
Step 40120: {'train_ae_loss': 0.6861, 'train_ucc_loss': 0.5231, 'train_ucc_acc': 0.75, 'loss': 0.6046}
Step 40140: {'train_ae_loss': 0.67646, 'train_ucc_loss': 0.58163, 'train_ucc_acc': 0.71875, 'loss': 0.62904}
Step 40160: {'train_ae_loss': 0.66883, 'train_ucc_loss': 0.52022, 'train_ucc_acc': 0.8125, 'loss': 0.59452}
Step 40180: {'train_ae_loss': 0.6748, 'train_ucc_loss': 0.57497, 'train_ucc_acc': 0.6875, 'loss': 0.62488}
Step 40200: {'train_ae_loss': 0.68534



step: 47000,eval_ae_loss: 0.67212,eval_ucc_loss: 0.48501,eval_ucc_acc: 0.81836




Step 47020: {'train_ae_loss': 0.6738, 'train_ucc_loss': 0.43359, 'train_ucc_acc': 0.90625, 'loss': 0.55369}
Step 47040: {'train_ae_loss': 0.66665, 'train_ucc_loss': 0.55604, 'train_ucc_acc': 0.71875, 'loss': 0.61135}
Step 47060: {'train_ae_loss': 0.6795, 'train_ucc_loss': 0.47694, 'train_ucc_acc': 0.8125, 'loss': 0.57822}
Step 47080: {'train_ae_loss': 0.68019, 'train_ucc_loss': 0.49749, 'train_ucc_acc': 0.8125, 'loss': 0.58884}
Step 47100: {'train_ae_loss': 0.66919, 'train_ucc_loss': 0.51343, 'train_ucc_acc': 0.8125, 'loss': 0.59131}
Step 47120: {'train_ae_loss': 0.69231, 'train_ucc_loss': 0.52063, 'train_ucc_acc': 0.78125, 'loss': 0.60647}
Step 47140: {'train_ae_loss': 0.67332, 'train_ucc_loss': 0.53661, 'train_ucc_acc': 0.75, 'loss': 0.60496}
Step 47160: {'train_ae_loss': 0.67374, 'train_ucc_loss': 0.47203, 'train_ucc_acc': 0.84375, 'loss': 0.57288}
Step 47180: {'train_ae_loss': 0.67887, 'train_ucc_loss': 0.40317, 'train_ucc_acc': 0.90625, 'loss': 0.54102}
Step 47200: {'train_ae_loss



step: 57000,eval_ae_loss: 0.65756,eval_ucc_loss: 0.48668,eval_ucc_acc: 0.82129




Step 57020: {'train_ae_loss': 0.67498, 'train_ucc_loss': 0.5025, 'train_ucc_acc': 0.8125, 'loss': 0.58874}
Step 57040: {'train_ae_loss': 0.66216, 'train_ucc_loss': 0.4637, 'train_ucc_acc': 0.84375, 'loss': 0.56293}
Step 57060: {'train_ae_loss': 0.67636, 'train_ucc_loss': 0.35434, 'train_ucc_acc': 0.96875, 'loss': 0.51535}
Step 57080: {'train_ae_loss': 0.65444, 'train_ucc_loss': 0.53257, 'train_ucc_acc': 0.78125, 'loss': 0.59351}
Step 57100: {'train_ae_loss': 0.66404, 'train_ucc_loss': 0.52725, 'train_ucc_acc': 0.8125, 'loss': 0.59565}
Step 57120: {'train_ae_loss': 0.66024, 'train_ucc_loss': 0.41045, 'train_ucc_acc': 0.875, 'loss': 0.53535}
Step 57140: {'train_ae_loss': 0.65892, 'train_ucc_loss': 0.42048, 'train_ucc_acc': 0.90625, 'loss': 0.5397}
Step 57160: {'train_ae_loss': 0.66403, 'train_ucc_loss': 0.47228, 'train_ucc_acc': 0.84375, 'loss': 0.56815}
Step 57180: {'train_ae_loss': 0.65695, 'train_ucc_loss': 0.45466, 'train_ucc_acc': 0.84375, 'loss': 0.55581}
Step 57200: {'train_ae_los



step: 58000,eval_ae_loss: 0.65993,eval_ucc_loss: 0.47575,eval_ucc_acc: 0.8291




Step 58020: {'train_ae_loss': 0.6619, 'train_ucc_loss': 0.47368, 'train_ucc_acc': 0.8125, 'loss': 0.56779}
Step 58040: {'train_ae_loss': 0.67721, 'train_ucc_loss': 0.41553, 'train_ucc_acc': 0.90625, 'loss': 0.54637}
Step 58060: {'train_ae_loss': 0.67979, 'train_ucc_loss': 0.40768, 'train_ucc_acc': 0.90625, 'loss': 0.54374}
Step 58080: {'train_ae_loss': 0.67458, 'train_ucc_loss': 0.51544, 'train_ucc_acc': 0.78125, 'loss': 0.59501}
Step 58100: {'train_ae_loss': 0.67934, 'train_ucc_loss': 0.53396, 'train_ucc_acc': 0.78125, 'loss': 0.60665}
Step 58120: {'train_ae_loss': 0.6563, 'train_ucc_loss': 0.39593, 'train_ucc_acc': 0.9375, 'loss': 0.52611}
Step 58140: {'train_ae_loss': 0.66173, 'train_ucc_loss': 0.43314, 'train_ucc_acc': 0.875, 'loss': 0.54744}
Step 58160: {'train_ae_loss': 0.65617, 'train_ucc_loss': 0.43561, 'train_ucc_acc': 0.875, 'loss': 0.54589}
Step 58180: {'train_ae_loss': 0.66719, 'train_ucc_loss': 0.41501, 'train_ucc_acc': 0.90625, 'loss': 0.5411}
Step 58200: {'train_ae_loss'



step: 60000,eval_ae_loss: 0.6566,eval_ucc_loss: 0.47432,eval_ucc_acc: 0.83301




Step 60020: {'train_ae_loss': 0.66913, 'train_ucc_loss': 0.51844, 'train_ucc_acc': 0.78125, 'loss': 0.59379}
Step 60040: {'train_ae_loss': 0.67265, 'train_ucc_loss': 0.43698, 'train_ucc_acc': 0.875, 'loss': 0.55481}
Step 60060: {'train_ae_loss': 0.65742, 'train_ucc_loss': 0.66087, 'train_ucc_acc': 0.625, 'loss': 0.65915}
Step 60080: {'train_ae_loss': 0.67394, 'train_ucc_loss': 0.50116, 'train_ucc_acc': 0.78125, 'loss': 0.58755}
Step 60100: {'train_ae_loss': 0.68681, 'train_ucc_loss': 0.46515, 'train_ucc_acc': 0.84375, 'loss': 0.57598}
Step 60120: {'train_ae_loss': 0.64501, 'train_ucc_loss': 0.56014, 'train_ucc_acc': 0.78125, 'loss': 0.60257}
Step 60140: {'train_ae_loss': 0.66805, 'train_ucc_loss': 0.59564, 'train_ucc_acc': 0.6875, 'loss': 0.63184}
Step 60160: {'train_ae_loss': 0.6702, 'train_ucc_loss': 0.45997, 'train_ucc_acc': 0.84375, 'loss': 0.56509}
Step 60180: {'train_ae_loss': 0.6676, 'train_ucc_loss': 0.47314, 'train_ucc_acc': 0.8125, 'loss': 0.57037}
Step 60200: {'train_ae_loss



step: 68000,eval_ae_loss: 0.65732,eval_ucc_loss: 0.47339,eval_ucc_acc: 0.83789




Step 68020: {'train_ae_loss': 0.6634, 'train_ucc_loss': 0.45149, 'train_ucc_acc': 0.84375, 'loss': 0.55744}
Step 68040: {'train_ae_loss': 0.66634, 'train_ucc_loss': 0.39876, 'train_ucc_acc': 0.90625, 'loss': 0.53255}
Step 68060: {'train_ae_loss': 0.65485, 'train_ucc_loss': 0.40363, 'train_ucc_acc': 0.90625, 'loss': 0.52924}
Step 68080: {'train_ae_loss': 0.66539, 'train_ucc_loss': 0.53398, 'train_ucc_acc': 0.78125, 'loss': 0.59968}
Step 68100: {'train_ae_loss': 0.65838, 'train_ucc_loss': 0.38926, 'train_ucc_acc': 0.9375, 'loss': 0.52382}
Step 68120: {'train_ae_loss': 0.68082, 'train_ucc_loss': 0.38577, 'train_ucc_acc': 0.9375, 'loss': 0.5333}
Step 68140: {'train_ae_loss': 0.67123, 'train_ucc_loss': 0.4772, 'train_ucc_acc': 0.8125, 'loss': 0.57422}
Step 68160: {'train_ae_loss': 0.65248, 'train_ucc_loss': 0.38759, 'train_ucc_acc': 0.9375, 'loss': 0.52003}
Step 68180: {'train_ae_loss': 0.66873, 'train_ucc_loss': 0.57032, 'train_ucc_acc': 0.71875, 'loss': 0.61953}
Step 68200: {'train_ae_los



step: 87000,eval_ae_loss: 0.65655,eval_ucc_loss: 0.46288,eval_ucc_acc: 0.84277




Step 87020: {'train_ae_loss': 0.676, 'train_ucc_loss': 0.47611, 'train_ucc_acc': 0.84375, 'loss': 0.57605}
Step 87040: {'train_ae_loss': 0.68384, 'train_ucc_loss': 0.48313, 'train_ucc_acc': 0.84375, 'loss': 0.58349}
Step 87060: {'train_ae_loss': 0.66874, 'train_ucc_loss': 0.50561, 'train_ucc_acc': 0.8125, 'loss': 0.58718}
Step 87080: {'train_ae_loss': 0.66309, 'train_ucc_loss': 0.48191, 'train_ucc_acc': 0.84375, 'loss': 0.5725}
Step 87100: {'train_ae_loss': 0.65558, 'train_ucc_loss': 0.45097, 'train_ucc_acc': 0.84375, 'loss': 0.55328}
Step 87120: {'train_ae_loss': 0.66603, 'train_ucc_loss': 0.39145, 'train_ucc_acc': 0.875, 'loss': 0.52874}
Step 87140: {'train_ae_loss': 0.66944, 'train_ucc_loss': 0.53067, 'train_ucc_acc': 0.78125, 'loss': 0.60005}
Step 87160: {'train_ae_loss': 0.64425, 'train_ucc_loss': 0.42032, 'train_ucc_acc': 0.90625, 'loss': 0.53228}
Step 87180: {'train_ae_loss': 0.67006, 'train_ucc_loss': 0.36067, 'train_ucc_acc': 0.9375, 'loss': 0.51536}
Step 87200: {'train_ae_los



step: 95000,eval_ae_loss: 0.65277,eval_ucc_loss: 0.46138,eval_ucc_acc: 0.85352




Step 95020: {'train_ae_loss': 0.63796, 'train_ucc_loss': 0.50584, 'train_ucc_acc': 0.8125, 'loss': 0.5719}
Step 95040: {'train_ae_loss': 0.66951, 'train_ucc_loss': 0.38477, 'train_ucc_acc': 0.9375, 'loss': 0.52714}
Step 95060: {'train_ae_loss': 0.65812, 'train_ucc_loss': 0.55013, 'train_ucc_acc': 0.75, 'loss': 0.60413}
Step 95080: {'train_ae_loss': 0.66565, 'train_ucc_loss': 0.52144, 'train_ucc_acc': 0.78125, 'loss': 0.59354}
Step 95100: {'train_ae_loss': 0.6808, 'train_ucc_loss': 0.4281, 'train_ucc_acc': 0.875, 'loss': 0.55445}
Step 95120: {'train_ae_loss': 0.67075, 'train_ucc_loss': 0.37404, 'train_ucc_acc': 0.9375, 'loss': 0.5224}
Step 95140: {'train_ae_loss': 0.67029, 'train_ucc_loss': 0.49072, 'train_ucc_acc': 0.84375, 'loss': 0.58051}
Step 95160: {'train_ae_loss': 0.66482, 'train_ucc_loss': 0.453, 'train_ucc_acc': 0.84375, 'loss': 0.55891}
Step 95180: {'train_ae_loss': 0.67193, 'train_ucc_loss': 0.56056, 'train_ucc_acc': 0.6875, 'loss': 0.61624}
Step 95200: {'train_ae_loss': 0.65



step: 124000,eval_ae_loss: 0.65004,eval_ucc_loss: 0.442,eval_ucc_acc: 0.86719




Step 124020: {'train_ae_loss': 0.65593, 'train_ucc_loss': 0.4889, 'train_ucc_acc': 0.8125, 'loss': 0.57242}
Step 124040: {'train_ae_loss': 0.65736, 'train_ucc_loss': 0.40068, 'train_ucc_acc': 0.90625, 'loss': 0.52902}
Step 124060: {'train_ae_loss': 0.65386, 'train_ucc_loss': 0.46182, 'train_ucc_acc': 0.84375, 'loss': 0.55784}
Step 124080: {'train_ae_loss': 0.66573, 'train_ucc_loss': 0.42561, 'train_ucc_acc': 0.90625, 'loss': 0.54567}
Step 124100: {'train_ae_loss': 0.65924, 'train_ucc_loss': 0.42172, 'train_ucc_acc': 0.90625, 'loss': 0.54048}
Step 124120: {'train_ae_loss': 0.66568, 'train_ucc_loss': 0.48173, 'train_ucc_acc': 0.84375, 'loss': 0.57371}
Step 124140: {'train_ae_loss': 0.64711, 'train_ucc_loss': 0.47957, 'train_ucc_acc': 0.84375, 'loss': 0.56334}
Step 124160: {'train_ae_loss': 0.6753, 'train_ucc_loss': 0.45368, 'train_ucc_acc': 0.875, 'loss': 0.56449}
Step 124180: {'train_ae_loss': 0.64438, 'train_ucc_loss': 0.4428, 'train_ucc_acc': 0.84375, 'loss': 0.54359}
Step 124200: {'t

In [None]:
mlflow.set_tracking_uri("mlruns")
run_name = "camelyon-ucc-drn"
experiment = mlflow.set_experiment(run_name)
experiment_id = experiment.experiment_id
cfg_name = "train_camelyon_ucc_drn"

In [None]:
experiment_id

In [7]:
import os

prefix_to_replace = "/content/gdrive/MyDrive/UCCDRNPytorch/"
prefix_replacement = "/Users/tanguanyu/UCC-DRN-Pytorch/"

In [20]:
for root, dirs, files in  os.walk("mlruns/152105657986962541"):
    for d in dirs:
        if d=="models":
            for rt, ds, _ in os.walk(f"{root}/models"):
                for d_ in ds:
                    with open(f"{rt}/{d_}/meta.yaml", "r") as file:
                        string = file.read()
                        string = string.replace(prefix_to_replace, prefix_replacement)
                    with open(f"{rt}/{d_}/meta.yaml", "w") as file:
                        file.write(string)
                break
        else:
            with open(f"{root}/{d}/meta.yaml", "r") as file:
                string = file.read()
                string = string.replace(prefix_to_replace, prefix_replacement)
            with open(f"{root}/{d}/meta.yaml", "w") as file:
                file.write(string)
    break

In [14]:
string.replace(prefix_to_replace, prefix_replacement)

"artifact_uri: /Users/tanguanyu/UCC-DRN-Pytorch/camelyon/mlruns/152105657986962541/2601d759316f40c78dc0aa8a8f21b5ad/artifacts\nend_time: 1749875771310\nentry_point_name: ''\nexperiment_id: '152105657986962541'\nlifecycle_stage: active\nrun_id: 2601d759316f40c78dc0aa8a8f21b5ad\nrun_name: funny-stork-278\nsource_name: ''\nsource_type: 4\nsource_version: ''\nstart_time: 1749875771065\nstatus: 4\ntags: []\nuser_id: root\n"

In [15]:
with open("test.yaml", "w") as file:
    file.write(string)

In [16]:
print(os.getcwd())

/Users/tanguanyu/UCC-DRN-Pytorch/camelyon


In [30]:
import yaml
for root, dirs, files in  os.walk("mlruns/152105657986962541"):
    for d in dirs:
        if d=="models":
            pass
        else:
            with open(f"{root}/{d}/meta.yaml", "r") as file:
                obj = yaml.safe_load(file)
            if "run_uuid" not in obj:
                obj["run_uuid"] = obj["run_id"]
                with open(f"{root}/{d}/meta.yaml", "w") as file:
                    file.write(yaml.safe_dump(obj))
    break

In [29]:
yaml.safe_dump(obj)

"artifact_uri: /Users/tanguanyu/UCC-DRN-Pytorch/camelyon/mlruns/152105657986962541/2601d759316f40c78dc0aa8a8f21b5ad/artifacts\nend_time: 1749875771310\nentry_point_name: ''\nexperiment_id: '152105657986962541'\nlifecycle_stage: active\nrun_id: 2601d759316f40c78dc0aa8a8f21b5ad\nrun_name: funny-stork-278\nsource_name: ''\nsource_type: 4\nsource_version: ''\nstart_time: 1749875771065\nstatus: 4\ntags: []\nuser_id: root\n"

In [36]:
filess = []
for root, dirs, files in  os.walk("mlruns/152105657986962541"):
    for d in dirs:
        if d=="models":
            pass
        else:
            loss_file_path = f"{root}/{d}/metrics/loss"
            if os.path.exists(loss_file_path):
                with open(loss_file_path, "r") as file:
                    string = file.read()
                if len(string)==0:
                    filess.append(f"{root}/{d}")
            else:
                filess.append(f"{root}/{d}")
    break

In [39]:
import shutil
for f in filess:
    shutil.rmtree(f)