In [1]:
# using optimization to find the optimal mean and variance for normal initialization
from copy import deepcopy
from hydra import compose, initialize
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import mlflow
import numpy as np
from typing import Tuple
from omegaconf.omegaconf import OmegaConf
from torch.utils.data import DataLoader

from model import UCCDRNModel
from dataset import Cifar10Dataset
from utils import get_or_create_experiment, parse_experiment_runs_to_optuna_study
torch.autograd.set_detect_anomaly(True)

cfg_name = "train_cifar10_ucc_drn"
with initialize(version_base=None, config_path="../configs"):
    cfg = compose(config_name=cfg_name)
x = np.arange(-0.25,0.35,0.05)

In [2]:
def set_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True

def init_model_and_optimizer(args, model_cfg, device):
    model = UCCDRNModel(model_cfg).to(device)
    optimizer = torch.optim.Adam(
        model.parameters(), lr=args.learning_rate)
    return model, optimizer

def load_model_and_optimizer(experiment_id, run_id):
    model = torch.load(f"mlruns/{experiment_id}/{run_id}/artifacts/best_model/data/model.pth", weights=False)
    optimizer = torch.load(f"mlruns/{experiment_id}/{run_id}/artifacts/optimizer.pt", weights=False)
    return model, optimizer

def init_dataloader(args):
    train_dataset_len = args.train_num_steps * args.batch_size
    train_dataset = Cifar10Dataset(
        mode="train",
        num_instances=args.num_instances,
        num_samples_per_class=args.num_samples_per_class,
        object_arr=list(range(10)),
        ucc_start=args.ucc_start,
        ucc_end=args.ucc_end,
        length=train_dataset_len,
    )
    val_dataset_len = args.val_num_steps * args.batch_size
    val_dataset = Cifar10Dataset(
        mode="val",
        num_instances=args.num_instances,
        num_samples_per_class=args.num_samples_per_class,
        object_arr=list(range(10)),
        ucc_start=args.ucc_start,
        ucc_end=args.ucc_end,
        length=val_dataset_len,
    )
    # create dataloader
    train_loader = DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        shuffle=True,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        shuffle=False,
    )
    return train_loader, val_loader

def evaluate(model, val_loader, device):
    model.eval()
    val_ae_loss_list = []
    val_ucc_loss_list = []
    val_acc_list = []
    with torch.no_grad():
        for batch_samples, batch_labels in val_loader:
            batch_samples = batch_samples.to(device)
            batch_labels = batch_labels.to(device)

            ucc_logits, reconstruction = model(batch_samples, return_reconstruction=True)

            ucc_loss = F.cross_entropy(ucc_logits, batch_labels)
            val_ucc_loss_list.append(ucc_loss.item())

            ae_loss = F.mse_loss(batch_samples, reconstruction)
            val_ae_loss_list.append(ae_loss.item())

            # acculate accuracy
            # _, batch_labels = torch.max(batch_labels, dim=1)
            
            _, ucc_predicts = torch.max(ucc_logits, dim=1)
            acc = torch.sum(ucc_predicts == batch_labels).item() / len(batch_labels)
            val_acc_list.append(acc)
    return {
                "eval_ae_loss": np.round(np.mean(val_ae_loss_list), 5),
                "eval_ucc_loss": np.round(np.mean(val_ucc_loss_list), 5),
                "eval_ucc_acc": np.round(np.mean(val_acc_list), 5)
            }

def train(args, model, optimizer, lr_scheduler, train_loader, val_loader, device, step=0):
    print("training")
    # mlflow.pytorch.log_model(model, "init_model")
    # output_dir = hydra.core.hydra_config.HydraConfig.get().runtime.output_dir

    model.train()
    best_eval_acc = 0
    if step == 0:
        mlflow.pytorch.log_model(
            model,
            artifact_path = "best_model"
        )
    for batch_samples, batch_labels in tqdm(train_loader):
        batch_samples = batch_samples.to(device)
        batch_labels = batch_labels.to(device)

        optimizer.zero_grad()
        ucc_logits, reconstruction = model(batch_samples, return_reconstruction=True)
        ucc_loss = F.cross_entropy(ucc_logits, batch_labels)
        ae_loss = F.mse_loss(batch_samples, reconstruction)
        loss = (1-model.alpha)*ucc_loss + model.alpha*ae_loss

        loss.backward()

        optimizer.step()

        step += 1

        if step%20 ==0:
            with torch.no_grad():
                metric_dict = {}
                grad_log = {name: torch.mean(param.grad).cpu().item(
                ) for name, param in model.named_parameters() if isinstance(param.grad, torch.Tensor)}
                if step == 1000 and ae_loss.detach().item()>0.99:
                    encoder_grad_log = [grad for name, grad in grad_log.items() if "encoder" in name and "weight" in name]
                    if max(encoder_grad_log)<1e-9:
                        break
                mlflow.log_metrics(grad_log, step=step)
                metric_dict["train_ae_loss"] = np.round(ae_loss.detach().item(), 5)
                _, pred = torch.max(ucc_logits, dim=1)
                accuracy = torch.sum(pred.flatten() == batch_labels.flatten())/len(batch_labels)
                metric_dict["train_ucc_loss"] = np.round(ucc_loss.detach().item(), 5)
                metric_dict["train_ucc_acc"] = np.round(float(accuracy), 5)
                metric_dict["loss"] = np.round(float(loss), 5)
                print(f"Step {step}:", metric_dict)
            mlflow.log_metrics(metric_dict, step=step)

        if step % args.save_interval == 0:
            eval_metric_dict = evaluate(
                model,
                val_loader,
                device)
            print(f"step: {step}," + ",".join([f"{key}: {value}"for key, value in eval_metric_dict.items()]))
            mlflow.log_metrics(eval_metric_dict, step=step)
            eval_acc = eval_metric_dict["eval_ucc_acc"]
            if eval_acc > best_eval_acc or eval_acc==1.0:
                best_eval_acc = eval_acc
                mlflow.log_metric("best_eval_acc", best_eval_acc)
                mlflow.pytorch.log_model(model, artifact_path="best_model")
                torch.save(optimizer, "optimizer.pt")
                mlflow.log_artifact("optimizer.pt")
            if step == 200000:
                break
            model.train()

    print("Training finished!!!")
    return best_eval_acc

In [5]:

mlflow.set_tracking_uri("file:\\D:\\UCC-DRN-Pytorch\\cifar10\\mlruns")

run_name = "cifar10-ucc-drn-search-init"
experiment_id = get_or_create_experiment(experiment_name=run_name)
mlflow.set_experiment(experiment_id=experiment_id)
last_lower_bound = -0.3
last_upper_bound = 0
for lower_bound in x:
    lower_bound = np.round(lower_bound, 5)
    for upper_bound in x:
        upper_bound = np.round(upper_bound, 5)
        if lower_bound<last_lower_bound:
            continue
        if lower_bound==last_lower_bound and upper_bound<last_upper_bound:
            continue
        if lower_bound >= upper_bound:
            continue
        # print(lower_bound)
        # print(upper_bound)
        with mlflow.start_run(nested=True) as run:
            cfg.model.drn.init_lower_bound = float(lower_bound)
            cfg.model.drn.init_upper_bound = float(upper_bound)
            mlflow.log_params({
                "init_W_lower_bound": float(lower_bound),
                "init_W_upper_bound": float(upper_bound)
            })
            print(cfg.model.drn)
            print(experiment_id)
            cfg.args.learning_rate = 0.001
            mlflow.log_dict(dict(OmegaConf.to_object(cfg)), "config.yaml")
            args = cfg.args
            device = torch.device("cuda" if torch.cuda.is_available() else "mps")
            model, optimizer = init_model_and_optimizer(args, cfg, device)
            train_loader, val_loader = init_dataloader(args)
            artifact_path = run.info.artifact_uri
            mlflow.pytorch.log_model(
                    model,
                    artifact_path = "init_model")
            best_acc = train(args, model, optimizer, None,
                            train_loader, val_loader, device)

{'num_bins': 11, 'hidden_q': 100, 'num_layers': 2, 'num_nodes': 9, 'init_method': 'uniform', 'init_upper_bound': -0.2, 'init_lower_bound': -0.25, 'output_bins': 4}
259546648097171860
(100, 11)
(100, 100)
(4, 100)
10000 val samples
10000 val samples




training




  0%|          | 0/100000 [00:00<?, ?it/s]

Step 20: {'train_ae_loss': 0.96709, 'train_ucc_loss': 1.41244, 'train_ucc_acc': 0.06667, 'loss': 1.18977}
Step 40: {'train_ae_loss': 0.98496, 'train_ucc_loss': 1.40366, 'train_ucc_acc': 0.13333, 'loss': 1.19431}
Step 60: {'train_ae_loss': 0.98913, 'train_ucc_loss': 1.38701, 'train_ucc_acc': 0.26667, 'loss': 1.18807}
Step 80: {'train_ae_loss': 1.00313, 'train_ucc_loss': 1.41504, 'train_ucc_acc': 0.06667, 'loss': 1.20909}
Step 100: {'train_ae_loss': 0.97731, 'train_ucc_loss': 1.387, 'train_ucc_acc': 0.26667, 'loss': 1.18215}
Step 120: {'train_ae_loss': 0.99633, 'train_ucc_loss': 1.3952, 'train_ucc_acc': 0.13333, 'loss': 1.19577}
Step 140: {'train_ae_loss': 0.96004, 'train_ucc_loss': 1.3965, 'train_ucc_acc': 0.06667, 'loss': 1.17827}
Step 160: {'train_ae_loss': 0.99367, 'train_ucc_loss': 1.40761, 'train_ucc_acc': 0.13333, 'loss': 1.20064}
Step 180: {'train_ae_loss': 0.97402, 'train_ucc_loss': 1.38374, 'train_ucc_acc': 0.26667, 'loss': 1.17888}
Step 200: {'train_ae_loss': 1.07271, 'train_u



training




  0%|          | 0/100000 [00:00<?, ?it/s]

Step 20: {'train_ae_loss': 1.0268, 'train_ucc_loss': 1.3927, 'train_ucc_acc': 0.2, 'loss': 1.20975}
Step 40: {'train_ae_loss': 1.02644, 'train_ucc_loss': 1.38617, 'train_ucc_acc': 0.26667, 'loss': 1.20631}
Step 60: {'train_ae_loss': 1.0034, 'train_ucc_loss': 1.39084, 'train_ucc_acc': 0.2, 'loss': 1.19712}
Step 80: {'train_ae_loss': 0.99395, 'train_ucc_loss': 1.38791, 'train_ucc_acc': 0.26667, 'loss': 1.19093}
Step 100: {'train_ae_loss': 0.94712, 'train_ucc_loss': 1.38177, 'train_ucc_acc': 0.33333, 'loss': 1.16444}
Step 120: {'train_ae_loss': 0.96795, 'train_ucc_loss': 1.37255, 'train_ucc_acc': 0.4, 'loss': 1.17025}
Step 140: {'train_ae_loss': 0.91093, 'train_ucc_loss': 1.37986, 'train_ucc_acc': 0.33333, 'loss': 1.1454}
Step 160: {'train_ae_loss': 1.04361, 'train_ucc_loss': 1.37841, 'train_ucc_acc': 0.33333, 'loss': 1.21101}
Step 180: {'train_ae_loss': 0.97229, 'train_ucc_loss': 1.3947, 'train_ucc_acc': 0.2, 'loss': 1.18349}
Step 200: {'train_ae_loss': 0.97939, 'train_ucc_loss': 1.35701



step: 1000,eval_ae_loss: 0.98881,eval_ucc_loss: 1.38645,eval_ucc_acc: 0.25




Step 1020: {'train_ae_loss': 0.96148, 'train_ucc_loss': 1.39512, 'train_ucc_acc': 0.13333, 'loss': 1.1783}
Step 1040: {'train_ae_loss': 1.01159, 'train_ucc_loss': 1.38542, 'train_ucc_acc': 0.2, 'loss': 1.1985}
Step 1060: {'train_ae_loss': 0.96836, 'train_ucc_loss': 1.38468, 'train_ucc_acc': 0.33333, 'loss': 1.17652}
Step 1080: {'train_ae_loss': 0.94212, 'train_ucc_loss': 1.39223, 'train_ucc_acc': 0.13333, 'loss': 1.16718}
Step 1100: {'train_ae_loss': 0.95178, 'train_ucc_loss': 1.38628, 'train_ucc_acc': 0.2, 'loss': 1.16903}
Step 1120: {'train_ae_loss': 0.9981, 'train_ucc_loss': 1.38611, 'train_ucc_acc': 0.2, 'loss': 1.1921}
Step 1140: {'train_ae_loss': 0.96227, 'train_ucc_loss': 1.38626, 'train_ucc_acc': 0.33333, 'loss': 1.17426}
Step 1160: {'train_ae_loss': 0.97367, 'train_ucc_loss': 1.38796, 'train_ucc_acc': 0.2, 'loss': 1.18081}
Step 1180: {'train_ae_loss': 1.02378, 'train_ucc_loss': 1.38247, 'train_ucc_acc': 0.33333, 'loss': 1.20312}
Step 1200: {'train_ae_loss': 0.96142, 'train_ucc

KeyboardInterrupt: 

In [None]:
cfg

{'args': {'dataset': 'cifar10', 'model_dir': 'saved_models/', 'model_name': 'cifar10_ucc', 'num_instances': 32, 'ucc_start': 1, 'ucc_end': 4, 'batch_size': 15, 'num_samples_per_class': 5, 'num_workers': 4, 'learning_rate': 0.001, 'num_bins': 11, 'num_features': 10, 'train_num_steps': 100000, 'val_num_steps': 200, 'save_interval': 1000, 'seed': 22}, 'model': {'num_channels': 3, 'input_shape': [28, 28, 1], 'kde_model': {'num_bins': 11, 'sigma': 0.1}, 'encoder': {'conv_input_channel': 3, 'conv_output_channel': 16, 'block1_output_channel': 321, 'block1_num_layer': 1, 'block2_output_channel': 64, 'block2_num_layer': 1, 'block3_output_channel': 128, 'block3_num_layer': 1, 'flatten_size': 8192, 'num_features': 10}, 'decoder': {'linear_size': 8192, 'reshape_size': [7, 7, 128], 'block1_output_channel': 64, 'block1_num_layer': 1, 'block2_output_channel': 32, 'block2_num_layer': 1, 'block3_output_channel': 16, 'block3_num_layer': 1, 'output_channel': 3}, 'drn': {'num_bins': 11, 'hidden_q': 100, '