## Load libraries

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os,sys
import re
import math
from datetime import datetime
import time
sys.dont_write_bytecode = True

In [3]:
import pandas as pd
import joblib
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path
from typing import List, Set, Dict, Tuple, Optional, Iterable, Mapping, Union, Callable, TypeVar

from pprint import pprint
from ipdb import set_trace as brpt

In [4]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from  torch.linalg import norm as tnorm
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision
from torchvision import datasets, transforms

import pytorch_lightning as pl
from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.tuner.tuning import Tuner


# Select Visible GPU
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"]="1"

## Set Path 
1. Add project root and src folders to `sys.path`
2. Set DATA_ROOT to `maptile_v2` folder

In [5]:
this_nb_path = Path(os.getcwd())
ROOT = this_nb_path.parent
SRC = ROOT/'src'
DATA_ROOT = Path("/data/hayley-old/maptiles_v2/")
paths2add = [this_nb_path, ROOT]

print("Project root: ", str(ROOT))
print('Src folder: ', str(SRC))
print("This nb path: ", str(this_nb_path))


for p in paths2add:
    if str(p) not in sys.path:
        sys.path.insert(0, str(p))
        print(f"\n{str(p)} added to the path.")
        
# print(sys.path)

Project root:  /data/hayley-old/Tenanbaum2000
Src folder:  /data/hayley-old/Tenanbaum2000/src
This nb path:  /data/hayley-old/Tenanbaum2000/nbs

/data/hayley-old/Tenanbaum2000 added to the path.


In [6]:
# Data transforms
from src.data.transforms.transforms import Identity, Unnormalizer, LinearRescaler
from src.data.transforms.functional import unnormalize

# Utils
from src.visualize.utils import show_timg, show_timgs, show_batch, make_grid_from_tensors
from src.utils.misc import info, get_next_version_path
from collections import OrderedDict

In [7]:
# DataModules
from src.data.datamodules import MNISTDataModule, MNISTMDataModule, MonoMNISTDataModule
from src.data.datamodules import MultiMonoMNISTDataModule
from src.data.datamodules.multisource_rotated_mnist_datamodule import MultiRotatedMNISTDataModule
from src.data.datamodules.multisource_maptiles_datamodule import MultiMaptilesDataModule


# plModules
from src.models.plmodules.vanilla_vae import VanillaVAE
from src.models.plmodules.iwae import IWAE
from src.models.plmodules.bilatent_vae import BiVAE

# Evaluations
from torch.utils.tensorboard import SummaryWriter
from pytorch_lightning.utilities.cloud_io import load as pl_load
from src.evaluator.qualitative import save_content_transfers, save_style_transfers, run_both_transfers

In [8]:
"""
Find the best hparam setting for a specific BiVAE model trained on a specific datamodule.

Required args:
    --model_name: eg. "vae", "iwae", "bivae"
    --data_name: eg. "maptiles", "mnist", "multi_mono_mnist"
    --latent_dim: int, eg. 10

Optional args: (partial)
    --hidden_dims: eg. --hidden_dims 32 64 128 256 (which is default)

Hyperparameter space:
- latent_dim = [16, 32, 63, 128]
- is_contrasive =  [False, True]
- kld_weight = [
- adv_loss_weight = [5, 15, 45, 135, 405, 1215]
- batch_size = [32, 64, 128, 256, 514, 1028]
- learning_rate =

To run: (at the root of the project, ie. /data/hayley-old/Tenanbaum2000
# Values for adv_weight, latent_dim, batch_size, lr, is_contrasive will be overwritten
# as the searched hyperparmeter values

 nohup python tune_hparams_bivae.py --model_name="bivae" \
--latent_dim=10 --hidden_dims 32 64 128 256 --adv_dim 32 32 32 --adv_weight 15.0 \
--data_name="multi_mono_mnist" --colors red green blue --n_styles=3 \
--gpu_id=2 --max_epochs=300 --batch_size=128 -lr 1e-3  --terminate_on_nan=True  \
--log_root="/data/hayley-old/Tenanbaum2000/lightning_logs/2021-01-13-ray/" &

 nohup python tune_hparams_bivae.py --model_name="bivae" \
--latent_dim=10 --hidden_dims 32 64 128 256 --adv_dim 32 32 32 --adv_weight 15.0 \
--use_beta_scheduler \
--data_name="multi_mono_mnist" --colors red green blue --n_styles=3 \
--gpu_id=2 --max_epochs=300 --batch_size=128 -lr 1e-3  --terminate_on_nan=True  \
--log_root="/data/hayley-old/Tenanbaum2000/lightning_logs/2021-01-14-ray/" &

# View the Ray dashboard at http://127.0.0.1:8265
# Run this at  local terminal:
# ssh -NfL 8265:localhost:8265 arya

"""

import os
import time
from argparse import ArgumentParser, Namespace
from collections import OrderedDict
from pathlib import Path
from typing import List, Set,Any, Dict, Tuple, Optional, Iterable, Mapping, Union, Callable, TypeVar
import warnings
from pprint import pprint

import numpy as np
import torch
import torch.nn as nn

import torchvision
import pytorch_lightning as pl
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.callbacks import LearningRateMonitor

# Ray
import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.integration.pytorch_lightning import TuneReportCallback

from src.callbacks.recon_logger import ReconLogger
from src.callbacks.hist_logger import  HistogramLogger
from src.callbacks.beta_scheduler import BetaScheduler

# src helpers
from src.utils.misc import info, n_iter_per_epoch
from src.models.model_wrapper import ModelWrapper

# utils for instatiating a selected datamodule and a selected model
from utils import get_model_class, get_dm_class
from utils import instantiate_model, instantiate_dm
from utils import add_base_arguments

In [9]:
def train_tune(args: Union[Dict, Namespace]):
    # Init. datamodule and model
    dm = instantiate_dm(args)
    dm.setup('fit')
    model = instantiate_model(args)

    # Specify logger
    exp_name = f'{model.name}_{dm.name}'
    print('Exp name: ', exp_name)
    tb_logger = pl_loggers.TensorBoardLogger(save_dir=args.log_root,
                                             name=exp_name,
                                             default_hp_metric=False,
                                             )
    log_dir = Path(tb_logger.log_dir)
    print("Log Dir: ", log_dir)
    # breakpoint()
    if not log_dir.exists():
        log_dir.mkdir(parents=True)
        print("Created: ", log_dir)

    # Specify callbacks
    callbacks = [
        LearningRateMonitor(logging_interval='epoch'),
        TuneReportCallback(
            {
            'loss': 'val_loss',
            'mean_accuracy': 'val/style_acc', # use the string after pl.Module's "self.log("
            },
            on="validation_end"
        ),
        # HistogramLogger(hist_epoch_interval=args.hist_epoch_interval),
        # ReconLogger(recon_epoch_interval=args.recon_epoch_interval),
        #         EarlyStopping('val_loss', patience=10),
    ]
    if args.use_beta_scheduler:
        max_iters = n_iter_per_epoch(dm.train_dataloader()) * args.max_epochs
        callbacks.append(BetaScheduler(max_iters,
                                       start=args.beta_start,
                                       stop=args.beta_stop,
                                       n_cycle=args.beta_n_cycle,
                                       ratio=args.beta_ratio,
                                       log_tag=args.beta_log_tag))

    trainer_overwrites = {
        'gpus':1, #use a single gpu
        'progress_bar_refresh_rate':0, # don't print out progress bar
        'terminate_on_nan':True,
        'check_val_every_n_epoch':10,
        'logger': tb_logger,
        'callbacks': callbacks
    }

    # Init. trainer
    trainer = pl.Trainer.from_argparse_args(args, **trainer_overwrites)

    # Log model's computational graph
    model_wrapper = ModelWrapper(model)
    # tb_logger.experiment.add_graph(model_wrapper, model.)
    tb_logger.log_graph(model_wrapper)


    # ------------------------------------------------------------------------
    # Run the experiment
    # ------------------------------------------------------------------------
    start_time = time.time()
    print(f"{exp_name} started...")
    print(f"Logging to {Path(tb_logger.log_dir).absolute()}")
    trainer.fit(model, dm)
    print(f"Finished at ep {trainer.current_epoch, trainer.batch_idx}")


    # ------------------------------------------------------------------------
    # Log the best score and current experiment's hyperparameters
    # ------------------------------------------------------------------------
    hparams = model.hparams.copy()
    hparams.update(dm.hparams)
    best_score = trainer.checkpoint_callback.best_model_score.item()
    metrics = {'hparam/best_score': best_score}  # todo: define a metric and use it here
    trainer.logger.log_hyperparams(hparams, metrics)

    print("Logged hparams and metrics...")
    print("\t hparams: ")
    pprint(hparams)
    print("=====")
    print("\t metrics: ", metrics)
    print(f"Training Done: took {time.time() - start_time}")

    # ------------------------------------------------------------------------
    # Evaluation
    #   1. Reconstructions:
    #     x --> model.encoder(x) --> theta_z
    #     --> sample N latent codes from the Pr(z; theta_z)
    #     --> model.decoder(z) for each sampled z's
    #   2. Embedding:
    #       a mini-batch input -> mu_z, logvar_z
    #       -> rsample
    #       -> project to 2D -> visualize
    #   3. Inspect the topology/landscape of the learned latent space
    #     Latent traversal: Pick a dimension of the latent space.
    #     - Keep all other dimensions' values constant.
    #     - Vary the chosen dimenion's values (eg. linearly, spherically)
    #     - and decode the latent codes. Show the outputs of the decoder.
    #   4. Marginal Loglikelihood of train/val/test dataset
    # ------------------------------------------------------------------------
    # print("Evaluations...")
    # model.eval()

In [10]:
def main(user_args: List[str]):
    parser = ArgumentParser()

    # ------------------------------------------------------------------------
    # Add general arguments for this CLI script for training/testing
    # ------------------------------------------------------------------------
    parser = add_base_arguments(parser)
    args, unknown = parser.parse_known_args(user_args)
    print("Base CLI args: ")
    pprint(args)

    # ------------------------------------------------------------------------
    # Add model/datamodule/trainer specific args
    # ------------------------------------------------------------------------
    model_class = get_model_class(args.model_name)
    dm_class = get_dm_class(args.data_name)
    parser = model_class.add_model_specific_args(parser)
    parser = dm_class.add_model_specific_args(parser)
    parser = pl.Trainer.add_argparse_args(parser)

    # RayTune args
    parser.add_argument('--n_cpus',  type=int, default=8, help='Num of CPUs per trial')
    parser.add_argument("--gpu_ids", type=str, required=True, nargs='*',
                        help="GPU ID(s) to use") #Returns an empty list if not specified
    parser.add_argument("--n_ray_samples", type=int, default=1,
                         help="Num of Ray Tune's run argument, num_samples")
    parser.add_argument("--ray_log_dir", type=str, default="/data/log/ray",
                        help="dir to save training results from Ray")
    # Callback switch args
    parser = BetaScheduler.add_argparse_args(parser)
    # parser.add_argument("--hist_epoch_interval", type=int, default=10, help="Epoch interval to plot histogram of q's parameter")
    # parser.add_argument("--recon_epoch_interval", type=int, default=10, help="Epoch interval to plot reconstructions of train and val samples")
    args = parser.parse_args(user_args)
    print("Final args: ")
    pprint(args)

    # Select Visible GPU
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(args.gpu_ids)
    print("===GPUs===")
    print(os.environ["CUDA_VISIBLE_DEVICES"])

    def set_hparam_and_train_closure(config: Dict[str, Any]):
        """Use the (k,v) in `overwrite` to update the args

        Parameters
        ----------
        config: Hyperparam search space as a Dict[hparam-name, value of the hpamram]
            This dict object is a sample point from the Ray's Hyperparameter space,
            and will be used to overwrite the `args`'s key-value with its key-value.

        Returns
        -------
        None. Train the model in the specified hyperparmeter space
        """
        print("Inside the clousure===")
        pprint(args)
        print("===")
        pprint(config)

        d_args =  vars(args)
        for k, v in config.items():
            d_args[k] = v
            print("Overwrote args: ", k)

        # Start experiment with this overwritten hyperparams
        train_tune(args)

    # ------------------------------------------------------------------------
    # Specify hyperparameter search space
    # ------------------------------------------------------------------------
    search_space = {
        # "latent_dim": tune.grid_search([10, 20, 60, 100]),
        'enc_type': tune.choice(['conv', 'resnet']),
        'dec_type': tune.choice(['conv', 'resnet']),
        'is_contrasive': tune.choice([False, True]),
        'kld_weight': tune.choice(np.array([0.5*(2**i) for i in range(12)])), #[0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32., 64, 128., 256, 512, 1024]), #np.array([0.5*(2**i) for i in range(12)])
        'use_beta_scheduler': False, #tune.grid_search([False,True]),
        'adv_loss_weight': tune.choice(np.logspace(0.0, 7.0, num=8, base=3.0)),
        'learning_rate': tune.loguniform(1e-4, 1e-1), #tune.grid_search(list(np.logspace(-4., -1, num=10))),
        'batch_size': tune.choice([32, 64, 128,]),
    }
    

    # ------------------------------------------------------------------------
    # Start hyperparameter search using Ray
    # ------------------------------------------------------------------------
#     ray.shutdown()
#     ray.init(log_to_driver=False)
    # search_alg =

    reporter = CLIReporter(
        parameter_columns=list(search_space.keys()),
        metric_columns=["loss", "mean_accuracy", "training_iteration"])
    
#     breakpoint()
    
    analysis = tune.run(
        set_hparam_and_train_closure,
        config=search_space,
        metric='loss', #set to val_loss
        mode='min',
        # search_alg=search_alg,
        num_samples=args.n_ray_samples,
        verbose=1,
        progress_reporter=reporter,
        name="Tune-BiVAE", # name of experiment
        local_dir= args.ray_log_dir,
        resources_per_trial={"cpu":args.n_cpus, "gpu": len(args.gpu_ids)}, # there are 16cpus in arya machine; so at a time 16/2=8 trials will be run concurrently
    )
    print("Best hyperparameters found were: ", analysis.best_config)

    dfs = analysis.fetch_trial_dataframes()
    
    
    
    
#         # Debug
#     config = {
#         # "latent_dim": tune.grid_search([10, 20, 60, 100]),
#         'enc_type': 'conv',
#         'dec_type': 'resnet',
#         'is_contrasive': False,
#         'kld_weight': 1.0, 
#         'use_beta_scheduler': False, #tune.grid_search([False,True]),
#         'adv_loss_weight': 1.0, 
#         'learning_rate': 1e-4, 
#         'batch_size': 32,
#     }
#     set_hparam_and_train_closure(config)

In [15]:
def main(user_args: List[str]):
    parser = ArgumentParser()

    # ------------------------------------------------------------------------
    # Add general arguments for this CLI script for training/testing
    # ------------------------------------------------------------------------
    parser = add_base_arguments(parser)
    args, unknown = parser.parse_known_args(user_args)
    print("Base CLI args: ")
    pprint(args)

    # ------------------------------------------------------------------------
    # Add model/datamodule/trainer specific args
    # ------------------------------------------------------------------------
    model_class = get_model_class(args.model_name)
    dm_class = get_dm_class(args.data_name)
    parser = model_class.add_model_specific_args(parser)
    parser = dm_class.add_model_specific_args(parser)
    parser = pl.Trainer.add_argparse_args(parser)

    # RayTune args
    parser.add_argument('--n_cpus',  type=int, default=8, help='Num of CPUs per trial')
    parser.add_argument("--gpu_ids", type=str, required=True, nargs='*',
                        help="GPU ID(s) to use") #Returns an empty list if not specified
    parser.add_argument("--n_ray_samples", type=int, default=1,
                         help="Num of Ray Tune's run argument, num_samples")
    parser.add_argument("--ray_log_dir", type=str, default="/data/log/ray",
                        help="dir to save training results from Ray")
    # Callback switch args
    parser = BetaScheduler.add_argparse_args(parser)
    # parser.add_argument("--hist_epoch_interval", type=int, default=10, help="Epoch interval to plot histogram of q's parameter")
    # parser.add_argument("--recon_epoch_interval", type=int, default=10, help="Epoch interval to plot reconstructions of train and val samples")
    args = parser.parse_args(user_args)
    print("Final args: ")
    pprint(args)

    # Select Visible GPU
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(args.gpu_ids)
    print("===GPUs===")
    print(os.environ["CUDA_VISIBLE_DEVICES"])

    def set_hparam_and_train_closure(config: Dict[str, Any]):
        """Use the (k,v) in `overwrite` to update the args

        Parameters
        ----------
        config: Hyperparam search space as a Dict[hparam-name, value of the hpamram]
            This dict object is a sample point from the Ray's Hyperparameter space,
            and will be used to overwrite the `args`'s key-value with its key-value.

        Returns
        -------
        None. Train the model in the specified hyperparmeter space
        """
        print("Inside the clousure===")
        pprint(args)
        print("===")
        pprint(config)

        d_args =  vars(args)
        for k, v in config.items():
            d_args[k] = v
            print("Overwrote args: ", k)

        # Start experiment with this overwritten hyperparams
        train_tune(args)

    # ------------------------------------------------------------------------
    # Specify hyperparameter search space
    # ------------------------------------------------------------------------
    search_space = {
        # "latent_dim": tune.grid_search([10, 20, 60, 100]),
        'enc_type': tune.choice(['conv', 'resnet']),
        'dec_type': tune.choice(['conv', 'resnet']),
        'is_contrasive': tune.choice([False, True]),
        'kld_weight': tune.choice(np.array([0.5*(2**i) for i in range(12)])), #[0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32., 64, 128., 256, 512, 1024]), #np.array([0.5*(2**i) for i in range(12)])
        'use_beta_scheduler': False, #tune.grid_search([False,True]),
        'adv_loss_weight': tune.choice(np.logspace(0.0, 7.0, num=8, base=3.0)),
        'learning_rate': tune.loguniform(1e-4, 1e-1), #tune.grid_search(list(np.logspace(-4., -1, num=10))),
        'batch_size': tune.choice([32, 64, 128,]),
    }
    

    # ------------------------------------------------------------------------
    # Start hyperparameter search using Ray
    # ------------------------------------------------------------------------
    ray.shutdown()
    ray.init(log_to_driver=False)
    # search_alg =

    reporter = CLIReporter(
        parameter_columns=list(search_space.keys()),
        metric_columns=["loss", "mean_accuracy", "training_iteration"])
    
#     breakpoint()
    
    analysis = tune.run(
        set_hparam_and_train_closure,
        config=search_space,
        metric='loss', #set to val_loss
        mode='min',
        # search_alg=search_alg,
        num_samples=args.n_ray_samples,
        verbose=1,
        progress_reporter=reporter,
        name="Tune-BiVAE", # name of experiment
        local_dir= args.ray_log_dir,
        resources_per_trial={"cpu":args.n_cpus, "gpu": len(args.gpu_ids)}, # there are 16cpus in arya machine; so at a time 16/2=8 trials will be run concurrently
    )
    print("Best hyperparameters found were: ", analysis.best_config)

    dfs = analysis.fetch_trial_dataframes()
    
    
    
    
#         # Debug
#     config = {
#         # "latent_dim": tune.grid_search([10, 20, 60, 100]),
#         'enc_type': 'conv',
#         'dec_type': 'resnet',
#         'is_contrasive': False,
#         'kld_weight': 1.0, 
#         'use_beta_scheduler': False, #tune.grid_search([False,True]),
#         'adv_loss_weight': 1.0, 
#         'learning_rate': 1e-4, 
#         'batch_size': 32,
#     }
#     set_hparam_and_train_closure(config)

In [16]:
cmd = """--model_name bivae 
--latent_dim 20 
--hidden_dims 32 64 128 256 512 
--adv_dim 32 32 32 
--data_name "multi_maptiles" 
--cities 'la' 'charlotte' 'vegas' 'boston' 'paris' 'amsterdam' 'shanghai' 'seoul' 'chicago' 'manhattan' 'berlin' 'montreal' 'rome' 
--styles StamenTonerBackground --n_styles 1 
--zooms 14 
--gpu_ids 0 --max_epochs 150 --terminate_on_nan True 
--n_ray_samples 1 
--log_root "/data/hayley-old/Tenanbaum2000/lightning_logs/2021-03-08-ray/"
"""

In [17]:
user_args = cmd.replace('\n', '').replace('"', '').replace('\'', '').split(' ')

In [18]:
print(user_args)

['--model_name', 'bivae', '--latent_dim', '20', '--hidden_dims', '32', '64', '128', '256', '512', '--adv_dim', '32', '32', '32', '--data_name', 'multi_maptiles', '--cities', 'la', 'charlotte', 'vegas', 'boston', 'paris', 'amsterdam', 'shanghai', 'seoul', 'chicago', 'manhattan', 'berlin', 'montreal', 'rome', '--styles', 'StamenTonerBackground', '--n_styles', '1', '--zooms', '14', '--gpu_ids', '0', '--max_epochs', '150', '--terminate_on_nan', 'True', '--n_ray_samples', '1', '--log_root', '/data/hayley-old/Tenanbaum2000/lightning_logs/2021-03-08-ray/']


In [19]:
main(user_args)

Base CLI args: 
Namespace(data_name='multi_maptiles', log_root='/data/hayley-old/Tenanbaum2000/lightning_logs/2021-03-08-ray/', mode='fit', model_name='bivae', verbose=False)
Final args: 
Namespace(accelerator=None, accumulate_grad_batches=1, act_fn='leaky_relu', adv_loss_weight=1.0, adversary_dims=[32, 32, 32], amp_backend='native', amp_level='O2', auto_lr_find=False, auto_scale_batch_size=False, auto_select_gpus=False, automatic_optimization=None, batch_size=32, benchmark=False, beta_log_tag='train/beta', beta_n_cycle=4, beta_ratio=0.5, beta_start=0.0, beta_stop=1.0, check_val_every_n_epoch=1, checkpoint_callback=True, cities=['la', 'charlotte', 'vegas', 'boston', 'paris', 'amsterdam', 'shanghai', 'seoul', 'chicago', 'manhattan', 'berlin', 'montreal', 'rome'], data_name='multi_maptiles', data_root='/data/hayley-old/maptiles_v2/', dec_type='conv', default_root_dir=None, deterministic=False, distributed_backend=None, enable_pl_optimizer=True, enc_type='conv', fast_dev_run=False, flush_

2021-03-08 15:33:03,392	INFO services.py:1172 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8267[39m[22m


== Status ==
Memory usage on this node: 18.4/62.6 GiB
Using FIFO scheduling algorithm.
Resources requested: 8/16 CPUs, 1/1 GPUs, 0.0/27.78 GiB heap, 0.0/9.57 GiB objects (0/1.0 accelerator_type:X)
Result logdir: /data/log/ray/Tune-BiVAE
Number of trials: 1/1 (1 RUNNING)




2021-03-08 15:33:07,023	ERROR trial_runner.py:616 -- Trial set_hparam_and_train_closure_a2338_00000: Error processing event.
Traceback (most recent call last):
  File "/home/hayley/miniconda3/envs/test/lib/python3.8/site-packages/ray/tune/trial_runner.py", line 586, in _process_trial
    results = self.trial_executor.fetch_result(trial)
  File "/home/hayley/miniconda3/envs/test/lib/python3.8/site-packages/ray/tune/ray_trial_executor.py", line 609, in fetch_result
    result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
  File "/home/hayley/miniconda3/envs/test/lib/python3.8/site-packages/ray/_private/client_mode_hook.py", line 47, in wrapper
    return func(*args, **kwargs)
  File "/home/hayley/miniconda3/envs/test/lib/python3.8/site-packages/ray/worker.py", line 1456, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RuntimeError): [36mray::ImplicitFunc.train_buffered()[39m (pid=27123, ip=68.181.30.41)
  File "python/ray/_raylet.pyx", line 439, in ra

== Status ==
Memory usage on this node: 18.5/62.6 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/1 GPUs, 0.0/27.78 GiB heap, 0.0/9.57 GiB objects (0/1.0 accelerator_type:X)
Result logdir: /data/log/ray/Tune-BiVAE
Number of trials: 1/1 (1 ERROR)
Number of errored trials: 1
+------------------------------------------+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                               |   # failures | error file                                                                                                                                                                                |
|------------------------------------------+--------------+----------------------------------------------------------------------------------------------------------------------------------------------------

TuneError: ('Trials did not complete', [set_hparam_and_train_closure_a2338_00000])

In [66]:
user_args = cmd.replace('\n', '').replace('"', '').replace('\'', '').split(' ')

In [67]:
print(user_args)

['--model_name', 'bivae', '--latent_dim', '20', '--hidden_dims', '32', '64', '128', '256', '512', '--adv_dim', '32', '32', '32', '--data_name', 'multi_maptiles', '--cities', 'la', 'charlotte', 'vegas', 'boston', 'paris', 'amsterdam', 'shanghai', 'seoul', 'chicago', 'manhattan', 'berlin', 'montreal', 'rome', '--styles', 'StamenTonerBackground', '--n_styles', '1', '--zooms', '14', '--gpu_ids', '0', '--max_epochs', '150', '--terminate_on_nan', 'True', '--n_ray_samples', '1', '--log_root', '/data/hayley-old/Tenanbaum2000/lightning_logs/2021-03-08-ray/']


In [68]:
main(user_args)

Base CLI args: 
Namespace(data_name='multi_maptiles', log_root='/data/hayley-old/Tenanbaum2000/lightning_logs/2021-03-08-ray/', mode='fit', model_name='bivae', verbose=False)
Final args: 
Namespace(accelerator=None, accumulate_grad_batches=1, act_fn='leaky_relu', adv_loss_weight=1.0, adversary_dims=[32, 32, 32], amp_backend='native', amp_level='O2', auto_lr_find=False, auto_scale_batch_size=False, auto_select_gpus=False, automatic_optimization=None, batch_size=32, benchmark=False, beta_log_tag='train/beta', beta_n_cycle=4, beta_ratio=0.5, beta_start=0.0, beta_stop=1.0, check_val_every_n_epoch=1, checkpoint_callback=True, cities=['la', 'charlotte', 'vegas', 'boston', 'paris', 'amsterdam', 'shanghai', 'seoul', 'chicago', 'manhattan', 'berlin', 'montreal', 'rome'], data_name='multi_maptiles', data_root='/data/hayley-old/maptiles_v2/', dec_type='conv', default_root_dir=None, deterministic=False, distributed_backend=None, enable_pl_optimizer=True, enc_type='conv', fast_dev_run=False, flush_

Missing logger folder: /data/hayley-old/Tenanbaum2000/lightning_logs/2021-03-08-ray/BiVAE-conv-resnet-1.0-1.0_Maptiles_la-charlotte-vegas-boston-paris-amsterdam-shanghai-seoul-chicago-manhattan-berlin-montreal-rome_StamenTonerBackground_14


n_train, n_val:  3941 1688
train channelwise_mean,std:  [0.86187316 0.86187316 0.86187316] [0.33095462 0.33095462 0.33095462]
Exp name:  BiVAE-conv-resnet-1.0-1.0_Maptiles_la-charlotte-vegas-boston-paris-amsterdam-shanghai-seoul-chicago-manhattan-berlin-montreal-rome_StamenTonerBackground_14
Log Dir:  /data/hayley-old/Tenanbaum2000/lightning_logs/2021-03-08-ray/BiVAE-conv-resnet-1.0-1.0_Maptiles_la-charlotte-vegas-boston-paris-amsterdam-shanghai-seoul-chicago-manhattan-berlin-montreal-rome_StamenTonerBackground_14/version_0
Created:  /data/hayley-old/Tenanbaum2000/lightning_logs/2021-03-08-ray/BiVAE-conv-resnet-1.0-1.0_Maptiles_la-charlotte-vegas-boston-paris-amsterdam-shanghai-seoul-chicago-manhattan-berlin-montreal-rome_StamenTonerBackground_14/version_0


GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


BiVAE-conv-resnet-1.0-1.0_Maptiles_la-charlotte-vegas-boston-paris-amsterdam-shanghai-seoul-chicago-manhattan-berlin-montreal-rome_StamenTonerBackground_14 started...
Logging to /data/hayley-old/Tenanbaum2000/lightning_logs/2021-03-08-ray/BiVAE-conv-resnet-1.0-1.0_Maptiles_la-charlotte-vegas-boston-paris-amsterdam-shanghai-seoul-chicago-manhattan-berlin-montreal-rome_StamenTonerBackground_14/version_0



   | Name               | Type          | Params
------------------------------------------------------
0  | act_fn             | LeakyReLU     | 0     
1  | out_fn             | Tanh          | 0     
2  | encoder            | Sequential    | 1.6 M 
3  | fc_flatten2qparams | Linear        | 20.5 K
4  | fc_latent2flatten  | Linear        | 10.8 K
5  | decoder            | ResNetDecoder | 6.3 M 
6  | out_layer          | Sequential    | 84    
7  | adversary          | Sequential    | 2.5 K 
8  | train_style_acc    | Accuracy      | 0     
9  | val_style_acc      | Accuracy      | 0     
10 | test_style_acc     | Accuracy      | 0     
------------------------------------------------------
7.9 M     Trainable params
0         Non-trainable params
7.9 M     Total params


Ep: 0, batch: 0
Ep: 0, batch: 0




Ep: 0, batch: 0


    return _run_code(code, main_globals, None,
  File "/home/hayley/miniconda3/envs/test/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/home/hayley/miniconda3/envs/test/lib/python3.8/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/hayley/miniconda3/envs/test/lib/python3.8/site-packages/traitlets/config/application.py", line 845, in launch_instance
    app.start()
  File "/home/hayley/miniconda3/envs/test/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 612, in start
    self.io_loop.start()
  File "/home/hayley/miniconda3/envs/test/lib/python3.8/site-packages/tornado/platform/asyncio.py", line 199, in start
    self.asyncio_loop.run_forever()
  File "/home/hayley/miniconda3/envs/test/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
    self._run_once()
  File "/home/hayley/miniconda3/envs/test/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
    handle._run(

Ep: 10, batch: 0
Ep: 20, batch: 0
Ep: 30, batch: 0
Ep: 40, batch: 0
Ep: 50, batch: 0
Epoch    51: reducing learning rate of group 0 to 1.0000e-05.
Ep: 60, batch: 0
Ep: 70, batch: 0
Epoch    71: reducing learning rate of group 0 to 1.0000e-06.
Ep: 80, batch: 0
Ep: 90, batch: 0
Epoch    91: reducing learning rate of group 0 to 1.0000e-07.
Ep: 100, batch: 0
Epoch   102: reducing learning rate of group 0 to 1.0000e-08.
Ep: 110, batch: 0
Ep: 120, batch: 0
Ep: 130, batch: 0
Ep: 140, batch: 0
Finished at ep (149, 122)
Logged hparams and metrics...
	 hparams: 
{'act_fn': LeakyReLU(negative_slope=0.01),
 'adv_loss_weight': 1.0,
 'adversary_dims': [32, 32, 32],
 'batch_size': 32,
 'cities': ['la',
            'charlotte',
            'vegas',
            'boston',
            'paris',
            'amsterdam',
            'shanghai',
            'seoul',
            'chicago',
            'manhattan',
            'berlin',
            'montreal',
            'rome'],
 'dec_type': 'resnet',
 'enc_

In [None]:
from argparse import Namespace


In [61]:
# args=Namespace(
#   act_fn='leaky_relu', 
#   adv_loss_weight=1.0, 
#   adversary_dims=[32, 32, 32], 
#   batch_size=32, 
#    cities=['la', 'charlotte', 'vegas', 'boston', 'paris', 'amsterdam', 'shanghai', 'seoul', 'chicago', 'manhattan', 'berlin', 'montreal', 'rome'],
#     data_name='multi_maptiles', 
#     data_root='/data/hayley-old/maptiles_v2/', 
#     dec_type='conv', default_root_dir=None, 
#     enc_type='conv', 
#     gpu_ids=['0'], 
#     hidden_dims=[32, 64, 128, 256, 512], 
#     in_shape=[3, 32, 32], 
#     is_contrasive=True, 
#     kld_weight=1.0, 
#     latent_dim=20, learning_rate=0.001,  
#     log_root='/data/hayley-old/Tenanbaum2000/lightning_logs/2021-03-08-ray/', 
#     logger=True, 
#     max_epochs=150,
#     mode='fit', 
#     model_name='bivae',
#     n_cpus=8, n_ray_samples=1, n_styles=1, 
#     ray_log_dir='/data/log/ray',
#     styles=['StamenTonerBackground'], 
#     terminate_on_nan=True, 
#     use_beta_scheduler=True,
#     pin_memory=True,
#     num_workers=16,
#     verbose=False,
#     zooms=['14'])

In [None]:
# debug
# train_tune(args)