# Hyperparameter selection with Ray Tune

In [1]:
import os
from datetime import datetime
import time
import random
import warnings
import joblib
import warnings
import gc
# My modules
from config import Config
from logger import init_logger
from common_utils import set_seeds, read_csvs, stratify_split, setup, get_data_dfs, get_loaders
from model import Model
from train_loop_functions import train_epoch, valid_epoch

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score

import torch
from torch import nn
from torch.cuda.amp import GradScaler

# hyperparameter tuning
from functools import partial
import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from ray.tune.suggest.ax import AxSearch
from ray.tune.suggest.bayesopt import BayesOptSearch

In [2]:
%load_ext autoreload
%autoreload 2

# Setup

In [3]:
set_seeds(Config.seed)
LOGGER = init_logger() # uses Python's logging framework

# Tuning

In [4]:
# all params are passed in by Tune. this is the "objective" function.
def train_main(config, checkpoint_dir=None, data_dir=None):
    assert config is not None
    # -------- DATASETS AND LOADERS --------
    data_df, _, _ = read_csvs(data_dir, Config.debug, num_samples=7000)
    train_folds = stratify_split(data_df, Config.fold_num, Config.seed, Config.target_col)
    
    # select only one of the folds (fold 0)
    train_df, valid_df = get_data_dfs(train_folds, 0)
    train_dataloader, valid_dataloader = get_loaders(train_df, valid_df,
                                                     config["batch_size"], 
                                                     data_dir+'/train_images')
    
    # -------- MODEL --------
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model, optimizer, scheduler, criterion = setup(Config.model_arch, 
                                                   config["lr"], 
                                                   config["is_amsgrad"], 
                                                   num_labels=data_df.label.nunique(), 
                                                   fc_layer={
                                                       'middle_fc': config["middle_fc"], 
                                                        'middle_fc_size': config["middle_fc_size"]
                                                    }, 
                                                   weight_decay=config["weight_decay"],
                                                   device=device,
                                                   checkpoint=None)
    
    # The `checkpoint_dir` parameter gets passed by Ray Tune when a checkpoint should be restored.
    if checkpoint_dir:
        checkpoint = os.path.join(checkpoint_dir, "checkpoint")
        model_state, optimizer_state = torch.load(checkpoint)
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)
        
    # EPOCHS TRAIN
    for e in range(10):
        # TRAIN
        avg_training_loss = train_epoch(train_dataloader, model, 
                                      criterion, optimizer, 
                                      scheduler, GradScaler(), 
                                      config["accum_iter"], LOGGER,
                                      device)

        # VALIDATE
        avg_validation_loss, preds = valid_epoch(valid_dataloader, model, criterion, LOGGER, device)
  
        validation_labels = valid_df[Config.target_col].values
        accuracy = accuracy_score(y_true=validation_labels, y_pred=preds)
        
        # SAVE CHECKPOINT.
        # It is automatically registered with Ray Tune and will potentially
        # be passed as the `checkpoint_dir` parameter in future iterations.
        with tune.checkpoint_dir(step=e) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((model.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=avg_validation_loss, accuracy=accuracy)
        gc.collect()

    print("Finished Training")

In [5]:
def main(num_samples=50, max_num_epochs=10, gpus_per_trial=1):
    data_dir = os.path.abspath('./data')
    
    hyperconfig = {
        "is_amsgrad": False,
        "accum_iter": tune.choice([4,8]),
        "lr": tune.loguniform(1e-3, 1e-2),
        "batch_size": tune.choice([8, 16]),
        "weight_decay": tune.choice([1/8 * 0.001, 1/2 * 0.001, 0.]),
        # try adding a FC layer to the classifier portion of the model
        "middle_fc": False,
        "middle_fc_size": tune.choice([512, 256])
    }
    
    scheduler = ASHAScheduler(
        #metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=2,
        reduction_factor=2)
    reporter = CLIReporter(metric_columns=["loss", "accuracy", "training_iteration"])
    
    search_alg = AxSearch(metric="loss", mode="min")
    #search_alg = BayesOptSearch(metric="loss", mode="min")
    result = tune.run(
        partial(train_main, data_dir=data_dir),
        name="ax2",
        metric="loss",
        resources_per_trial={"cpu": os.cpu_count(), "gpu": gpus_per_trial},
        config=hyperconfig,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        search_alg=search_alg,
        local_dir='./ray-results',
        checkpoint_score_attr='min-loss'
    )

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

In [None]:
main()

2020-12-26 22:04:11,463	INFO services.py:1092 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
2020-12-26 22:04:12,129	INFO registry.py:65 -- Detected unknown callable for trainable. Converting to class.
[INFO 12-26 22:04:12] ax.service.ax_client: Starting optimization with verbose logging. To disable logging, set the `verbose_logging` argument to `False`. Note that float values in the logs are rounded to 2 decimal points.
[INFO 12-26 22:04:12] ax.service.utils.instantiation: Inferred value type of ParameterType.INT for parameter accum_iter. If that is not the expected value type, you can explicity specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
[INFO 12-26 22:04:12] ax.service.utils.instantiation: Inferred value type of ParameterType.INT for parameter batch_size. If that is not the expected value type, you can explicity specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
[INFO 12-26 22:04:12] ax.service.utils.instant

== Status ==
Memory usage on this node: 5.3/31.3 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-results/ax2
Number of trials: 1/50 (1 RUNNING)
+------------------+----------+-------+--------------+--------------+--------------+------------+-------------+------------------+----------------+
| Trial name       | status   | loc   |   accum_iter |   batch_size | is_amsgrad   |         lr | middle_fc   |   middle_fc_size |   weight_decay |
|------------------+----------+-------+--------------+--------------+--------------+------------+-------------+------------------+----------------|
| DEFAULT_492b1aee | RUNNING  |       |            8 |           16 | False        | 0.00143511 | False       |              256 |              0 |
+------------------+----------+-------+

Result for DEFAULT_492b1aee:
  accuracy: 0.696
  date: 2020-12-26_22-20-53
  done: false
  experiment_id: 026f4fb8ae0f433b879dee9624236d14
  experiment_tag: 1_accum_iter=8,batch_size=16,is_amsgrad=False,lr=0.0014351,middle_fc=False,middle_fc_size=256,weight_decay=0.0
  hostname: Nevsky
  iterations_since_restore: 5
  loss: 1.2652046593752775
  node_ip: 10.0.0.200
  pid: 14605
  should_checkpoint: true
  time_since_restore: 999.8098523616791
  time_this_iter_s: 199.54348397254944
  time_total_s: 999.8098523616791
  timestamp: 1609021253
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: 492b1aee
  
== Status ==
Memory usage on this node: 7.6/31.3 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: -1.2600499976765025 | Iter 2.000: -1.252114293792031
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-results/ax2
Number of trials: 

Result for DEFAULT_492b1aee:
  accuracy: 0.6937142857142857
  date: 2020-12-26_22-34-11
  done: false
  experiment_id: 026f4fb8ae0f433b879dee9624236d14
  experiment_tag: 1_accum_iter=8,batch_size=16,is_amsgrad=False,lr=0.0014351,middle_fc=False,middle_fc_size=256,weight_decay=0.0
  hostname: Nevsky
  iterations_since_restore: 9
  loss: 1.2720156517895784
  node_ip: 10.0.0.200
  pid: 14605
  should_checkpoint: true
  time_since_restore: 1797.850442647934
  time_this_iter_s: 199.55387783050537
  time_total_s: 1797.850442647934
  timestamp: 1609022051
  timesteps_since_restore: 0
  training_iteration: 9
  trial_id: 492b1aee
  
== Status ==
Memory usage on this node: 7.6/31.3 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: -1.25368201082403 | Iter 4.000: -1.2600499976765025 | Iter 2.000: -1.252114293792031
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-resu

[INFO 12-26 22:37:30] ax.service.ax_client: Completed trial 0 with data: {'loss': (1.27, 0.0)}.
[INFO 12-26 22:37:30] ax.service.ax_client: Generated new trial 2 with parameters {'lr': 0.0, 'accum_iter': 4, 'batch_size': 8, 'weight_decay': 0.0, 'middle_fc_size': 512, 'is_amsgrad': False, 'middle_fc': False}.


Result for DEFAULT_492b1aee:
  accuracy: 0.6931428571428572
  date: 2020-12-26_22-37-30
  done: true
  experiment_id: 026f4fb8ae0f433b879dee9624236d14
  experiment_tag: 1_accum_iter=8,batch_size=16,is_amsgrad=False,lr=0.0014351,middle_fc=False,middle_fc_size=256,weight_decay=0.0
  hostname: Nevsky
  iterations_since_restore: 10
  loss: 1.2658602736212992
  node_ip: 10.0.0.200
  pid: 14605
  should_checkpoint: true
  time_since_restore: 1997.5598957538605
  time_this_iter_s: 199.7094531059265
  time_total_s: 1997.5598957538605
  timestamp: 1609022250
  timesteps_since_restore: 0
  training_iteration: 10
  trial_id: 492b1aee
  
== Status ==
Memory usage on this node: 7.6/31.3 GiB
Using AsyncHyperBand: num_stopped=1
Bracket: Iter 8.000: -1.25368201082403 | Iter 4.000: -1.2600499976765025 | Iter 2.000: -1.252114293792031
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-re

Result for DEFAULT_49321484:
  accuracy: 0.6422857142857142
  date: 2020-12-26_22-51-59
  done: false
  experiment_id: a6e191224be644eeaae6a2dd76be1590
  experiment_tag: 2_accum_iter=4,batch_size=8,is_amsgrad=False,lr=0.0016245,middle_fc=False,middle_fc_size=256,weight_decay=0.0
  hostname: Nevsky
  iterations_since_restore: 4
  loss: 1.2295918096195568
  node_ip: 10.0.0.200
  pid: 14607
  should_checkpoint: true
  time_since_restore: 867.0532298088074
  time_this_iter_s: 215.55943727493286
  time_total_s: 867.0532298088074
  timestamp: 1609023119
  timesteps_since_restore: 0
  training_iteration: 4
  trial_id: '49321484'
  
== Status ==
Memory usage on this node: 7.6/31.3 GiB
Using AsyncHyperBand: num_stopped=1
Bracket: Iter 8.000: -1.25368201082403 | Iter 4.000: -1.2448209036480296 | Iter 2.000: -1.2408238205042754
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-re

Result for DEFAULT_49321484:
  accuracy: 0.6457142857142857
  date: 2020-12-26_23-06-21
  done: false
  experiment_id: a6e191224be644eeaae6a2dd76be1590
  experiment_tag: 2_accum_iter=4,batch_size=8,is_amsgrad=False,lr=0.0016245,middle_fc=False,middle_fc_size=256,weight_decay=0.0
  hostname: Nevsky
  iterations_since_restore: 8
  loss: 1.2363299889998003
  node_ip: 10.0.0.200
  pid: 14607
  should_checkpoint: true
  time_since_restore: 1728.7878551483154
  time_this_iter_s: 215.5197880268097
  time_total_s: 1728.7878551483154
  timestamp: 1609023981
  timesteps_since_restore: 0
  training_iteration: 8
  trial_id: '49321484'
  
== Status ==
Memory usage on this node: 7.6/31.3 GiB
Using AsyncHyperBand: num_stopped=1
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2448209036480296 | Iter 2.000: -1.2408238205042754
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray

[INFO 12-26 23:13:32] ax.service.ax_client: Completed trial 1 with data: {'loss': (1.22, 0.0)}.
[INFO 12-26 23:13:32] ax.service.ax_client: Generated new trial 3 with parameters {'lr': 0.0, 'accum_iter': 4, 'batch_size': 16, 'weight_decay': 0.0, 'middle_fc_size': 512, 'is_amsgrad': False, 'middle_fc': False}.


Result for DEFAULT_49321484:
  accuracy: 0.6508571428571429
  date: 2020-12-26_23-13-32
  done: true
  experiment_id: a6e191224be644eeaae6a2dd76be1590
  experiment_tag: 2_accum_iter=4,batch_size=8,is_amsgrad=False,lr=0.0016245,middle_fc=False,middle_fc_size=256,weight_decay=0.0
  hostname: Nevsky
  iterations_since_restore: 10
  loss: 1.2187263597141613
  node_ip: 10.0.0.200
  pid: 14607
  should_checkpoint: true
  time_since_restore: 2159.54705286026
  time_this_iter_s: 215.54514503479004
  time_total_s: 2159.54705286026
  timestamp: 1609024412
  timesteps_since_restore: 0
  training_iteration: 10
  trial_id: '49321484'
  
== Status ==
Memory usage on this node: 7.6/31.3 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2448209036480296 | Iter 2.000: -1.2408238205042754
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-r

[INFO 12-26 23:20:47] ax.service.ax_client: Completed trial 2 with data: {'loss': (1.58, 0.0)}.
[INFO 12-26 23:20:47] ax.service.ax_client: Generated new trial 4 with parameters {'lr': 0.01, 'accum_iter': 4, 'batch_size': 16, 'weight_decay': 0.0, 'middle_fc_size': 256, 'is_amsgrad': False, 'middle_fc': False}.


Result for DEFAULT_f0842426:
  accuracy: 0.4045714285714286
  date: 2020-12-26_23-20-47
  done: true
  experiment_id: 39265be00951494cacac25d808190abc
  experiment_tag: 3_accum_iter=4,batch_size=8,is_amsgrad=False,lr=0.0016618,middle_fc=False,middle_fc_size=512,weight_decay=0.0
  hostname: Nevsky
  iterations_since_restore: 2
  loss: 1.5774462288076228
  node_ip: 10.0.0.200
  pid: 14603
  should_checkpoint: true
  time_since_restore: 433.7202892303467
  time_this_iter_s: 215.04089975357056
  time_total_s: 433.7202892303467
  timestamp: 1609024847
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: f0842426
  
== Status ==
Memory usage on this node: 7.5/31.3 GiB
Using AsyncHyperBand: num_stopped=3
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2448209036480296 | Iter 2.000: -1.252114293792031
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-resu

[INFO 12-26 23:27:30] ax.service.ax_client: Completed trial 3 with data: {'loss': (1.62, 0.0)}.
[INFO 12-26 23:27:30] ax.service.ax_client: Generated new trial 5 with parameters {'lr': 0.0, 'accum_iter': 8, 'batch_size': 8, 'weight_decay': 0.0, 'middle_fc_size': 256, 'is_amsgrad': False, 'middle_fc': False}.


Result for DEFAULT_f8b77576:
  accuracy: 0.184
  date: 2020-12-26_23-27-30
  done: true
  experiment_id: 29b77c99c990449589ddee4ec5a75bb6
  experiment_tag: 4_accum_iter=4,batch_size=16,is_amsgrad=False,lr=0.0012076,middle_fc=False,middle_fc_size=512,weight_decay=0.0005
  hostname: Nevsky
  iterations_since_restore: 2
  loss: 1.6152193849736993
  node_ip: 10.0.0.200
  pid: 14604
  should_checkpoint: true
  time_since_restore: 401.0507321357727
  time_this_iter_s: 198.935072183609
  time_total_s: 401.0507321357727
  timestamp: 1609025250
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: f8b77576
  
== Status ==
Memory usage on this node: 7.5/31.3 GiB
Using AsyncHyperBand: num_stopped=4
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2448209036480296 | Iter 2.000: -1.414780261299827
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-results/ax2
Num

[INFO 12-26 23:34:13] ax.service.ax_client: Completed trial 4 with data: {'loss': (3.25, 0.0)}.
[INFO 12-26 23:34:13] ax.service.ax_client: Generated new trial 6 with parameters {'lr': 0.01, 'accum_iter': 4, 'batch_size': 16, 'weight_decay': 0.0, 'middle_fc_size': 256, 'is_amsgrad': False, 'middle_fc': False}.


Result for DEFAULT_fc481014:
  accuracy: 0.6194285714285714
  date: 2020-12-26_23-34-13
  done: true
  experiment_id: 70ad3dad08e446788ad4acc37e1d1911
  experiment_tag: 5_accum_iter=4,batch_size=16,is_amsgrad=False,lr=0.0088018,middle_fc=False,middle_fc_size=256,weight_decay=0.0005
  hostname: Nevsky
  iterations_since_restore: 2
  loss: 3.247355008125305
  node_ip: 10.0.0.200
  pid: 14606
  should_checkpoint: true
  time_since_restore: 401.4571373462677
  time_this_iter_s: 199.04457211494446
  time_total_s: 401.4571373462677
  timestamp: 1609025653
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: fc481014
  
== Status ==
Memory usage on this node: 7.5/31.3 GiB
Using AsyncHyperBand: num_stopped=5
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2448209036480296 | Iter 2.000: -1.5774462288076228
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-

[INFO 12-26 23:41:28] ax.service.ax_client: Completed trial 5 with data: {'loss': (1.61, 0.0)}.
[INFO 12-26 23:41:28] ax.service.ax_client: Generated new trial 7 with parameters {'lr': 0.0, 'accum_iter': 8, 'batch_size': 8, 'weight_decay': 0.0, 'middle_fc_size': 512, 'is_amsgrad': False, 'middle_fc': False}.


Result for DEFAULT_ec6ce196:
  accuracy: 0.1742857142857143
  date: 2020-12-26_23-41-28
  done: true
  experiment_id: e6c2b450db46479f89dcbd19e6252fe7
  experiment_tag: 6_accum_iter=8,batch_size=8,is_amsgrad=False,lr=0.0013398,middle_fc=False,middle_fc_size=256,weight_decay=0.0
  hostname: Nevsky
  iterations_since_restore: 2
  loss: 1.6059652783653953
  node_ip: 10.0.0.200
  pid: 14600
  should_checkpoint: true
  time_since_restore: 432.7575466632843
  time_this_iter_s: 215.05083870887756
  time_total_s: 432.7575466632843
  timestamp: 1609026088
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: ec6ce196
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=6
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2448209036480296 | Iter 2.000: -1.591705753586509
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-resu

[INFO 12-26 23:48:11] ax.service.ax_client: Completed trial 6 with data: {'loss': (3.1, 0.0)}.
[INFO 12-26 23:48:11] ax.service.ax_client: Generated new trial 8 with parameters {'lr': 0.0, 'accum_iter': 4, 'batch_size': 8, 'weight_decay': 0.0, 'middle_fc_size': 512, 'is_amsgrad': False, 'middle_fc': False}.


Result for DEFAULT_dcb9fe72:
  accuracy: 0.6182857142857143
  date: 2020-12-26_23-48-11
  done: true
  experiment_id: 74aede046435405b94d335ab351c652c
  experiment_tag: 7_accum_iter=4,batch_size=16,is_amsgrad=False,lr=0.0070571,middle_fc=False,middle_fc_size=256,weight_decay=0.000125
  hostname: Nevsky
  iterations_since_restore: 2
  loss: 3.099164702675559
  node_ip: 10.0.0.200
  pid: 14602
  should_checkpoint: true
  time_since_restore: 401.58901810646057
  time_this_iter_s: 199.183096408844
  time_total_s: 401.58901810646057
  timestamp: 1609026491
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: dcb9fe72
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=7
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2448209036480296 | Iter 2.000: -1.6059652783653953
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ra

[INFO 12-26 23:55:25] ax.service.ax_client: Completed trial 7 with data: {'loss': (1.61, 0.0)}.
[INFO 12-26 23:55:25] ax.service.ax_client: Generated new trial 9 with parameters {'lr': 0.0, 'accum_iter': 4, 'batch_size': 8, 'weight_decay': 0.0, 'middle_fc_size': 512, 'is_amsgrad': False, 'middle_fc': False}.


Result for DEFAULT_dfb8125c:
  accuracy: 0.228
  date: 2020-12-26_23-55-25
  done: true
  experiment_id: f45d53be015d41c29102ab0037618267
  experiment_tag: 8_accum_iter=8,batch_size=8,is_amsgrad=False,lr=0.0033217,middle_fc=False,middle_fc_size=512,weight_decay=0.000125
  hostname: Nevsky
  iterations_since_restore: 2
  loss: 1.607922200723128
  node_ip: 10.0.0.200
  pid: 14601
  should_checkpoint: true
  time_since_restore: 432.5370411872864
  time_this_iter_s: 214.65771102905273
  time_total_s: 432.5370411872864
  timestamp: 1609026925
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: dfb8125c
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=8
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2448209036480296 | Iter 2.000: -1.6069437395442616
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-results/ax2


Result for DEFAULT_d0132516:
  accuracy: 0.04742857142857143
  date: 2020-12-27_00-06-12
  done: false
  experiment_id: 088bcf3d721d430ebef2a2eed68a0dcc
  experiment_tag: 9_accum_iter=4,batch_size=8,is_amsgrad=False,lr=0.0015419,middle_fc=False,middle_fc_size=512,weight_decay=0.000125
  hostname: Nevsky
  iterations_since_restore: 3
  loss: .nan
  node_ip: 10.0.0.200
  pid: 20803
  should_checkpoint: true
  time_since_restore: 644.1903223991394
  time_this_iter_s: 213.3810920715332
  time_total_s: 644.1903223991394
  timestamp: 1609027572
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: d0132516
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=8
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2448209036480296 | Iter 2.000: -1.6069437395442616
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-results/ax2

Result for DEFAULT_d0132516:
  accuracy: 0.04742857142857143
  date: 2020-12-27_00-16-53
  done: false
  experiment_id: 088bcf3d721d430ebef2a2eed68a0dcc
  experiment_tag: 9_accum_iter=4,batch_size=8,is_amsgrad=False,lr=0.0015419,middle_fc=False,middle_fc_size=512,weight_decay=0.000125
  hostname: Nevsky
  iterations_since_restore: 6
  loss: .nan
  node_ip: 10.0.0.200
  pid: 20803
  should_checkpoint: true
  time_since_restore: 1285.8716614246368
  time_this_iter_s: 213.75410342216492
  time_total_s: 1285.8716614246368
  timestamp: 1609028213
  timesteps_since_restore: 0
  training_iteration: 6
  trial_id: d0132516
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=8
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2448209036480296 | Iter 2.000: -1.6069437395442616
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-results/

Result for DEFAULT_d0132516:
  accuracy: 0.04742857142857143
  date: 2020-12-27_00-27-34
  done: false
  experiment_id: 088bcf3d721d430ebef2a2eed68a0dcc
  experiment_tag: 9_accum_iter=4,batch_size=8,is_amsgrad=False,lr=0.0015419,middle_fc=False,middle_fc_size=512,weight_decay=0.000125
  hostname: Nevsky
  iterations_since_restore: 9
  loss: .nan
  node_ip: 10.0.0.200
  pid: 20803
  should_checkpoint: true
  time_since_restore: 1926.7791485786438
  time_this_iter_s: 214.0234944820404
  time_total_s: 1926.7791485786438
  timestamp: 1609028854
  timesteps_since_restore: 0
  training_iteration: 9
  trial_id: d0132516
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=8
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2448209036480296 | Iter 2.000: -1.6069437395442616
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-results/a

[INFO 12-27 00:31:07] ax.service.ax_client: Completed trial 8 with data: {'loss': (nan, 0.0)}.
[INFO 12-27 00:31:08] ax.service.ax_client: Generated new trial 10 with parameters {'lr': 0.01, 'accum_iter': 8, 'batch_size': 16, 'weight_decay': 0.0, 'middle_fc_size': 256, 'is_amsgrad': False, 'middle_fc': False}.


Result for DEFAULT_d0132516:
  accuracy: 0.04742857142857143
  date: 2020-12-27_00-31-07
  done: true
  experiment_id: 088bcf3d721d430ebef2a2eed68a0dcc
  experiment_tag: 9_accum_iter=4,batch_size=8,is_amsgrad=False,lr=0.0015419,middle_fc=False,middle_fc_size=512,weight_decay=0.000125
  hostname: Nevsky
  iterations_since_restore: 10
  loss: .nan
  node_ip: 10.0.0.200
  pid: 20803
  should_checkpoint: true
  time_since_restore: 2140.0762481689453
  time_this_iter_s: 213.2970995903015
  time_total_s: 2140.0762481689453
  timestamp: 1609029067
  timesteps_since_restore: 0
  training_iteration: 10
  trial_id: d0132516
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=9
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2448209036480296 | Iter 2.000: -1.6069437395442616
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-results/

Result for DEFAULT_d2fdf87c:
  accuracy: 0.3942857142857143
  date: 2020-12-27_00-38-22
  done: false
  experiment_id: 46ebcf0118d04ab587aa22dfc98c08a8
  experiment_tag: 10_accum_iter=4,batch_size=8,is_amsgrad=False,lr=0.0011551,middle_fc=False,middle_fc_size=512,weight_decay=0.0
  hostname: Nevsky
  iterations_since_restore: 2
  loss: 1.5682909293608231
  node_ip: 10.0.0.200
  pid: 30358
  should_checkpoint: true
  time_since_restore: 432.59260988235474
  time_this_iter_s: 215.19338035583496
  time_total_s: 432.59260988235474
  timestamp: 1609029502
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: d2fdf87c
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=9
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2448209036480296 | Iter 2.000: -1.6059652783653953
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray

[INFO 12-27 00:45:31] ax.service.ax_client: Completed trial 9 with data: {'loss': (1.57, 0.0)}.


Result for DEFAULT_d2fdf87c:
  accuracy: 0.34914285714285714
  date: 2020-12-27_00-45-31
  done: true
  experiment_id: 46ebcf0118d04ab587aa22dfc98c08a8
  experiment_tag: 10_accum_iter=4,batch_size=8,is_amsgrad=False,lr=0.0011551,middle_fc=False,middle_fc_size=512,weight_decay=0.0
  hostname: Nevsky
  iterations_since_restore: 4
  loss: 1.574831813031977
  node_ip: 10.0.0.200
  pid: 30358
  should_checkpoint: true
  time_since_restore: 861.883510351181
  time_this_iter_s: 214.21553540229797
  time_total_s: 861.883510351181
  timestamp: 1609029931
  timesteps_since_restore: 0
  training_iteration: 4
  trial_id: d2fdf87c
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=10
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2600499976765025 | Iter 2.000: -1.6059652783653953
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-res

[INFO 12-27 00:45:32] ax.service.ax_client: Generated new trial 11 with parameters {'lr': 0.0, 'accum_iter': 8, 'batch_size': 8, 'weight_decay': 0.0, 'middle_fc_size': 512, 'is_amsgrad': False, 'middle_fc': False}.


[2m[36m(pid=1595)[0m Epoch     0: adjusting learning rate of group 0 to 6.2826e-03.
Result for DEFAULT_cfcdb9ee:
  accuracy: 0.6182857142857143
  date: 2020-12-27_00-48-54
  done: false
  experiment_id: 92a314ab85df48cab35f6038c63fdeb2
  experiment_tag: 11_accum_iter=8,batch_size=16,is_amsgrad=False,lr=0.0062826,middle_fc=False,middle_fc_size=256,weight_decay=0.0
  hostname: Nevsky
  iterations_since_restore: 1
  loss: 6.3002673886039045
  node_ip: 10.0.0.200
  pid: 1595
  should_checkpoint: true
  time_since_restore: 200.81376671791077
  time_this_iter_s: 200.81376671791077
  time_total_s: 200.81376671791077
  timestamp: 1609030134
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: cfcdb9ee
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=10
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2600499976765025 | Iter 2.000: -1.6059652783653953
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB object

Result for DEFAULT_cfcdb9ee:
  accuracy: 0.568
  date: 2020-12-27_00-55-30
  done: false
  experiment_id: 92a314ab85df48cab35f6038c63fdeb2
  experiment_tag: 11_accum_iter=8,batch_size=16,is_amsgrad=False,lr=0.0062826,middle_fc=False,middle_fc_size=256,weight_decay=0.0
  hostname: Nevsky
  iterations_since_restore: 3
  loss: 1.2891770427877254
  node_ip: 10.0.0.200
  pid: 1595
  should_checkpoint: true
  time_since_restore: 596.4155578613281
  time_this_iter_s: 197.39278268814087
  time_total_s: 596.4155578613281
  timestamp: 1609030530
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: cfcdb9ee
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=10
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2600499976765025 | Iter 2.000: -1.591705753586509
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-results/ax2
Nu

[INFO 12-27 00:58:48] ax.service.ax_client: Completed trial 10 with data: {'loss': (1.33, 0.0)}.
[INFO 12-27 00:58:48] ax.service.ax_client: Generated new trial 12 with parameters {'lr': 0.0, 'accum_iter': 8, 'batch_size': 8, 'weight_decay': 0.0, 'middle_fc_size': 512, 'is_amsgrad': False, 'middle_fc': False}.


Result for DEFAULT_cfcdb9ee:
  accuracy: 0.548
  date: 2020-12-27_00-58-48
  done: true
  experiment_id: 92a314ab85df48cab35f6038c63fdeb2
  experiment_tag: 11_accum_iter=8,batch_size=16,is_amsgrad=False,lr=0.0062826,middle_fc=False,middle_fc_size=256,weight_decay=0.0
  hostname: Nevsky
  iterations_since_restore: 4
  loss: 1.325357857617465
  node_ip: 10.0.0.200
  pid: 1595
  should_checkpoint: true
  time_since_restore: 794.195544719696
  time_this_iter_s: 197.77998685836792
  time_total_s: 794.195544719696
  timestamp: 1609030728
  timesteps_since_restore: 0
  training_iteration: 4
  trial_id: cfcdb9ee
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=11
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2927039276469836 | Iter 2.000: -1.591705753586509
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-results/ax2
Number

Result for DEFAULT_d2baca28:
  accuracy: 0.04742857142857143
  date: 2020-12-27_01-06-03
  done: false
  experiment_id: def4833f516e4b298cd418f618401803
  experiment_tag: 12_accum_iter=8,batch_size=8,is_amsgrad=False,lr=0.002974,middle_fc=False,middle_fc_size=512,weight_decay=0.0005
  hostname: Nevsky
  iterations_since_restore: 2
  loss: .nan
  node_ip: 10.0.0.200
  pid: 5883
  should_checkpoint: true
  time_since_restore: 432.95861053466797
  time_this_iter_s: 214.88736391067505
  time_total_s: 432.95861053466797
  timestamp: 1609031163
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: d2baca28
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=11
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2927039276469836 | Iter 2.000: -1.591705753586509
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-results/ax2

Result for DEFAULT_d2baca28:
  accuracy: 0.04742857142857143
  date: 2020-12-27_01-13-12
  done: false
  experiment_id: def4833f516e4b298cd418f618401803
  experiment_tag: 12_accum_iter=8,batch_size=8,is_amsgrad=False,lr=0.002974,middle_fc=False,middle_fc_size=512,weight_decay=0.0005
  hostname: Nevsky
  iterations_since_restore: 4
  loss: .nan
  node_ip: 10.0.0.200
  pid: 5883
  should_checkpoint: true
  time_since_restore: 862.7623097896576
  time_this_iter_s: 214.68617177009583
  time_total_s: 862.7623097896576
  timestamp: 1609031592
  timesteps_since_restore: 0
  training_iteration: 4
  trial_id: d2baca28
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=11
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2927039276469836 | Iter 2.000: -1.591705753586509
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-results/ax2
N

Result for DEFAULT_d2baca28:
  accuracy: 0.04742857142857143
  date: 2020-12-27_01-20-22
  done: false
  experiment_id: def4833f516e4b298cd418f618401803
  experiment_tag: 12_accum_iter=8,batch_size=8,is_amsgrad=False,lr=0.002974,middle_fc=False,middle_fc_size=512,weight_decay=0.0005
  hostname: Nevsky
  iterations_since_restore: 6
  loss: .nan
  node_ip: 10.0.0.200
  pid: 5883
  should_checkpoint: true
  time_since_restore: 1292.4050891399384
  time_this_iter_s: 215.07006740570068
  time_total_s: 1292.4050891399384
  timestamp: 1609032022
  timesteps_since_restore: 0
  training_iteration: 6
  trial_id: d2baca28
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=11
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2927039276469836 | Iter 2.000: -1.591705753586509
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-results/ax2

Result for DEFAULT_d2baca28:
  accuracy: 0.04742857142857143
  date: 2020-12-27_01-27-32
  done: false
  experiment_id: def4833f516e4b298cd418f618401803
  experiment_tag: 12_accum_iter=8,batch_size=8,is_amsgrad=False,lr=0.002974,middle_fc=False,middle_fc_size=512,weight_decay=0.0005
  hostname: Nevsky
  iterations_since_restore: 8
  loss: .nan
  node_ip: 10.0.0.200
  pid: 5883
  should_checkpoint: true
  time_since_restore: 1722.4905843734741
  time_this_iter_s: 214.92282438278198
  time_total_s: 1722.4905843734741
  timestamp: 1609032452
  timesteps_since_restore: 0
  training_iteration: 8
  trial_id: d2baca28
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=11
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2927039276469836 | Iter 2.000: -1.591705753586509
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-results/ax2

[INFO 12-27 01:34:42] ax.service.ax_client: Completed trial 11 with data: {'loss': (nan, 0.0)}.


Result for DEFAULT_d2baca28:
  accuracy: 0.04742857142857143
  date: 2020-12-27_01-34-42
  done: true
  experiment_id: def4833f516e4b298cd418f618401803
  experiment_tag: 12_accum_iter=8,batch_size=8,is_amsgrad=False,lr=0.002974,middle_fc=False,middle_fc_size=512,weight_decay=0.0005
  hostname: Nevsky
  iterations_since_restore: 10
  loss: .nan
  node_ip: 10.0.0.200
  pid: 5883
  should_checkpoint: true
  time_since_restore: 2152.4852311611176
  time_this_iter_s: 214.83894395828247
  time_total_s: 2152.4852311611176
  timestamp: 1609032882
  timesteps_since_restore: 0
  training_iteration: 10
  trial_id: d2baca28
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=12
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2927039276469836 | Iter 2.000: -1.591705753586509
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-results/ax

[INFO 12-27 01:34:42] ax.service.ax_client: Generated new trial 13 with parameters {'lr': 0.0, 'accum_iter': 4, 'batch_size': 16, 'weight_decay': 0.0, 'middle_fc_size': 256, 'is_amsgrad': False, 'middle_fc': False}.


[2m[36m(pid=17180)[0m Epoch     0: adjusting learning rate of group 0 to 1.3472e-03.
Result for DEFAULT_ad56dd74:
  accuracy: 0.24742857142857144
  date: 2020-12-27_01-38-22
  done: false
  experiment_id: c5fc8d49c8f840a69254b017e5a57b08
  experiment_tag: 13_accum_iter=8,batch_size=8,is_amsgrad=False,lr=0.0013472,middle_fc=False,middle_fc_size=512,weight_decay=0.000125
  hostname: Nevsky
  iterations_since_restore: 1
  loss: 1.5948340372605758
  node_ip: 10.0.0.200
  pid: 17180
  should_checkpoint: true
  time_since_restore: 217.64151883125305
  time_this_iter_s: 217.64151883125305
  time_total_s: 217.64151883125305
  timestamp: 1609033102
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: ad56dd74
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=12
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2927039276469836 | Iter 2.000: -1.591705753586509
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB 

[INFO 12-27 01:41:57] ax.service.ax_client: Completed trial 12 with data: {'loss': (1.6, 0.0)}.


Result for DEFAULT_ad56dd74:
  accuracy: 0.25142857142857145
  date: 2020-12-27_01-41-57
  done: true
  experiment_id: c5fc8d49c8f840a69254b017e5a57b08
  experiment_tag: 13_accum_iter=8,batch_size=8,is_amsgrad=False,lr=0.0013472,middle_fc=False,middle_fc_size=512,weight_decay=0.000125
  hostname: Nevsky
  iterations_since_restore: 2
  loss: 1.595612144470215
  node_ip: 10.0.0.200
  pid: 17180
  should_checkpoint: true
  time_since_restore: 432.4841847419739
  time_this_iter_s: 214.84266591072083
  time_total_s: 432.4841847419739
  timestamp: 1609033317
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: ad56dd74
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=13
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2927039276469836 | Iter 2.000: -1.595612144470215
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/r

[INFO 12-27 01:41:57] ax.service.ax_client: Generated new trial 14 with parameters {'lr': 0.0, 'accum_iter': 8, 'batch_size': 16, 'weight_decay': 0.0, 'middle_fc_size': 512, 'is_amsgrad': False, 'middle_fc': False}.


[2m[36m(pid=19374)[0m Epoch     0: adjusting learning rate of group 0 to 1.9924e-03.
Result for DEFAULT_b1812c74:
  accuracy: 0.12285714285714286
  date: 2020-12-27_01-45-19
  done: false
  experiment_id: 62f6ab044cbc42f68b4c347ac471dd8a
  experiment_tag: 14_accum_iter=4,batch_size=16,is_amsgrad=False,lr=0.0019924,middle_fc=False,middle_fc_size=256,weight_decay=0.000125
  hostname: Nevsky
  iterations_since_restore: 1
  loss: 1.6470605243336072
  node_ip: 10.0.0.200
  pid: 19374
  should_checkpoint: true
  time_since_restore: 200.2153296470642
  time_this_iter_s: 200.2153296470642
  time_total_s: 200.2153296470642
  timestamp: 1609033519
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: b1812c74
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=13
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2927039276469836 | Iter 2.000: -1.595612144470215
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB ob

Result for DEFAULT_b1812c74:
  accuracy: 0.6548571428571428
  date: 2020-12-27_01-51-54
  done: false
  experiment_id: 62f6ab044cbc42f68b4c347ac471dd8a
  experiment_tag: 14_accum_iter=4,batch_size=16,is_amsgrad=False,lr=0.0019924,middle_fc=False,middle_fc_size=256,weight_decay=0.000125
  hostname: Nevsky
  iterations_since_restore: 3
  loss: 1.3104863860390403
  node_ip: 10.0.0.200
  pid: 19374
  should_checkpoint: true
  time_since_restore: 595.51123213768
  time_this_iter_s: 197.7282247543335
  time_total_s: 595.51123213768
  timestamp: 1609033914
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: b1812c74
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=13
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2927039276469836 | Iter 2.000: -1.5865291866389188
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray

Result for DEFAULT_b1812c74:
  accuracy: 0.6422857142857142
  date: 2020-12-27_01-58-29
  done: false
  experiment_id: 62f6ab044cbc42f68b4c347ac471dd8a
  experiment_tag: 14_accum_iter=4,batch_size=16,is_amsgrad=False,lr=0.0019924,middle_fc=False,middle_fc_size=256,weight_decay=0.000125
  hostname: Nevsky
  iterations_since_restore: 5
  loss: 1.0018747774037449
  node_ip: 10.0.0.200
  pid: 19374
  should_checkpoint: true
  time_since_restore: 990.8592267036438
  time_this_iter_s: 197.66626143455505
  time_total_s: 990.8592267036438
  timestamp: 1609034309
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: b1812c74
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=13
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2600499976765025 | Iter 2.000: -1.5865291866389188
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassav

Result for DEFAULT_b1812c74:
  accuracy: 0.6417142857142857
  date: 2020-12-27_02-05-05
  done: false
  experiment_id: 62f6ab044cbc42f68b4c347ac471dd8a
  experiment_tag: 14_accum_iter=4,batch_size=16,is_amsgrad=False,lr=0.0019924,middle_fc=False,middle_fc_size=256,weight_decay=0.000125
  hostname: Nevsky
  iterations_since_restore: 7
  loss: 1.003383907404813
  node_ip: 10.0.0.200
  pid: 19374
  should_checkpoint: true
  time_since_restore: 1386.184466600418
  time_this_iter_s: 197.47348165512085
  time_total_s: 1386.184466600418
  timestamp: 1609034705
  timesteps_since_restore: 0
  training_iteration: 7
  trial_id: b1812c74
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=13
Bracket: Iter 8.000: -1.2450059999119152 | Iter 4.000: -1.2600499976765025 | Iter 2.000: -1.5865291866389188
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava

Result for DEFAULT_b1812c74:
  accuracy: 0.6365714285714286
  date: 2020-12-27_02-11-40
  done: false
  experiment_id: 62f6ab044cbc42f68b4c347ac471dd8a
  experiment_tag: 14_accum_iter=4,batch_size=16,is_amsgrad=False,lr=0.0019924,middle_fc=False,middle_fc_size=256,weight_decay=0.000125
  hostname: Nevsky
  iterations_since_restore: 9
  loss: 1.0187214071100408
  node_ip: 10.0.0.200
  pid: 19374
  should_checkpoint: true
  time_since_restore: 1781.527489900589
  time_this_iter_s: 197.47379970550537
  time_total_s: 1781.527489900589
  timestamp: 1609035100
  timesteps_since_restore: 0
  training_iteration: 9
  trial_id: b1812c74
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=13
Bracket: Iter 8.000: -1.2363299889998003 | Iter 4.000: -1.2600499976765025 | Iter 2.000: -1.5865291866389188
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassav

[INFO 12-27 02:14:57] ax.service.ax_client: Completed trial 13 with data: {'loss': (1.02, 0.0)}.


Result for DEFAULT_b1812c74:
  accuracy: 0.6411428571428571
  date: 2020-12-27_02-14-57
  done: true
  experiment_id: 62f6ab044cbc42f68b4c347ac471dd8a
  experiment_tag: 14_accum_iter=4,batch_size=16,is_amsgrad=False,lr=0.0019924,middle_fc=False,middle_fc_size=256,weight_decay=0.000125
  hostname: Nevsky
  iterations_since_restore: 10
  loss: 1.0153339396823535
  node_ip: 10.0.0.200
  pid: 19374
  should_checkpoint: true
  time_since_restore: 1978.9380266666412
  time_this_iter_s: 197.41053676605225
  time_total_s: 1978.9380266666412
  timestamp: 1609035297
  timesteps_since_restore: 0
  training_iteration: 10
  trial_id: b1812c74
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=14
Bracket: Iter 8.000: -1.2363299889998003 | Iter 4.000: -1.2600499976765025 | Iter 2.000: -1.5865291866389188
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cas

[INFO 12-27 02:14:58] ax.service.ax_client: Generated new trial 15 with parameters {'lr': 0.01, 'accum_iter': 4, 'batch_size': 16, 'weight_decay': 0.0, 'middle_fc_size': 256, 'is_amsgrad': False, 'middle_fc': False}.


[2m[36m(pid=30843)[0m Epoch     0: adjusting learning rate of group 0 to 2.5102e-03.
Result for DEFAULT_b47f0044:
  accuracy: 0.20342857142857143
  date: 2020-12-27_02-18-20
  done: false
  experiment_id: 4f182e6930504d39a206914479561ab5
  experiment_tag: 15_accum_iter=8,batch_size=16,is_amsgrad=False,lr=0.0025102,middle_fc=False,middle_fc_size=512,weight_decay=0.0
  hostname: Nevsky
  iterations_since_restore: 1
  loss: 1.6029738946394487
  node_ip: 10.0.0.200
  pid: 30843
  should_checkpoint: true
  time_since_restore: 200.662348985672
  time_this_iter_s: 200.662348985672
  time_total_s: 200.662348985672
  timestamp: 1609035500
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: b47f0044
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=14
Bracket: Iter 8.000: -1.2363299889998003 | Iter 4.000: -1.2600499976765025 | Iter 2.000: -1.5865291866389188
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (

Result for DEFAULT_b47f0044:
  accuracy: 0.6851428571428572
  date: 2020-12-27_02-24-55
  done: false
  experiment_id: 4f182e6930504d39a206914479561ab5
  experiment_tag: 15_accum_iter=8,batch_size=16,is_amsgrad=False,lr=0.0025102,middle_fc=False,middle_fc_size=512,weight_decay=0.0
  hostname: Nevsky
  iterations_since_restore: 3
  loss: 1.0828923561356285
  node_ip: 10.0.0.200
  pid: 30843
  should_checkpoint: true
  time_since_restore: 596.0161716938019
  time_this_iter_s: 197.7636251449585
  time_total_s: 596.0161716938019
  timestamp: 1609035895
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: b47f0044
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=14
Bracket: Iter 8.000: -1.2363299889998003 | Iter 4.000: -1.2600499976765025 | Iter 2.000: -1.5774462288076228
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-

Result for DEFAULT_b47f0044:
  accuracy: 0.6468571428571429
  date: 2020-12-27_02-31-31
  done: false
  experiment_id: 4f182e6930504d39a206914479561ab5
  experiment_tag: 15_accum_iter=8,batch_size=16,is_amsgrad=False,lr=0.0025102,middle_fc=False,middle_fc_size=512,weight_decay=0.0
  hostname: Nevsky
  iterations_since_restore: 5
  loss: 1.138025782325051
  node_ip: 10.0.0.200
  pid: 30843
  should_checkpoint: true
  time_since_restore: 991.0964517593384
  time_this_iter_s: 197.43246126174927
  time_total_s: 991.0964517593384
  timestamp: 1609036291
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: b47f0044
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=14
Bracket: Iter 8.000: -1.2363299889998003 | Iter 4.000: -1.2448209036480296 | Iter 2.000: -1.5774462288076228
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-

Result for DEFAULT_b47f0044:
  accuracy: 0.6771428571428572
  date: 2020-12-27_02-38-06
  done: false
  experiment_id: 4f182e6930504d39a206914479561ab5
  experiment_tag: 15_accum_iter=8,batch_size=16,is_amsgrad=False,lr=0.0025102,middle_fc=False,middle_fc_size=512,weight_decay=0.0
  hostname: Nevsky
  iterations_since_restore: 7
  loss: 1.094308497688987
  node_ip: 10.0.0.200
  pid: 30843
  should_checkpoint: true
  time_since_restore: 1386.3219509124756
  time_this_iter_s: 197.62376737594604
  time_total_s: 1386.3219509124756
  timestamp: 1609036686
  timesteps_since_restore: 0
  training_iteration: 7
  trial_id: b47f0044
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=14
Bracket: Iter 8.000: -1.2363299889998003 | Iter 4.000: -1.2448209036480296 | Iter 2.000: -1.5774462288076228
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ra

Result for DEFAULT_b47f0044:
  accuracy: 0.6834285714285714
  date: 2020-12-27_02-44-41
  done: false
  experiment_id: 4f182e6930504d39a206914479561ab5
  experiment_tag: 15_accum_iter=8,batch_size=16,is_amsgrad=False,lr=0.0025102,middle_fc=False,middle_fc_size=512,weight_decay=0.0
  hostname: Nevsky
  iterations_since_restore: 9
  loss: 1.0787187814712524
  node_ip: 10.0.0.200
  pid: 30843
  should_checkpoint: true
  time_since_restore: 1781.2443425655365
  time_this_iter_s: 197.48790740966797
  time_total_s: 1781.2443425655365
  timestamp: 1609037081
  timesteps_since_restore: 0
  training_iteration: 9
  trial_id: b47f0044
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=14
Bracket: Iter 8.000: -1.1740320362827994 | Iter 4.000: -1.2448209036480296 | Iter 2.000: -1.5774462288076228
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/r

[INFO 12-27 02:47:58] ax.service.ax_client: Completed trial 14 with data: {'loss': (nan, 0.0)}.


Result for DEFAULT_b47f0044:
  accuracy: 0.04742857142857143
  date: 2020-12-27_02-47-58
  done: true
  experiment_id: 4f182e6930504d39a206914479561ab5
  experiment_tag: 15_accum_iter=8,batch_size=16,is_amsgrad=False,lr=0.0025102,middle_fc=False,middle_fc_size=512,weight_decay=0.0
  hostname: Nevsky
  iterations_since_restore: 10
  loss: .nan
  node_ip: 10.0.0.200
  pid: 30843
  should_checkpoint: true
  time_since_restore: 1978.749592781067
  time_this_iter_s: 197.5052502155304
  time_total_s: 1978.749592781067
  timestamp: 1609037278
  timesteps_since_restore: 0
  training_iteration: 10
  trial_id: b47f0044
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=15
Bracket: Iter 8.000: -1.1740320362827994 | Iter 4.000: -1.2448209036480296 | Iter 2.000: -1.5774462288076228
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassava/ray-results/ax2


[INFO 12-27 02:47:59] ax.service.ax_client: Generated new trial 16 with parameters {'lr': 0.0, 'accum_iter': 8, 'batch_size': 8, 'weight_decay': 0.0, 'middle_fc_size': 256, 'is_amsgrad': False, 'middle_fc': False}.


[2m[36m(pid=10348)[0m Epoch     0: adjusting learning rate of group 0 to 8.0861e-03.
Result for DEFAULT_5135a650:
  accuracy: 0.11028571428571429
  date: 2020-12-27_02-51-22
  done: false
  experiment_id: 116aa590ae1a418ab7886c7c3df07ef8
  experiment_tag: 16_accum_iter=4,batch_size=16,is_amsgrad=False,lr=0.0080861,middle_fc=False,middle_fc_size=256,weight_decay=0.0005
  hostname: Nevsky
  iterations_since_restore: 1
  loss: 1.6356067332354458
  node_ip: 10.0.0.200
  pid: 10348
  should_checkpoint: true
  time_since_restore: 201.60954880714417
  time_this_iter_s: 201.60954880714417
  time_total_s: 201.60954880714417
  timestamp: 1609037482
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 5135a650
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=15
Bracket: Iter 8.000: -1.1740320362827994 | Iter 4.000: -1.2448209036480296 | Iter 2.000: -1.5774462288076228
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB 

[INFO 12-27 02:54:41] ax.service.ax_client: Completed trial 15 with data: {'loss': (1.63, 0.0)}.


Result for DEFAULT_5135a650:
  accuracy: 0.11314285714285714
  date: 2020-12-27_02-54-41
  done: true
  experiment_id: 116aa590ae1a418ab7886c7c3df07ef8
  experiment_tag: 16_accum_iter=4,batch_size=16,is_amsgrad=False,lr=0.0080861,middle_fc=False,middle_fc_size=256,weight_decay=0.0005
  hostname: Nevsky
  iterations_since_restore: 2
  loss: 1.6320970340208574
  node_ip: 10.0.0.200
  pid: 10348
  should_checkpoint: true
  time_since_restore: 400.20594358444214
  time_this_iter_s: 198.59639477729797
  time_total_s: 400.20594358444214
  timestamp: 1609037681
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: 5135a650
  
== Status ==
Memory usage on this node: 7.4/31.3 GiB
Using AsyncHyperBand: num_stopped=16
Bracket: Iter 8.000: -1.1740320362827994 | Iter 4.000: -1.2448209036480296 | Iter 2.000: -1.5865291866389188
Resources requested: 8/8 CPUs, 1/1 GPUs, 0.0/17.53 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/AI/Felix/kaggle-cassav

[INFO 12-27 02:54:41] ax.service.ax_client: Generated new trial 17 with parameters {'lr': 0.01, 'accum_iter': 8, 'batch_size': 8, 'weight_decay': 0.0, 'middle_fc_size': 512, 'is_amsgrad': False, 'middle_fc': False}.


[2m[36m(pid=12749)[0m Epoch     0: adjusting learning rate of group 0 to 2.5192e-03.
