In [1]:
import errno
import glob
import json
import os
import re
import shutil
from types import SimpleNamespace
import cv2
import torch
import warnings
from lightning_objects import LightningModel
warnings.filterwarnings('ignore')
from config import Configuration
import pandas as pd
from common_utils import stratify_split, make_holdout_df, set_seeds
from train_manager import TrainManager

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
def main(experiment_name: str, debug, resume=False,
         finetune=False, freeze_bn=True, freeze_feature_extractor=False):

    experiment_dir = os.path.abspath(f'trained-models/{experiment_name}')
    print('Experiment directory', experiment_dir)

    try:
        # -------- SETUP --------
        checkpoint_params = None
        finetune_model_fnames = None
        folds_df, holdout_df = None, None

        if not resume and not finetune: # totally new experiment
            make_experiment_directory(experiment_dir)
            config = Configuration()
            config.debug = debug
            set_seeds(config.seed)

            # -------- LOAD DATA FROM TRAIN FILE --------
            data_df = pd.read_csv(config.data_dir + '/train.csv', engine='python')
            data_df, holdout_df = make_holdout_df(data_df, seed=config.seed)
            folds_df = stratify_split(data_df, config.fold_num, config.seed, config.target_col)

            # -------- SAVE FILES (for experiment state) --------
            folds_df.to_csv(experiment_dir + '/folds.csv', index=False)
            # save holdout to a csv file for final inference (so we don't run inference on training examples)
            holdout_df.to_csv(experiment_dir + '/holdout.csv', index=False)
            with open(experiment_dir + '/experiment_config.json', 'w') as f:
                json.dump(config.__dict__, f)
        elif resume or finetune:
            # LOAD DATA FROM SAVED FILES
            with open(experiment_dir + '/experiment_config.json', 'r') as f:
                config = json.load(f, object_hook=lambda d: SimpleNamespace(**d))
                set_seeds(config.seed)
                config.debug = debug

            folds_df = pd.read_csv(experiment_dir + '/folds.csv', engine='python')
            holdout_df = pd.read_csv(experiment_dir + '/holdout.csv', engine='python')

            if finetune and not resume:
                print('finetuning...')
                # verify there are checkpoints to fine tune
                finetune_model_fnames = glob.glob(experiment_dir + '/*fold*.ckpt')
                assert len(finetune_model_fnames) > 0
                finetune_model_fnames.sort()

                # make new directory for tuning experiment with files from training run 1
                make_experiment_directory(experiment_dir + '_tune')
                for f in os.listdir(experiment_dir):
                    print(f"copying {f} to {experiment_dir + '_tune'}")
                    shutil.copy2(experiment_dir + '/' + f, experiment_dir + '_tune')
                experiment_dir += '_tune'
                experiment_name += '_tune'
            else:
                print('resuming from last checkpoint...')
                checkpoint_params = get_checkpoint_params(experiment_dir, resume, config.model_arch)

        assert holdout_df is not None, 'holdout_df is None'
        assert folds_df is not None, 'folds_df is None'

        # cv2 multithreading seems to go into deadlock with PyTorch data loaders
        if config.num_workers > 0:
            cv2.setNumThreads(0)

        trainer = TrainManager(experiment_name=experiment_name, experiment_dir=experiment_dir,
                               folds_df=folds_df, holdout_df=holdout_df,
                               checkpoint_params=checkpoint_params, config=config,
                               finetune=finetune, freeze_bn=freeze_bn,
                               freeze_feature_extractor=freeze_feature_extractor,
                               finetune_model_fnames=finetune_model_fnames)
        trainer.run()
    finally:
        torch.cuda.empty_cache()

def make_experiment_directory(name):
    try:
        os.makedirs(name)
    except FileExistsError as e:
        print('Experiment already exists. Be sure to resume training appropriately or start a new experiment.')
        if e.errno == errno.EEXIST: raise


def get_checkpoint_params(basename, resume, model_arch):
    checkpoint_params = None
    if resume:
        checkpoint_params = {}
        model_filenames = glob.glob(basename + '/*fold*.ckpt')
        model_filenames.sort()
        trained_folds = [re.findall(r'fold\d+', f)[0][len('fold'):] for f in model_filenames]
        most_recent_fold = int(max(trained_folds)) if len(trained_folds) > 0 else 0

        checkpoint_params['restart_from'] = most_recent_fold
        checkpoint_params['checkpoint_file_path'] = model_filenames[-1]

    return checkpoint_params

In [4]:
if __name__ == '__main__':
    try:
        debug = False
        print('Running in debug mode:', debug)
        main(experiment_name='adabound_onecycle_bnf_bitemp_smooth_weighted_t1=0.3_t2=1.0', debug=debug,
             resume=True, finetune=False, freeze_bn=True, freeze_feature_extractor=False)
    except KeyboardInterrupt:
        pass

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Using native 16bit precision.


Running in debug mode: False
Experiment directory /opt/favordata/AI/Felix/kaggle-cassava/trained-models/adabound_onecycle_bnf_bitemp_smooth_weighted_t1=0.3_t2=1.0
resuming from last checkpoint...
folds_df len 18187, holdout_df len 3210
Training fold 4
Class sample counts [ 758 1470 1623 8934 1765]
After class sample counts [2274 2940 3732 8934 4765]



  | Name           | Type           | Params
--------------------------------------------------
0 | valid_accuracy | Accuracy       | 0     
1 | test_accuracy  | Accuracy       | 0     
2 | criterion      | BiTemperedLoss | 0     
3 | model          | EfficientNet   | 17.6 M
--------------------------------------------------
17.4 M    Trainable params
125 K     Non-trainable params
17.6 M    Total params


Epoch 0:  94%|█████████▍| 910/967 [04:40<00:17,  3.24it/s, loss=0.438, v_num=2, val_loss=0.487, val_acc=0.25, train_loss=0.443]
Validating: 0it [00:00, ?it/s][A
Epoch 0:  94%|█████████▍| 912/967 [04:42<00:17,  3.23it/s, loss=0.438, v_num=2, val_loss=0.487, val_acc=0.25, train_loss=0.443]
Validating:   4%|▎         | 2/57 [00:01<00:44,  1.23it/s][A
Epoch 0:  95%|█████████▍| 914/967 [04:42<00:16,  3.23it/s, loss=0.438, v_num=2, val_loss=0.487, val_acc=0.25, train_loss=0.443]
Validating:   7%|▋         | 4/57 [00:02<00:24,  2.20it/s][A
Epoch 0:  95%|█████████▍| 916/967 [04:43<00:15,  3.23it/s, loss=0.438, v_num=2, val_loss=0.487, val_acc=0.25, train_loss=0.443]
Validating:  11%|█         | 6/57 [00:03<00:18,  2.77it/s][A
Epoch 0:  95%|█████████▍| 918/967 [04:44<00:15,  3.23it/s, loss=0.438, v_num=2, val_loss=0.487, val_acc=0.25, train_loss=0.443]
Validating:  14%|█▍        | 8/57 [00:03<00:15,  3.07it/s][A
Epoch 0:  95%|█████████▌| 920/967 [04:44<00:14,  3.23it/s, loss=0.438, v_num=2

Epoch 0, global step 227: val_loss reached 0.35271 (best 0.35271), saving model to "/opt/favordata/AI/Felix/kaggle-cassava/trained-models/adabound_onecycle_bnf_bitemp_smooth_weighted_t1=0.3_t2=1.0/tf_efficientnet_b4_ns_bitempered_smooth=0.05_val_loss=0.353_val_acc=0.640_fold4.ckpt" as top 1


Epoch 0: 100%|██████████| 967/967 [05:01<00:00,  3.21it/s, loss=0.438, v_num=2, val_loss=0.353, val_acc=0.64, train_loss=0.494]
Epoch 1:  94%|█████████▍| 910/967 [04:36<00:17,  3.29it/s, loss=0.4, v_num=2, val_loss=0.353, val_acc=0.64, train_loss=0.36]   
Validating: 0it [00:00, ?it/s][A
Epoch 1:  94%|█████████▍| 912/967 [04:38<00:16,  3.28it/s, loss=0.4, v_num=2, val_loss=0.353, val_acc=0.64, train_loss=0.36]
Validating:   4%|▎         | 2/57 [00:02<00:56,  1.03s/it][A
Epoch 1:  95%|█████████▍| 914/967 [04:38<00:16,  3.28it/s, loss=0.4, v_num=2, val_loss=0.353, val_acc=0.64, train_loss=0.36]
Validating:   7%|▋         | 4/57 [00:02<00:28,  1.86it/s][A
Epoch 1:  95%|█████████▍| 916/967 [04:39<00:15,  3.28it/s, loss=0.4, v_num=2, val_loss=0.353, val_acc=0.64, train_loss=0.36]
Validating:  11%|█         | 6/57 [00:03<00:19,  2.56it/s][A
Epoch 1:  95%|█████████▍| 918/967 [04:40<00:14,  3.28it/s, loss=0.4, v_num=2, val_loss=0.353, val_acc=0.64, train_loss=0.36]
Validating:  14%|█▍     

Epoch 1, global step 455: val_loss reached 0.30827 (best 0.30827), saving model to "/opt/favordata/AI/Felix/kaggle-cassava/trained-models/adabound_onecycle_bnf_bitemp_smooth_weighted_t1=0.3_t2=1.0/tf_efficientnet_b4_ns_bitempered_smooth=0.05_val_loss=0.308_val_acc=0.674_fold4.ckpt" as top 1


Epoch 1: 100%|██████████| 967/967 [04:54<00:00,  3.28it/s, loss=0.4, v_num=2, val_loss=0.308, val_acc=0.674, train_loss=0.303]
Epoch 2:  94%|█████████▍| 910/967 [04:36<00:17,  3.29it/s, loss=0.374, v_num=2, val_loss=0.308, val_acc=0.674, train_loss=0.311]
Validating: 0it [00:00, ?it/s][A
Epoch 2:  94%|█████████▍| 912/967 [04:37<00:16,  3.28it/s, loss=0.374, v_num=2, val_loss=0.308, val_acc=0.674, train_loss=0.311]
Validating:   4%|▎         | 2/57 [00:01<00:46,  1.19it/s][A
Epoch 2:  95%|█████████▍| 914/967 [04:38<00:16,  3.28it/s, loss=0.374, v_num=2, val_loss=0.308, val_acc=0.674, train_loss=0.311]
Validating:   7%|▋         | 4/57 [00:02<00:25,  2.07it/s][A
Epoch 2:  95%|█████████▍| 916/967 [04:39<00:15,  3.28it/s, loss=0.374, v_num=2, val_loss=0.308, val_acc=0.674, train_loss=0.311]
Validating:  11%|█         | 6/57 [00:03<00:19,  2.56it/s][A
Epoch 2:  95%|█████████▍| 918/967 [04:39<00:14,  3.28it/s, loss=0.374, v_num=2, val_loss=0.308, val_acc=0.674, train_loss=0.311]
Validati

Epoch 2, global step 683: val_loss reached 0.28655 (best 0.28655), saving model to "/opt/favordata/AI/Felix/kaggle-cassava/trained-models/adabound_onecycle_bnf_bitemp_smooth_weighted_t1=0.3_t2=1.0/tf_efficientnet_b4_ns_bitempered_smooth=0.05_val_loss=0.287_val_acc=0.698_fold4.ckpt" as top 1


Epoch 2: 100%|██████████| 967/967 [04:54<00:00,  3.29it/s, loss=0.374, v_num=2, val_loss=0.287, val_acc=0.698, train_loss=0.371]
Epoch 3:  94%|█████████▍| 910/967 [04:36<00:17,  3.30it/s, loss=0.334, v_num=2, val_loss=0.287, val_acc=0.698, train_loss=0.289]
Validating: 0it [00:00, ?it/s][A
Epoch 3:  94%|█████████▍| 912/967 [04:38<00:16,  3.28it/s, loss=0.334, v_num=2, val_loss=0.287, val_acc=0.698, train_loss=0.289]
Validating:   4%|▎         | 2/57 [00:02<00:52,  1.04it/s][A
Epoch 3:  95%|█████████▍| 914/967 [04:38<00:16,  3.28it/s, loss=0.334, v_num=2, val_loss=0.287, val_acc=0.698, train_loss=0.289]
Validating:   7%|▋         | 4/57 [00:02<00:27,  1.90it/s][A
Epoch 3:  95%|█████████▍| 916/967 [04:39<00:15,  3.28it/s, loss=0.334, v_num=2, val_loss=0.287, val_acc=0.698, train_loss=0.289]
Validating:  11%|█         | 6/57 [00:03<00:21,  2.36it/s][A
Epoch 3:  95%|█████████▍| 918/967 [04:39<00:14,  3.28it/s, loss=0.334, v_num=2, val_loss=0.287, val_acc=0.698, train_loss=0.289]
Valida

Epoch 3, global step 911: val_loss reached 0.24331 (best 0.24331), saving model to "/opt/favordata/AI/Felix/kaggle-cassava/trained-models/adabound_onecycle_bnf_bitemp_smooth_weighted_t1=0.3_t2=1.0/tf_efficientnet_b4_ns_bitempered_smooth=0.05_val_loss=0.243_val_acc=0.743_fold4.ckpt" as top 1


Epoch 3: 100%|██████████| 967/967 [04:54<00:00,  3.28it/s, loss=0.334, v_num=2, val_loss=0.243, val_acc=0.743, train_loss=0.461]
Epoch 4:  94%|█████████▍| 910/967 [04:36<00:17,  3.29it/s, loss=0.313, v_num=2, val_loss=0.243, val_acc=0.743, train_loss=0.37] 
Validating: 0it [00:00, ?it/s][A
Epoch 4:  94%|█████████▍| 912/967 [04:38<00:16,  3.27it/s, loss=0.313, v_num=2, val_loss=0.243, val_acc=0.743, train_loss=0.37]
Validating:   4%|▎         | 2/57 [00:02<00:54,  1.01it/s][A
Epoch 4:  95%|█████████▍| 914/967 [04:39<00:16,  3.28it/s, loss=0.313, v_num=2, val_loss=0.243, val_acc=0.743, train_loss=0.37]
Validating:   7%|▋         | 4/57 [00:02<00:27,  1.93it/s][A
Epoch 4:  95%|█████████▍| 916/967 [04:39<00:15,  3.28it/s, loss=0.313, v_num=2, val_loss=0.243, val_acc=0.743, train_loss=0.37]
Validating:  11%|█         | 6/57 [00:03<00:19,  2.63it/s][A
Epoch 4:  95%|█████████▍| 918/967 [04:40<00:14,  3.28it/s, loss=0.313, v_num=2, val_loss=0.243, val_acc=0.743, train_loss=0.37]
Validating

Epoch 4, global step 1139: val_loss reached 0.21029 (best 0.21029), saving model to "/opt/favordata/AI/Felix/kaggle-cassava/trained-models/adabound_onecycle_bnf_bitemp_smooth_weighted_t1=0.3_t2=1.0/tf_efficientnet_b4_ns_bitempered_smooth=0.05_val_loss=0.210_val_acc=0.765_fold4.ckpt" as top 1


Epoch 4: 100%|██████████| 967/967 [04:54<00:00,  3.28it/s, loss=0.313, v_num=2, val_loss=0.21, val_acc=0.765, train_loss=0.478]
Epoch 5:  94%|█████████▍| 910/967 [04:36<00:17,  3.30it/s, loss=0.264, v_num=2, val_loss=0.21, val_acc=0.765, train_loss=0.313]
Validating: 0it [00:00, ?it/s][A
Epoch 5:  94%|█████████▍| 912/967 [04:37<00:16,  3.29it/s, loss=0.264, v_num=2, val_loss=0.21, val_acc=0.765, train_loss=0.313]
Validating:   4%|▎         | 2/57 [00:02<00:51,  1.07it/s][A
Epoch 5:  95%|█████████▍| 914/967 [04:38<00:16,  3.28it/s, loss=0.264, v_num=2, val_loss=0.21, val_acc=0.765, train_loss=0.313]
Validating:   7%|▋         | 4/57 [00:02<00:26,  2.03it/s][A
Epoch 5:  95%|█████████▍| 916/967 [04:38<00:15,  3.28it/s, loss=0.264, v_num=2, val_loss=0.21, val_acc=0.765, train_loss=0.313]
Validating:  11%|█         | 6/57 [00:03<00:19,  2.63it/s][A
Epoch 5:  95%|█████████▍| 918/967 [04:39<00:14,  3.28it/s, loss=0.264, v_num=2, val_loss=0.21, val_acc=0.765, train_loss=0.313]
Validating: 

Epoch 5, global step 1367: val_loss reached 0.20574 (best 0.20574), saving model to "/opt/favordata/AI/Felix/kaggle-cassava/trained-models/adabound_onecycle_bnf_bitemp_smooth_weighted_t1=0.3_t2=1.0/tf_efficientnet_b4_ns_bitempered_smooth=0.05_val_loss=0.206_val_acc=0.767_fold4.ckpt" as top 1


Epoch 5: 100%|██████████| 967/967 [04:54<00:00,  3.29it/s, loss=0.264, v_num=2, val_loss=0.206, val_acc=0.767, train_loss=0.121]
Epoch 6:  94%|█████████▍| 910/967 [04:36<00:17,  3.29it/s, loss=0.228, v_num=2, val_loss=0.206, val_acc=0.767, train_loss=0.309] 
Validating: 0it [00:00, ?it/s][A
Epoch 6:  94%|█████████▍| 912/967 [04:37<00:16,  3.28it/s, loss=0.228, v_num=2, val_loss=0.206, val_acc=0.767, train_loss=0.309]
Validating:   4%|▎         | 2/57 [00:01<00:47,  1.16it/s][A
Epoch 6:  95%|█████████▍| 914/967 [04:38<00:16,  3.28it/s, loss=0.228, v_num=2, val_loss=0.206, val_acc=0.767, train_loss=0.309]
Validating:   7%|▋         | 4/57 [00:02<00:25,  2.10it/s][A
Epoch 6:  95%|█████████▍| 916/967 [04:39<00:15,  3.28it/s, loss=0.228, v_num=2, val_loss=0.206, val_acc=0.767, train_loss=0.309]
Validating:  11%|█         | 6/57 [00:03<00:19,  2.64it/s][A
Epoch 6:  95%|█████████▍| 918/967 [04:39<00:14,  3.28it/s, loss=0.228, v_num=2, val_loss=0.206, val_acc=0.767, train_loss=0.309]
Valid

Epoch 6, global step 1595: val_loss reached 0.18003 (best 0.18003), saving model to "/opt/favordata/AI/Felix/kaggle-cassava/trained-models/adabound_onecycle_bnf_bitemp_smooth_weighted_t1=0.3_t2=1.0/tf_efficientnet_b4_ns_bitempered_smooth=0.05_val_loss=0.180_val_acc=0.795_fold4.ckpt" as top 1


Epoch 6: 100%|██████████| 967/967 [04:54<00:00,  3.28it/s, loss=0.228, v_num=2, val_loss=0.18, val_acc=0.795, train_loss=0.328] 
Epoch 7:  94%|█████████▍| 910/967 [04:35<00:17,  3.30it/s, loss=0.231, v_num=2, val_loss=0.18, val_acc=0.795, train_loss=0.307] 
Validating: 0it [00:00, ?it/s][A
Epoch 7:  94%|█████████▍| 912/967 [04:37<00:16,  3.28it/s, loss=0.231, v_num=2, val_loss=0.18, val_acc=0.795, train_loss=0.307]
Validating:   4%|▎         | 2/57 [00:02<00:53,  1.03it/s][A
Epoch 7:  95%|█████████▍| 914/967 [04:38<00:16,  3.28it/s, loss=0.231, v_num=2, val_loss=0.18, val_acc=0.795, train_loss=0.307]
Validating:   7%|▋         | 4/57 [00:02<00:28,  1.84it/s][A
Epoch 7:  95%|█████████▍| 916/967 [04:38<00:15,  3.28it/s, loss=0.231, v_num=2, val_loss=0.18, val_acc=0.795, train_loss=0.307]
Validating:  11%|█         | 6/57 [00:03<00:20,  2.55it/s][A
Epoch 7:  95%|█████████▍| 918/967 [04:39<00:14,  3.29it/s, loss=0.231, v_num=2, val_loss=0.18, val_acc=0.795, train_loss=0.307]
Validating

Epoch 7, global step 1823: val_loss reached 0.14638 (best 0.14638), saving model to "/opt/favordata/AI/Felix/kaggle-cassava/trained-models/adabound_onecycle_bnf_bitemp_smooth_weighted_t1=0.3_t2=1.0/tf_efficientnet_b4_ns_bitempered_smooth=0.05_val_loss=0.146_val_acc=0.834_fold4.ckpt" as top 1


Epoch 7: 100%|██████████| 967/967 [04:54<00:00,  3.29it/s, loss=0.231, v_num=2, val_loss=0.146, val_acc=0.834, train_loss=0.389]
Epoch 8:  94%|█████████▍| 910/967 [04:35<00:17,  3.30it/s, loss=0.201, v_num=2, val_loss=0.146, val_acc=0.834, train_loss=0.276] 
Validating: 0it [00:00, ?it/s][A
Epoch 8:  94%|█████████▍| 912/967 [04:37<00:16,  3.28it/s, loss=0.201, v_num=2, val_loss=0.146, val_acc=0.834, train_loss=0.276]
Validating:   4%|▎         | 2/57 [00:02<00:51,  1.06it/s][A
Epoch 8:  95%|█████████▍| 914/967 [04:38<00:16,  3.28it/s, loss=0.201, v_num=2, val_loss=0.146, val_acc=0.834, train_loss=0.276]
Validating:   7%|▋         | 4/57 [00:02<00:26,  1.97it/s][A
Epoch 8:  95%|█████████▍| 916/967 [04:38<00:15,  3.28it/s, loss=0.201, v_num=2, val_loss=0.146, val_acc=0.834, train_loss=0.276]
Validating:  11%|█         | 6/57 [00:03<00:19,  2.65it/s][A
Epoch 8:  95%|█████████▍| 918/967 [04:39<00:14,  3.28it/s, loss=0.201, v_num=2, val_loss=0.146, val_acc=0.834, train_loss=0.276]
Valid

Epoch 8, global step 2051: val_loss reached 0.14062 (best 0.14062), saving model to "/opt/favordata/AI/Felix/kaggle-cassava/trained-models/adabound_onecycle_bnf_bitemp_smooth_weighted_t1=0.3_t2=1.0/tf_efficientnet_b4_ns_bitempered_smooth=0.05_val_loss=0.141_val_acc=0.842_fold4.ckpt" as top 1


Epoch 8: 100%|██████████| 967/967 [04:54<00:00,  3.29it/s, loss=0.201, v_num=2, val_loss=0.141, val_acc=0.842, train_loss=0.0346]
Epoch 9:  94%|█████████▍| 910/967 [04:35<00:17,  3.30it/s, loss=0.178, v_num=2, val_loss=0.141, val_acc=0.842, train_loss=0.212] 
Validating: 0it [00:00, ?it/s][A
Epoch 9:  94%|█████████▍| 912/967 [04:37<00:16,  3.29it/s, loss=0.178, v_num=2, val_loss=0.141, val_acc=0.842, train_loss=0.212]
Validating:   4%|▎         | 2/57 [00:01<00:44,  1.25it/s][A
Epoch 9:  95%|█████████▍| 914/967 [04:37<00:16,  3.29it/s, loss=0.178, v_num=2, val_loss=0.141, val_acc=0.842, train_loss=0.212]
Validating:   7%|▋         | 4/57 [00:02<00:23,  2.22it/s][A
Epoch 9:  95%|█████████▍| 916/967 [04:38<00:15,  3.29it/s, loss=0.178, v_num=2, val_loss=0.141, val_acc=0.842, train_loss=0.212]
Validating:  11%|█         | 6/57 [00:02<00:18,  2.77it/s][A
Epoch 9:  95%|█████████▍| 918/967 [04:39<00:14,  3.29it/s, loss=0.178, v_num=2, val_loss=0.141, val_acc=0.842, train_loss=0.212]
Vali

Epoch 9, global step 2279: val_loss reached 0.13272 (best 0.13272), saving model to "/opt/favordata/AI/Felix/kaggle-cassava/trained-models/adabound_onecycle_bnf_bitemp_smooth_weighted_t1=0.3_t2=1.0/tf_efficientnet_b4_ns_bitempered_smooth=0.05_val_loss=0.133_val_acc=0.846_fold4.ckpt" as top 1


Epoch 9: 100%|██████████| 967/967 [04:53<00:00,  3.29it/s, loss=0.178, v_num=2, val_loss=0.133, val_acc=0.846, train_loss=0.341]
Epoch 10:  94%|█████████▍| 910/967 [04:35<00:17,  3.30it/s, loss=0.177, v_num=2, val_loss=0.133, val_acc=0.846, train_loss=0.229] 
Validating: 0it [00:00, ?it/s][A
Epoch 10:  94%|█████████▍| 912/967 [04:37<00:16,  3.28it/s, loss=0.177, v_num=2, val_loss=0.133, val_acc=0.846, train_loss=0.229]
Validating:   4%|▎         | 2/57 [00:02<00:52,  1.06it/s][A
Epoch 10:  95%|█████████▍| 914/967 [04:38<00:16,  3.29it/s, loss=0.177, v_num=2, val_loss=0.133, val_acc=0.846, train_loss=0.229]
Validating:   7%|▋         | 4/57 [00:02<00:26,  1.99it/s][A
Epoch 10:  95%|█████████▍| 916/967 [04:38<00:15,  3.29it/s, loss=0.177, v_num=2, val_loss=0.133, val_acc=0.846, train_loss=0.229]
Validating:  11%|█         | 6/57 [00:03<00:19,  2.64it/s][A
Epoch 10:  95%|█████████▍| 918/967 [04:39<00:14,  3.29it/s, loss=0.177, v_num=2, val_loss=0.133, val_acc=0.846, train_loss=0.229]


Epoch 10, global step 2507: val_loss reached 0.11765 (best 0.11765), saving model to "/opt/favordata/AI/Felix/kaggle-cassava/trained-models/adabound_onecycle_bnf_bitemp_smooth_weighted_t1=0.3_t2=1.0/tf_efficientnet_b4_ns_bitempered_smooth=0.05_val_loss=0.118_val_acc=0.868_fold4.ckpt" as top 1


Epoch 10: 100%|██████████| 967/967 [04:53<00:00,  3.29it/s, loss=0.177, v_num=2, val_loss=0.118, val_acc=0.868, train_loss=0.229]
Epoch 11:  94%|█████████▍| 910/967 [04:36<00:17,  3.29it/s, loss=0.176, v_num=2, val_loss=0.118, val_acc=0.868, train_loss=0.4]    
Validating: 0it [00:00, ?it/s][A
Epoch 11:  94%|█████████▍| 912/967 [04:37<00:16,  3.28it/s, loss=0.176, v_num=2, val_loss=0.118, val_acc=0.868, train_loss=0.4]
Validating:   4%|▎         | 2/57 [00:01<00:48,  1.13it/s][A
Epoch 11:  95%|█████████▍| 914/967 [04:38<00:16,  3.28it/s, loss=0.176, v_num=2, val_loss=0.118, val_acc=0.868, train_loss=0.4]
Validating:   7%|▋         | 4/57 [00:02<00:25,  2.09it/s][A
Epoch 11:  95%|█████████▍| 916/967 [04:39<00:15,  3.28it/s, loss=0.176, v_num=2, val_loss=0.118, val_acc=0.868, train_loss=0.4]
Validating:  11%|█         | 6/57 [00:03<00:18,  2.70it/s][A
Epoch 11:  95%|█████████▍| 918/967 [04:39<00:14,  3.28it/s, loss=0.176, v_num=2, val_loss=0.118, val_acc=0.868, train_loss=0.4]
Valida

Epoch 11, global step 2735: val_loss reached 0.11669 (best 0.11669), saving model to "/opt/favordata/AI/Felix/kaggle-cassava/trained-models/adabound_onecycle_bnf_bitemp_smooth_weighted_t1=0.3_t2=1.0/tf_efficientnet_b4_ns_bitempered_smooth=0.05_val_loss=0.117_val_acc=0.864_fold4.ckpt" as top 1


Epoch 11: 100%|██████████| 967/967 [04:54<00:00,  3.28it/s, loss=0.176, v_num=2, val_loss=0.117, val_acc=0.864, train_loss=0.118]
Epoch 12:  94%|█████████▍| 910/967 [04:36<00:17,  3.29it/s, loss=0.162, v_num=2, val_loss=0.117, val_acc=0.864, train_loss=0.147] 
Validating: 0it [00:00, ?it/s][A
Epoch 12:  94%|█████████▍| 912/967 [04:38<00:16,  3.28it/s, loss=0.162, v_num=2, val_loss=0.117, val_acc=0.864, train_loss=0.147]
Validating:   4%|▎         | 2/57 [00:02<00:50,  1.09it/s][A
Epoch 12:  95%|█████████▍| 914/967 [04:38<00:16,  3.28it/s, loss=0.162, v_num=2, val_loss=0.117, val_acc=0.864, train_loss=0.147]
Validating:   7%|▋         | 4/57 [00:02<00:26,  2.03it/s][A
Epoch 12:  95%|█████████▍| 916/967 [04:39<00:15,  3.28it/s, loss=0.162, v_num=2, val_loss=0.117, val_acc=0.864, train_loss=0.147]
Validating:  11%|█         | 6/57 [00:03<00:18,  2.69it/s][A
Epoch 12:  95%|█████████▍| 918/967 [04:39<00:14,  3.28it/s, loss=0.162, v_num=2, val_loss=0.117, val_acc=0.864, train_loss=0.147]

Epoch 12, global step 2963: val_loss reached 0.11525 (best 0.11525), saving model to "/opt/favordata/AI/Felix/kaggle-cassava/trained-models/adabound_onecycle_bnf_bitemp_smooth_weighted_t1=0.3_t2=1.0/tf_efficientnet_b4_ns_bitempered_smooth=0.05_val_loss=0.115_val_acc=0.873_fold4.ckpt" as top 1


Epoch 12: 100%|██████████| 967/967 [04:54<00:00,  3.28it/s, loss=0.162, v_num=2, val_loss=0.115, val_acc=0.873, train_loss=0.585]
Epoch 13:  94%|█████████▍| 910/967 [04:36<00:17,  3.29it/s, loss=0.171, v_num=2, val_loss=0.115, val_acc=0.873, train_loss=0.213] 
Validating: 0it [00:00, ?it/s][A
Epoch 13:  94%|█████████▍| 912/967 [04:38<00:16,  3.28it/s, loss=0.171, v_num=2, val_loss=0.115, val_acc=0.873, train_loss=0.213]
Validating:   4%|▎         | 2/57 [00:02<00:48,  1.14it/s][A
Epoch 13:  95%|█████████▍| 914/967 [04:38<00:16,  3.28it/s, loss=0.171, v_num=2, val_loss=0.115, val_acc=0.873, train_loss=0.213]
Validating:   7%|▋         | 4/57 [00:02<00:26,  2.03it/s][A
Epoch 13:  95%|█████████▍| 916/967 [04:39<00:15,  3.28it/s, loss=0.171, v_num=2, val_loss=0.115, val_acc=0.873, train_loss=0.213]
Validating:  11%|█         | 6/57 [00:03<00:19,  2.66it/s][A
Epoch 13:  95%|█████████▍| 918/967 [04:39<00:14,  3.28it/s, loss=0.171, v_num=2, val_loss=0.115, val_acc=0.873, train_loss=0.213]

Epoch 13, global step 3191: val_loss reached 0.10947 (best 0.10947), saving model to "/opt/favordata/AI/Felix/kaggle-cassava/trained-models/adabound_onecycle_bnf_bitemp_smooth_weighted_t1=0.3_t2=1.0/tf_efficientnet_b4_ns_bitempered_smooth=0.05_val_loss=0.109_val_acc=0.883_fold4.ckpt" as top 1


Epoch 13: 100%|██████████| 967/967 [04:54<00:00,  3.28it/s, loss=0.171, v_num=2, val_loss=0.109, val_acc=0.883, train_loss=0.079]
Epoch 14:  94%|█████████▍| 910/967 [04:36<00:17,  3.29it/s, loss=0.159, v_num=2, val_loss=0.109, val_acc=0.883, train_loss=0.162] 
Validating: 0it [00:00, ?it/s][A
Epoch 14:  94%|█████████▍| 912/967 [04:38<00:16,  3.28it/s, loss=0.159, v_num=2, val_loss=0.109, val_acc=0.883, train_loss=0.162]
Validating:   4%|▎         | 2/57 [00:01<00:46,  1.18it/s][A
Epoch 14:  95%|█████████▍| 914/967 [04:38<00:16,  3.28it/s, loss=0.159, v_num=2, val_loss=0.109, val_acc=0.883, train_loss=0.162]
Validating:   7%|▋         | 4/57 [00:02<00:25,  2.11it/s][A
Epoch 14:  95%|█████████▍| 916/967 [04:39<00:15,  3.28it/s, loss=0.159, v_num=2, val_loss=0.109, val_acc=0.883, train_loss=0.162]
Validating:  11%|█         | 6/57 [00:03<00:19,  2.63it/s][A
Epoch 14:  95%|█████████▍| 918/967 [04:39<00:14,  3.28it/s, loss=0.159, v_num=2, val_loss=0.109, val_acc=0.883, train_loss=0.162]

Epoch 14, step 3419: val_loss was not in top 1


Epoch 14: 100%|██████████| 967/967 [04:54<00:00,  3.28it/s, loss=0.159, v_num=2, val_loss=0.117, val_acc=0.869, train_loss=0.176]
Epoch 15:  94%|█████████▍| 910/967 [04:36<00:17,  3.29it/s, loss=0.155, v_num=2, val_loss=0.117, val_acc=0.869, train_loss=0.144] 
Validating: 0it [00:00, ?it/s][A
Epoch 15:  94%|█████████▍| 912/967 [04:38<00:16,  3.28it/s, loss=0.155, v_num=2, val_loss=0.117, val_acc=0.869, train_loss=0.144]
Validating:   4%|▎         | 2/57 [00:01<00:46,  1.18it/s][A
Epoch 15:  95%|█████████▍| 914/967 [04:38<00:16,  3.28it/s, loss=0.155, v_num=2, val_loss=0.117, val_acc=0.869, train_loss=0.144]
Validating:   7%|▋         | 4/57 [00:02<00:24,  2.13it/s][A
Epoch 15:  95%|█████████▍| 916/967 [04:39<00:15,  3.28it/s, loss=0.155, v_num=2, val_loss=0.117, val_acc=0.869, train_loss=0.144]
Validating:  11%|█         | 6/57 [00:03<00:19,  2.65it/s][A
Epoch 15:  95%|█████████▍| 918/967 [04:39<00:14,  3.28it/s, loss=0.155, v_num=2, val_loss=0.117, val_acc=0.869, train_loss=0.144]

Epoch 15, step 3647: val_loss was not in top 1


Epoch 15: 100%|██████████| 967/967 [04:54<00:00,  3.29it/s, loss=0.155, v_num=2, val_loss=0.122, val_acc=0.863, train_loss=0.139]
Epoch 16:  94%|█████████▍| 910/967 [04:36<00:17,  3.29it/s, loss=0.159, v_num=2, val_loss=0.122, val_acc=0.863, train_loss=0.144]  
Validating: 0it [00:00, ?it/s][A
Epoch 16:  94%|█████████▍| 912/967 [04:37<00:16,  3.28it/s, loss=0.159, v_num=2, val_loss=0.122, val_acc=0.863, train_loss=0.144]
Validating:   4%|▎         | 2/57 [00:01<00:45,  1.22it/s][A
Epoch 16:  95%|█████████▍| 914/967 [04:38<00:16,  3.28it/s, loss=0.159, v_num=2, val_loss=0.122, val_acc=0.863, train_loss=0.144]
Validating:   7%|▋         | 4/57 [00:02<00:24,  2.15it/s][A
Epoch 16:  95%|█████████▍| 916/967 [04:39<00:15,  3.28it/s, loss=0.159, v_num=2, val_loss=0.122, val_acc=0.863, train_loss=0.144]
Validating:  11%|█         | 6/57 [00:03<00:19,  2.68it/s][A
Epoch 16:  95%|█████████▍| 918/967 [04:39<00:14,  3.28it/s, loss=0.159, v_num=2, val_loss=0.122, val_acc=0.863, train_loss=0.144

Epoch 16, step 3875: val_loss was not in top 1


Epoch 16: 100%|██████████| 967/967 [04:54<00:00,  3.29it/s, loss=0.159, v_num=2, val_loss=0.123, val_acc=0.869, train_loss=0.195]
Epoch 17:  94%|█████████▍| 910/967 [04:36<00:17,  3.29it/s, loss=0.14, v_num=2, val_loss=0.123, val_acc=0.869, train_loss=0.136]   
Validating: 0it [00:00, ?it/s][A
Epoch 17:  94%|█████████▍| 912/967 [04:38<00:16,  3.27it/s, loss=0.14, v_num=2, val_loss=0.123, val_acc=0.869, train_loss=0.136]
Validating:   4%|▎         | 2/57 [00:02<00:54,  1.00it/s][A
Epoch 17:  95%|█████████▍| 914/967 [04:39<00:16,  3.27it/s, loss=0.14, v_num=2, val_loss=0.123, val_acc=0.869, train_loss=0.136]
Validating:   7%|▋         | 4/57 [00:02<00:27,  1.91it/s][A
Epoch 17:  95%|█████████▍| 916/967 [04:39<00:15,  3.27it/s, loss=0.14, v_num=2, val_loss=0.123, val_acc=0.869, train_loss=0.136]
Validating:  11%|█         | 6/57 [00:03<00:19,  2.63it/s][A
Epoch 17:  95%|█████████▍| 918/967 [04:40<00:14,  3.28it/s, loss=0.14, v_num=2, val_loss=0.123, val_acc=0.869, train_loss=0.136]
Va

Epoch 17, step 4103: val_loss was not in top 1


Epoch 17: 100%|██████████| 967/967 [04:54<00:00,  3.28it/s, loss=0.14, v_num=2, val_loss=0.115, val_acc=0.873, train_loss=0.082]
Epoch 18:  94%|█████████▍| 910/967 [04:36<00:17,  3.29it/s, loss=0.146, v_num=2, val_loss=0.115, val_acc=0.873, train_loss=0.11]   
Validating: 0it [00:00, ?it/s][A
Epoch 18:  94%|█████████▍| 912/967 [04:38<00:16,  3.27it/s, loss=0.146, v_num=2, val_loss=0.115, val_acc=0.873, train_loss=0.11]
Validating:   4%|▎         | 2/57 [00:02<00:52,  1.04it/s][A
Epoch 18:  95%|█████████▍| 914/967 [04:39<00:16,  3.27it/s, loss=0.146, v_num=2, val_loss=0.115, val_acc=0.873, train_loss=0.11]
Validating:   7%|▋         | 4/57 [00:02<00:27,  1.95it/s][A
Epoch 18:  95%|█████████▍| 916/967 [04:39<00:15,  3.27it/s, loss=0.146, v_num=2, val_loss=0.115, val_acc=0.873, train_loss=0.11]
Validating:  11%|█         | 6/57 [00:03<00:19,  2.63it/s][A
Epoch 18:  95%|█████████▍| 918/967 [04:40<00:14,  3.27it/s, loss=0.146, v_num=2, val_loss=0.115, val_acc=0.873, train_loss=0.11]
Val

Epoch 18, step 4331: val_loss was not in top 1


Epoch 18: 100%|██████████| 967/967 [04:54<00:00,  3.28it/s, loss=0.146, v_num=2, val_loss=0.115, val_acc=0.874, train_loss=0.298]
Epoch 19:  94%|█████████▍| 910/967 [04:36<00:17,  3.29it/s, loss=0.142, v_num=2, val_loss=0.115, val_acc=0.874, train_loss=0.157]  
Validating: 0it [00:00, ?it/s][A
Epoch 19:  94%|█████████▍| 912/967 [04:38<00:16,  3.28it/s, loss=0.142, v_num=2, val_loss=0.115, val_acc=0.874, train_loss=0.157]
Validating:   4%|▎         | 2/57 [00:02<00:48,  1.13it/s][A
Epoch 19:  95%|█████████▍| 914/967 [04:39<00:16,  3.28it/s, loss=0.142, v_num=2, val_loss=0.115, val_acc=0.874, train_loss=0.157]
Validating:   7%|▋         | 4/57 [00:02<00:25,  2.06it/s][A
Epoch 19:  95%|█████████▍| 916/967 [04:39<00:15,  3.28it/s, loss=0.142, v_num=2, val_loss=0.115, val_acc=0.874, train_loss=0.157]
Validating:  11%|█         | 6/57 [00:03<00:19,  2.67it/s][A
Epoch 19:  95%|█████████▍| 918/967 [04:40<00:14,  3.28it/s, loss=0.142, v_num=2, val_loss=0.115, val_acc=0.874, train_loss=0.157

Epoch 19, step 4559: val_loss was not in top 1


Epoch 19: 100%|██████████| 967/967 [04:54<00:00,  3.28it/s, loss=0.142, v_num=2, val_loss=0.113, val_acc=0.875, train_loss=0.225]
Epoch 20:  94%|█████████▍| 910/967 [04:36<00:17,  3.29it/s, loss=0.136, v_num=2, val_loss=0.113, val_acc=0.875, train_loss=0.216]  
Validating: 0it [00:00, ?it/s][A
Epoch 20:  94%|█████████▍| 912/967 [04:38<00:16,  3.28it/s, loss=0.136, v_num=2, val_loss=0.113, val_acc=0.875, train_loss=0.216]
Validating:   4%|▎         | 2/57 [00:02<00:49,  1.10it/s][A
Epoch 20:  95%|█████████▍| 914/967 [04:38<00:16,  3.28it/s, loss=0.136, v_num=2, val_loss=0.113, val_acc=0.875, train_loss=0.216]
Validating:   7%|▋         | 4/57 [00:02<00:25,  2.05it/s][A
Epoch 20:  95%|█████████▍| 916/967 [04:39<00:15,  3.28it/s, loss=0.136, v_num=2, val_loss=0.113, val_acc=0.875, train_loss=0.216]
Validating:  11%|█         | 6/57 [00:03<00:19,  2.68it/s][A
Epoch 20:  95%|█████████▍| 918/967 [04:39<00:14,  3.28it/s, loss=0.136, v_num=2, val_loss=0.113, val_acc=0.875, train_loss=0.216

Epoch 20, step 4787: val_loss was not in top 1


Epoch 20: 100%|██████████| 967/967 [04:53<00:00,  3.29it/s, loss=0.136, v_num=2, val_loss=0.113, val_acc=0.879, train_loss=0.14] 
Epoch 21:  94%|█████████▍| 910/967 [04:36<00:17,  3.29it/s, loss=0.127, v_num=2, val_loss=0.113, val_acc=0.879, train_loss=0.125]  
Validating: 0it [00:00, ?it/s][A
Epoch 21:  94%|█████████▍| 912/967 [04:38<00:16,  3.28it/s, loss=0.127, v_num=2, val_loss=0.113, val_acc=0.879, train_loss=0.125]
Validating:   4%|▎         | 2/57 [00:02<00:53,  1.03it/s][A
Epoch 21:  95%|█████████▍| 914/967 [04:39<00:16,  3.28it/s, loss=0.127, v_num=2, val_loss=0.113, val_acc=0.879, train_loss=0.125]
Validating:   7%|▋         | 4/57 [00:02<00:28,  1.86it/s][A
Epoch 21:  95%|█████████▍| 916/967 [04:39<00:15,  3.28it/s, loss=0.127, v_num=2, val_loss=0.113, val_acc=0.879, train_loss=0.125]
Validating:  11%|█         | 6/57 [00:03<00:19,  2.59it/s][A
Epoch 21:  95%|█████████▍| 918/967 [04:40<00:14,  3.28it/s, loss=0.127, v_num=2, val_loss=0.113, val_acc=0.879, train_loss=0.125

Epoch 21, step 5015: val_loss was not in top 1


Epoch 21: 100%|██████████| 967/967 [04:54<00:00,  3.29it/s, loss=0.127, v_num=2, val_loss=0.109, val_acc=0.882, train_loss=0.146]
Epoch 21: 100%|██████████| 967/967 [04:54<00:00,  3.28it/s, loss=0.127, v_num=2, val_loss=0.109, val_acc=0.882, train_loss=0.146]
