In [1]:
import errno
import glob
import json
import os
import re
import shutil
from types import SimpleNamespace
import cv2
import torch
import warnings
from lightning_objects import LightningModel
warnings.filterwarnings('ignore')
from config import Configuration
import pandas as pd
from utils import stratify_split, make_holdout_df, set_seeds
from train_manager import TrainManager

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
def main(experiment_name: str, debug, resume=False,
         finetune=False, freeze_bn=True, freeze_feature_extractor=False):

    experiment_dir = os.path.abspath(f'trained-models/{experiment_name}')
    print('Experiment directory', experiment_dir)

    try:
        # -------- SETUP --------
        checkpoint_params = None
        finetune_model_fnames = None
        folds_df, holdout_df = None, None

        if not resume and not finetune: # totally new experiment
            make_experiment_directory(experiment_dir)
            config = Configuration()
            config.debug = debug
            set_seeds(config.seed)

            # -------- LOAD DATA FROM TRAIN FILE --------
            data_df = pd.read_csv(config.data_dir + '/train.csv', engine='python')
            data_df, holdout_df = make_holdout_df(data_df, seed=config.seed)
            folds_df = stratify_split(data_df, config.fold_num, config.seed, config.target_col)

            # -------- SAVE FILES (for experiment state) --------
            folds_df.to_csv(experiment_dir + '/folds.csv', index=False)
            # save holdout to a csv file for final inference (so we don't run inference on training examples)
            holdout_df.to_csv(experiment_dir + '/holdout.csv', index=False)
            with open(experiment_dir + '/experiment_config.json', 'w') as f:
                json.dump(config.__dict__, f)
        elif resume or finetune:
            # LOAD DATA FROM SAVED FILES
            with open(experiment_dir + '/experiment_config.json', 'r') as f:
                config = json.load(f, object_hook=lambda d: SimpleNamespace(**d))
                set_seeds(config.seed)
                config.debug = debug

            folds_df = pd.read_csv(experiment_dir + '/folds.csv', engine='python')
            holdout_df = pd.read_csv(experiment_dir + '/holdout.csv', engine='python')

            if finetune and not resume:
                print('finetuning...')
                # verify there are checkpoints to fine tune
                finetune_model_fnames = glob.glob(experiment_dir + '/*fold*.ckpt')
                assert len(finetune_model_fnames) > 0
                finetune_model_fnames.sort()

                # make new directory for tuning experiment with files from training run 1
                make_experiment_directory(experiment_dir + '_tune')
                for f in os.listdir(experiment_dir):
                    print(f"copying {f} to {experiment_dir + '_tune'}")
                    shutil.copy2(experiment_dir + '/' + f, experiment_dir + '_tune')
                experiment_dir += '_tune'
                experiment_name += '_tune'
            else:
                print('resuming from last checkpoint...')
                checkpoint_params = get_checkpoint_params(experiment_dir, resume, config.model_arch)

        assert holdout_df is not None, 'holdout_df is None'
        assert folds_df is not None, 'folds_df is None'

        # cv2 multithreading seems to go into deadlock with PyTorch data loaders
        if config.num_workers > 0:
            cv2.setNumThreads(0)

        trainer = TrainManager(experiment_name=experiment_name, experiment_dir=experiment_dir,
                               folds_df=folds_df, holdout_df=holdout_df,
                               checkpoint_params=checkpoint_params, config=config,
                               finetune=finetune, freeze_bn=freeze_bn,
                               freeze_feature_extractor=freeze_feature_extractor,
                               finetune_model_fnames=finetune_model_fnames)
        trainer.run()
    finally:
        torch.cuda.empty_cache()

def make_experiment_directory(name):
    try:
        os.makedirs(name)
    except FileExistsError as e:
        print('Experiment already exists. Be sure to resume training appropriately or start a new experiment.')
        if e.errno == errno.EEXIST: raise


def get_checkpoint_params(basename, resume, model_arch):
    checkpoint_params = None
    if resume:
        checkpoint_params = {}
        model_filenames = glob.glob(basename + '/*fold*.ckpt')
        model_filenames.sort()
        trained_folds = [re.findall(r'fold\d+', f)[0][len('fold'):] for f in model_filenames]
        most_recent_fold = int(max(trained_folds)) if len(trained_folds) > 0 else 0

        checkpoint_params['restart_from'] = most_recent_fold
        checkpoint_params['checkpoint_file_path'] = model_filenames[-1]

    return checkpoint_params

In [None]:
if __name__ == '__main__':
    try:
        debug = False
        print('Running in debug mode:', debug)
        main(experiment_name='seresnet50_sgd_coswarm_bnf_bitemp_smooth_weighted_t1=0.3_t2=1.0', debug=debug,
             resume=False, finetune=False, freeze_bn=True, freeze_feature_extractor=False)
    except KeyboardInterrupt:
        pass

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Using native 16bit precision.


Running in debug mode: False
Experiment directory /opt/favordata/AI/Felix/kaggle-cassava/trained-models/seresnet50_sgd_coswarm_bnf_bitemp_smooth_weighted_t1=0.3_t2=1.0
folds_df len 18187, holdout_df len 3210
Training fold 1
Class sample counts [ 758 1471 1622 8933 1765]
After class sample counts [2274 2942 3730 8933 4765]
conv1.weight True
bn1.weight False
bn1.bias False
layer1.0.conv1.weight True
layer1.0.bn1.weight False
layer1.0.bn1.bias False
layer1.0.conv2.weight True
layer1.0.bn2.weight False
layer1.0.bn2.bias False
layer1.0.conv3.weight True
layer1.0.bn3.weight False
layer1.0.bn3.bias False
layer1.0.se.fc1.weight True
layer1.0.se.fc1.bias True
layer1.0.se.fc2.weight True
layer1.0.se.fc2.bias True
layer1.0.downsample.0.weight True
layer1.0.downsample.1.weight False
layer1.0.downsample.1.bias False
layer1.1.conv1.weight True
layer1.1.bn1.weight False
layer1.1.bn1.bias False
layer1.1.conv2.weight True
layer1.1.bn2.weight False
layer1.1.bn2.bias False
layer1.1.conv3.weight True
laye


  | Name           | Type           | Params
--------------------------------------------------
0 | valid_accuracy | Accuracy       | 0     
1 | test_accuracy  | Accuracy       | 0     
2 | criterion      | BiTemperedLoss | 0     
3 | model          | ResNet         | 26.0 M
--------------------------------------------------
26.0 M    Trainable params
53.1 K    Non-trainable params
26.0 M    Total params
Finding best initial lr: 100%|██████████| 100/100 [01:16<00:00,  1.29it/s]Restored states from the checkpoint file at /opt/favordata/AI/Felix/kaggle-cassava/trained-models/seresnet50_sgd_coswarm_bnf_bitemp_smooth_weighted_t1=0.3_t2=1.0/lr_find_temp_model.ckpt
Learning rate set to 0.2290867652767775

  | Name           | Type           | Params
--------------------------------------------------
0 | valid_accuracy | Accuracy       | 0     
1 | test_accuracy  | Accuracy       | 0     
2 | criterion      | BiTemperedLoss | 0     
3 | model          | ResNet         | 26.0 M
--------------

Epoch 1:  94%|█████████▍| 910/967 [03:00<00:20,  2.83it/s, loss=0.2, v_num=0, val_loss=0.49, val_acc=0.195, train_loss=0.186]      
Validating: 0it [00:00, ?it/s][A
Epoch 1:  94%|█████████▍| 912/967 [03:02<00:19,  2.82it/s, loss=0.2, v_num=0, val_loss=0.49, val_acc=0.195, train_loss=0.186]
Validating:   4%|▎         | 2/57 [00:01<00:48,  1.14it/s][A
Epoch 1:  95%|█████████▍| 914/967 [03:02<00:18,  2.81it/s, loss=0.2, v_num=0, val_loss=0.49, val_acc=0.195, train_loss=0.186]
Validating:   7%|▋         | 4/57 [00:02<00:23,  2.27it/s][A
Epoch 1:  95%|█████████▍| 916/967 [03:03<00:18,  2.82it/s, loss=0.2, v_num=0, val_loss=0.49, val_acc=0.195, train_loss=0.186]
Validating:  11%|█         | 6/57 [00:02<00:15,  3.21it/s][A
Epoch 1:  95%|█████████▍| 918/967 [03:03<00:17,  2.82it/s, loss=0.2, v_num=0, val_loss=0.49, val_acc=0.195, train_loss=0.186]
Validating:  14%|█▍        | 8/57 [00:03<00:12,  3.80it/s][A
Epoch 1:  95%|█████████▌| 920/967 [03:04<00:16,  2.83it/s, loss=0.2, v_num=0, val_

Epoch 1, global step 228: val_loss reached 0.14008 (best 0.14008), saving model to "/opt/favordata/AI/Felix/kaggle-cassava/trained-models/seresnet50_sgd_coswarm_bnf_bitemp_smooth_weighted_t1=0.3_t2=1.0/seresnet50_bitempered_smooth=0.05_val_loss=0.140_val_acc=0.840_fold1.ckpt" as top 1


Epoch 1: 100%|██████████| 967/967 [03:18<00:00,  2.87it/s, loss=0.2, v_num=0, val_loss=0.14, val_acc=0.84, train_loss=0.391] 
                                                           [A

Finding best initial lr: 100%|██████████| 100/100 [04:38<00:00,  2.78s/it]

Epoch 2:   0%|          | 0/967 [00:00<00:00, -750797.35it/s, loss=0.2, v_num=0, val_loss=0.14, val_acc=0.84, train_loss=0.391]  




Epoch 2:  94%|█████████▍| 910/967 [02:58<00:19,  2.86it/s, loss=0.151, v_num=0, val_loss=0.14, val_acc=0.84, train_loss=0.174]     
Validating: 0it [00:00, ?it/s][A
Epoch 2:  94%|█████████▍| 912/967 [02:59<00:19,  2.85it/s, loss=0.151, v_num=0, val_loss=0.14, val_acc=0.84, train_loss=0.174]
Validating:   4%|▎         | 2/57 [00:01<00:48,  1.14it/s][A
Epoch 2:  95%|█████████▍| 914/967 [03:00<00:18,  2.85it/s, loss=0.151, v_num=0, val_loss=0.14, val_acc=0.84, train_loss=0.174]
Validating:   7%|▋         | 4/57 [00:02<00:23,  2.23it/s][A
Epoch 2:  95%|█████████▍| 916/967 [03:01<00:17,  2.85it/s, loss=0.151, v_num=0, val_loss=0.14, val_acc=0.84, train_loss=0.174]
Validating:  11%|█         | 6/57 [00:02<00:16,  3.12it/s][A
Epoch 2:  95%|█████████▍| 918/967 [03:01<00:17,  2.86it/s, loss=0.151, v_num=0, val_loss=0.14, val_acc=0.84, train_loss=0.174]
Validating:  14%|█▍        | 8/57 [00:03<00:12,  3.77it/s][A
Epoch 2:  95%|█████████▌| 920/967 [03:01<00:16,  2.86it/s, loss=0.151, v_num=0

Epoch 2, step 456: val_loss was not in top 1


Epoch 2: 100%|██████████| 967/967 [03:14<00:00,  2.92it/s, loss=0.151, v_num=0, val_loss=0.152, val_acc=0.83, train_loss=0.267]
Epoch 3:  94%|█████████▍| 910/967 [02:58<00:19,  2.86it/s, loss=0.149, v_num=0, val_loss=0.152, val_acc=0.83, train_loss=0.174]     
Validating: 0it [00:00, ?it/s][A
Epoch 3:  94%|█████████▍| 912/967 [03:00<00:19,  2.85it/s, loss=0.149, v_num=0, val_loss=0.152, val_acc=0.83, train_loss=0.174]
Validating:   4%|▎         | 2/57 [00:01<00:45,  1.20it/s][A
Epoch 3:  95%|█████████▍| 914/967 [03:00<00:18,  2.85it/s, loss=0.149, v_num=0, val_loss=0.152, val_acc=0.83, train_loss=0.174]
Validating:   7%|▋         | 4/57 [00:02<00:22,  2.40it/s][A
Epoch 3:  95%|█████████▍| 916/967 [03:01<00:17,  2.86it/s, loss=0.149, v_num=0, val_loss=0.152, val_acc=0.83, train_loss=0.174]
Validating:  11%|█         | 6/57 [00:02<00:16,  3.13it/s][A
Epoch 3:  95%|█████████▍| 918/967 [03:01<00:17,  2.86it/s, loss=0.149, v_num=0, val_loss=0.152, val_acc=0.83, train_loss=0.174]
Validat

Epoch 3, global step 684: val_loss reached 0.10524 (best 0.10524), saving model to "/opt/favordata/AI/Felix/kaggle-cassava/trained-models/seresnet50_sgd_coswarm_bnf_bitemp_smooth_weighted_t1=0.3_t2=1.0/seresnet50_bitempered_smooth=0.05_val_loss=0.105_val_acc=0.887_fold1.ckpt" as top 1


Epoch 3: 100%|██████████| 967/967 [03:14<00:00,  2.92it/s, loss=0.149, v_num=0, val_loss=0.105, val_acc=0.887, train_loss=0.194]
Epoch 4:  94%|█████████▍| 910/967 [02:58<00:19,  2.86it/s, loss=0.142, v_num=0, val_loss=0.105, val_acc=0.887, train_loss=0.131]     
Validating: 0it [00:00, ?it/s][A
Epoch 4:  94%|█████████▍| 912/967 [03:00<00:19,  2.84it/s, loss=0.142, v_num=0, val_loss=0.105, val_acc=0.887, train_loss=0.131]
Validating:   4%|▎         | 2/57 [00:02<00:53,  1.03it/s][A
Epoch 4:  95%|█████████▍| 914/967 [03:01<00:18,  2.84it/s, loss=0.142, v_num=0, val_loss=0.105, val_acc=0.887, train_loss=0.131]
Validating:   7%|▋         | 4/57 [00:02<00:24,  2.16it/s][A
Epoch 4:  95%|█████████▍| 916/967 [03:01<00:17,  2.85it/s, loss=0.142, v_num=0, val_loss=0.105, val_acc=0.887, train_loss=0.131]
Validating:  11%|█         | 6/57 [00:03<00:18,  2.74it/s][A
Epoch 4:  95%|█████████▍| 918/967 [03:02<00:17,  2.85it/s, loss=0.142, v_num=0, val_loss=0.105, val_acc=0.887, train_loss=0.131]
V

Epoch 4, step 912: val_loss was not in top 1


Epoch 4: 100%|██████████| 967/967 [03:14<00:00,  2.92it/s, loss=0.142, v_num=0, val_loss=0.127, val_acc=0.861, train_loss=0.295]
Epoch 5:  94%|█████████▍| 910/967 [02:58<00:19,  2.86it/s, loss=0.158, v_num=0, val_loss=0.127, val_acc=0.861, train_loss=0.144]     
Validating: 0it [00:00, ?it/s][A
Epoch 5:  94%|█████████▍| 912/967 [03:00<00:19,  2.85it/s, loss=0.158, v_num=0, val_loss=0.127, val_acc=0.861, train_loss=0.144]
Validating:   4%|▎         | 2/57 [00:01<00:47,  1.16it/s][A
Epoch 5:  95%|█████████▍| 914/967 [03:00<00:18,  2.85it/s, loss=0.158, v_num=0, val_loss=0.127, val_acc=0.861, train_loss=0.144]
Validating:   7%|▋         | 4/57 [00:02<00:22,  2.35it/s][A
Epoch 5:  95%|█████████▍| 916/967 [03:01<00:17,  2.86it/s, loss=0.158, v_num=0, val_loss=0.127, val_acc=0.861, train_loss=0.144]
Validating:  11%|█         | 6/57 [00:02<00:16,  3.01it/s][A
Epoch 5:  95%|█████████▍| 918/967 [03:01<00:17,  2.86it/s, loss=0.158, v_num=0, val_loss=0.127, val_acc=0.861, train_loss=0.144]
V

Epoch 5, step 1140: val_loss was not in top 1


Epoch 5: 100%|██████████| 967/967 [03:14<00:00,  2.93it/s, loss=0.158, v_num=0, val_loss=0.114, val_acc=0.871, train_loss=0.427]
Epoch 6:  94%|█████████▍| 910/967 [02:58<00:19,  2.86it/s, loss=0.15, v_num=0, val_loss=0.114, val_acc=0.871, train_loss=0.301]      
Validating: 0it [00:00, ?it/s][A
Epoch 6:  94%|█████████▍| 912/967 [02:59<00:19,  2.85it/s, loss=0.15, v_num=0, val_loss=0.114, val_acc=0.871, train_loss=0.301]
Validating:   4%|▎         | 2/57 [00:01<00:41,  1.34it/s][A
Epoch 6:  95%|█████████▍| 914/967 [03:00<00:18,  2.86it/s, loss=0.15, v_num=0, val_loss=0.114, val_acc=0.871, train_loss=0.301]
Validating:   7%|▋         | 4/57 [00:02<00:20,  2.56it/s][A
Epoch 6:  95%|█████████▍| 916/967 [03:00<00:17,  2.86it/s, loss=0.15, v_num=0, val_loss=0.114, val_acc=0.871, train_loss=0.301]
Validating:  11%|█         | 6/57 [00:02<00:18,  2.80it/s][A
Epoch 6:  95%|█████████▍| 918/967 [03:01<00:17,  2.86it/s, loss=0.15, v_num=0, val_loss=0.114, val_acc=0.871, train_loss=0.301]
Valid

Epoch 6, step 1368: val_loss was not in top 1


Epoch 6: 100%|██████████| 967/967 [03:13<00:00,  2.93it/s, loss=0.15, v_num=0, val_loss=0.148, val_acc=0.832, train_loss=0.0325]
Epoch 7:  94%|█████████▍| 910/967 [02:58<00:19,  2.86it/s, loss=0.145, v_num=0, val_loss=0.148, val_acc=0.832, train_loss=0.363]     
Validating: 0it [00:00, ?it/s][A
Epoch 7:  94%|█████████▍| 912/967 [03:00<00:19,  2.84it/s, loss=0.145, v_num=0, val_loss=0.148, val_acc=0.832, train_loss=0.363]
Validating:   4%|▎         | 2/57 [00:01<00:46,  1.18it/s][A
Epoch 7:  95%|█████████▍| 914/967 [03:00<00:18,  2.85it/s, loss=0.145, v_num=0, val_loss=0.148, val_acc=0.832, train_loss=0.363]
Validating:   7%|▋         | 4/57 [00:02<00:22,  2.37it/s][A
Epoch 7:  95%|█████████▍| 916/967 [03:01<00:17,  2.85it/s, loss=0.145, v_num=0, val_loss=0.148, val_acc=0.832, train_loss=0.363]
Validating:  11%|█         | 6/57 [00:02<00:17,  2.86it/s][A
Epoch 7:  95%|█████████▍| 918/967 [03:01<00:17,  2.85it/s, loss=0.145, v_num=0, val_loss=0.148, val_acc=0.832, train_loss=0.363]
V

Epoch 7, step 1596: val_loss was not in top 1


Epoch 7: 100%|██████████| 967/967 [03:14<00:00,  2.92it/s, loss=0.145, v_num=0, val_loss=0.112, val_acc=0.878, train_loss=0.06] 
Epoch 8:  94%|█████████▍| 910/967 [02:58<00:19,  2.86it/s, loss=0.137, v_num=0, val_loss=0.112, val_acc=0.878, train_loss=0.145]      
Validating: 0it [00:00, ?it/s][A
Epoch 8:  94%|█████████▍| 912/967 [03:00<00:19,  2.84it/s, loss=0.137, v_num=0, val_loss=0.112, val_acc=0.878, train_loss=0.145]
Validating:   4%|▎         | 2/57 [00:02<00:50,  1.09it/s][A
Epoch 8:  95%|█████████▍| 914/967 [03:00<00:18,  2.85it/s, loss=0.137, v_num=0, val_loss=0.112, val_acc=0.878, train_loss=0.145]
Validating:   7%|▋         | 4/57 [00:02<00:23,  2.22it/s][A
Epoch 8:  95%|█████████▍| 916/967 [03:01<00:17,  2.85it/s, loss=0.137, v_num=0, val_loss=0.112, val_acc=0.878, train_loss=0.145]
Validating:  11%|█         | 6/57 [00:03<00:18,  2.75it/s][A
Epoch 8:  95%|█████████▍| 918/967 [03:02<00:17,  2.85it/s, loss=0.137, v_num=0, val_loss=0.112, val_acc=0.878, train_loss=0.145]


Epoch 8, step 1824: val_loss was not in top 1


Epoch 8: 100%|██████████| 967/967 [03:14<00:00,  2.92it/s, loss=0.137, v_num=0, val_loss=0.112, val_acc=0.88, train_loss=0.222] 
Epoch 9:  94%|█████████▍| 910/967 [02:58<00:19,  2.86it/s, loss=0.111, v_num=0, val_loss=0.112, val_acc=0.88, train_loss=0.0661]    
Validating: 0it [00:00, ?it/s][A
Epoch 9:  94%|█████████▍| 912/967 [03:00<00:19,  2.85it/s, loss=0.111, v_num=0, val_loss=0.112, val_acc=0.88, train_loss=0.0661]
Validating:   4%|▎         | 2/57 [00:01<00:41,  1.33it/s][A
Epoch 9:  95%|█████████▍| 914/967 [03:00<00:18,  2.85it/s, loss=0.111, v_num=0, val_loss=0.112, val_acc=0.88, train_loss=0.0661]
Validating:   7%|▋         | 4/57 [00:02<00:20,  2.57it/s][A
Epoch 9:  95%|█████████▍| 916/967 [03:00<00:17,  2.86it/s, loss=0.111, v_num=0, val_loss=0.112, val_acc=0.88, train_loss=0.0661]
Validating:  11%|█         | 6/57 [00:02<00:14,  3.47it/s][A
Epoch 9:  95%|█████████▍| 918/967 [03:01<00:17,  2.86it/s, loss=0.111, v_num=0, val_loss=0.112, val_acc=0.88, train_loss=0.0661]
Va

Epoch 9, step 2052: val_loss was not in top 1


Epoch 9: 100%|██████████| 967/967 [03:14<00:00,  2.93it/s, loss=0.111, v_num=0, val_loss=0.116, val_acc=0.877, train_loss=0.203]
Epoch 10:  94%|█████████▍| 910/967 [02:58<00:19,  2.87it/s, loss=0.117, v_num=0, val_loss=0.116, val_acc=0.877, train_loss=0.134]     
Validating: 0it [00:00, ?it/s][A
Epoch 10:  94%|█████████▍| 912/967 [03:00<00:19,  2.85it/s, loss=0.117, v_num=0, val_loss=0.116, val_acc=0.877, train_loss=0.134]
Validating:   4%|▎         | 2/57 [00:02<00:49,  1.11it/s][A
Epoch 10:  95%|█████████▍| 914/967 [03:00<00:18,  2.85it/s, loss=0.117, v_num=0, val_loss=0.116, val_acc=0.877, train_loss=0.134]
Validating:   7%|▋         | 4/57 [00:02<00:23,  2.29it/s][A
Epoch 10:  95%|█████████▍| 916/967 [03:01<00:17,  2.85it/s, loss=0.117, v_num=0, val_loss=0.116, val_acc=0.877, train_loss=0.134]
Validating:  11%|█         | 6/57 [00:03<00:19,  2.60it/s][A
Epoch 10:  95%|█████████▍| 918/967 [03:01<00:17,  2.86it/s, loss=0.117, v_num=0, val_loss=0.116, val_acc=0.877, train_loss=0.1

Epoch 10, step 2280: val_loss was not in top 1


Epoch 10: 100%|██████████| 967/967 [03:14<00:00,  2.92it/s, loss=0.117, v_num=0, val_loss=nan, val_acc=0.883, train_loss=0.061]  
Epoch 11:  94%|█████████▍| 910/967 [02:58<00:19,  2.86it/s, loss=0.106, v_num=0, val_loss=nan, val_acc=0.883, train_loss=0.0952]     
Validating: 0it [00:00, ?it/s][A
Epoch 11:  94%|█████████▍| 912/967 [03:00<00:19,  2.85it/s, loss=0.106, v_num=0, val_loss=nan, val_acc=0.883, train_loss=0.0952]
Validating:   4%|▎         | 2/57 [00:01<00:45,  1.21it/s][A
Epoch 11:  95%|█████████▍| 914/967 [03:00<00:18,  2.85it/s, loss=0.106, v_num=0, val_loss=nan, val_acc=0.883, train_loss=0.0952]
Validating:   7%|▋         | 4/57 [00:02<00:22,  2.38it/s][A
Epoch 11:  95%|█████████▍| 916/967 [03:01<00:17,  2.86it/s, loss=0.106, v_num=0, val_loss=nan, val_acc=0.883, train_loss=0.0952]
Validating:  11%|█         | 6/57 [00:02<00:17,  2.96it/s][A
Epoch 11:  95%|█████████▍| 918/967 [03:01<00:17,  2.86it/s, loss=0.106, v_num=0, val_loss=nan, val_acc=0.883, train_loss=0.0952]


Epoch 11, global step 2508: val_loss reached 0.10084 (best 0.10084), saving model to "/opt/favordata/AI/Felix/kaggle-cassava/trained-models/seresnet50_sgd_coswarm_bnf_bitemp_smooth_weighted_t1=0.3_t2=1.0/seresnet50_bitempered_smooth=0.05_val_loss=0.101_val_acc=0.889_fold1.ckpt" as top 1


Epoch 11: 100%|██████████| 967/967 [03:14<00:00,  2.93it/s, loss=0.106, v_num=0, val_loss=0.101, val_acc=0.889, train_loss=0.083]
Epoch 12:  94%|█████████▍| 910/967 [02:58<00:19,  2.86it/s, loss=0.0961, v_num=0, val_loss=0.101, val_acc=0.889, train_loss=0.0808]    
Validating: 0it [00:00, ?it/s][A
Epoch 12:  94%|█████████▍| 912/967 [03:00<00:19,  2.84it/s, loss=0.0961, v_num=0, val_loss=0.101, val_acc=0.889, train_loss=0.0808]
Validating:   4%|▎         | 2/57 [00:01<00:46,  1.18it/s][A
Epoch 12:  95%|█████████▍| 914/967 [03:01<00:18,  2.84it/s, loss=0.0961, v_num=0, val_loss=0.101, val_acc=0.889, train_loss=0.0808]
Validating:   7%|▋         | 4/57 [00:02<00:22,  2.38it/s][A
Epoch 12:  95%|█████████▍| 916/967 [03:01<00:17,  2.85it/s, loss=0.0961, v_num=0, val_loss=0.101, val_acc=0.889, train_loss=0.0808]
Validating:  11%|█         | 6/57 [00:03<00:20,  2.51it/s][A
Epoch 12:  95%|█████████▍| 918/967 [03:02<00:17,  2.85it/s, loss=0.0961, v_num=0, val_loss=0.101, val_acc=0.889, train

Epoch 12, step 2736: val_loss was not in top 1


Epoch 12: 100%|██████████| 967/967 [03:14<00:00,  2.92it/s, loss=0.0961, v_num=0, val_loss=0.118, val_acc=0.868, train_loss=0.0191]
Epoch 13:  94%|█████████▍| 910/967 [02:58<00:19,  2.86it/s, loss=0.135, v_num=0, val_loss=0.118, val_acc=0.868, train_loss=0.187]      
Validating: 0it [00:00, ?it/s][A
Epoch 13:  94%|█████████▍| 912/967 [03:00<00:19,  2.84it/s, loss=0.135, v_num=0, val_loss=0.118, val_acc=0.868, train_loss=0.187]
Validating:   4%|▎         | 2/57 [00:02<00:56,  1.02s/it][A
Epoch 13:  95%|█████████▍| 914/967 [03:00<00:18,  2.85it/s, loss=0.135, v_num=0, val_loss=0.118, val_acc=0.868, train_loss=0.187]
Validating:   7%|▋         | 4/57 [00:02<00:25,  2.05it/s][A
Epoch 13:  95%|█████████▍| 916/967 [03:01<00:17,  2.85it/s, loss=0.135, v_num=0, val_loss=0.118, val_acc=0.868, train_loss=0.187]
Validating:  11%|█         | 6/57 [00:03<00:18,  2.75it/s][A
Epoch 13:  95%|█████████▍| 918/967 [03:01<00:17,  2.85it/s, loss=0.135, v_num=0, val_loss=0.118, val_acc=0.868, train_loss

Epoch 13, step 2964: val_loss was not in top 1


Epoch 13: 100%|██████████| 967/967 [03:14<00:00,  2.92it/s, loss=0.135, v_num=0, val_loss=0.107, val_acc=0.885, train_loss=0.215]
Epoch 14:  94%|█████████▍| 910/967 [02:58<00:19,  2.86it/s, loss=0.131, v_num=0, val_loss=0.107, val_acc=0.885, train_loss=0.0871]    
Validating: 0it [00:00, ?it/s][A
Epoch 14:  94%|█████████▍| 912/967 [03:00<00:19,  2.85it/s, loss=0.131, v_num=0, val_loss=0.107, val_acc=0.885, train_loss=0.0871]
Validating:   4%|▎         | 2/57 [00:01<00:49,  1.12it/s][A
Epoch 14:  95%|█████████▍| 914/967 [03:00<00:18,  2.85it/s, loss=0.131, v_num=0, val_loss=0.107, val_acc=0.885, train_loss=0.0871]
Validating:   7%|▋         | 4/57 [00:02<00:23,  2.30it/s][A
Epoch 14:  95%|█████████▍| 916/967 [03:01<00:17,  2.86it/s, loss=0.131, v_num=0, val_loss=0.107, val_acc=0.885, train_loss=0.0871]
Validating:  11%|█         | 6/57 [00:02<00:15,  3.19it/s][A
Epoch 14:  95%|█████████▍| 918/967 [03:01<00:17,  2.85it/s, loss=0.131, v_num=0, val_loss=0.107, val_acc=0.885, train_loss

Epoch 14, step 3192: val_loss was not in top 1


Epoch 14: 100%|██████████| 967/967 [03:14<00:00,  2.93it/s, loss=0.131, v_num=0, val_loss=0.114, val_acc=0.879, train_loss=0.177] 
Epoch 15:  94%|█████████▍| 910/967 [02:58<00:19,  2.86it/s, loss=0.131, v_num=0, val_loss=0.114, val_acc=0.879, train_loss=0.101]     
Validating: 0it [00:00, ?it/s][A
Epoch 15:  94%|█████████▍| 912/967 [03:00<00:19,  2.85it/s, loss=0.131, v_num=0, val_loss=0.114, val_acc=0.879, train_loss=0.101]
Validating:   4%|▎         | 2/57 [00:01<00:46,  1.19it/s][A
Epoch 15:  95%|█████████▍| 914/967 [03:00<00:18,  2.85it/s, loss=0.131, v_num=0, val_loss=0.114, val_acc=0.879, train_loss=0.101]
Validating:   7%|▋         | 4/57 [00:02<00:22,  2.39it/s][A
Epoch 15:  95%|█████████▍| 916/967 [03:01<00:17,  2.85it/s, loss=0.131, v_num=0, val_loss=0.114, val_acc=0.879, train_loss=0.101]
Validating:  11%|█         | 6/57 [00:03<00:21,  2.42it/s][A
Epoch 15:  95%|█████████▍| 918/967 [03:01<00:17,  2.85it/s, loss=0.131, v_num=0, val_loss=0.114, val_acc=0.879, train_loss=0

Epoch 15, step 3420: val_loss was not in top 1


Epoch 15: 100%|██████████| 967/967 [03:14<00:00,  2.92it/s, loss=0.131, v_num=0, val_loss=0.134, val_acc=0.851, train_loss=0.208]
Epoch 16:  94%|█████████▍| 910/967 [02:58<00:19,  2.86it/s, loss=0.11, v_num=0, val_loss=0.134, val_acc=0.851, train_loss=0.0959]     
Validating: 0it [00:00, ?it/s][A
Epoch 16:  94%|█████████▍| 912/967 [03:00<00:19,  2.84it/s, loss=0.11, v_num=0, val_loss=0.134, val_acc=0.851, train_loss=0.0959]
Validating:   4%|▎         | 2/57 [00:02<00:52,  1.05it/s][A
Epoch 16:  95%|█████████▍| 914/967 [03:01<00:18,  2.85it/s, loss=0.11, v_num=0, val_loss=0.134, val_acc=0.851, train_loss=0.0959]
Validating:   7%|▋         | 4/57 [00:02<00:24,  2.19it/s][A
Epoch 16:  95%|█████████▍| 916/967 [03:01<00:17,  2.85it/s, loss=0.11, v_num=0, val_loss=0.134, val_acc=0.851, train_loss=0.0959]
Validating:  11%|█         | 6/57 [00:03<00:20,  2.45it/s][A
Epoch 16:  95%|█████████▍| 918/967 [03:02<00:17,  2.85it/s, loss=0.11, v_num=0, val_loss=0.134, val_acc=0.851, train_loss=0.0

Epoch 16, step 3648: val_loss was not in top 1


Epoch 16: 100%|██████████| 967/967 [03:14<00:00,  2.92it/s, loss=0.11, v_num=0, val_loss=0.123, val_acc=0.865, train_loss=0.0883]
Epoch 17:  94%|█████████▍| 910/967 [02:58<00:19,  2.86it/s, loss=0.0973, v_num=0, val_loss=0.123, val_acc=0.865, train_loss=0.304]     
Validating: 0it [00:00, ?it/s][A
Epoch 17:  94%|█████████▍| 912/967 [03:00<00:19,  2.85it/s, loss=0.0973, v_num=0, val_loss=0.123, val_acc=0.865, train_loss=0.304]
Validating:   4%|▎         | 2/57 [00:01<00:44,  1.25it/s][A
Epoch 17:  95%|█████████▍| 914/967 [03:00<00:18,  2.85it/s, loss=0.0973, v_num=0, val_loss=0.123, val_acc=0.865, train_loss=0.304]
Validating:   7%|▋         | 4/57 [00:02<00:21,  2.46it/s][A
Epoch 17:  95%|█████████▍| 916/967 [03:00<00:17,  2.86it/s, loss=0.0973, v_num=0, val_loss=0.123, val_acc=0.865, train_loss=0.304]
Validating:  11%|█         | 6/57 [00:02<00:17,  2.92it/s][A
Epoch 17:  95%|█████████▍| 918/967 [03:01<00:17,  2.86it/s, loss=0.0973, v_num=0, val_loss=0.123, val_acc=0.865, train_lo

Epoch 17, step 3876: val_loss was not in top 1


Epoch 17: 100%|██████████| 967/967 [03:14<00:00,  2.93it/s, loss=0.0973, v_num=0, val_loss=0.114, val_acc=0.878, train_loss=0.0956]
Epoch 18:  94%|█████████▍| 910/967 [02:58<00:19,  2.86it/s, loss=0.115, v_num=0, val_loss=0.114, val_acc=0.878, train_loss=0.124]       
Validating: 0it [00:00, ?it/s][A
Epoch 18:  94%|█████████▍| 912/967 [03:00<00:19,  2.85it/s, loss=0.115, v_num=0, val_loss=0.114, val_acc=0.878, train_loss=0.124]
Validating:   4%|▎         | 2/57 [00:01<00:45,  1.22it/s][A
Epoch 18:  95%|█████████▍| 914/967 [03:00<00:18,  2.85it/s, loss=0.115, v_num=0, val_loss=0.114, val_acc=0.878, train_loss=0.124]
Validating:   7%|▋         | 4/57 [00:02<00:21,  2.42it/s][A
Epoch 18:  95%|█████████▍| 916/967 [03:01<00:17,  2.85it/s, loss=0.115, v_num=0, val_loss=0.114, val_acc=0.878, train_loss=0.124]
Validating:  11%|█         | 6/57 [00:02<00:18,  2.78it/s][A
Epoch 18:  95%|█████████▍| 918/967 [03:01<00:17,  2.86it/s, loss=0.115, v_num=0, val_loss=0.114, val_acc=0.878, train_los

Epoch 18, step 4104: val_loss was not in top 1


Epoch 18: 100%|██████████| 967/967 [03:14<00:00,  2.92it/s, loss=0.115, v_num=0, val_loss=0.132, val_acc=0.859, train_loss=0.252]
Epoch 19:  94%|█████████▍| 910/967 [02:58<00:19,  2.86it/s, loss=0.0925, v_num=0, val_loss=0.132, val_acc=0.859, train_loss=0.0397]     
Validating: 0it [00:00, ?it/s][A
Epoch 19:  94%|█████████▍| 912/967 [03:00<00:19,  2.85it/s, loss=0.0925, v_num=0, val_loss=0.132, val_acc=0.859, train_loss=0.0397]
Validating:   4%|▎         | 2/57 [00:02<00:50,  1.09it/s][A
Epoch 19:  95%|█████████▍| 914/967 [03:00<00:18,  2.85it/s, loss=0.0925, v_num=0, val_loss=0.132, val_acc=0.859, train_loss=0.0397]
Validating:   7%|▋         | 4/57 [00:02<00:23,  2.25it/s][A
Epoch 19:  95%|█████████▍| 916/967 [03:01<00:17,  2.85it/s, loss=0.0925, v_num=0, val_loss=0.132, val_acc=0.859, train_loss=0.0397]
Validating:  11%|█         | 6/57 [00:02<00:16,  3.14it/s][A
Epoch 19:  95%|█████████▍| 918/967 [03:01<00:17,  2.85it/s, loss=0.0925, v_num=0, val_loss=0.132, val_acc=0.859, trai

Epoch 19, step 4332: val_loss was not in top 1


Epoch 19: 100%|██████████| 967/967 [03:14<00:00,  2.93it/s, loss=0.0925, v_num=0, val_loss=0.112, val_acc=0.882, train_loss=0.0388]
                                                           [A

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Using native 16bit precision.


Training fold 2
Class sample counts [ 759 1471 1622 8934 1764]
After class sample counts [2277 2942 3730 8934 4762]
conv1.weight True
bn1.weight False
bn1.bias False
layer1.0.conv1.weight True
layer1.0.bn1.weight False
layer1.0.bn1.bias False
layer1.0.conv2.weight True
layer1.0.bn2.weight False
layer1.0.bn2.bias False
layer1.0.conv3.weight True
layer1.0.bn3.weight False
layer1.0.bn3.bias False
layer1.0.se.fc1.weight True
layer1.0.se.fc1.bias True
layer1.0.se.fc2.weight True
layer1.0.se.fc2.bias True
layer1.0.downsample.0.weight True
layer1.0.downsample.1.weight False
layer1.0.downsample.1.bias False
layer1.1.conv1.weight True
layer1.1.bn1.weight False
layer1.1.bn1.bias False
layer1.1.conv2.weight True
layer1.1.bn2.weight False
layer1.1.bn2.bias False
layer1.1.conv3.weight True
layer1.1.bn3.weight False
layer1.1.bn3.bias False
layer1.1.se.fc1.weight True
layer1.1.se.fc1.bias True
layer1.1.se.fc2.weight True
layer1.1.se.fc2.bias True
layer1.2.conv1.weight True
layer1.2.bn1.weight False
l


  | Name           | Type           | Params
--------------------------------------------------
0 | valid_accuracy | Accuracy       | 0     
1 | test_accuracy  | Accuracy       | 0     
2 | criterion      | BiTemperedLoss | 0     
3 | model          | ResNet         | 26.0 M
--------------------------------------------------
26.0 M    Trainable params
53.1 K    Non-trainable params
26.0 M    Total params
Finding best initial lr:   2%|▏         | 2/100 [00:00<00:52,  1.86it/s]