In [2]:
import errno
import glob
import json
import os
import re
import shutil
from types import SimpleNamespace
import cv2
import torch
import warnings
from lightning_objects import LightningModel
warnings.filterwarnings('ignore')
from config import Configuration
import pandas as pd
from utils import stratify_split, make_holdout_df, set_seeds
from train_manager import TrainManager

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
def main(experiment_name: str, debug, resume=False,
         finetune=False, freeze_bn=True, freeze_feature_extractor=False):

    experiment_dir = os.path.abspath(f'trained-models/{experiment_name}')
    print('Experiment directory', experiment_dir)

    try:
        # -------- SETUP --------
        checkpoint_params = None
        finetune_model_fnames = None
        folds_df, holdout_df = None, None

        if not resume and not finetune: # totally new experiment
            make_experiment_directory(experiment_dir)
            config = Configuration()
            config.debug = debug
            set_seeds(config.seed)

            # -------- LOAD DATA FROM TRAIN FILE --------
            data_df = pd.read_csv(config.data_dir + '/train.csv', engine='python')
            data_df, holdout_df = make_holdout_df(data_df, seed=config.seed)
            folds_df = stratify_split(data_df, config.fold_num, config.seed, config.target_col)

            # -------- SAVE FILES (experiment state: things like resuming, fine tuning, and inference on holdout) --------
            folds_df.to_csv(experiment_dir + '/folds.csv', index=False)
            holdout_df.to_csv(experiment_dir + '/holdout.csv', index=False)
            with open(experiment_dir + '/experiment_config.json', 'w') as f:
                json.dump(config.__dict__, f)
        elif resume or finetune:
            # LOAD DATA FROM SAVED FILES
            with open(experiment_dir + '/experiment_config.json', 'r') as f:
                config = json.load(f, object_hook=lambda d: SimpleNamespace(**d))
                set_seeds(config.seed)
                config.debug = debug

            folds_df = pd.read_csv(experiment_dir + '/folds.csv', engine='python')
            holdout_df = pd.read_csv(experiment_dir + '/holdout.csv', engine='python')

            if finetune and not resume:
                print('finetuning...')
                # verify there are checkpoints to fine tune
                finetune_model_fnames = glob.glob(experiment_dir + '/*fold*.ckpt')
                assert len(finetune_model_fnames) > 0
                finetune_model_fnames.sort()

                # make new directory for tuning experiment with files from training run 1
                make_experiment_directory(experiment_dir + '_tune')
                for f in os.listdir(experiment_dir):
                    print(f"copying {f} to {experiment_dir + '_tune'}")
                    shutil.copy2(experiment_dir + '/' + f, experiment_dir + '_tune')
                experiment_dir += '_tune'
                experiment_name += '_tune'
            else:
                print('resuming from last checkpoint...')
                checkpoint_params = get_checkpoint_params(experiment_dir, resume, config.model_arch)

        assert holdout_df is not None, 'holdout_df is None'
        assert folds_df is not None, 'folds_df is None'

        # cv2 multithreading seems to go into deadlock with PyTorch data loaders
        if config.num_workers > 0:
            cv2.setNumThreads(0)

        trainer = TrainManager(experiment_name=experiment_name, experiment_dir=experiment_dir,
                               folds_df=folds_df, holdout_df=holdout_df,
                               checkpoint_params=checkpoint_params, config=config,
                               finetune=finetune, freeze_bn=freeze_bn,
                               freeze_feature_extractor=freeze_feature_extractor,
                               finetune_model_fnames=finetune_model_fnames)
        trainer.run()
    finally:
        torch.cuda.empty_cache()

def make_experiment_directory(name):
    try:
        os.makedirs(name)
    except FileExistsError as e:
        print('Experiment already exists. Be sure to resume training appropriately or start a new experiment.')
        if e.errno == errno.EEXIST: raise


def get_checkpoint_params(basename, resume, model_arch):
    checkpoint_params = None
    if resume:
        checkpoint_params = {}
        model_filenames = glob.glob(basename + '/*fold*.ckpt')
        model_filenames.sort()
        trained_folds = [re.findall(r'fold\d+', f)[0][len('fold'):] for f in model_filenames]
        most_recent_fold = int(max(trained_folds)) if len(trained_folds) > 0 else 0

        checkpoint_params['restart_from'] = most_recent_fold
        checkpoint_params['checkpoint_file_path'] = model_filenames[-1]

    return checkpoint_params

In [6]:
if __name__ == '__main__':
    try:
        debug = False
        print('Running in debug mode:', debug)
        main(experiment_name='tf_efficientnet_b4_ns_sgd_onecycle_smoothing=0.05_weighted_bitempered_t1=0.8_t2=1', debug=debug,
             resume=False, finetune=False, freeze_bn=True, freeze_feature_extractor=False)
    except KeyboardInterrupt:
        pass

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Using native 16bit precision.


Running in debug mode: False
Experiment directory /opt/favordata/AI/Felix/kaggle-cassava/trained-models/tf_efficientnet_b4_ns_sgd_onecycle_smoothing=0.05_weighted_bitempered_t1=0.8_t2=1
folds_df len 18187, holdout_df len 3210
Training fold 0
Class sample counts [ 758 1470 1623 8933 1765]
After class sample counts [2274 2940 3732 8933 4765]



  | Name           | Type           | Params
--------------------------------------------------
0 | valid_accuracy | Accuracy       | 0     
1 | test_accuracy  | Accuracy       | 0     
2 | criterion      | BiTemperedLoss | 0     
3 | model          | EfficientNet   | 17.6 M
--------------------------------------------------
17.4 M    Trainable params
125 K     Non-trainable params
17.6 M    Total params

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s][A
Finding best initial lr:   1%|          | 1/100 [00:00<00:30,  3.24it/s][A
Finding best initial lr:   2%|▏         | 2/100 [00:01<01:18,  1.24it/s][A
Finding best initial lr:   3%|▎         | 3/100 [00:02<01:33,  1.04it/s][A
Finding best initial lr:   4%|▍         | 4/100 [00:03<01:40,  1.04s/it][A
Finding best initial lr:   5%|▌         | 5/100 [00:04<01:42,  1.08s/it][A
Finding best initial lr:   6%|▌         | 6/100 [00:06<01:44,  1.11s/it][A
Finding best initial lr:   7%|▋         | 7/100 [00:07<01:44,  1.1

Learning rate set to 0.15848931924611143

  | Name           | Type           | Params
--------------------------------------------------
0 | valid_accuracy | Accuracy       | 0     
1 | test_accuracy  | Accuracy       | 0     
2 | criterion      | BiTemperedLoss | 0     
3 | model          | EfficientNet   | 17.6 M
--------------------------------------------------
17.4 M    Trainable params
125 K     Non-trainable params
17.6 M    Total params


Epoch 2:   3%|▎         | 33/967 [20:12<-1:08:27, -0.30it/s, loss=0.793, v_num=0, val_loss=nan, val_acc=0.852, train_loss=0.946]    

Finding best initial lr: 100%|██████████| 100/100 [06:11<00:00,  3.71s/it]


Epoch 1:  88%|████████▊ | 848/967 [04:11<01:06,  1.78it/s, loss=0.434, v_num=0, val_loss=0.957, val_acc=0.258, train_loss=0.302]




Epoch 1:  94%|█████████▍| 910/967 [04:29<00:30,  1.89it/s, loss=0.405, v_num=0, val_loss=0.957, val_acc=0.258, train_loss=0.219]
Validating: 0it [00:00, ?it/s][A
Epoch 1:  94%|█████████▍| 912/967 [04:31<00:29,  1.89it/s, loss=0.405, v_num=0, val_loss=0.957, val_acc=0.258, train_loss=0.219]
Validating:   4%|▎         | 2/57 [00:02<00:57,  1.05s/it][A
Epoch 1:  95%|█████████▍| 914/967 [04:32<00:28,  1.89it/s, loss=0.405, v_num=0, val_loss=0.957, val_acc=0.258, train_loss=0.219]
Validating:   7%|▋         | 4/57 [00:02<00:28,  1.88it/s][A
Epoch 1:  95%|█████████▍| 916/967 [04:33<00:26,  1.89it/s, loss=0.405, v_num=0, val_loss=0.957, val_acc=0.258, train_loss=0.219]
Validating:  11%|█         | 6/57 [00:03<00:19,  2.55it/s][A
Epoch 1:  95%|█████████▍| 918/967 [04:33<00:25,  1.90it/s, loss=0.405, v_num=0, val_loss=0.957, val_acc=0.258, train_loss=0.219]
Validating:  14%|█▍        | 8/57 [00:04<00:16,  3.00it/s][A
Epoch 1:  95%|█████████▌| 920/967 [04:34<00:24,  1.90it/s, loss=0.405, v_

Epoch 1, global step 228: val_loss reached 0.26035 (best 0.26035), saving model to "/opt/favordata/AI/Felix/kaggle-cassava/trained-models/tf_efficientnet_b4_ns_sgd_onecycle_smoothing=0.05_weighted_bitempered_t1=0.8_t2=1/tf_efficientnet_b4_ns_bitempered_smooth=0.05_val_loss=0.260_val_acc=0.851_fold0.ckpt" as top 1


Epoch 1: 100%|██████████| 967/967 [04:48<00:00,  1.97it/s, loss=0.405, v_num=0, val_loss=0.26, val_acc=0.851, train_loss=0.486] 
Epoch 2:  94%|█████████▍| 910/967 [04:30<00:30,  1.89it/s, loss=0.337, v_num=0, val_loss=0.26, val_acc=0.851, train_loss=0.362]    
Validating: 0it [00:00, ?it/s][A
Epoch 2:  94%|█████████▍| 912/967 [04:31<00:29,  1.89it/s, loss=0.337, v_num=0, val_loss=0.26, val_acc=0.851, train_loss=0.362]
Validating:   4%|▎         | 2/57 [00:01<00:46,  1.17it/s][A
Epoch 2:  95%|█████████▍| 914/967 [04:32<00:28,  1.89it/s, loss=0.337, v_num=0, val_loss=0.26, val_acc=0.851, train_loss=0.362]
Validating:   7%|▋         | 4/57 [00:02<00:24,  2.15it/s][A
Epoch 2:  95%|█████████▍| 916/967 [04:32<00:26,  1.89it/s, loss=0.337, v_num=0, val_loss=0.26, val_acc=0.851, train_loss=0.362]
Validating:  11%|█         | 6/57 [00:03<00:19,  2.65it/s][A
Epoch 2:  95%|█████████▍| 918/967 [04:33<00:25,  1.90it/s, loss=0.337, v_num=0, val_loss=0.26, val_acc=0.851, train_loss=0.362]
Validat

Epoch 2, global step 456: val_loss reached 0.22706 (best 0.22706), saving model to "/opt/favordata/AI/Felix/kaggle-cassava/trained-models/tf_efficientnet_b4_ns_sgd_onecycle_smoothing=0.05_weighted_bitempered_t1=0.8_t2=1/tf_efficientnet_b4_ns_bitempered_smooth=0.05_val_loss=0.227_val_acc=0.872_fold0.ckpt" as top 1


Epoch 2: 100%|██████████| 967/967 [04:48<00:00,  1.97it/s, loss=0.337, v_num=0, val_loss=0.227, val_acc=0.872, train_loss=0.379]
Epoch 3:  94%|█████████▍| 910/967 [04:29<00:30,  1.89it/s, loss=0.296, v_num=0, val_loss=0.227, val_acc=0.872, train_loss=0.429]     
Validating: 0it [00:00, ?it/s][A
Epoch 3:  94%|█████████▍| 912/967 [04:31<00:29,  1.89it/s, loss=0.296, v_num=0, val_loss=0.227, val_acc=0.872, train_loss=0.429]
Validating:   4%|▎         | 2/57 [00:01<00:44,  1.25it/s][A
Epoch 3:  95%|█████████▍| 914/967 [04:31<00:27,  1.89it/s, loss=0.296, v_num=0, val_loss=0.227, val_acc=0.872, train_loss=0.429]
Validating:   7%|▋         | 4/57 [00:02<00:23,  2.23it/s][A
Epoch 3:  95%|█████████▍| 916/967 [04:32<00:26,  1.90it/s, loss=0.296, v_num=0, val_loss=0.227, val_acc=0.872, train_loss=0.429]
Validating:  11%|█         | 6/57 [00:02<00:18,  2.83it/s][A
Epoch 3:  95%|█████████▍| 918/967 [04:33<00:25,  1.90it/s, loss=0.296, v_num=0, val_loss=0.227, val_acc=0.872, train_loss=0.429]
V

Epoch 3, global step 684: val_loss reached 0.21888 (best 0.21888), saving model to "/opt/favordata/AI/Felix/kaggle-cassava/trained-models/tf_efficientnet_b4_ns_sgd_onecycle_smoothing=0.05_weighted_bitempered_t1=0.8_t2=1/tf_efficientnet_b4_ns_bitempered_smooth=0.05_val_loss=0.219_val_acc=0.875_fold0.ckpt" as top 1


Epoch 3: 100%|██████████| 967/967 [04:47<00:00,  1.98it/s, loss=0.296, v_num=0, val_loss=0.219, val_acc=0.875, train_loss=0.193]
Epoch 4:  94%|█████████▍| 910/967 [04:30<00:30,  1.89it/s, loss=0.31, v_num=0, val_loss=0.219, val_acc=0.875, train_loss=0.194]     
Validating: 0it [00:00, ?it/s][A
Epoch 4:  94%|█████████▍| 912/967 [04:32<00:29,  1.89it/s, loss=0.31, v_num=0, val_loss=0.219, val_acc=0.875, train_loss=0.194]
Validating:   4%|▎         | 2/57 [00:02<00:53,  1.04it/s][A
Epoch 4:  95%|█████████▍| 914/967 [04:32<00:28,  1.89it/s, loss=0.31, v_num=0, val_loss=0.219, val_acc=0.875, train_loss=0.194]
Validating:   7%|▋         | 4/57 [00:02<00:27,  1.91it/s][A
Epoch 4:  95%|█████████▍| 916/967 [04:33<00:26,  1.89it/s, loss=0.31, v_num=0, val_loss=0.219, val_acc=0.875, train_loss=0.194]
Validating:  11%|█         | 6/57 [00:03<00:19,  2.60it/s][A
Epoch 4:  95%|█████████▍| 918/967 [04:33<00:25,  1.90it/s, loss=0.31, v_num=0, val_loss=0.219, val_acc=0.875, train_loss=0.194]
Valida

Epoch 4, global step 912: val_loss reached 0.21403 (best 0.21403), saving model to "/opt/favordata/AI/Felix/kaggle-cassava/trained-models/tf_efficientnet_b4_ns_sgd_onecycle_smoothing=0.05_weighted_bitempered_t1=0.8_t2=1/tf_efficientnet_b4_ns_bitempered_smooth=0.05_val_loss=0.214_val_acc=0.882_fold0.ckpt" as top 1


Epoch 4: 100%|██████████| 967/967 [04:48<00:00,  1.97it/s, loss=0.31, v_num=0, val_loss=0.214, val_acc=0.882, train_loss=0.154]
Epoch 5:  94%|█████████▍| 910/967 [04:30<00:30,  1.89it/s, loss=0.285, v_num=0, val_loss=0.214, val_acc=0.882, train_loss=0.37]     
Validating: 0it [00:00, ?it/s][A
Epoch 5:  94%|█████████▍| 912/967 [04:31<00:29,  1.89it/s, loss=0.285, v_num=0, val_loss=0.214, val_acc=0.882, train_loss=0.37]
Validating:   4%|▎         | 2/57 [00:01<00:47,  1.15it/s][A
Epoch 5:  95%|█████████▍| 914/967 [04:32<00:28,  1.89it/s, loss=0.285, v_num=0, val_loss=0.214, val_acc=0.882, train_loss=0.37]
Validating:   7%|▋         | 4/57 [00:02<00:25,  2.11it/s][A
Epoch 5:  95%|█████████▍| 916/967 [04:33<00:26,  1.89it/s, loss=0.285, v_num=0, val_loss=0.214, val_acc=0.882, train_loss=0.37]
Validating:  11%|█         | 6/57 [00:03<00:19,  2.68it/s][A
Epoch 5:  95%|█████████▍| 918/967 [04:33<00:25,  1.90it/s, loss=0.285, v_num=0, val_loss=0.214, val_acc=0.882, train_loss=0.37]
Validat

Epoch 5, global step 1140: val_loss reached 0.20602 (best 0.20602), saving model to "/opt/favordata/AI/Felix/kaggle-cassava/trained-models/tf_efficientnet_b4_ns_sgd_onecycle_smoothing=0.05_weighted_bitempered_t1=0.8_t2=1/tf_efficientnet_b4_ns_bitempered_smooth=0.05_val_loss=0.206_val_acc=0.889_fold0.ckpt" as top 1


Epoch 5: 100%|██████████| 967/967 [04:48<00:00,  1.97it/s, loss=0.285, v_num=0, val_loss=0.206, val_acc=0.889, train_loss=0.245]
Epoch 6:  94%|█████████▍| 910/967 [04:29<00:30,  1.89it/s, loss=0.279, v_num=0, val_loss=0.206, val_acc=0.889, train_loss=0.216]    
Validating: 0it [00:00, ?it/s][A
Epoch 6:  94%|█████████▍| 912/967 [04:32<00:29,  1.89it/s, loss=0.279, v_num=0, val_loss=0.206, val_acc=0.889, train_loss=0.216]
Validating:   4%|▎         | 2/57 [00:02<00:56,  1.03s/it][A
Epoch 6:  95%|█████████▍| 914/967 [04:32<00:28,  1.89it/s, loss=0.279, v_num=0, val_loss=0.206, val_acc=0.889, train_loss=0.216]
Validating:   7%|▋         | 4/57 [00:02<00:28,  1.87it/s][A
Epoch 6:  95%|█████████▍| 916/967 [04:33<00:26,  1.89it/s, loss=0.279, v_num=0, val_loss=0.206, val_acc=0.889, train_loss=0.216]
Validating:  11%|█         | 6/57 [00:03<00:19,  2.60it/s][A
Epoch 6:  95%|█████████▍| 918/967 [04:33<00:25,  1.90it/s, loss=0.279, v_num=0, val_loss=0.206, val_acc=0.889, train_loss=0.216]
Va

Epoch 6, step 1368: val_loss was not in top 1


Epoch 6: 100%|██████████| 967/967 [04:47<00:00,  1.97it/s, loss=0.279, v_num=0, val_loss=0.213, val_acc=0.876, train_loss=0.202]
Epoch 7:  94%|█████████▍| 910/967 [04:30<00:30,  1.89it/s, loss=0.278, v_num=0, val_loss=0.213, val_acc=0.876, train_loss=0.13]     
Validating: 0it [00:00, ?it/s][A
Epoch 7:  94%|█████████▍| 912/967 [04:31<00:29,  1.89it/s, loss=0.278, v_num=0, val_loss=0.213, val_acc=0.876, train_loss=0.13]
Validating:   4%|▎         | 2/57 [00:02<00:49,  1.11it/s][A
Epoch 7:  95%|█████████▍| 914/967 [04:32<00:28,  1.89it/s, loss=0.278, v_num=0, val_loss=0.213, val_acc=0.876, train_loss=0.13]
Validating:   7%|▋         | 4/57 [00:02<00:25,  2.06it/s][A
Epoch 7:  95%|█████████▍| 916/967 [04:32<00:26,  1.89it/s, loss=0.278, v_num=0, val_loss=0.213, val_acc=0.876, train_loss=0.13]
Validating:  11%|█         | 6/57 [00:03<00:19,  2.66it/s][A
Epoch 7:  95%|█████████▍| 918/967 [04:33<00:25,  1.90it/s, loss=0.278, v_num=0, val_loss=0.213, val_acc=0.876, train_loss=0.13]
Valida

Epoch 7, step 1596: val_loss was not in top 1


Epoch 7: 100%|██████████| 967/967 [04:47<00:00,  1.97it/s, loss=0.278, v_num=0, val_loss=nan, val_acc=0.0522, train_loss=0.209]
Epoch 8:  94%|█████████▍| 910/967 [04:29<00:30,  1.89it/s, loss=0.271, v_num=0, val_loss=nan, val_acc=0.0522, train_loss=0.266]    
Validating: 0it [00:00, ?it/s][A
Epoch 8:  94%|█████████▍| 912/967 [04:31<00:29,  1.89it/s, loss=0.271, v_num=0, val_loss=nan, val_acc=0.0522, train_loss=0.266]
Validating:   4%|▎         | 2/57 [00:02<00:53,  1.03it/s][A
Epoch 8:  95%|█████████▍| 914/967 [04:32<00:28,  1.89it/s, loss=0.271, v_num=0, val_loss=nan, val_acc=0.0522, train_loss=0.266]
Validating:   7%|▋         | 4/57 [00:02<00:26,  1.99it/s][A
Epoch 8:  95%|█████████▍| 916/967 [04:33<00:26,  1.89it/s, loss=0.271, v_num=0, val_loss=nan, val_acc=0.0522, train_loss=0.266]
Validating:  11%|█         | 6/57 [00:03<00:19,  2.68it/s][A
Epoch 8:  95%|█████████▍| 918/967 [04:33<00:25,  1.90it/s, loss=0.271, v_num=0, val_loss=nan, val_acc=0.0522, train_loss=0.266]
Validati

Epoch 8, step 1824: val_loss was not in top 1


Epoch 8: 100%|██████████| 967/967 [04:47<00:00,  1.97it/s, loss=0.271, v_num=0, val_loss=nan, val_acc=0.0522, train_loss=0.174]
Epoch 9:   7%|▋         | 68/967 [00:20<-1:59:04, -15.87it/s, loss=0.26, v_num=0, val_loss=nan, val_acc=0.0522, train_loss=0.162]  

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Using native 16bit precision.


Training fold 1
Class sample counts [ 758 1471 1622 8933 1765]
After class sample counts [2274 2942 3730 8933 4765]
