In [2]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, "../")

from autogluon.vision import ImagePredictor, ImageDataset
import numpy as np
import pandas as pd
import pickle
import datetime
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from cross_validation_autogluon import cross_val_predict_autogluon_image_dataset, train_predict_autogluon

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Read data

In [4]:
# change label index based on list of classes
def process_autogluon_image_dataset(dataset, classes):
    dataset["label_name"] = dataset.image.map(lambda x: Path(x).parts[-2])
    dataset["label"] = dataset["label_name"].map(lambda x: classes[x])
    return dataset

In [5]:
# original data deduped
DATA_PATH = "/Data/andrew-ng-dcai-comp-2021-data-deduped/andrew-ng-dcai-comp-2021-data/"

# read data from root folder
train_dataset, val_dataset, test_dataset = \
    ImageDataset.from_folders(
        root=DATA_PATH,
    )

# NOTE!
# NOTE! change the label index! AutoGluon assigns label index based on alphabetical order by default
classes = {
    label_name: idx
    for idx, label_name in enumerate(
        ["i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x"]
    )
}

train_dataset, val_dataset, test_dataset = \
    process_autogluon_image_dataset(train_dataset, classes), \
    process_autogluon_image_dataset(val_dataset, classes), \
    process_autogluon_image_dataset(test_dataset, classes)

# combine train and val dataset for cross-validation
train_val_dataset = pd.concat([train_dataset, val_dataset])

In [12]:
train_val_dataset.shape

(2831, 3)

In [16]:
train_val_dataset.groupby(["label", "label_name"])["image"].count()

label  label_name
0      i             335
1      ii            238
2      iii           256
3      iv            362
4      v             274
5      vi            263
6      vii           263
7      viii          282
8      ix            309
9      x             249
Name: image, dtype: int64

In [17]:
train_val_dataset.head()

Unnamed: 0,image,label,label_name
0,/Data/andrew-ng-dcai-comp-2021-data-deduped/andrew-ng-dcai-comp-2021-data/train/i/ab9fb784-ce5d-11eb-b317-38f9d35ea60f.png,0,i
1,/Data/andrew-ng-dcai-comp-2021-data-deduped/andrew-ng-dcai-comp-2021-data/train/i/aba24486-ce5d-11eb-b317-38f9d35ea60f.png,0,i
2,/Data/andrew-ng-dcai-comp-2021-data-deduped/andrew-ng-dcai-comp-2021-data/train/i/aba35128-ce5d-11eb-b317-38f9d35ea60f.png,0,i
3,/Data/andrew-ng-dcai-comp-2021-data-deduped/andrew-ng-dcai-comp-2021-data/train/i/aba4001e-ce5d-11eb-b317-38f9d35ea60f.png,0,i
4,/Data/andrew-ng-dcai-comp-2021-data-deduped/andrew-ng-dcai-comp-2021-data/train/i/aba4a622-ce5d-11eb-b317-38f9d35ea60f.png,0,i


## Generate Out-of-Sample Predicted Probabilities via Cross-Validation

In [19]:
%%time

# generate cross-validated predicted probabilities for various models so we can use them for ensemble scoring methods
models = [
    "resnet18",
    "resnet50d",
    "efficientnet_b1",
    "twins_pcpvt_base",
    "swin_base_patch4_window7_224"
]

epochs = 100
holdout_frac = 0.2
n_splits = 5

# run cross-validation for each model

ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S") # CV start timestamp

for model in models:
    
    print("----")
    print(f"Running cross-validation for model: {model}")

    MODEL_PARAMS = {
        "model": model,
        "epochs": epochs,
        "holdout_frac": holdout_frac
    }

    # results of cross-validation will be saved to pickle files for each model/fold
    _ = \
        train_predict_autogluon(
            dataset=train_val_dataset, # train with NOISY LABELS
            classes=classes,
            out_folder=f"dcai_train_val_dataset_cv_{model}/", # save results of cross-validation in pickle files for each fold
            n_splits=n_splits,
            model_params=MODEL_PARAMS
        )

modified configs(<old> != <new>): {
root.img_cls.model   resnet101 != resnet18
root.train.early_stop_baseline 0.0 != -inf
root.train.batch_size 32 != 16
root.train.epochs    200 != 100
root.train.early_stop_max_value 1.0 != inf
root.train.early_stop_patience -1 != 10
root.misc.seed       42 != 84
root.misc.num_workers 4 != 16
}
Saved config to /dcai/src/experiments/andrew-ng-dcai/1cc2f720/.trial_0/config.yaml


----
Running cross-validation for model: resnet18
----
Running Cross-Validation on Split: 0


Model resnet18 created, param count:                                         11181642
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 239.285470 samples/sec	accuracy=0.092500	lr=0.000100
Epoch[0] Batch [99]	Speed: 523.931972 samples/sec	accuracy=0.133125	lr=0.000100
[Epoch 0] training: accuracy=0.132874
[Epoch 0] speed: 353 samples/sec	time cost: 5.698930
[Epoch 0] validation: top1=0.149780 top5=0.709251
[Epoch 0] Current best top-1: 0.149780 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/1cc2f720/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 451.582510 samples/sec	accuracy=0.185000	lr=0.000250
Epoch[1] Batch [99]	Speed: 504.656697 samples/sec	accuracy=0.232500	lr=0.000250
[Epoch 1] training: accuracy=0.242618
[Epoch 1] speed: 490 samples/sec	time cost: 4.113294
[Epoch 1] validation: top1=0.339207 top5=0.837004
[Epoch 1] Current best top-1: 0.339207 vs previous 0.149780

Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_0/_test_pred_probs_split_0
Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_0/_test_pred_features_split_0


modified configs(<old> != <new>): {
root.img_cls.model   resnet101 != resnet18
root.train.early_stop_baseline 0.0 != -inf
root.train.batch_size 32 != 16
root.train.epochs    200 != 100
root.train.early_stop_max_value 1.0 != inf
root.train.early_stop_patience -1 != 10
root.misc.seed       42 != 344
root.misc.num_workers 4 != 16
}
Saved config to /dcai/src/experiments/andrew-ng-dcai/6f2c7aa5/.trial_0/config.yaml


Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_0/_test_labels_split_0
Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_0/_test_image_files_split_0
Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_0/_test_indices_split_0
----
Running Cross-Validation on Split: 1


Model resnet18 created, param count:                                         11181642
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 243.274296 samples/sec	accuracy=0.123750	lr=0.000100
Epoch[0] Batch [99]	Speed: 525.497794 samples/sec	accuracy=0.138750	lr=0.000100
[Epoch 0] training: accuracy=0.144685
[Epoch 0] speed: 361 samples/sec	time cost: 5.583660
[Epoch 0] validation: top1=0.158590 top5=0.634361
[Epoch 0] Current best top-1: 0.158590 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/6f2c7aa5/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 429.767985 samples/sec	accuracy=0.223750	lr=0.000250
Epoch[1] Batch [99]	Speed: 537.922917 samples/sec	accuracy=0.253125	lr=0.000250
[Epoch 1] training: accuracy=0.263287
[Epoch 1] speed: 487 samples/sec	time cost: 4.133948
[Epoch 1] validation: top1=0.378855 top5=0.872247
[Epoch 1] Current best top-1: 0.378855 vs previous 0.158590

Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_1/_test_pred_probs_split_1
Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_1/_test_pred_features_split_1


modified configs(<old> != <new>): {
root.img_cls.model   resnet101 != resnet18
root.train.early_stop_baseline 0.0 != -inf
root.train.batch_size 32 != 16
root.train.epochs    200 != 100
root.train.early_stop_max_value 1.0 != inf
root.train.early_stop_patience -1 != 10
root.misc.seed       42 != 661
root.misc.num_workers 4 != 16
}
Saved config to /dcai/src/experiments/andrew-ng-dcai/bd568d90/.trial_0/config.yaml


Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_1/_test_labels_split_1
Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_1/_test_image_files_split_1
Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_1/_test_indices_split_1
----
Running Cross-Validation on Split: 2


Model resnet18 created, param count:                                         11181642
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 246.391770 samples/sec	accuracy=0.106250	lr=0.000100
Epoch[0] Batch [99]	Speed: 507.602549 samples/sec	accuracy=0.126875	lr=0.000100
[Epoch 0] training: accuracy=0.139272
[Epoch 0] speed: 360 samples/sec	time cost: 5.596146
[Epoch 0] validation: top1=0.158590 top5=0.647577
[Epoch 0] Current best top-1: 0.158590 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/bd568d90/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 429.238829 samples/sec	accuracy=0.191250	lr=0.000250
Epoch[1] Batch [99]	Speed: 511.735605 samples/sec	accuracy=0.226875	lr=0.000250
[Epoch 1] training: accuracy=0.240157
[Epoch 1] speed: 479 samples/sec	time cost: 4.205325
[Epoch 1] validation: top1=0.356828 top5=0.828194
[Epoch 1] Current best top-1: 0.356828 vs previous 0.158590

Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_2/_test_pred_probs_split_2
Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_2/_test_pred_features_split_2


modified configs(<old> != <new>): {
root.img_cls.model   resnet101 != resnet18
root.train.early_stop_baseline 0.0 != -inf
root.train.batch_size 32 != 16
root.train.epochs    200 != 100
root.train.early_stop_max_value 1.0 != inf
root.train.early_stop_patience -1 != 10
root.misc.seed       42 != 508
root.misc.num_workers 4 != 16
}
Saved config to /dcai/src/experiments/andrew-ng-dcai/68693ad6/.trial_0/config.yaml


Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_2/_test_labels_split_2
Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_2/_test_image_files_split_2
Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_2/_test_indices_split_2
----
Running Cross-Validation on Split: 3


Model resnet18 created, param count:                                         11181642
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 238.721351 samples/sec	accuracy=0.118750	lr=0.000100
Epoch[0] Batch [99]	Speed: 512.975370 samples/sec	accuracy=0.144375	lr=0.000100
[Epoch 0] training: accuracy=0.153051
[Epoch 0] speed: 351 samples/sec	time cost: 5.742999
[Epoch 0] validation: top1=0.286344 top5=0.762115
[Epoch 0] Current best top-1: 0.286344 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/68693ad6/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 443.120509 samples/sec	accuracy=0.227500	lr=0.000250
Epoch[1] Batch [99]	Speed: 494.456074 samples/sec	accuracy=0.248750	lr=0.000250
[Epoch 1] training: accuracy=0.262795
[Epoch 1] speed: 473 samples/sec	time cost: 4.257391
[Epoch 1] validation: top1=0.330396 top5=0.854626
[Epoch 1] Current best top-1: 0.330396 vs previous 0.286344

Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_3/_test_pred_probs_split_3
Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_3/_test_pred_features_split_3


modified configs(<old> != <new>): {
root.img_cls.model   resnet101 != resnet18
root.train.early_stop_baseline 0.0 != -inf
root.train.batch_size 32 != 16
root.train.epochs    200 != 100
root.train.early_stop_max_value 1.0 != inf
root.train.early_stop_patience -1 != 10
root.misc.seed       42 != 709
root.misc.num_workers 4 != 16
}
Saved config to /dcai/src/experiments/andrew-ng-dcai/f9e76c5b/.trial_0/config.yaml


Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_3/_test_labels_split_3
Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_3/_test_image_files_split_3
Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_3/_test_indices_split_3
----
Running Cross-Validation on Split: 4


Model resnet18 created, param count:                                         11181642
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 249.999326 samples/sec	accuracy=0.106250	lr=0.000100
Epoch[0] Batch [99]	Speed: 503.462358 samples/sec	accuracy=0.131875	lr=0.000100
[Epoch 0] training: accuracy=0.139764
[Epoch 0] speed: 363 samples/sec	time cost: 5.550025
[Epoch 0] validation: top1=0.281938 top5=0.713656
[Epoch 0] Current best top-1: 0.281938 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/f9e76c5b/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 436.465598 samples/sec	accuracy=0.255000	lr=0.000250
Epoch[1] Batch [99]	Speed: 518.315147 samples/sec	accuracy=0.288750	lr=0.000250
[Epoch 1] training: accuracy=0.280020
[Epoch 1] speed: 483 samples/sec	time cost: 4.172208
[Epoch 1] validation: top1=0.427313 top5=0.898678
[Epoch 1] Current best top-1: 0.427313 vs previous 0.281938

Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_4/_test_pred_probs_split_4
Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_4/_test_pred_features_split_4


modified configs(<old> != <new>): {
root.img_cls.model   resnet101 != resnet50d
root.train.early_stop_baseline 0.0 != -inf
root.train.batch_size 32 != 16
root.train.epochs    200 != 100
root.train.early_stop_max_value 1.0 != inf
root.train.early_stop_patience -1 != 10
root.misc.seed       42 != 535
root.misc.num_workers 4 != 16
}
Saved config to /dcai/src/experiments/andrew-ng-dcai/231a41eb/.trial_0/config.yaml


Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_4/_test_labels_split_4
Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_4/_test_image_files_split_4
Saving dcai_train_val_dataset_cv_resnet18_20220329175851/split_4/_test_indices_split_4
----
Running cross-validation for model: resnet50d
----
Running Cross-Validation on Split: 0


Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet50d_ra2-464e36ba.pth" to /root/.cache/torch/hub/checkpoints/resnet50d_ra2-464e36ba.pth
Model resnet50d created, param count:                                         23547754
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 167.131761 samples/sec	accuracy=0.103750	lr=0.000100
Epoch[0] Batch [99]	Speed: 247.215961 samples/sec	accuracy=0.116250	lr=0.000100
[Epoch 0] training: accuracy=0.117618
[Epoch 0] speed: 210 samples/sec	time cost: 9.558042
[Epoch 0] validation: top1=0.127753 top5=0.577093
[Epoch 0] Current best top-1: 0.127753 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/231a41eb/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 235.622514 samples/sec	accuracy=0.163750	lr=0.000250
Epoch[1] Batch [99]	Speed: 249.410873 samples/sec	accuracy=0.168125	lr=0.000250
[Epoch 1] t

Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_0/_test_pred_probs_split_0
Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_0/_test_pred_features_split_0


modified configs(<old> != <new>): {
root.img_cls.model   resnet101 != resnet50d
root.train.early_stop_baseline 0.0 != -inf
root.train.batch_size 32 != 16
root.train.epochs    200 != 100
root.train.early_stop_max_value 1.0 != inf
root.train.early_stop_patience -1 != 10
root.misc.seed       42 != 261
root.misc.num_workers 4 != 16
}
Saved config to /dcai/src/experiments/andrew-ng-dcai/7c3a2cc9/.trial_0/config.yaml


Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_0/_test_labels_split_0
Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_0/_test_image_files_split_0
Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_0/_test_indices_split_0
----
Running Cross-Validation on Split: 1


Model resnet50d created, param count:                                         23547754
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 163.363048 samples/sec	accuracy=0.090000	lr=0.000100
Epoch[0] Batch [99]	Speed: 255.294469 samples/sec	accuracy=0.096250	lr=0.000100
[Epoch 0] training: accuracy=0.100394
[Epoch 0] speed: 205 samples/sec	time cost: 9.790966
[Epoch 0] validation: top1=0.083700 top5=0.537445
[Epoch 0] Current best top-1: 0.083700 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/7c3a2cc9/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 238.166630 samples/sec	accuracy=0.125000	lr=0.000250
Epoch[1] Batch [99]	Speed: 239.214488 samples/sec	accuracy=0.146875	lr=0.000250
[Epoch 1] training: accuracy=0.150098
[Epoch 1] speed: 244 samples/sec	time cost: 8.243794
[Epoch 1] validation: top1=0.189427 top5=0.647577
[Epoch 1] Current best top-1: 0.189427 vs previous 0.08370

Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_1/_test_pred_probs_split_1
Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_1/_test_pred_features_split_1


modified configs(<old> != <new>): {
root.img_cls.model   resnet101 != resnet50d
root.train.early_stop_baseline 0.0 != -inf
root.train.batch_size 32 != 16
root.train.epochs    200 != 100
root.train.early_stop_max_value 1.0 != inf
root.train.early_stop_patience -1 != 10
root.misc.seed       42 != 351
root.misc.num_workers 4 != 16
}
Saved config to /dcai/src/experiments/andrew-ng-dcai/19094d68/.trial_0/config.yaml


Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_1/_test_labels_split_1
Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_1/_test_image_files_split_1
Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_1/_test_indices_split_1
----
Running Cross-Validation on Split: 2


Model resnet50d created, param count:                                         23547754
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 160.345456 samples/sec	accuracy=0.150000	lr=0.000100
Epoch[0] Batch [99]	Speed: 264.206982 samples/sec	accuracy=0.135000	lr=0.000100
[Epoch 0] training: accuracy=0.130906
[Epoch 0] speed: 211 samples/sec	time cost: 9.554395
[Epoch 0] validation: top1=0.154185 top5=0.621145
[Epoch 0] Current best top-1: 0.154185 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/19094d68/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 232.891144 samples/sec	accuracy=0.158750	lr=0.000250
Epoch[1] Batch [99]	Speed: 246.180358 samples/sec	accuracy=0.165625	lr=0.000250
[Epoch 1] training: accuracy=0.177165
[Epoch 1] speed: 235 samples/sec	time cost: 8.555349
[Epoch 1] validation: top1=0.277533 top5=0.722467
[Epoch 1] Current best top-1: 0.277533 vs previous 0.15418

Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_2/_test_pred_probs_split_2
Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_2/_test_pred_features_split_2


modified configs(<old> != <new>): {
root.img_cls.model   resnet101 != resnet50d
root.train.early_stop_baseline 0.0 != -inf
root.train.batch_size 32 != 16
root.train.epochs    200 != 100
root.train.early_stop_max_value 1.0 != inf
root.train.early_stop_patience -1 != 10
root.misc.seed       42 != 98
root.misc.num_workers 4 != 16
}
Saved config to /dcai/src/experiments/andrew-ng-dcai/91a58b61/.trial_0/config.yaml


Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_2/_test_labels_split_2
Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_2/_test_image_files_split_2
Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_2/_test_indices_split_2
----
Running Cross-Validation on Split: 3


Model resnet50d created, param count:                                         23547754
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 159.265178 samples/sec	accuracy=0.097500	lr=0.000100
Epoch[0] Batch [99]	Speed: 242.371468 samples/sec	accuracy=0.098750	lr=0.000100
[Epoch 0] training: accuracy=0.105807
[Epoch 0] speed: 201 samples/sec	time cost: 10.011939
[Epoch 0] validation: top1=0.101322 top5=0.585903
[Epoch 0] Current best top-1: 0.101322 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/91a58b61/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 235.940984 samples/sec	accuracy=0.131250	lr=0.000250
Epoch[1] Batch [99]	Speed: 259.771736 samples/sec	accuracy=0.153750	lr=0.000250
[Epoch 1] training: accuracy=0.162402
[Epoch 1] speed: 247 samples/sec	time cost: 8.142409
[Epoch 1] validation: top1=0.211454 top5=0.744493
[Epoch 1] Current best top-1: 0.211454 vs previous 0.1013

Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_3/_test_pred_probs_split_3
Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_3/_test_pred_features_split_3


modified configs(<old> != <new>): {
root.img_cls.model   resnet101 != resnet50d
root.train.early_stop_baseline 0.0 != -inf
root.train.batch_size 32 != 16
root.train.epochs    200 != 100
root.train.early_stop_max_value 1.0 != inf
root.train.early_stop_patience -1 != 10
root.misc.seed       42 != 506
root.misc.num_workers 4 != 16
}
Saved config to /dcai/src/experiments/andrew-ng-dcai/09585111/.trial_0/config.yaml


Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_3/_test_labels_split_3
Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_3/_test_image_files_split_3
Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_3/_test_indices_split_3
----
Running Cross-Validation on Split: 4


Model resnet50d created, param count:                                         23547754
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 156.994936 samples/sec	accuracy=0.096250	lr=0.000100
Epoch[0] Batch [99]	Speed: 237.112011 samples/sec	accuracy=0.105000	lr=0.000100
[Epoch 0] training: accuracy=0.115157
[Epoch 0] speed: 189 samples/sec	time cost: 10.668043
[Epoch 0] validation: top1=0.185022 top5=0.572687
[Epoch 0] Current best top-1: 0.185022 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/09585111/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 234.993609 samples/sec	accuracy=0.158750	lr=0.000250
Epoch[1] Batch [99]	Speed: 255.553943 samples/sec	accuracy=0.175000	lr=0.000250
[Epoch 1] training: accuracy=0.180610
[Epoch 1] speed: 241 samples/sec	time cost: 8.356817
[Epoch 1] validation: top1=0.251101 top5=0.766520
[Epoch 1] Current best top-1: 0.251101 vs previous 0.1850

Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_4/_test_pred_probs_split_4
Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_4/_test_pred_features_split_4


modified configs(<old> != <new>): {
root.img_cls.model   resnet101 != efficientnet_b1
root.train.early_stop_baseline 0.0 != -inf
root.train.batch_size 32 != 16
root.train.epochs    200 != 100
root.train.early_stop_max_value 1.0 != inf
root.train.early_stop_patience -1 != 10
root.misc.seed       42 != 84
root.misc.num_workers 4 != 16
}
Saved config to /dcai/src/experiments/andrew-ng-dcai/3206e423/.trial_0/config.yaml


Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_4/_test_labels_split_4
Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_4/_test_image_files_split_4
Saving dcai_train_val_dataset_cv_resnet50d_20220329175851/split_4/_test_indices_split_4
----
Running cross-validation for model: efficientnet_b1
----
Running Cross-Validation on Split: 0


Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b1-533bc792.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b1-533bc792.pth
Model efficientnet_b1 created, param count:                                         6525994
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 127.783711 samples/sec	accuracy=0.120000	lr=0.000100
Epoch[0] Batch [99]	Speed: 164.980679 samples/sec	accuracy=0.131250	lr=0.000100
[Epoch 0] training: accuracy=0.137795
[Epoch 0] speed: 148 samples/sec	time cost: 13.557907
[Epoch 0] validation: top1=0.136564 top5=0.488987
[Epoch 0] Current best top-1: 0.136564 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/3206e423/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 167.158296 samples/sec	accuracy=0.168750	lr=0.000250
Epoch[1] Batch [99]	Speed: 165.850320 samples/sec	accuracy=0.180000	lr=0.000250
[

Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_0/_test_pred_probs_split_0
Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_0/_test_pred_features_split_0


modified configs(<old> != <new>): {
root.img_cls.model   resnet101 != efficientnet_b1
root.train.early_stop_baseline 0.0 != -inf
root.train.batch_size 32 != 16
root.train.epochs    200 != 100
root.train.early_stop_max_value 1.0 != inf
root.train.early_stop_patience -1 != 10
root.misc.seed       42 != 344
root.misc.num_workers 4 != 16
}
Saved config to /dcai/src/experiments/andrew-ng-dcai/9c5cb4f4/.trial_0/config.yaml


Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_0/_test_labels_split_0
Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_0/_test_image_files_split_0
Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_0/_test_indices_split_0
----
Running Cross-Validation on Split: 1


Model efficientnet_b1 created, param count:                                         6525994
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 125.180862 samples/sec	accuracy=0.111250	lr=0.000100
Epoch[0] Batch [99]	Speed: 171.646558 samples/sec	accuracy=0.117500	lr=0.000100
[Epoch 0] training: accuracy=0.116634
[Epoch 0] speed: 146 samples/sec	time cost: 13.796324
[Epoch 0] validation: top1=0.251101 top5=0.718062
[Epoch 0] Current best top-1: 0.251101 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/9c5cb4f4/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 171.083760 samples/sec	accuracy=0.168750	lr=0.000250
Epoch[1] Batch [99]	Speed: 176.847813 samples/sec	accuracy=0.189375	lr=0.000250
[Epoch 1] training: accuracy=0.204232
[Epoch 1] speed: 165 samples/sec	time cost: 12.172245
[Epoch 1] validation: top1=0.339207 top5=0.792952
[Epoch 1] Current best top-1: 0.339207 vs previous 

Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_1/_test_pred_probs_split_1
Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_1/_test_pred_features_split_1


modified configs(<old> != <new>): {
root.img_cls.model   resnet101 != efficientnet_b1
root.train.early_stop_baseline 0.0 != -inf
root.train.batch_size 32 != 16
root.train.epochs    200 != 100
root.train.early_stop_max_value 1.0 != inf
root.train.early_stop_patience -1 != 10
root.misc.seed       42 != 661
root.misc.num_workers 4 != 16
}
Saved config to /dcai/src/experiments/andrew-ng-dcai/9b62c3d9/.trial_0/config.yaml


Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_1/_test_labels_split_1
Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_1/_test_image_files_split_1
Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_1/_test_indices_split_1
----
Running Cross-Validation on Split: 2


Model efficientnet_b1 created, param count:                                         6525994
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 127.206903 samples/sec	accuracy=0.097500	lr=0.000100
Epoch[0] Batch [99]	Speed: 166.865787 samples/sec	accuracy=0.115625	lr=0.000100
[Epoch 0] training: accuracy=0.126969
[Epoch 0] speed: 148 samples/sec	time cost: 13.590369
[Epoch 0] validation: top1=0.149780 top5=0.581498
[Epoch 0] Current best top-1: 0.149780 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/9b62c3d9/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 158.850008 samples/sec	accuracy=0.173750	lr=0.000250
Epoch[1] Batch [99]	Speed: 174.480644 samples/sec	accuracy=0.199375	lr=0.000250
[Epoch 1] training: accuracy=0.201280
[Epoch 1] speed: 165 samples/sec	time cost: 12.194802
[Epoch 1] validation: top1=0.339207 top5=0.713656
[Epoch 1] Current best top-1: 0.339207 vs previous 

Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_2/_test_pred_probs_split_2
Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_2/_test_pred_features_split_2


modified configs(<old> != <new>): {
root.img_cls.model   resnet101 != efficientnet_b1
root.train.early_stop_baseline 0.0 != -inf
root.train.batch_size 32 != 16
root.train.epochs    200 != 100
root.train.early_stop_max_value 1.0 != inf
root.train.early_stop_patience -1 != 10
root.misc.seed       42 != 508
root.misc.num_workers 4 != 16
}
Saved config to /dcai/src/experiments/andrew-ng-dcai/334ea4e2/.trial_0/config.yaml


Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_2/_test_labels_split_2
Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_2/_test_image_files_split_2
Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_2/_test_indices_split_2
----
Running Cross-Validation on Split: 3


Model efficientnet_b1 created, param count:                                         6525994
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 126.606108 samples/sec	accuracy=0.108750	lr=0.000100
Epoch[0] Batch [99]	Speed: 175.133027 samples/sec	accuracy=0.130625	lr=0.000100
[Epoch 0] training: accuracy=0.131890
[Epoch 0] speed: 153 samples/sec	time cost: 13.139937
[Epoch 0] validation: top1=0.207048 top5=0.704846
[Epoch 0] Current best top-1: 0.207048 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/334ea4e2/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 167.314246 samples/sec	accuracy=0.220000	lr=0.000250
Epoch[1] Batch [99]	Speed: 176.991833 samples/sec	accuracy=0.216875	lr=0.000250
[Epoch 1] training: accuracy=0.220472
[Epoch 1] speed: 174 samples/sec	time cost: 11.536484
[Epoch 1] validation: top1=0.180617 top5=0.634361
Epoch[2] Batch [49]	Speed: 164.664682 samples/sec	a

Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_3/_test_pred_probs_split_3
Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_3/_test_pred_features_split_3


modified configs(<old> != <new>): {
root.img_cls.model   resnet101 != efficientnet_b1
root.train.early_stop_baseline 0.0 != -inf
root.train.batch_size 32 != 16
root.train.epochs    200 != 100
root.train.early_stop_max_value 1.0 != inf
root.train.early_stop_patience -1 != 10
root.misc.seed       42 != 709
root.misc.num_workers 4 != 16
}
Saved config to /dcai/src/experiments/andrew-ng-dcai/cab7c6da/.trial_0/config.yaml


Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_3/_test_labels_split_3
Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_3/_test_image_files_split_3
Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_3/_test_indices_split_3
----
Running Cross-Validation on Split: 4


Model efficientnet_b1 created, param count:                                         6525994
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 118.721823 samples/sec	accuracy=0.116250	lr=0.000100
Epoch[0] Batch [99]	Speed: 160.763044 samples/sec	accuracy=0.125625	lr=0.000100
[Epoch 0] training: accuracy=0.132874
[Epoch 0] speed: 138 samples/sec	time cost: 14.588555
[Epoch 0] validation: top1=0.162996 top5=0.629956
[Epoch 0] Current best top-1: 0.162996 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/cab7c6da/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 160.202733 samples/sec	accuracy=0.195000	lr=0.000250
Epoch[1] Batch [99]	Speed: 166.890851 samples/sec	accuracy=0.202500	lr=0.000250
[Epoch 1] training: accuracy=0.211614
[Epoch 1] speed: 167 samples/sec	time cost: 12.069118
[Epoch 1] validation: top1=0.264317 top5=0.797357
[Epoch 1] Current best top-1: 0.264317 vs previous 

Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_4/_test_pred_probs_split_4
Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_4/_test_pred_features_split_4


modified configs(<old> != <new>): {
root.img_cls.model   resnet101 != twins_pcpvt_base
root.train.early_stop_baseline 0.0 != -inf
root.train.batch_size 32 != 16
root.train.epochs    200 != 100
root.train.early_stop_max_value 1.0 != inf
root.train.early_stop_patience -1 != 10
root.misc.seed       42 != 535
root.misc.num_workers 4 != 16
}
Saved config to /dcai/src/experiments/andrew-ng-dcai/058257ce/.trial_0/config.yaml


Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_4/_test_labels_split_4
Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_4/_test_image_files_split_4
Saving dcai_train_val_dataset_cv_efficientnet_b1_20220329175851/split_4/_test_indices_split_4
----
Running cross-validation for model: twins_pcpvt_base
----
Running Cross-Validation on Split: 0


Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/twins_pcpvt_base-e5ecb09b.pth" to /root/.cache/torch/hub/checkpoints/twins_pcpvt_base-e5ecb09b.pth
Model twins_pcpvt_base created, param count:                                         43320586
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 86.504819 samples/sec	accuracy=0.146250	lr=0.000100
Epoch[0] Batch [99]	Speed: 105.635543 samples/sec	accuracy=0.176875	lr=0.000100
[Epoch 0] training: accuracy=0.196850
[Epoch 0] speed: 97 samples/sec	time cost: 20.601321
[Epoch 0] validation: top1=0.290749 top5=0.766520
[Epoch 0] Current best top-1: 0.290749 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/058257ce/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 106.335790 samples/sec	accuracy=0.300000	lr=0.000250
Epoch[1] Batch [99]	Speed: 110.243744 samples/sec	accuracy=0.327500	lr=0.0

Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_0/_test_pred_probs_split_0
Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_0/_test_pred_features_split_0
Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_0/_test_labels_split_0
Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_0/_test_image_files_split_0
Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_0/_test_indices_split_0
----
Running Cross-Validation on Split: 1


Model twins_pcpvt_base created, param count:                                         43320586
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 86.389826 samples/sec	accuracy=0.185000	lr=0.000100
Epoch[0] Batch [99]	Speed: 102.277166 samples/sec	accuracy=0.231875	lr=0.000100
[Epoch 0] training: accuracy=0.250492
[Epoch 0] speed: 96 samples/sec	time cost: 20.969023
[Epoch 0] validation: top1=0.343612 top5=0.814978
[Epoch 0] Current best top-1: 0.343612 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/2cb3347d/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 102.994349 samples/sec	accuracy=0.337500	lr=0.000250
Epoch[1] Batch [99]	Speed: 102.346793 samples/sec	accuracy=0.373750	lr=0.000250
[Epoch 1] training: accuracy=0.390256
[Epoch 1] speed: 103 samples/sec	time cost: 19.494329
[Epoch 1] validation: top1=0.528634 top5=0.947137
[Epoch 1] Current best top-1: 0.528634 vs previous 

Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_1/_test_pred_probs_split_1
Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_1/_test_pred_features_split_1
Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_1/_test_labels_split_1
Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_1/_test_image_files_split_1
Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_1/_test_indices_split_1
----
Running Cross-Validation on Split: 2


Model twins_pcpvt_base created, param count:                                         43320586
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 84.563478 samples/sec	accuracy=0.140000	lr=0.000100
Epoch[0] Batch [99]	Speed: 99.684231 samples/sec	accuracy=0.174375	lr=0.000100
[Epoch 0] training: accuracy=0.190453
[Epoch 0] speed: 91 samples/sec	time cost: 21.927084
[Epoch 0] validation: top1=0.308370 top5=0.779736
[Epoch 0] Current best top-1: 0.308370 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/177f4de2/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 105.569129 samples/sec	accuracy=0.328750	lr=0.000250
Epoch[1] Batch [99]	Speed: 100.900779 samples/sec	accuracy=0.377500	lr=0.000250
[Epoch 1] training: accuracy=0.392224
[Epoch 1] speed: 103 samples/sec	time cost: 19.472081
[Epoch 1] validation: top1=0.519824 top5=0.933921
[Epoch 1] Current best top-1: 0.519824 vs previous 0

Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_2/_test_pred_probs_split_2
Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_2/_test_pred_features_split_2
Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_2/_test_labels_split_2
Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_2/_test_image_files_split_2
Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_2/_test_indices_split_2
----
Running Cross-Validation on Split: 3


Model twins_pcpvt_base created, param count:                                         43320586
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 82.829366 samples/sec	accuracy=0.206250	lr=0.000100
Epoch[0] Batch [99]	Speed: 99.752086 samples/sec	accuracy=0.233125	lr=0.000100
[Epoch 0] training: accuracy=0.243602
[Epoch 0] speed: 93 samples/sec	time cost: 21.565423
[Epoch 0] validation: top1=0.348018 top5=0.770925
[Epoch 0] Current best top-1: 0.348018 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/a081ea63/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 99.907969 samples/sec	accuracy=0.338750	lr=0.000250
Epoch[1] Batch [99]	Speed: 102.330881 samples/sec	accuracy=0.369375	lr=0.000250
[Epoch 1] training: accuracy=0.384843
[Epoch 1] speed: 101 samples/sec	time cost: 19.834410
[Epoch 1] validation: top1=0.537445 top5=0.938326
[Epoch 1] Current best top-1: 0.537445 vs previous 0.

Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_3/_test_pred_probs_split_3
Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_3/_test_pred_features_split_3
Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_3/_test_labels_split_3
Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_3/_test_image_files_split_3
Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_3/_test_indices_split_3
----
Running Cross-Validation on Split: 4


Model twins_pcpvt_base created, param count:                                         43320586
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 82.647447 samples/sec	accuracy=0.150000	lr=0.000100
Epoch[0] Batch [99]	Speed: 102.899086 samples/sec	accuracy=0.181250	lr=0.000100
[Epoch 0] training: accuracy=0.195374
[Epoch 0] speed: 94 samples/sec	time cost: 21.368766
[Epoch 0] validation: top1=0.303965 top5=0.814978
[Epoch 0] Current best top-1: 0.303965 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/d99f76b1/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 103.173039 samples/sec	accuracy=0.315000	lr=0.000250
Epoch[1] Batch [99]	Speed: 104.984172 samples/sec	accuracy=0.348125	lr=0.000250
[Epoch 1] training: accuracy=0.368110
[Epoch 1] speed: 104 samples/sec	time cost: 19.239305
[Epoch 1] validation: top1=0.533040 top5=0.955947
[Epoch 1] Current best top-1: 0.533040 vs previous 

Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_4/_test_pred_probs_split_4
Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_4/_test_pred_features_split_4
Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_4/_test_labels_split_4
Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_4/_test_image_files_split_4
Saving dcai_train_val_dataset_cv_twins_pcpvt_base_20220329175851/split_4/_test_indices_split_4
----
Running cross-validation for model: swin_base_patch4_window7_224
----
Running Cross-Validation on Split: 0


Downloading: "https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224_22kto1k.pth" to /root/.cache/torch/hub/checkpoints/swin_base_patch4_window7_224_22kto1k.pth
Model swin_base_patch4_window7_224 created, param count:                                         86753474
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 83.060751 samples/sec	accuracy=0.156250	lr=0.000100
Epoch[0] Batch [99]	Speed: 101.763639 samples/sec	accuracy=0.168125	lr=0.000100
[Epoch 0] training: accuracy=0.173228
[Epoch 0] speed: 92 samples/sec	time cost: 21.716587
[Epoch 0] validation: top1=0.229075 top5=0.757709
[Epoch 0] Current best top-1: 0.229075 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/b7213540/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 98.061397 samples/sec	accuracy=0.296250	lr=0.000250
Epoch[1] Batch [99]	Speed: 102.052670 samples/sec	accuracy=

Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_0/_test_pred_probs_split_0
Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_0/_test_pred_features_split_0
Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_0/_test_labels_split_0
Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_0/_test_image_files_split_0
Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_0/_test_indices_split_0
----
Running Cross-Validation on Split: 1


Model swin_base_patch4_window7_224 created, param count:                                         86753474
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 80.828003 samples/sec	accuracy=0.123750	lr=0.000100
Epoch[0] Batch [99]	Speed: 99.656174 samples/sec	accuracy=0.164375	lr=0.000100
[Epoch 0] training: accuracy=0.190453
[Epoch 0] speed: 90 samples/sec	time cost: 22.243049
[Epoch 0] validation: top1=0.348018 top5=0.784141
[Epoch 0] Current best top-1: 0.348018 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/3d3f1508/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 97.192242 samples/sec	accuracy=0.343750	lr=0.000250
Epoch[1] Batch [99]	Speed: 99.555223 samples/sec	accuracy=0.380625	lr=0.000250
[Epoch 1] training: accuracy=0.396161
[Epoch 1] speed: 88 samples/sec	time cost: 22.836701
[Epoch 1] validation: top1=0.581498 top5=0.929515
[Epoch 1] Current best top-1: 0.581498 vs p

Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_1/_test_pred_probs_split_1
Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_1/_test_pred_features_split_1
Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_1/_test_labels_split_1
Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_1/_test_image_files_split_1
Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_1/_test_indices_split_1
----
Running Cross-Validation on Split: 2


root.train.early_stop_max_value 1.0 != inf
root.train.early_stop_patience -1 != 10
root.misc.seed       42 != 661
root.misc.num_workers 4 != 16
}
Saved config to /dcai/src/experiments/andrew-ng-dcai/0a3415c7/.trial_0/config.yaml
Model swin_base_patch4_window7_224 created, param count:                                         86753474
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 82.037189 samples/sec	accuracy=0.133750	lr=0.000100
Epoch[0] Batch [99]	Speed: 97.839408 samples/sec	accuracy=0.175000	lr=0.000100
[Epoch 0] training: accuracy=0.192421
[Epoch 0] speed: 90 samples/sec	time cost: 22.287003
[Epoch 0] validation: top1=0.383260 top5=0.876652
[Epoch 0] Current best top-1: 0.383260 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/0a3415c7/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 98.700996 samples/sec	accuracy=0.332500	lr=0.000250
Epoch[1] Batch [99]	Speed: 101.190

Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_2/_test_pred_probs_split_2
Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_2/_test_pred_features_split_2
Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_2/_test_labels_split_2
Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_2/_test_image_files_split_2
Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_2/_test_indices_split_2
----
Running Cross-Validation on Split: 3


Model swin_base_patch4_window7_224 created, param count:                                         86753474
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 80.955239 samples/sec	accuracy=0.115000	lr=0.000100
Epoch[0] Batch [99]	Speed: 99.818224 samples/sec	accuracy=0.150000	lr=0.000100
[Epoch 0] training: accuracy=0.173720
[Epoch 0] speed: 90 samples/sec	time cost: 22.167973
[Epoch 0] validation: top1=0.317181 top5=0.757709
[Epoch 0] Current best top-1: 0.317181 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/8d755c56/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 98.761910 samples/sec	accuracy=0.275000	lr=0.000250
Epoch[1] Batch [99]	Speed: 100.769779 samples/sec	accuracy=0.329375	lr=0.000250
[Epoch 1] training: accuracy=0.353346
[Epoch 1] speed: 99 samples/sec	time cost: 20.296773
[Epoch 1] validation: top1=0.546256 top5=0.929515
[Epoch 1] Current best top-1: 0.546256 vs 

Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_3/_test_pred_probs_split_3
Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_3/_test_pred_features_split_3
Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_3/_test_labels_split_3
Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_3/_test_image_files_split_3
Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_3/_test_indices_split_3
----
Running Cross-Validation on Split: 4


Model swin_base_patch4_window7_224 created, param count:                                         86753474
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 82.739762 samples/sec	accuracy=0.150000	lr=0.000100
Epoch[0] Batch [99]	Speed: 99.684895 samples/sec	accuracy=0.171875	lr=0.000100
[Epoch 0] training: accuracy=0.189961
[Epoch 0] speed: 91 samples/sec	time cost: 21.959800
[Epoch 0] validation: top1=0.339207 top5=0.748899
[Epoch 0] Current best top-1: 0.339207 vs previous -inf, saved to /dcai/src/experiments/andrew-ng-dcai/1eb70707/.trial_0/best_checkpoint.pkl
Epoch[1] Batch [49]	Speed: 98.747726 samples/sec	accuracy=0.293750	lr=0.000250
Epoch[1] Batch [99]	Speed: 101.095218 samples/sec	accuracy=0.332500	lr=0.000250
[Epoch 1] training: accuracy=0.356299
[Epoch 1] speed: 99 samples/sec	time cost: 20.285553
[Epoch 1] validation: top1=0.511013 top5=0.938326
[Epoch 1] Current best top-1: 0.511013 vs 

Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_4/_test_pred_probs_split_4
Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_4/_test_pred_features_split_4
Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_4/_test_labels_split_4
Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_4/_test_image_files_split_4
Saving dcai_train_val_dataset_cv_swin_base_patch4_window7_224_20220329175851/split_4/_test_indices_split_4
CPU times: user 3h 45min 27s, sys: 17min 53s, total: 4h 3min 20s
Wall time: 4h 27min 38s


## Display GPU used for training

In [20]:
!nvidia-smi

Tue Mar 29 22:33:38 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.86       Driver Version: 470.86       CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:05:00.0 Off |                  N/A |
| 41%   35C    P8     7W / 260W |    115MiB / 11011MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA RTX A6000    Off  | 00000000:09:00.0 Off |                  Off |
| 30%   43C    P8    21W / 300W |   7669MiB / 48685MiB |      0%      Default |
|       