In [1]:
# %load_ext autoreload
# %autoreload 2

import sys

sys.path.insert(0, "../")

from autogluon.vision import ImagePredictor, ImageDataset
import numpy as np
import pandas as pd
import pickle
import datetime
from pathlib import Path
import cleanlab
from cross_validation_autogluon import cross_val_predict_autogluon_image_dataset, train_predict_autogluon

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

## Read original data

In [2]:
# data path
CIFAR_10_DATA_PATH = "/datasets/uly/ood-data/cifar10_png/"

# read data from root folder
train_dataset, _, test_dataset = \
    ImageDataset.from_folders(
        root=CIFAR_10_DATA_PATH,
    )

## Read data with noisy label

In case cross-validation procedure fails for a model, we want to be able to rerun with the same noisy dataset

In [3]:
noise_amount = 0.2
frac_zero_noise_rates = 0.4

# DATA_NOISY_LABELS_OUT_FILE = f"cifar10_train_dataset_noise_amount_{noise_amount}_sparsity_{frac_zero_noise_rates}_20220326055753.csv"
DATA_NOISY_LABELS_OUT_FILE = f"cifar10_train_dataset_noise_amount_{noise_amount}_sparsity_{frac_zero_noise_rates}_20220926183148.csv"
train_dataset_noisy_labels_loaded = pd.read_csv(DATA_NOISY_LABELS_OUT_FILE)
# train_dataset_noisy_labels_loaded['image'] = train_dataset_noisy_labels_loaded.apply(lambda x: 
#                                                                                      CIFAR_10_DATA_PATH + x['image'][18:],axis=1)

In [4]:
train_dataset_noisy_labels_loaded.head()

Unnamed: 0,image,label
0,/datasets/uly/ood-data/cifar10_png/train/airplane/data_batch_1_index_0029.png,0
1,/datasets/uly/ood-data/cifar10_png/train/airplane/data_batch_1_index_0030.png,5
2,/datasets/uly/ood-data/cifar10_png/train/airplane/data_batch_1_index_0035.png,9
3,/datasets/uly/ood-data/cifar10_png/train/airplane/data_batch_1_index_0049.png,7
4,/datasets/uly/ood-data/cifar10_png/train/airplane/data_batch_1_index_0077.png,8


In [5]:
train_dataset_noisy_labels_loaded.shape

(50000, 2)

In [6]:
train_dataset_noisy_labels_loaded.groupby("label")["image"].count().reset_index()

Unnamed: 0,label,image
0,0,3690
1,1,5282
2,2,3693
3,3,5883
4,4,5962
5,5,4411
6,6,5275
7,7,5790
8,8,6030
9,9,3984


## Generate In-Sample Predicted Probabilities

In [None]:
%%time

models = [
    "resnet18",
    "resnet50d",
    "efficientnet_b1",
    "twins_pcpvt_base",
    "swin_base_patch4_window7_224"
]

epochs = 100
holdout_frac = 0.2
n_splits = 5

# run cross-validation for each model
for model in models:
    
    print("----")
    print(f"Running cross-validation for model: {model}")

    MODEL_PARAMS = {
        "model": model,
        "epochs": epochs,
        "holdout_frac": holdout_frac
    }
   
    _ = \
    train_predict_autogluon(
        dataset=train_dataset_noisy_labels_loaded, # train with NOISY LABELS,
        classes=train_dataset.classes,
        out_folder=f"./cifar10_train_dataset_noise_amount_{noise_amount}_sparsity_{frac_zero_noise_rates}_cv_{model}/", # save results of cross-validation in pickle files for each fold
        n_splits=n_splits,
        model_params=MODEL_PARAMS
    )

modified configs(<old> != <new>): {
root.train.early_stop_patience -1 != 10
root.train.early_stop_max_value 1.0 != inf
root.train.epochs    200 != 100
root.train.batch_size 32 != 16
root.train.early_stop_baseline 0.0 != -inf
root.img_cls.model   resnet101 != twins_pcpvt_base
root.misc.num_workers 4 != 64
root.misc.seed       42 != 146
}
Saved config to /datasets/uly/label-error-detection-benchmarks/src/experiments/cifar-10/b9e13592/.trial_0/config.yaml


----
Running cross-validation for model: twins_pcpvt_base


Model twins_pcpvt_base created, param count:                                         43320586
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 55.107458 samples/sec	accuracy=0.168750	lr=0.000100
Epoch[0] Batch [99]	Speed: 83.373585 samples/sec	accuracy=0.224375	lr=0.000100
Epoch[0] Batch [149]	Speed: 88.382134 samples/sec	accuracy=0.266667	lr=0.000100
Epoch[0] Batch [199]	Speed: 82.885791 samples/sec	accuracy=0.296250	lr=0.000100
Epoch[0] Batch [249]	Speed: 86.489971 samples/sec	accuracy=0.326500	lr=0.000100
Epoch[0] Batch [299]	Speed: 87.162945 samples/sec	accuracy=0.356875	lr=0.000100
`time_limit=59.999918937683105` reached, exit early...


Saving to numpy files in this folder: ./noxval_delete_cifar10_train_dataset_noise_amount_0.2_sparsity_0.4_cv_twins_pcpvt_base/


modified configs(<old> != <new>): {
root.train.early_stop_patience -1 != 10
root.train.early_stop_max_value 1.0 != inf
root.train.epochs    200 != 100
root.train.batch_size 32 != 16
root.train.early_stop_baseline 0.0 != -inf
root.img_cls.model   resnet101 != swin_base_patch4_window7_224
root.misc.num_workers 4 != 64
root.misc.seed       42 != 443
}
Saved config to /datasets/uly/label-error-detection-benchmarks/src/experiments/cifar-10/e3f4a1db/.trial_0/config.yaml


----
Running cross-validation for model: swin_base_patch4_window7_224


Model swin_base_patch4_window7_224 created, param count:                                         86753474
AMP not enabled. Training in float32.
Disable EMA as it is not supported for now.
Start training from [Epoch 0]
Epoch[0] Batch [49]	Speed: 43.282026 samples/sec	accuracy=0.128750	lr=0.000100
Epoch[0] Batch [99]	Speed: 63.044492 samples/sec	accuracy=0.175000	lr=0.000100
Epoch[0] Batch [149]	Speed: 60.689322 samples/sec	accuracy=0.220417	lr=0.000100
Epoch[0] Batch [199]	Speed: 60.251935 samples/sec	accuracy=0.273750	lr=0.000100
`time_limit=59.99990963935852` reached, exit early...


In [None]:
train_dataset_noisy_labels_loaded["label"].unique()

## Display GPU used for training

In [None]:
!nvidia-smi