In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, "../")

from autogluon.vision import ImagePredictor, ImageDataset
import numpy as np
import pandas as pd
import pickle
import datetime
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from cross_validation_autogluon import cross_val_predict_autogluon_image_dataset

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [None]:
# change label index based on list of classes
def process_autogluon_image_dataset(dataset, classes):
    dataset["label_name"] = dataset.image.map(lambda x: Path(x).parts[-2])
    dataset["label"] = dataset["label_name"].map(lambda x: classes[x])
    return dataset

In [None]:
# original data deduped
DATA_PATH = "/Data/andrew-ng-dcai-comp-2021-data-deduped/andrew-ng-dcai-comp-2021-data/"

# read data from root folder
train_dataset, val_dataset, test_dataset = \
    ImageDataset.from_folders(
        root=DATA_PATH,
    )

# NOTE!
# NOTE! change the label index! AutoGluon assigns label index based on alphabetical order by default
classes = {
    label_name: idx
    for idx, label_name in enumerate(
        ["i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x"]
    )
}

train_dataset, val_dataset, test_dataset = \
    process_autogluon_image_dataset(train_dataset, classes), \
    process_autogluon_image_dataset(val_dataset, classes), \
    process_autogluon_image_dataset(test_dataset, classes)

# combine train and val dataset for cross-validation
train_val_dataset = pd.concat([train_dataset, val_dataset])

In [None]:
train_val_dataset.head(3)

## Train model for feature extractor

- Features will be used for KNN-based OOD scoring

In [None]:
%%time

# generate cross-validated predicted probabilities for various models so we can use them for ensemble scoring methods
models = [
    "swin_base_patch4_window7_224"
]

epochs = 100
holdout_frac = 0.2
n_splits = 5

# run cross-validation for each model

ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S") # CV start timestamp

for model in models:
    
    print("----")
    print(f"Running cross-validation for model: {model}")

    MODEL_PARAMS = {
        "model": model,
        "epochs": epochs,
        "holdout_frac": holdout_frac
    }

    # results of cross-validation will be saved to pickle files for each model/fold
    _ = \
        cross_val_predict_autogluon_image_dataset(
            dataset=train_val_dataset, # train with NOISY LABELS
            out_folder=f"./dcai_train_val_dataset_cv_{model}/", # save results of cross-validation in pickle files for each fold
            n_splits=n_splits,
            model_params=MODEL_PARAMS
        )