In [1]:
from datasets import load_dataset, load_from_disk, Dataset, Features, Array3D,Array2D

dataset = load_dataset("imagefolder", data_dir='C:\\research\\data\\fashion_images_structured', split='train')

Resolving data files:   0%|          | 0/49173 [00:00<?, ?it/s]

Using custom data configuration default-5684e19ace993ce9
Found cached dataset imagefolder (C:/Users/Владимир/.cache/huggingface/datasets/imagefolder/default-5684e19ace993ce9/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f)


In [2]:
from typing import Tuple
def split_dataset(
    dataset: Dataset,
    val_size: float=0.2,
    test_size: float=0.1
) -> Tuple[Dataset, Dataset, Dataset]:
    """
    Returns a tuple with three random train, validation and test subsets by splitting the passed dataset.
    Size of the validation and test sets defined as a fraction of 1 with the `val_size` and `test_size` arguments.
    """

    print("Splitting dataset into train, validation and test sets...")

    # Split dataset into train and (val + test) sets
    split_size = round(val_size + test_size, 3)
    dataset = dataset.train_test_split(shuffle=True, test_size=split_size)

    # Split (val + test) into val and test sets
    split_ratio = round(test_size / (test_size + val_size), 3)
    val_test_sets = dataset['test'].train_test_split(shuffle=True, test_size=split_ratio)

    train_dataset = dataset["train"]
    val_dataset = val_test_sets["train"]
    test_dataset = val_test_sets["test"]
    return train_dataset, val_dataset, test_dataset

In [3]:
val_size = 0.2
test_size = 0.1
model_name = "google/vit-base-patch16-224-in21k"

In [4]:
train_dataset, val_dataset, test_dataset = split_dataset(dataset, val_size, test_size)

Splitting dataset into train, validation and test sets...


In [7]:
from transformers import AutoProcessor, ViTFeatureExtractor, ViTForImageClassification, Trainer, TrainingArguments, default_data_collator


def process_examples(examples, image_processor):
    """Processor helper function. Used to process batches of images using the
    passed image_processor.

    Parameters
    ----------
    examples
        A batch of image examples.

    image_processor
        A HuggingFace image processor for the selected model.

    Returns
    -------
    examples
        A batch of processed image examples.
    """
    # Get batch of images
    images = examples['image']
    # images = [image.convert("RGB").resize((32,32)) for image in examples["image"]]

    # Preprocess
    inputs = image_processor(images=images)
    # Add pixel_values
    examples['pixel_values'] = inputs['pixel_values']

    return examples


def apply_processing(
    model_name: str,
    train_dataset: Dataset,
    val_dataset: Dataset,
    test_dataset: Dataset
) -> Tuple[Dataset, Dataset, Dataset]:
    """
    Apply model's image AutoProcessor to transform train, validation and test subsets.
    Returns train, validation and test datasets with `pixel_values` in torch tensor type.
    """

    # Extend the features
    features = Features({
        **train_dataset.features,
        'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
    })

    # Instantiate image_processor
    image_processor = AutoProcessor.from_pretrained(model_name)

    # Preprocess images
    train_dataset = train_dataset.map(process_examples, batched=True, features=features, fn_kwargs={"image_processor": image_processor}, batch_size=2500)
    val_dataset = val_dataset.map(process_examples, batched=True, features=features, fn_kwargs={"image_processor": image_processor}, batch_size=2500)
    test_dataset = test_dataset.map(process_examples, batched=True, features=features, fn_kwargs={"image_processor": image_processor}, batch_size=2500)

    # Set to torch format for training
    train_dataset.set_format('torch', columns=['pixel_values', 'label'])
    val_dataset.set_format('torch', columns=['pixel_values', 'label'])
    test_dataset.set_format('torch', columns=['pixel_values', 'label'])

    # Remove unused column
    train_dataset = train_dataset.remove_columns("image")
    val_dataset = val_dataset.remove_columns("image")
    test_dataset = test_dataset.remove_columns("image")

    return train_dataset, val_dataset, test_dataset

In [11]:
imp = AutoProcessor.from_pretrained(model_name)

In [12]:
imp.__dict__

{'_processor_class': None,
 'do_resize': True,
 'do_rescale': True,
 'do_normalize': True,
 'size': {'height': 224, 'width': 224},
 'resample': <Resampling.BILINEAR: 2>,
 'rescale_factor': 0.00392156862745098,
 'image_mean': [0.5, 0.5, 0.5],
 'image_std': [0.5, 0.5, 0.5]}

In [8]:
# Apply AutoProcessor
train_dataset, val_dataset, test_dataset = apply_processing(model_name, train_dataset, val_dataset, test_dataset)

  0%|          | 0/14 [00:00<?, ?ba/s]

KeyboardInterrupt: 

In [11]:
train_save_path = './data/processed_dataset_32x32/train/'
val_save_path = './data/processed_dataset_32x32/val/'
test_save_path = './data/processed_dataset_32x32/test/'

In [12]:
train_dataset.save_to_disk(train_save_path)
val_dataset.save_to_disk(val_save_path)
test_dataset.save_to_disk(test_save_path)

Saving the dataset (0/42 shards):   0%|          | 0/34421 [00:00<?, ? examples/s]

Saving the dataset (0/12 shards):   0%|          | 0/9839 [00:00<?, ? examples/s]

Saving the dataset (0/6 shards):   0%|          | 0/4913 [00:00<?, ? examples/s]