In [1]:
from datasets import load_dataset, DatasetDict
from loguru import logger
from pathlib import Path

dataset = load_dataset("frgfm/imagenette", "full_size")

active_learning_dataset = DatasetDict()


In [2]:
dataset['validation'].features['label'].names

['tench',
 'English springer',
 'cassette player',
 'chain saw',
 'church',
 'French horn',
 'garbage truck',
 'gas pump',
 'golf ball',
 'parachute']

## Save image to disk and add filepath to dataframe

In [3]:
# Constants
SAVE_DIR = Path("data/imagenette")
NUM_PROC = 8  # Adjust based on your CPU
CLASS_NAMES = dataset['validation'].features['label'].names

def save_image(example, idx):
    """Save a dataset image to disk with error handling and add label name.
    
    Args:
        example (dict): Dataset example containing 'image' and 'label'
        idx (int): Index of the example
        
    Returns:
        dict: Dictionary containing the saved filepath and label_name
    """
    try:
        image = example['image']
        label_id = example.get('label')  # Get label if it exists (won't exist for unlabeled)
        
        # Get label name if label exists
        label_name = CLASS_NAMES[label_id] if label_id is not None else None
        
        # Create directory structure
        label_dir = SAVE_DIR / str(label_id if label_id is not None else 'unlabeled')
        label_dir.mkdir(parents=True, exist_ok=True)
        
        # Create filename with label subdirectory
        filepath = label_dir / f"{idx:05d}.jpg"
        
        # Save with quality optimization
        image.save(filepath, "JPEG", quality=95, optimize=True)
        
        return {
            "filepath": str(filepath),
            "label_name": label_name
        }
    
    except Exception as e:
        logger.error(f"Error saving image {idx}: {str(e)}")
        return {
            "filepath": None,
            "label_name": None
        }


active_learning_dataset['evaluation'] = dataset['validation'].map(
    save_image,
    with_indices=True,
    num_proc=NUM_PROC,
    desc="Saving evaluation images",
    remove_columns="label"
)

active_learning_dataset['unlabeled'] = dataset['train'].map(
    save_image,
    with_indices=True,
    num_proc=NUM_PROC,
    desc="Saving unlabeled images",
    remove_columns="label"
)



Saving evaluation images (num_proc=8):   0%|          | 0/3925 [00:00<?, ? examples/s]

Saving unlabeled images (num_proc=8):   0%|          | 0/9469 [00:00<?, ? examples/s]

In [4]:
active_learning_dataset['evaluation'][0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x487>,
 'filepath': 'data/imagenette/2/00000.jpg',
 'label_name': 'cassette player'}

In [5]:
active_learning_dataset['unlabeled'][0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x281>,
 'filepath': 'data/imagenette/2/00000.jpg',
 'label_name': 'cassette player'}

In [6]:
# active_learning_dataset['unlabeled'] = active_learning_dataset['unlabeled'].remove_columns('label_name')


In [7]:
active_learning_dataset

DatasetDict({
    evaluation: Dataset({
        features: ['image', 'filepath', 'label_name'],
        num_rows: 3925
    })
    unlabeled: Dataset({
        features: ['image', 'filepath', 'label_name'],
        num_rows: 9469
    })
})

In [9]:
active_learning_dataset.push_to_hub("dnth/active-learning-imagenette")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/3925 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/40 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/4735 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/48 [00:00<?, ?ba/s]

Map:   0%|          | 0/4734 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/48 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/dnth/active-learning-imagenette/commit/3075d59771dc6d8f44419f3d5b57fb102f7ebd25', commit_message='Upload dataset', commit_description='', oid='3075d59771dc6d8f44419f3d5b57fb102f7ebd25', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/dnth/active-learning-imagenette', endpoint='https://huggingface.co', repo_type='dataset', repo_id='dnth/active-learning-imagenette'), pr_revision=None, pr_num=None)