In [1]:
from datasets import load_dataset

dataset = load_dataset("timm/oxford-iiit-pet")


In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'label', 'image_id', 'label_cat_dog'],
        num_rows: 3680
    })
    test: Dataset({
        features: ['image', 'label', 'image_id', 'label_cat_dog'],
        num_rows: 3669
    })
})

## Save the dataset locally

In [3]:
from pathlib import Path
from loguru import logger
from datasets import DatasetDict
from PIL import Image

SAVE_DIR = Path("data/oxford-iiit-pet")
NUM_PROC = 8  
CLASS_NAMES = dataset['train'].features['label'].names

def save_image(example, idx):
    """Save a dataset image to disk with error handling and add label name.
    
    Args:
        example (dict): Dataset example containing 'image' and 'label'
        idx (int): Index of the example
        
    Returns:
        dict: Dictionary containing the saved filepath and label_name
    """
    try:
        image = example['image']
        label_id = example.get('label')  # Get label if it exists (won't exist for unlabeled)
        
        # Convert RGBA to RGB if necessary
        if image.mode == 'RGBA':
            # Create a white background
            background = Image.new('RGB', image.size, (255, 255, 255))
            # Paste the image using alpha channel as mask
            background.paste(image, mask=image.split()[3])
            image = background
        
        # Get label name if label exists
        label_name = CLASS_NAMES[label_id] if label_id is not None else None
        
        # Create directory structure
        label_dir = SAVE_DIR / str(label_id if label_id is not None else 'unlabeled')
        label_dir.mkdir(parents=True, exist_ok=True)
        
        # Create filename with label subdirectory
        filepath = label_dir / f"{idx:05d}.jpg"
        
        # Save with quality optimization
        image.save(filepath, "JPEG", quality=95, optimize=True)
        
        return {
            "filepath": str(filepath),
            "label_name": label_name
        }
    
    except Exception as e:
        logger.error(f"Error saving image {idx}: {str(e)}")
        return {
            "filepath": None,
            "label_name": None
        }

active_learning_dataset = DatasetDict()

active_learning_dataset['evaluation'] = dataset['test'].map(
    save_image,
    with_indices=True,
    num_proc=NUM_PROC,
    desc="Saving evaluation images",
    remove_columns="label"
)

active_learning_dataset['unlabeled'] = dataset['train'].map(
    save_image,
    with_indices=True,
    num_proc=NUM_PROC,
    desc="Saving unlabeled images",
    remove_columns="label"
)



Saving evaluation images (num_proc=8):   0%|          | 0/3669 [00:00<?, ? examples/s]

Saving unlabeled images (num_proc=8):   0%|          | 0/3680 [00:00<?, ? examples/s]

In [4]:
active_learning_dataset

DatasetDict({
    evaluation: Dataset({
        features: ['image', 'image_id', 'label_cat_dog', 'filepath', 'label_name'],
        num_rows: 3669
    })
    unlabeled: Dataset({
        features: ['image', 'image_id', 'label_cat_dog', 'filepath', 'label_name'],
        num_rows: 3680
    })
})

## Make Initial Samples

In [5]:
import numpy as np

unique_labels = active_learning_dataset["unlabeled"].unique("label_name")
samples = []
n_samples_per_class = 10

for label in unique_labels:
    label_indices = np.where(np.array(active_learning_dataset["unlabeled"]["label_name"]) == label)[0]
    # Sample 10 random indices without replacement
    random_indices = np.random.choice(
        label_indices, size=n_samples_per_class, replace=False
    )
    samples.extend(random_indices)

initial_samples = active_learning_dataset["unlabeled"].select(samples)

# Verify the result (should show 100 rows total, 10 per class)
print(f"Total samples: {len(initial_samples)}")
print("\nSamples per class:")
print(initial_samples.select_columns(["label_name"]).to_pandas().value_counts())

Total samples: 370

Samples per class:
label_name                
abyssinian                    10
american_bulldog              10
american_pit_bull_terrier     10
basset_hound                  10
beagle                        10
bengal                        10
birman                        10
bombay                        10
boxer                         10
british_shorthair             10
chihuahua                     10
egyptian_mau                  10
english_cocker_spaniel        10
english_setter                10
german_shorthaired            10
great_pyrenees                10
havanese                      10
japanese_chin                 10
keeshond                      10
leonberger                    10
maine_coon                    10
miniature_pinscher            10
newfoundland                  10
persian                       10
pomeranian                    10
pug                           10
ragdoll                       10
russian_blue                  10
saint_bern

In [6]:
initial_samples

Dataset({
    features: ['image', 'image_id', 'label_cat_dog', 'filepath', 'label_name'],
    num_rows: 370
})

In [7]:
initial_samples[1]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x375>,
 'image_id': 'Maine_Coon_104',
 'label_cat_dog': 0,
 'filepath': 'data/oxford-iiit-pet/20/00546.jpg',
 'label_name': 'maine_coon'}

In [8]:
initial_samples = initial_samples.remove_columns(["image", "image_id", "label_name"])
df = initial_samples.to_pandas()
df = df.rename(columns={"label_cat_dog": "label"})
df["label"] = df["label"].apply(lambda x: "cat" if x == 0 else "dog")
df

Unnamed: 0,label,filepath
0,cat,data/oxford-iiit-pet/20/00624.jpg
1,cat,data/oxford-iiit-pet/20/00546.jpg
2,cat,data/oxford-iiit-pet/20/02051.jpg
3,cat,data/oxford-iiit-pet/20/00816.jpg
4,cat,data/oxford-iiit-pet/20/02624.jpg
...,...,...
365,cat,data/oxford-iiit-pet/7/02456.jpg
366,cat,data/oxford-iiit-pet/7/01600.jpg
367,cat,data/oxford-iiit-pet/7/01758.jpg
368,cat,data/oxford-iiit-pet/7/01961.jpg


In [9]:
df.to_parquet("data/oxford-iiit-pet/initial_samples.parquet")

## Make Evaluation Split

In [10]:
active_learning_dataset["evaluation"][0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x334>,
 'image_id': 'newfoundland_31',
 'label_cat_dog': 1,
 'filepath': 'data/oxford-iiit-pet/22/00000.jpg',
 'label_name': 'newfoundland'}

In [11]:
eval_samples = active_learning_dataset["evaluation"].remove_columns(["image", "image_id", "label_name"])
eval_samples = eval_samples.rename_column("label_cat_dog", "label")
eval_samples

Dataset({
    features: ['label', 'filepath'],
    num_rows: 3669
})

In [12]:
df = eval_samples.to_pandas()
df["label"] = df["label"].apply(lambda x: "cat" if x == 0 else "dog")
df

Unnamed: 0,label,filepath
0,dog,data/oxford-iiit-pet/22/00000.jpg
1,dog,data/oxford-iiit-pet/25/00001.jpg
2,dog,data/oxford-iiit-pet/1/00002.jpg
3,dog,data/oxford-iiit-pet/15/00003.jpg
4,dog,data/oxford-iiit-pet/16/00004.jpg
...,...,...
3664,dog,data/oxford-iiit-pet/10/03664.jpg
3665,dog,data/oxford-iiit-pet/17/03665.jpg
3666,dog,data/oxford-iiit-pet/28/03666.jpg
3667,dog,data/oxford-iiit-pet/21/03667.jpg


In [13]:
df.to_parquet("data/oxford-iiit-pet/evaluation_samples.parquet")

## Make Unlabeled Split

In [14]:
unlabeled_samples = active_learning_dataset["unlabeled"].remove_columns(["image", "image_id", "label_name"])
unlabeled_samples = unlabeled_samples.rename_column("label_cat_dog", "label")
df = unlabeled_samples.to_pandas()
df["label"] = df["label"].apply(lambda x: "cat" if x == 0 else "dog")
df


Unnamed: 0,label,filepath
0,cat,data/oxford-iiit-pet/20/00000.jpg
1,dog,data/oxford-iiit-pet/1/00001.jpg
2,dog,data/oxford-iiit-pet/18/00002.jpg
3,dog,data/oxford-iiit-pet/16/00003.jpg
4,dog,data/oxford-iiit-pet/14/00004.jpg
...,...,...
3675,dog,data/oxford-iiit-pet/14/03675.jpg
3676,cat,data/oxford-iiit-pet/26/03676.jpg
3677,dog,data/oxford-iiit-pet/1/03677.jpg
3678,dog,data/oxford-iiit-pet/35/03678.jpg


Remove the initial samples from the unlabeled samples

In [15]:
initial_filepaths = set(initial_samples['filepath'])
initial_filepaths

{'data/oxford-iiit-pet/0/00292.jpg',
 'data/oxford-iiit-pet/0/01161.jpg',
 'data/oxford-iiit-pet/0/01284.jpg',
 'data/oxford-iiit-pet/0/01630.jpg',
 'data/oxford-iiit-pet/0/01790.jpg',
 'data/oxford-iiit-pet/0/02063.jpg',
 'data/oxford-iiit-pet/0/02176.jpg',
 'data/oxford-iiit-pet/0/02488.jpg',
 'data/oxford-iiit-pet/0/02490.jpg',
 'data/oxford-iiit-pet/0/03205.jpg',
 'data/oxford-iiit-pet/1/00160.jpg',
 'data/oxford-iiit-pet/1/00630.jpg',
 'data/oxford-iiit-pet/1/01141.jpg',
 'data/oxford-iiit-pet/1/01311.jpg',
 'data/oxford-iiit-pet/1/01451.jpg',
 'data/oxford-iiit-pet/1/02323.jpg',
 'data/oxford-iiit-pet/1/02403.jpg',
 'data/oxford-iiit-pet/1/02622.jpg',
 'data/oxford-iiit-pet/1/02627.jpg',
 'data/oxford-iiit-pet/1/02847.jpg',
 'data/oxford-iiit-pet/10/00028.jpg',
 'data/oxford-iiit-pet/10/00106.jpg',
 'data/oxford-iiit-pet/10/00174.jpg',
 'data/oxford-iiit-pet/10/01066.jpg',
 'data/oxford-iiit-pet/10/01407.jpg',
 'data/oxford-iiit-pet/10/01500.jpg',
 'data/oxford-iiit-pet/10/02239.

In [16]:
# Filter out rows that are in initial_samples
unlabeled_samples = unlabeled_samples.filter(
    lambda x: x['filepath'] not in initial_filepaths
)

unlabeled_samples

Filter:   0%|          | 0/3680 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'filepath'],
    num_rows: 3310
})

In [17]:
df = unlabeled_samples.to_pandas()
df["label"] = df["label"].apply(lambda x: "cat" if x == 0 else "dog")
df
df.to_parquet("data/oxford-iiit-pet/unlabeled_samples.parquet")
