In [None]:
# Download Caltech256 dataset with: wget https://data.caltech.edu/records/nyy15-4j048/files/256_ObjectCategories.tar?download=1 -O caltech256
# Unpack it with: tar -xvf caltech256.tar (creates many subfolders in working dir!)

In [1]:
import os
import shutil
import random
import pandas as pd
from tqdm import tqdm

os.chdir('../')

In [2]:
# 197 stairs images --> 65 per class

In [2]:
caltech_dirpath = 'data/caltech256'
caltech_subset_dirpath = 'data/caltech256_subset/caltech256_subset'
n_subset = 65

In [3]:
# categories = ['023.bulldozer', '028.camel', '256.toad']
categories = ['049.cormorant', '158.penguin', '207.swan']

In [4]:
if os.path.exists(caltech_subset_dirpath):
    shutil.rmtree(caltech_subset_dirpath)
    os.makedirs(caltech_subset_dirpath)
    
# os.makedirs(caltech_subset_dirpath, exist_ok=True)

In [5]:
caltech256_subset_filepaths = []
caltech256_subset_gts = []

for category in sorted(os.listdir(caltech_dirpath)):
    if category in categories:
        print(category)
        filepaths = sorted([os.path.join(caltech_dirpath, category, fp) for fp in os.listdir(os.path.join(caltech_dirpath, category))])
        random.seed(42)
        subset_filepaths = sorted(random.sample(filepaths, n_subset))
        for filepath in tqdm(subset_filepaths):
            caltech256_subset_filepaths.append(filepath)
            gt = category.split('.')[1]
            caltech256_subset_gts.append(gt)
            # filename = os.path.basename(filepath)
            # shutil.copy2(src=filepath, dst=os.path.join(caltech_subset_dirpath, filename))
            
len(caltech256_subset_filepaths), len(caltech256_subset_gts)

049.cormorant


100%|██████████| 65/65 [00:00<00:00, 354064.62it/s]


158.penguin


100%|██████████| 65/65 [00:00<00:00, 425319.44it/s]


207.swan


100%|██████████| 65/65 [00:00<00:00, 577605.42it/s]


(195, 195)

In [6]:
random.seed(42)
random.shuffle(caltech256_subset_filepaths)

random.seed(42)
random.shuffle(caltech256_subset_gts)

In [7]:
caltech256_subset_renamed_filepaths = []
for i, filepath in enumerate(caltech256_subset_filepaths):
    new_filename = f'caltech_{str(i + 1).zfill(3)}.jpg'
    new_filepath = os.path.join(caltech_subset_dirpath, new_filename)
    caltech256_subset_renamed_filepaths.append(new_filepath)

In [8]:
# Move and rename
for src, dst in zip(caltech256_subset_filepaths, caltech256_subset_renamed_filepaths):
    shutil.copy(src=src, dst=dst)

In [9]:
df = pd.DataFrame({'filepath': caltech256_subset_renamed_filepaths, 'GT': caltech256_subset_gts})

In [10]:
df.GT = df.GT.replace({
    categories[0].split('.')[1]: 'angular', 
    categories[1].split('.')[1]: 'bent', 
    categories[2].split('.')[1]: 'straight'}
)

In [11]:
df.to_csv('data/caltech256_subset/caltech256_subset_annotation.csv')