In [58]:
import os
from os.path import join
import numpy as np
import pandas as pd
from tqdm import tqdm
import cv2
from torchvision.datasets import CIFAR10
from coreml.utils.io import save_yml, read_yml

In [3]:
train = CIFAR10('/data/CIFAR10/raw',  download=True)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /data/CIFAR10/cifar-10-python.tar.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting /data/CIFAR10/cifar-10-python.tar.gz to /data/CIFAR10


In [5]:
train

Dataset CIFAR10
    Number of datapoints: 50000
    Root location: /data/CIFAR10
    Split: Train

In [6]:
test = CIFAR10('/data/CIFAR10/raw', train=False, download=True)

Files already downloaded and verified


In [10]:
processed_dir = '/data/CIFAR10/processed'

In [11]:
os.makedirs(processed_dir, exist_ok=True)

In [13]:
train.data.shape

(50000, 32, 32, 3)

In [15]:
len(train.targets)

50000

In [21]:
all_images = np.append(train.data, test.data, axis=0)
all_targets = np.append(train.targets, test.targets, axis=0)

In [22]:
all_images.shape, all_targets.shape

((60000, 32, 32, 3), (60000,))

In [30]:
image_dir = join(processed_dir, 'images')
annotation_path = join(processed_dir, 'annotation.csv')
version_dir = join(processed_dir, 'versions')
os.makedirs(version_dir, exist_ok=True)
os.makedirs(image_dir, exist_ok=True)
version_path = join(version_dir, 'default.yml')

In [33]:
image_paths = []
for index in tqdm(range(len(all_images))):
    image = all_images[index]
    image_path = join(image_dir, f'{index}.png')
    image_paths.append(image_path)
    cv2.imwrite(image_path, image[:, :, ::-1])

100%|██████████| 60000/60000 [00:10<00:00, 5735.53it/s]


In [34]:
splits = ['train'] * len(train.data) + ['test'] * len(test.data)

In [71]:
labels = [{'classification': [all_targets[index].tolist()]} for index in range(len(all_targets))]

In [72]:
annotation = pd.DataFrame({'path': image_paths, 'label': labels, 'split': splits})

In [73]:
annotation.head()

Unnamed: 0,path,label,split
0,/data/CIFAR10/processed/images/0.png,{'classification': [6]},train
1,/data/CIFAR10/processed/images/1.png,{'classification': [9]},train
2,/data/CIFAR10/processed/images/2.png,{'classification': [9]},train
3,/data/CIFAR10/processed/images/3.png,{'classification': [4]},train
4,/data/CIFAR10/processed/images/4.png,{'classification': [1]},train


In [74]:
annotation.to_csv(annotation_path, index=False)

In [75]:
version = {}

In [77]:
version['train'] = {
    'file': image_paths[:len(train.data)],
    'label': labels[:len(train.data)]
}

In [78]:
version['test'] = {
    'file': image_paths[len(train.data):],
    'label': labels[len(train.data):]
}

In [79]:
assert len(version['train']['file']) == 50000
assert len(version['test']['file']) == 10000

In [80]:
assert isinstance(version['train']['label'], list)
assert isinstance(version['train']['label'][0], dict)
assert isinstance(version['test']['label'], list)
assert isinstance(version['test']['label'][0], dict)

In [81]:
save_yml(version_path, version)

In [82]:
load = read_yml(version_path)

In [83]:
annotation.loc[50000]

path     /data/CIFAR10/processed/images/50000.png
label                     {'classification': [3]}
split                                        test
Name: 50000, dtype: object