#### Download and extract the SKU110K dataset

In [17]:
# download, decompress the data
!wget http://trax-geometry.s3.amazonaws.com/cvpr_challenge/SKU110K_fixed.tar.gz

--2022-01-29 03:52:37--  http://trax-geometry.s3.amazonaws.com/cvpr_challenge/SKU110K_fixed.tar.gz
Resolving trax-geometry.s3.amazonaws.com (trax-geometry.s3.amazonaws.com)... 52.217.136.217
Connecting to trax-geometry.s3.amazonaws.com (trax-geometry.s3.amazonaws.com)|52.217.136.217|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12196152718 (11G) [application/x-gzip]
Saving to: ‘cache/SKU110K_fixed.tar.gz’


2022-01-29 04:29:40 (5.23 MB/s) - ‘cache/SKU110K_fixed.tar.gz’ saved [12196152718/12196152718]

tar: Error opening archive: Failed to open 'SKU110K_fixed.tar.gz'


In [22]:
!tar -xvf SKU110K_fixed.tar.gz > /dev/null

tar: Error opening archive: Failed to open 'SKU110K_fixed.tar.gz'


#### Set up the dataset's local path

In [18]:
import os
import pandas as pd
from pathlib import Path
local_folder = os.getcwd()
sku_dataset_dirname = 'SKU110K_fixed'
path_images = Path(local_folder) / sku_dataset_dirname / "images"

#### Re-organize files into test, train, and validation 

In [11]:
prefix_to_channel = {
    "train": "train",
    "val": "validation",
    "test": "test",
}

assert path_images.exists(), f"{path_images} not found"

for channel_name in prefix_to_channel.values():
    if not (path_images.parent / channel_name).exists():
        (path_images.parent / channel_name).mkdir()

for path_img in path_images.iterdir():
    for prefix in prefix_to_channel:
        if path_img.name.startswith(prefix):
            path_img.replace(
                path_images.parent / prefix_to_channel[prefix] / path_img.name
            )

#### Remove corrupted files

In [12]:
CORRUPTED_IMAGES = {
    "train": ("train_4222.jpg", "train_5822.jpg", "train_882.jpg", "train_924.jpg"),
    "validation": tuple(),
    "test": ("test_274.jpg", "test_2924.jpg"),
}

In [13]:
for channel_name in prefix_to_channel.values():
    for img_name in CORRUPTED_IMAGES[channel_name]:
        try:
            (path_images.parent / channel_name / img_name).unlink()
            print(f"{img_name} removed from channel {channel_name} ")
        except FileNotFoundError:
            print(f"{img_name} not in channel {channel_name}")

train_4222.jpg removed from channel train 
train_5822.jpg removed from channel train 
train_882.jpg removed from channel train 
train_924.jpg removed from channel train 
test_274.jpg removed from channel test 
test_2924.jpg removed from channel test 


In [14]:
# Expected output:
# Number of train images = 8215
# Number of validation images = 588
# Number of test images = 2934
for channel_name in prefix_to_channel.values():
    print(
        f"Number of {channel_name} images = {sum(1 for x in (path_images.parent / channel_name).glob('*.jpg'))}"
    )

Number of train images = 8215
Number of validation images = 588
Number of test images = 2934


In [20]:
os.rmdir(path_images)