## Prepare the SKU110K Dataset

#### Download and extract the SKU110K dataset

This step will vary by Users' preference. If nothing else, it serves to keep track of the URL where to find the dataset

In [10]:
# !wget http://trax-geometry.s3.amazonaws.com/cvpr_challenge/SKU110K_fixed.tar.gz

In [11]:
# !tar -xvf SKU110K_fixed.tar.gz > /dev/null

#### Import necessary libraries

In [110]:
import os
import glob
import pandas as pd
import shutil

from pathlib import Path

#### Set up the dataset's local path

In [34]:
sku_dataset_folder  = '/Users/f0z01ld/Work/datasets' # os.getcwd()
sku_dataset_dirname = 'SKU110K_fixed'
path_images         = Path(sku_dataset_folder) / sku_dataset_dirname / 'images'
path_annotations    = Path(sku_dataset_folder) / sku_dataset_dirname / 'annotations'

In [16]:
# !ls $path_images

#### Re-organize files into test, train, and validation 

In [17]:
prefix_to_channel = {
    "train": "train",
    "val": "validation",
    "test": "test",
}

assert path_images.exists(), f"{path_images} not found"

for channel_name in prefix_to_channel.values():
    if not (path_images.parent / channel_name).exists():
        (path_images.parent / channel_name).mkdir()

for path_img in path_images.iterdir():
    for prefix in prefix_to_channel:
        if path_img.name.startswith(prefix):
            path_img.replace(
                path_images.parent / prefix_to_channel[prefix] / path_img.name
            )

#### Remove corrupted files

In [18]:
CORRUPTED_IMAGES = {
    "train": ("train_4222.jpg", "train_5822.jpg", "train_882.jpg", "train_924.jpg"),
    "validation": tuple(),
    "test": ("test_274.jpg", "test_2924.jpg"),
}

In [19]:
for channel_name in prefix_to_channel.values():
    for img_name in CORRUPTED_IMAGES[channel_name]:
        try:
            (path_images.parent / channel_name / img_name).unlink()
            print(f"{img_name} removed from channel {channel_name} ")
        except FileNotFoundError:
            print(f"{img_name} not in channel {channel_name}")

train_4222.jpg removed from channel train 
train_5822.jpg removed from channel train 
train_882.jpg removed from channel train 
train_924.jpg removed from channel train 
test_274.jpg removed from channel test 
test_2924.jpg removed from channel test 


In [20]:
# Expected output:
# Number of train images = 8215
# Number of validation images = 588
# Number of test images = 2934
for channel_name in prefix_to_channel.values():
    print(
        f"Number of {channel_name} images = {sum(1 for x in (path_images.parent / channel_name).glob('*.jpg'))}"
    )

Number of train images = 8215
Number of validation images = 588
Number of test images = 2934


In [21]:
os.rmdir(path_images)

#### Reformat label (annotations) data

Taking the snipped of code from `./yolov5/data/SKU-110K.yaml` and modifying it for our use case

In [94]:
yolov5_dataset_folder = os.getcwd()
yolov5_sku_dataset_dirname = 'SKU110K_fixed'
local_path_annotations = Path(yolov5_dataset_folder) / yolov5_sku_dataset_dirname / 'labels'
local_path_images = Path(yolov5_dataset_folder) / yolov5_sku_dataset_dirname / 'images'

In [95]:
local_path_annotations

PosixPath('/Users/f0z01ld/Work/object-detection-yolov5/datasets/SKU110K_fixed/labels')

In [96]:
!mkdir -p $local_path_annotations $local_path_images

The original format of the `SKU110K` dataset is:

`'image', 'x1', 'y1', 'x2', 'y2', 'class', 'image_width', 'image_height'`

We need to convert it to `YOLO` format, which is:

`'class', 'x1', 'y1', 'x2', 'y2'`

We also need to normalize the bounding box coordinates as expected by the model

In [112]:
names = 'image', 'x1', 'y1', 'x2', 'y2', 'class', 'image_width', 'image_height'

annotation_files =  path_annotations.glob('*.csv')
for file in annotation_files:
    data = pd.read_csv(file, names=names)  # annotations
    prefix = file.name.split('_')[-1].replace('.csv','')
    out_labels_dir = local_path_annotations / prefix
    out_images_dir = local_path_images / prefix

    isExist = os.path.exists(out_labels_dir)
    if not isExist:
        os.mkdir(out_labels_dir)

    isExist = os.path.exists(out_images_dir)
    if not isExist:
        os.mkdir(out_images_dir)
    
    for idx_row, row in data.iterrows():
        out_labels_file = out_labels_dir / f'{prefix}_{idx_row}.txt'
        images_file = path_images.parent / prefix / f'{prefix}_{idx_row}.jpg'
        out_images_file = out_images_dir / f'{prefix}_{idx_row}.jpg'
        object_class = 0 # only one class for this dataset
        w  = row['image_width']
        h  = row['image_height']
        x1 = row['x1'] / w
        y1 = row['y1'] / h
        x2 = row['x2'] / w
        y2 = row['y2'] / h

        try:
            _ = shutil.copy2(images_file, out_images_file)
        except:
            continue # fails are due to image file not existing for the corresponding label
        
        with open(out_labels_file,'w') as f:
            f.write(f'{object_class} {x1} {y1} {x2} {y2}')