## SKU110K Dataset to YOLOv5 Format


> `Date: 09/28/2021`
<br/>
> `Author: @datasith`
<br/>
> `Last Modified:`
<br/>
> `/* name - date */`
<br/>
> `License: https://github.com/datasith/datasith/blob/main/licenses/LICENSE.nsfw`


#### Download and extract the SKU110K dataset

This step will vary by Users' preference. If nothing else, it serves to keep track of the URL where to find the dataset

In [None]:
# !wget http://trax-geometry.s3.amazonaws.com/cvpr_challenge/SKU110K_fixed.tar.gz -P Path_to_directory_where_the_images_should_be_downloaded

In [None]:
# !tar -xvf SKU110K_fixed.tar.gz > /dev/null

#### Import necessary libraries

In [None]:
import os
import glob
import pandas as pd
import shutil

from pathlib import Path

#### Set up the dataset's local path

In [None]:
# sku_dataset_folder  = # 'Path to directory where the images were downloaded'
sku_dataset_folder  = '/Users/f0z01ld/Work/datasets/'
sku_dataset_dirname = 'SKU110K_fixed'
path_images         = Path(sku_dataset_folder) / sku_dataset_dirname / 'images'
path_annotations    = Path(sku_dataset_folder) / sku_dataset_dirname / 'annotations'

In [None]:
# !ls $path_images

#### Re-organize files into test, train, and validation 

In [None]:
prefix_to_channel = {
    "train": "train",
    "val": "validation",
    "test": "test",
}

assert path_images.exists(), f"{path_images} not found"

for channel_name in prefix_to_channel.values():
    if not (path_images.parent / channel_name).exists():
        (path_images.parent / channel_name).mkdir()

for path_img in path_images.iterdir():
    for prefix in prefix_to_channel:
        if path_img.name.startswith(prefix):
            path_img.replace(
                path_images.parent / prefix_to_channel[prefix] / path_img.name
            )

#### Remove corrupted files

In [None]:
CORRUPTED_IMAGES = {
    "train": ("train_4222.jpg", "train_5822.jpg", "train_882.jpg", "train_924.jpg"),
    "validation": tuple(),
    "test": ("test_274.jpg", "test_2924.jpg"),
}

In [None]:
for channel_name in prefix_to_channel.values():
    for img_name in CORRUPTED_IMAGES[channel_name]:
        try:
            (path_images.parent / channel_name / img_name).unlink()
            print(f"{img_name} removed from channel {channel_name} ")
        except FileNotFoundError:
            print(f"{img_name} not in channel {channel_name}")

In [None]:
# Expected output:
# Number of train images = 8215
# Number of validation images = 588
# Number of test images = 2934
for channel_name in prefix_to_channel.values():
    print(
        f"Number of {channel_name} images = {sum(1 for x in (path_images.parent / channel_name).glob('*.jpg'))}"
    )

In [None]:
os.rmdir(path_images)

#### Reformat label (annotations) data

Taking the snipped of code from `./yolov5/data/SKU-110K.yaml` and modifying it for our use case

In [None]:
yolov5_dataset_folder = os.getcwd()
yolov5_sku_dataset_dirname = 'SKU110K_fixed'
local_path_annotations = Path(yolov5_dataset_folder) / yolov5_sku_dataset_dirname / 'labels'
local_path_images = Path(yolov5_dataset_folder) / yolov5_sku_dataset_dirname / 'images'

In [None]:
local_path_annotations

In [None]:
!mkdir -p $local_path_annotations $local_path_images

The original format of the `SKU110K` dataset is:

`'image', 'x1', 'y1', 'x2', 'y2', 'class', 'image_width', 'image_height'`

We need to convert it to `YOLO` format, which is:

`'class', 'x1', 'y1', 'x2', 'y2'`

We also need to normalize the bounding box coordinates as expected by the model

In [None]:
names = 'image', 'x1', 'y1', 'x2', 'y2', 'class', 'image_width', 'image_height'
annotation_files =  path_annotations.glob('*.csv')
for file in annotation_files:
    print(file)
    data = pd.read_csv(file, names=names)  # annotations
    prefix = file.name.split('_')[-1].replace('.csv','')
    out_labels_dir = local_path_annotations / prefix
    out_images_dir = local_path_images / prefix

    isExist = os.path.exists(out_labels_dir)
    if not isExist:
        os.mkdir(out_labels_dir)

    isExist = os.path.exists(out_images_dir)
    if not isExist:
        os.mkdir(out_images_dir)
        
    for filename_img in data['image'].unique():
        # Get all bounding boxes for this image
        mask_filename_img = data['image'] == filename_img
        data_img = data[mask_filename_img].copy().reset_index()

        # Reformat each bounding box and add it to output file
        # YOLO format is normalized (img_width, img_height) = (1, 1)
        width = data_img.image_width[0]
        height = data_img.image_height[0]
        data_img['x1'] = data_img['x1'] / width
        data_img['y1'] = data_img['y1'] / height
        data_img['x2'] = data_img['x2'] / width
        data_img['y2'] = data_img['y2'] / height
        data_img['class'] = 0
        
        data_img = data_img[['class','x1','y1','x2','y2']]

        # Set up the necessary paths
        filename_label = filename_img.replace('jpg','txt')
        out_labels_file = out_labels_dir / filename_label
        in_images_file = path_images.parent / prefix_to_channel[prefix] / filename_img
        out_images_file = out_images_dir / filename_img

        try:
            _ = shutil.copy2(in_images_file, out_images_file)
        except:
            # Exceptions are due to image file not existing for the corresponding label
            # raise NameError('check the image file name')
            print(f'check the image file name {filename_img}') 
            continue

        # If the image file is found and copied, it's safe to generate the corresponding label file
        data_img.to_csv(out_labels_file , sep=' ', header=False, index=False)