# OpenImagesV4 Data Preparation Utility For YOLOV3

This module extracts images from the Open Images Dataset V4 that contain selected classes, generating a corresponding .txt or .xml file for each image as per the YOLO and PASCAL VOC annotation formats respectively and the necessary metadata files.

You will need these libraries:
    1. pandas
    2. pascal_voc_writer
    3. pillow
    4. seaborn
    5. tqdm
    
You require the following meta files, located in a single directory:
    1. class_descriptions.csv
    2. train_annotations.csv
    3. train_images.csv
    4. valid_annotations.csv
    5. valid_images.csv
    
Subsequently, specify the source and destination paths accordingly, in the dictionaries:
    1. src
    2. dst

Finally, indicate the output annotation format, as either:
    1. "YOLO" or 
    2. "PASCALVOC"

Your selected classes are stored as a list of class descriptions (case-sensitive) - add to or remove from it as needed.

Optionally, you can limit the number of training and validation images copied by setting:
    1. limit
    2. n
    
Both training and validation images will be copied into a folder named 'custom', and the generated metadata files are:
    1. custom.names - contains class descriptions
    2. train.txt    - contains relative paths from darknet.exe to every trainin image
    3. valid.txt    - contains relative paths from darknet.exe to every validation image
    
    i.e. data/custom/XXX.jpg
    
Your directory tree should look like this:

```
+-- darknet
|   +-- data
|   |   +-- custom
|   |   |   +-- XXX.jpg
|   |   |   +-- XXX.txt
|   |   |   +-- YYY.jpg
|   |   |   +-- YYY.txt
|   |   |   ...
|   |   +-- custom.names
|   |   +-- train.txt
|   |   +-- validation.txt
|   ...
...
```

In [1]:
from pascal_voc_writer import Writer
from PIL import Image
from tqdm import tqdm
from shutil import copyfile
import matplotlib.pyplot as plt
import os
import pandas as pd
import seaborn as sns
sns.set_style("darkgrid")
import subprocess

In [9]:
classes = [
    # "Person",
    # "Human face",
    # "Hat",
    # "Sunglasses",
    "Knife",
    # "Handgun",
    # "Rifle",
    # "Weapon",
    # "Jacket",
    # "Shorts",
    # "Jeans",
    # "Skirt",
]

src = {"train": "/media/tingyu/WD BLUE/open_images/all/train_0", 
       "valid": "/media/tingyu/WD BLUE/open_images/all/validation", 
       "meta": "data/meta"}
dst = "data"

ann_format = "PASCALVOC"

limit = {"train": False, 
         "valid": False}
n = {"train": 0, 
     "valid": 0}

s3 = "aws s3 --no-sign-request cp s3://open-images-dataset"

['aws', 's3', '--no-sign-request', 'cp', 's3://open-images-dataset/train/0908260ac3dccd9b.jpg', 'data/custom']


'Completed 250.5 KiB/250.5 KiB (15.5 KiB/s) with 1 file(s) remaining\ndownload: s3://open-images-dataset/train/0908260ac3dccd9b.jpg to data\\custom\\0908260ac3dccd9b.jpg\n'

In [3]:
class OpenImagesUtility():
    def __init__(self, classes, src, dst, ann_format, limit, n):
        self.src = src
        self.dst = dst
        self.ann_format = ann_format
        self.limit = limit
        self.n = n
        
        self.class_desc = pd.read_csv("{}/class_descriptions.csv".format(self.src["meta"]), header=None)
        self.class_desc.columns = ["LabelName", "Description"]
        label_names = [self.class_desc[self.class_desc["Description"] == c]["LabelName"].values[0]
                       for c in classes]
        """
        select LabelName, Description
        from   class_desc
        where  LabelName in label_names
        """
        self.class_desc = (self.class_desc[self.class_desc["LabelName"].isin(label_names)][["LabelName", "Description"]]
                           .sort_values(["Description"]))
        self.class_desc = self.class_desc.reset_index(drop=True)
        print("Total Classes: {}".format(self.class_desc.shape[0]))
        
        train_anns = pd.read_csv("{}/train_annotations.csv".format(self.src["meta"]))
        """
        select *
        from   train_anns join class_desc
               on LabelName
        """
        train_anns = (pd.merge(train_anns, self.class_desc, on="LabelName")
                      .sort_values(["ImageID"]))
        print("Total Training Annotations: {}".format(train_anns.shape[0]))

        train_imgs = pd.read_csv("{}/train_images.csv".format(self.src["meta"]))
        """
        select *
        from   train_imgs join (select distinct ImageID from train_anns) 
               on ImageID
        """
        train_imgs = (pd.merge(train_imgs, pd.DataFrame(train_anns["ImageID"].unique(), columns=["ImageID"]), on="ImageID")
                      .sort_values(["ImageID"]))
        print("Total Training Images: {}".format(train_imgs.shape[0]))
        
        valid_anns = pd.read_csv("{}/valid_annotations.csv".format(self.src["meta"]))
        """
        select *
        from   valid_anns join class_desc
               on LabelName
        """
        valid_anns = (pd.merge(valid_anns, self.class_desc, on="LabelName")
                      .sort_values(["ImageID"]))
        print("Total Validation Annotations: {}".format(valid_anns.shape[0]))
        
        valid_imgs = pd.read_csv("{}/valid_images.csv".format(self.src["meta"]))
        valid_imgs["ImageID"] = valid_imgs["ImageID"].apply(lambda s: s[0:len(s) - 4])
        """
        select *
        from   valid_imgs join (select distinct ImageID from valid_anns) 
               on ImageID
        """
        valid_imgs = (pd.merge(valid_imgs, pd.DataFrame(valid_anns["ImageID"].unique(), columns=["ImageID"]), on="ImageID")
                      .sort_values(["ImageID"]))
        print("Total Validation Images: {}".format(valid_imgs.shape[0]))
        
        self.anns = {"train": train_anns, 
                     "valid": valid_anns}
        self.imgs = {"train": train_imgs, 
                     "valid": valid_imgs}
        
    def _to_yolo_format(self, label_name, x_min, x_max, y_min, y_max):
        """
        Converts annotations from PASCALVOC to YOLO format.
        """
        label_idx = self.get_label_idx(label_name)
        x_mid = (x_min + x_max) / 2
        y_mid = (y_min + y_max) / 2
        width = x_max - x_min
        height = y_max - y_min
        return "{} {} {} {} {}".format(label_idx, x_mid, y_mid, width, height)

    def _mk_anns_txt(self, anns, folder_path, img_id):
        """
        Create annotations .txt file in YOLO format.
        """
        with open("{}/{}.txt".format(folder_path, img_id), "w+") as file:    
            file.write("\n".join([self._to_yolo_format(r["LabelName"], r["XMin"], r["XMax"], r["YMin"], r["YMax"]) 
                                  for _, r in anns.iterrows()]))
            file.close()

    def _mk_anns_xml(self, anns, folder_path, img_id):
        """
        Create annotations .xml file in PASCALVOC format.
        """
        img = Image.open("{}/{}.jpg".format(folder_path, img_id))
        width, height = img.size
        writer = Writer(folder_path, width, height)
        for _, r in anns.iterrows():
            label_desc = self.get_label_desc(r["LabelName"])
            x_min = round(r["XMin"] * width)
            x_max = round(r["XMax"] * width)
            y_min = round(r["YMin"] * height)
            y_max = round(r["YMax"] * height)
            writer.addObject(label_desc, x_min, y_min, x_max, y_max)
        writer.save("{}/{}.xml".format(folder_path, img_id))

    def _mk_anns_file(self, anns, folder_path, img_id):
        """
        Create annotations file in specified format.
        """
        if self.ann_format == "YOLO":
            self._mk_anns_txt(anns, folder_path, img_id)
        elif self.ann_format == "PASCALVOC":
            self._mk_anns_xml(anns, folder_path, img_id)
        else:
            raise ValueError("Invalid annotation format")
    
    def _cp_with_anns(self, imgs, anns, mode, limit, n):
        """
        Copy image from source to destination and create corresponding annotations file in destination directory.
        
        Input(s):
        1. imgs (dataframe) - ImageIDs to copy
        2. anns (dataframe) - annotations for ImageIDs in imgs
        3. src (str) - source folder path
        4. limit (boolean) - copy n images if true, all otherwise
        5. n (int) - no. of images to copy, relevant when limit=True
        
        Output(s):
        1. paths (list) - paths to each copied image
        2. logs (list) - caught error messages
        """
        if not limit: 
            n = imgs.shape[0] 
        paths = []
        logs = []
        with tqdm(total=n) as pbar:
            for _, r in imgs.iterrows():
                if limit and len(paths) == n: 
                    break
                try:
                    cmd = "{}/{}/{}.jpg custom".format(s3, mode, r["ImageID"])
                    cmd = subprocess.run(cmd.split(), check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
                    # copyfile("{}/{}.jpg".format(src, r["ImageID"]), 
                    #          "{}/custom/{}.jpg".format(self.dst, r["ImageID"]))
                    self._mk_anns_file(anns[anns["ImageID"] == r["ImageID"]],
                                            "{}/custom".format(self.dst), 
                                            r["ImageID"])
                    paths.append("data/custom/{}.jpg".format(r["ImageID"]))
                except FileNotFoundError:
                    logs.append("{}.jpg not found".format(r["ImageID"]))
                pbar.update()
        return paths, logs
    
    def get_label_desc(self, label_name):
        return self.class_desc[self.class_desc["LabelName"] == label_name]["Description"].values[0]

    def get_label_idx(self, label_name):
        return self.class_desc[self.class_desc["LabelName"] == label_name].index[0]
    
    def plt_anns_dist(self, mode):
        """
        Displays a horizontal bar graph of the number of annotations per selected class.
        
        Input(s):
        1. mode (str) - either "train" or "valid" 
        """
        dist = (self.anns[mode]
                .groupby(["LabelName"])
                .size()
                .reset_index(name="Count")
                .sort_values(["Count"]))
        dist["LabelName"] = dist["LabelName"].apply(self.get_label_desc)
        plt.barh(y=dist["LabelName"], width=dist["Count"])
        plt.show()
        
    def mk_dst_dirs(self):
        """
        Create destination directories if not exists.
        """
        if not os.path.exists(self.dst): os.mkdir(self.dst)
        if not os.path.exists("{}/custom".format(self.dst)):
            os.mkdir("{}/custom".format(self.dst))
            
    def mk_yolo_metas(self):
        """
        Create necessary YOLO metadata files:
        1. .names - contains all class descriptions, 1 per line
        2. .data  - ?
        """
        with open("{}/custom.names".format(self.dst), "w+") as file:
            file.write("\n".join(self.class_desc["Description"].tolist()))
            file.close()
        with open("{}/custom.data".format(self.dst), "w+") as file:
            file.write("classes={}" \
                       "\ntrain=data/train.txt" \
                       "\nvalid=data/valid.txt" \
                       "\nnames=data/custom.names" \
                       "\nbackup=backup"
                       .format(self.class_desc.shape[0]))
            file.close()
        
    def prep(self, mode):
        paths, logs = self._cp_with_anns(self.imgs[mode], 
                                         self.anns[mode], 
                                         mode, 
                                         self.limit[mode], 
                                         self.n[mode])
        with open("{}/{}.txt".format(self.dst, mode), "w+") as txt:
            txt.write("\n".join(paths))
            txt.close()
        return logs
        
    def start(self):
        self.mk_dst_dirs()
        self.mk_yolo_metas()
        self.prep("train")
        self.prep("valid")

In [5]:
utility = OpenImagesUtility(classes, src, dst, ann_format, limit, n)

Total Classes: 1


MemoryError: 

In [None]:
utility.plt_anns_dist("train")

In [None]:
utility.plt_anns_dist("valid")

In [30]:
utility.start()

 45%|████▍     | 4457/10000 [14:33<16:02,  5.76it/s]


KeyboardInterrupt: 