### INSTRUCTIONS

This module extracts images from the Open Images Dataset V4 that contain selected classes and generates a corresponding .txt or .xml file for each image as per the YOLO and PASCAL VOC formats respectively.

You will need these packages:
    1. pandas via "pip install pandas"
    2. pascal_voc_writer via "pip install pascal-voc-writer"
    3. pillow via "pip install pillow"
    
You will require the following meta files, located in the same directory:
    1. class_descriptions.csv
    2. train_annotations.csv
    3. train_images_with_rotation.csv
    
Subsequently, specify the source and output paths accordingly, in the variables:
    1. imgs_source
    2. imgs_out
    3. meta_source
    4. meta_out

Finally, indicate the output annotation format, as either:
    1. "YOLO" or 
    2. "PASCAL VOC"

Your selected classes are stored as a list of class descriptions (case-sensitive) - add to or remove from it as needed.

### CHANGE THESE VARIABLES

In [None]:
imgs_source = "/media/tingyu/WD BLUE/open_images/all/train_1"
imgs_out = "/media/tingyu/WD BLUE/open_images/subset/train"

meta_source = "/media/tingyu/WD BLUE/open_images/all"
meta_out = "/media/tingyu/WD BLUE/open_images/subset"

annotation_format = "PASCAL VOC"

classes = [
    "Person",
    # "Man",
    # "Woman",
    "Human face",
    "Hat",
    # "Glasses",
    "Sunglasses",
    "Knife",
    "Handgun",
    "Rifle",
    "Weapon",
    # "Shirt",
    "Jacket",
    "Shorts",
    "Jeans",
    "Skirt",
]

### AVOID EDITING CELLS BELOW

In [None]:
from pascal_voc_writer import Writer
from PIL import Image
from shutil import copyfile
import os
import pandas as pd

In [None]:
class_descriptions = pd.read_csv("{}/class_descriptions.csv".format(meta_source), header=None)
class_descriptions.columns = ["LabelName", "Description"]
label_names = [class_descriptions[class_descriptions["Description"] == c]["LabelName"].values[0] 
               for c in classes]
"""
select LabelName, Description
from   class_descriptions
where  LabelName in label_names
"""
class_descriptions = (class_descriptions[class_descriptions["LabelName"].isin(label_names)][["LabelName", "Description"]]
                      .sort_values(["Description"]))
class_descriptions = class_descriptions.reset_index(drop=True)

In [None]:
train_annotations = pd.read_csv("{}/train_annotations.csv".format(meta_source))
"""
select *
from   train_annotations join class_descriptions 
       on LabelName
"""
train_annotations = (pd.merge(train_annotations, class_descriptions, on="LabelName")
                     .sort_values(["ImageID"]))
print(train_annotations.shape)

In [None]:
train_images = pd.read_csv("{}/train_images_with_rotation.csv".format(meta_source))
"""
select *
from   train_images join (select distinct ImageID from train_annotations) 
       on ImageID
"""
train_images = (pd.merge(train_images, pd.DataFrame(train_annotations["ImageID"].unique(), columns=["ImageID"]), on="ImageID")
                .sort_values(["ImageID"]))
print(train_images.shape)

In [None]:
def to_yolo_format(label_name, x_min, x_max, y_min, y_max):
    label_idx = class_descriptions[class_descriptions["LabelName"] == label_name].index[0]
    x_mid = (x_min + x_max) / 2
    y_mid = (y_min + y_max) / 2
    width = x_max - x_min
    height = y_max - y_min
    return "{} {} {} {} {}".format(label_idx, x_mid, y_mid, width, height)

In [None]:
def create_annotations_file(folder_path, image_id):
    annotations = train_annotations[train_annotations["ImageID"] == image_id]
    if annotation_format == "YOLO":
        with open("{}/{}.txt".format(folder_path, image_id), "w+") as file:    
            file.write("\n".join([to_yolo_format(r["LabelName"], r["XMin"], r["XMax"], r["YMin"], r["YMax"]) 
                                  for _, r in annotations.iterrows()]))
            file.close()
    elif annotation_format == "PASCAL VOC":
        img_path = "{}/{}.jpg".format(folder_path, image_id)
        img = Image.open(img_path)
        width, height = img.size
        writer = Writer(img_path, width, height)
        for _, r in annotations.iterrows():
            writer.addObject(r["LabelName"], r["XMin"], r["XMax"], r["YMin"], r["YMax"])
        writer.save("{}/{}.xml".format(folder_path, image_id))
    else:
        raise ValueError("Invalid annotation format")

In [None]:
if not os.path.exists(meta_out):
    os.mkdir(meta_out)
if not os.path.exists(imgs_out):
    os.mkdir(imgs_out)

with open("{}/subset.names".format(meta_out), "w+") as names_file:
    names_file.write("\n".join(class_descriptions["Description"].tolist()))
    names_file.close()

for _, r in train_images.iterrows():
    try:
        image_id = r["ImageID"]
        copyfile("{}/{}.jpg".format(imgs_source, image_id), "{}/{}.jpg".format(imgs_out, image_id))
        create_annotations_file(imgs_out, image_id)
    except FileNotFoundError:
        print("Image {}.jpg not found, skipping".format(image_id))