In [1]:
import os
import json
import shutil
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings("ignore")

In [2]:
def extract_categories(annot_fpath, categories):
    f = open(annot_fpath)
    body = json.load(f)

    # iterate over all categories in the dataset and get
    # detailed info of the those listed in cat_names
    cat_id = []
    cat_subset = []
    for c in body["categories"]:
        if c["name"] in categories:
            cat_id.append(c["id"])
            cat_subset.append(c)

    # iterate over all annotations in the dataset 
    # and get images and annotations for the categories
    # of interest
    annot_subset = []
    img_id_list = []
    for annot in tqdm(body["annotations"]):
        if annot['category_id'] in cat_id:
            annot_subset.append(annot)
            img_id_list.append(annot["image_id"])

    img_id_list = list(np.unique(img_id_list))

    # iterate over all images in the dataset 
    # and get images labeled with categories
    # of interest
    img_subset = []
    for img in body["images"]:
        if img["id"] in img_id_list:
            img_subset.append(img)
    return cat_subset, annot_subset, img_subset

In [3]:
# consistent categories
cat_dict = {
    'helmet': 'helmet',
    'Helmet': 'helmet',
    'head_helmet': 'helmet',
    'Hardhat': 'helmet',
    
    'head_nohelmet': 'head_nohelmet',
    'head': 'head_nohelmet',
    'NO-Hardhat': 'head_nohelmet',
    'Head': 'head_nohelmet',

    'vest': 'vest',
    'Vest': 'vest',
    'Safety Vest': 'vest',
    
    # 'NO-Safety Vest': 'no_vest',
    # 'chest': 'no_vest',
    
    'Person': 'person',
    'person': 'person',
}

id_dict = {
    'head_nohelmet': 1,
    'helmet': 2,
    'person': 3,
    'vest': 4,
    # 'no_vest': 5
}

In [4]:
# ppe_v3
in_fpath1 = "ppe_v3/ppe_v3.v5i.coco/train/_annotations.coco.json"

# ppe8-0mhax
in_fpath2 = "ppe8-0mhax/ppe8.v1-raw.coco/train/_annotations.coco.json"
in_fpath3 = "ppe8-0mhax/ppe8.v1-raw.coco/valid/_annotations.coco.json"

# personal-protective-equipment-combined-model
in_fpath4 = "personal-protective-equipment-combined-model/Personal Protective Equipment - Combined Model.v4-resize640_allclasses_noaugs.coco/train/_annotations.coco.json"
in_fpath5 = "personal-protective-equipment-combined-model/Personal Protective Equipment - Combined Model.v4-resize640_allclasses_noaugs.coco/valid/_annotations.coco.json"
in_fpath6 = "personal-protective-equipment-combined-model/Personal Protective Equipment - Combined Model.v4-resize640_allclasses_noaugs.coco/test/_annotations.coco.json"

# Construction PPEs Computer Vision Project
in_fpath7 = "construction-ppes/Construction PPEs.v6i.coco/train/_annotations.coco.json"
in_fpath8 = "construction-ppes/Construction PPEs.v6i.coco/valid/_annotations.coco.json"
in_fpath9 = "construction-ppes/Construction PPEs.v6i.coco/test/_annotations.coco.json"

# Hard Hat Workers
hhw_fpath1 = "hard-hat-workers/Hard Hat Workers.v2-raw.coco/train/_annotations.coco.json"
hhw_fpath2 = "hard-hat-workers/Hard Hat Workers.v2-raw.coco/test/_annotations.coco.json"

In [5]:
# ppe_v3, ppe8-0mhax
cat_names1 = [
    'head_helmet',
    'head_nohelmet',
    'person',
    'vest'
]


# personal-protective-equipment-combined-model
cat_names2 = [
    "Hardhat",
    "NO-Hardhat",
    # "NO-Safety Vest",
    "Person",
    "Safety Vest"
]

# Construction PPEs Computer Vision Project
cat_names3 = [
    "Helmet",
    "Person",
    "Vest",
    # "chest",
    "head"
]

# Hard Hat Workers
cat_names5 = ['head', 'helmet', 'person']

## Execution section

In [6]:
# extract original annotations and images from the categories listed in cat_names1, cat_names2 or cat_names3
ppev3_cat, ppev3_annot, ppev3_img = extract_categories(annot_fpath=in_fpath1, categories=cat_names1)

ppe8_cat, ppe8_annot1, ppe8_img1 = extract_categories(annot_fpath=in_fpath2, categories=cat_names1)
_, ppe8_annot2, ppe8_img2 = extract_categories(annot_fpath=in_fpath3, categories=cat_names1)

ppecm_cat, ppecm_annot1, ppecm_img1 = extract_categories(annot_fpath=in_fpath4, categories=cat_names2)
_, ppecm_annot2, ppecm_img2 = extract_categories(annot_fpath=in_fpath5, categories=cat_names2)
_, ppecm_annot3, ppecm_img3 = extract_categories(annot_fpath=in_fpath6, categories=cat_names2)

cppecvp_cat, cppecvp_annot1, cppecvp_img1 = extract_categories(annot_fpath=in_fpath7, categories=cat_names3)
_, cppecvp_annot2, cppecvp_img2 = extract_categories(annot_fpath=in_fpath8, categories=cat_names3)
_, cppecvp_annot3, cppecvp_img3 = extract_categories(annot_fpath=in_fpath9, categories=cat_names3)

hhw_cat, hhw_annot1, hhw_img1 = extract_categories(annot_fpath=hhw_fpath1, categories=cat_names5)
_, hhw_annot2, hhw_img2 = extract_categories(annot_fpath=hhw_fpath2, categories=cat_names5)

  0%|          | 0/19660 [00:00<?, ?it/s]

  0%|          | 0/869 [00:00<?, ?it/s]

  0%|          | 0/426 [00:00<?, ?it/s]

  0%|          | 0/76398 [00:00<?, ?it/s]

  0%|          | 0/22077 [00:00<?, ?it/s]

  0%|          | 0/11086 [00:00<?, ?it/s]

  0%|          | 0/92136 [00:00<?, ?it/s]

  0%|          | 0/25727 [00:00<?, ?it/s]

  0%|          | 0/13647 [00:00<?, ?it/s]

  0%|          | 0/20231 [00:00<?, ?it/s]

  0%|          | 0/6808 [00:00<?, ?it/s]

In [7]:
# assigning new IDs to photos

ppev3_img_df = pd.DataFrame(ppev3_img)

ppe8_img = ppe8_img1 + ppe8_img2
ppe8_img_df = pd.DataFrame(ppe8_img)

ppecm_img = ppecm_img1 + ppecm_img2 + ppecm_img3
ppecm_img_df = pd.DataFrame(ppecm_img)

cppecvp_img = cppecvp_img1 + cppecvp_img2 + cppecvp_img3
cppecvp_img_df = pd.DataFrame(cppecvp_img)

hhw_img = hhw_img1 + hhw_img2
hhw_img_df = pd.DataFrame(hhw_img)

ppev3_img_df["dataset"] = "ppe_v3"
ppe8_img_df["dataset"] = "ppe8-0mhax"
ppecm_img_df["dataset"] = "personal-protective-equipment-combined-model"
cppecvp_img_df["dataset"] = "construction-ppes"
hhw_img_df["dataset"] = "hard-hat-workers"

orig_img_df = pd.concat([
    ppev3_img_df, ppe8_img_df, ppecm_img_df, cppecvp_img_df, hhw_img_df
]).reset_index(drop=True).rename(columns={"id": "orig_id"})


new_id = 0
img_df = pd.DataFrame()
for dataset in tqdm(orig_img_df["dataset"].unique()):
    for orig_id in tqdm(orig_img_df.loc[orig_img_df["dataset"]==dataset]["orig_id"].unique()):
        tmp = orig_img_df.loc[(orig_img_df["dataset"]==dataset) & (orig_img_df["orig_id"]==orig_id)]
        tmp["id"] = new_id
        img_df = pd.concat([img_df, tmp])
        
        new_id += 1

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/2064 [00:00<?, ?it/s]

  0%|          | 0/73 [00:00<?, ?it/s]

  0%|          | 0/19078 [00:00<?, ?it/s]

  0%|          | 0/5404 [00:00<?, ?it/s]

  0%|          | 0/5269 [00:00<?, ?it/s]

In [8]:
# assigning new IDs to annotations

ppev3_annot_df = pd.DataFrame(ppev3_annot)

ppe8_annot = ppe8_annot1 + ppe8_annot2
ppe8_annot_df = pd.DataFrame(ppe8_annot)

ppecm_annot = ppecm_annot1 + ppecm_annot2 + ppecm_annot3
ppecm_annot_df = pd.DataFrame(ppecm_annot)

cppecvp_annot = cppecvp_annot1 + cppecvp_annot2 + cppecvp_annot3
cppecvp_annot_df = pd.DataFrame(cppecvp_annot)

hhw_annot = hhw_annot1 + hhw_annot2
hhw_annot_df = pd.DataFrame(hhw_annot)

ppev3_annot_df["dataset"] = "ppe_v3"
ppe8_annot_df["dataset"] = "ppe8-0mhax"
ppecm_annot_df["dataset"] = "personal-protective-equipment-combined-model"
cppecvp_annot_df["dataset"] = "construction-ppes"
hhw_annot_df["dataset"] = "hard-hat-workers"

orig_annot_df = pd.concat([
    ppev3_annot_df, ppe8_annot_df, ppecm_annot_df, cppecvp_annot_df, hhw_annot_df
]).reset_index(drop=True).rename(columns={"id": "orig_id"})


new_id = 0
annot_df = pd.DataFrame()
for dataset in tqdm(orig_annot_df["dataset"].unique()):
    for orig_id in tqdm(orig_annot_df.loc[orig_annot_df["dataset"]==dataset]["orig_id"].unique()):
        tmp = orig_annot_df.loc[(orig_annot_df["dataset"]==dataset) & (orig_annot_df["orig_id"]==orig_id)]
        tmp["id"] = new_id
        annot_df = pd.concat([annot_df, tmp])
        
        new_id += 1

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/11869 [00:00<?, ?it/s]

  0%|          | 0/517 [00:00<?, ?it/s]

  0%|          | 0/50739 [00:00<?, ?it/s]

  0%|          | 0/62809 [00:00<?, ?it/s]

  0%|          | 0/20231 [00:00<?, ?it/s]

In [9]:
# mapping category and photo ids in annotations

ppev3_cat_df = pd.DataFrame(ppev3_cat)
ppe8_cat_df = pd.DataFrame(ppe8_cat)
ppecm_cat_df = pd.DataFrame(ppecm_cat)
cppecvp_cat_df = pd.DataFrame(cppecvp_cat)
hhw_cat_df = pd.DataFrame(hhw_cat)

ppev3_cat_df["dataset"] = "ppe_v3"
ppe8_cat_df["dataset"] = "ppe8-0mhax"
ppecm_cat_df["dataset"] = "personal-protective-equipment-combined-model"
cppecvp_cat_df["dataset"] = "construction-ppes"
hhw_cat_df["dataset"] = "hard-hat-workers"

orig_cat_df = pd.concat([
    ppev3_cat_df, ppe8_cat_df, ppecm_cat_df, cppecvp_cat_df, hhw_cat_df
]).reset_index(drop=True).rename(columns={"id": "orig_id"})

annot_df = pd.DataFrame(annot_df).rename(columns={"image_id": "orig_image_id", "category_id": "orig_category_id"})
result = pd.DataFrame()

for dataset in tqdm(orig_cat_df["dataset"].unique()):
    tmp = orig_cat_df.loc[orig_cat_df["dataset"] == dataset]
    orig_dict = pd.Series(tmp["name"].values, index=tmp["orig_id"]).to_dict()
    print(orig_dict)
    
    tmp_annot_df = annot_df.loc[annot_df["dataset"] == dataset]
    tmp_annot_df["orig_category_name"] = tmp_annot_df["orig_category_id"].replace(orig_dict)

    # create and join column with new category name
    tmp_annot_df["category_name"] = tmp_annot_df["orig_category_name"].replace(cat_dict)
    # create and join column with new category id
    tmp_annot_df["category_id"] = tmp_annot_df["category_name"].replace(id_dict)
    
    res = pd.merge(
        tmp_annot_df,
        img_df[["orig_id", "id", "dataset"]].rename(columns={"orig_id": "orig_image_id", "id": "image_id"}).drop_duplicates(),
        how="inner",
        left_on=["dataset", "orig_image_id"],
        right_on=["dataset", "orig_image_id"]
    )
    
    assert tmp_annot_df.shape[0] == res.shape[0]
    result = pd.concat([result, res])
    
assert annot_df.shape[0] == result.shape[0]
final_annot_df = result[['id', 'image_id', 'category_id', 'bbox', 'area', 'segmentation', 'iscrowd']]

  0%|          | 0/5 [00:00<?, ?it/s]

{8: 'head_helmet', 9: 'head_nohelmet', 10: 'person', 12: 'vest'}
{8: 'head_helmet', 9: 'head_nohelmet', 10: 'person', 12: 'vest'}
{4: 'Hardhat', 9: 'NO-Hardhat', 12: 'Person', 14: 'Safety Vest'}
{3: 'Helmet', 5: 'Person', 8: 'Vest', 12: 'head'}
{1: 'head', 2: 'helmet', 3: 'person'}


In [11]:
final_annot_df.to_csv("annotations_all_melt.csv", index=False)

In [13]:
# copy *.jpg files to custom folder
for dataset in tqdm(result["dataset"].unique()):
    for fname in tqdm(img_df.loc[img_df["dataset"] == dataset]["file_name"].unique()):
        src = os.path.join("all-images", dataset, fname)
        dst = os.path.join("ppes-custom-dataset", "all", fname)
        shutil.copy(src, dst)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/2064 [00:00<?, ?it/s]

  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/23455 [00:00<?, ?it/s]

  0%|          | 0/7133 [00:00<?, ?it/s]

  0%|          | 0/7035 [00:00<?, ?it/s]

In [16]:
# create annotations.json

final_annot = [dict(v) for _, v in final_annot_df.iterrows()]

final_img_df = img_df[['id', 'license', 'file_name', 'height', 'width', 'date_captured']]
final_img = [dict(v) for _, v in final_img_df.iterrows()]

final_cat = []
for name, _id in id_dict.items():
    final_cat.append({
        "id": _id,
        "name": name,
        "supercategory": "Workers"
    })
    
body = {}

body['info'] = {
    'year': '2020',
    'version': '2',
    'description': 'Exported from roboflow.ai',
    'contributor': 'Northeastern University - China',
    'url': 'https://public.roboflow.ai/object-detection/hard-hat-workers',
    'date_created': '2020-04-30T03:26:22+00:00'
}
body['licenses'] = [{
    'id': 1,
    'url': 'https://creativecommons.org/publicdomain/zero/1.0/',
    'name': 'Public Domain'
}]
body['categories'] = final_cat
body['images'] = final_img
body['annotations'] = final_annot

with open(os.path.join("ppes-custom-dataset", "all", '_annotations.coco.json'), 'w') as f:
    json.dump(body, f)