# To extract specific annotation data from a superset original VIA JSON file

In [1]:
import json
import tqdm
from pycocotools.coco import COCO

## Load original dataset

In [None]:
data = COCO("annotations.json")

# Create lists to hold filtered data

In [None]:
anns = [] # list to store final annotations
cats = [] # list to store unique categories
imgs = {} # dict to map img id to index in ann array

# Loop through each image

In [None]:
# Get image info and annotations
for iid in tqdm.tqdm(data.getImgIds()):
  iminfo = data.loadImgs([iid])[0]
  anns += data.imgToAnns[iid]

  # Filter annotations by category - assume there is just one!
  catids = [a["category_id"] for a in anns if a['image_id'] == iid]

  if len(catids) > 1 or catids!= [2]:
    continue

  cats += [{"supercategory": "none", "id": cid, "name": cname} for cid,cname in enumerate(set([c["name"] for c in data.loadCats(catids)]))]

  # Map current image ID to its index in the final annotations/images arrays
  imgs[iid] = len(anns)-len(catids)

  # Write filtered results to new files
  with open('cars_train.json', 'w') as f:
    json.dump({"categories": cats,"annotations": [anns[i] for i in range(len(anns))if anns[i]['iscrowd']==0],"images": [iminfo for _,iminfo in sorted(imgs.items())]},f)

  with open('cars_val.json', 'w') as f:
    json.dump({"categories": cats,"annotations": [anns[i] for i in range(len(anns))if anns[i]['iscrowd']==0],"images": [iminfo for _,iminfo in sorted(imgs.items())][5:]},f)

# Copy images from one folder to another based on annotation

In [None]:
import os
import shutil

os.makedirs("cars", exist_ok=True)   # change 'car' class as per your requirement 

# Copy relevant images to subfolder
for fname in tqdm(["%d.jpg" % iid for iid in imgs]):
  srcfile = os.path.join("/path/to/original/images/",fname)
  destfile = os.path.join("cars",fname)
  shutil.copyfile(srcfile,destfile)