This is the COCO dataset version of the previously uploaded PASCAL VOC pipeline

# Dataset Loading 

In [11]:
import os, json, random
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot

from pycocotools.coco import COCO

The dataset is downloaded manually (another notebook in the repository should provide the downloading function/block for this part).

In [12]:
#CONSTANTS
TARGET_CLASSES = ["person", "car", "bicycle", "dog", "chair"]
# MAX_SAMPLE_IMG = {
#     "person": 10,
#     "car": 200,
#     "bicycle": 500,
#     "dog": 500,
#     "chair": 300,
# }

# MAX_OBJECTS_PER_IMAGE = {
#     "person": 2,
# }

MAX_INSTANCES_PER_CLASS = {
    "person": 2500,
    "car": 2500,
    "bicycle": 2500,
    "dog": 2500,
    "chair": 2500
}
DATASET = 'COCO_DATASET'
ANN_FILE = os.path.join(DATASET, 'annotations/instances_train2017.json')
IMG_FILE = os.path.join(DATASET, 'images/train2017')

random.seed(42)

In [13]:
#Initialize API
coco = COCO(ANN_FILE)

loading annotations into memory...
Done (t=16.73s)
creating index...
index created!


## Subsample The Dataset

In [14]:
class_instance_counter = {c_name  : 0 for c_name in TARGET_CLASSES}
filtered_img_ids = set()
filtered_anns = []

cat_ids = coco.getCatIds(catNms=TARGET_CLASSES)
cats = coco.loadCats(cat_ids)

c_name_to_c_id = {}
c_id_to_c_name = {}

for cat in cats:
    c_name_to_c_id[cat["name"]] = cat["id"]
    c_id_to_c_name[cat["id"]] = cat["name"]

In [15]:
for c_name in TARGET_CLASSES:
    c_id = c_name_to_c_id[c_name]
    img_ids = coco.getImgIds(catIds=[c_id])
    random.shuffle(img_ids)

    for img_id in img_ids:
        if class_instance_counter[c_name] >= MAX_INSTANCES_PER_CLASS[c_name]:
            break

        ann_ids = coco.getAnnIds(imgIds=[img_id], catIds=[c_id], iscrowd=0)
        anns = coco.loadAnns(ann_ids)

        for ann in anns:
            if class_instance_counter[c_name] < MAX_INSTANCES_PER_CLASS[c_name]:
                filtered_anns.append({
                    "id": ann["id"],
                    "image_id": ann["image_id"],
                    "category_id": ann["category_id"],
                    "bbox": ann["bbox"]
                })
                filtered_img_ids.add(img_id)
                class_instance_counter[c_name] +=1

filtered_imgs = coco.loadImgs(list(filtered_img_ids))
filtered_cat = cats

print(f"Total instances collected: {len(filtered_anns)}")
print(f"Total unique images used: {len(filtered_imgs)}")

Total instances collected: 12500
Total unique images used: 5134


In [16]:
ANN_OUT = r'COCO_DATASET\annotations\instances_subset_mini_train2017.json'

subset_ann = {
    "info": coco.dataset.get("info", {}),
    "license": coco.dataset.get("licenses", []),
    "images": filtered_imgs,
    "annotations": filtered_anns,
    "categories": filtered_cat
}

with open(ANN_OUT, "w") as f:
    json.dump(subset_ann, f)

## Basic EDA

In [33]:
from pycocotools.coco import COCO
import pandas as pd
import matplotlib.pyplot as plt
ANN_SUB = r'COCO_DATASET/annotations/instances_subset_mini_train2017.json'
coco = COCO(ANN_SUB)

loading annotations into memory...
Done (t=0.03s)
creating index...
index created!


In [34]:
rows = []
for ann in coco.loadAnns(coco.getAnnIds()):
    img = coco.loadImgs(ann["image_id"])[0]
    cat = coco.loadCats([ann["category_id"]])[0]

    x, y, w, h = ann["bbox"]
    x_min, y_min, x_max, y_max = x, y, x + w, y + h
    box_w, box_h = w, h
    box_area = box_w * box_h
    img_area = img["width"] * img["height"]

    rows.append({
        "image_id": ann["image_id"],
        "class": cat["name"],
        "xmin": x_min,
        "ymin": y_min,
        "xmax": x_max,
        "ymax": y_max,
        "box_width": box_w,
        "box_height": box_h,
        "box_area": box_area,
        "relative_area": box_area / img_area,
        "aspect_ratio": box_w / box_h if box_h > 0 else 0,
    })

In [35]:
df = pd.DataFrame(rows)

In [36]:
df.head(10)

Unnamed: 0,image_id,class,xmin,ymin,xmax,ymax,box_width,box_height,box_area,relative_area,aspect_ratio
0,271560,person,96.57,264.61,222.33,409.05,125.76,144.44,18164.7744,0.101197,0.870673
1,237745,person,60.86,114.94,378.13,514.97,317.27,400.03,126917.5181,0.432988,0.793116
2,219488,person,237.83,70.3,403.08,151.49,165.25,81.19,13416.6475,0.04921,2.035349
3,437732,person,134.01,151.46,299.55,455.52,165.54,304.06,50334.0924,0.302307,0.544432
4,270799,person,385.13,218.25,403.77,264.95,18.64,46.7,870.488,0.003231,0.399143
5,270799,person,351.69,193.8,371.09,235.04,19.4,41.24,800.056,0.002969,0.470417
6,373249,person,203.68,2.27,296.43,221.51,92.75,219.24,20334.51,0.200585,0.423052
7,373249,person,280.11,55.89,352.0,284.71,71.89,228.82,16449.8698,0.162266,0.314177
8,373249,person,0.79,7.21,76.06,222.47,75.27,215.26,16202.6202,0.159827,0.34967
9,373249,person,293.1,31.58,332.76,97.83,39.66,66.25,2627.475,0.025918,0.598642


In [37]:
print(len(df))

12500


In [38]:
global_stats = pd.DataFrame([{
    "num_images": df["image_id"].nunique(),
    "num_objects": len(df),
    "avg_objects_per_image": round(len(df) / df["image_id"].nunique(), 2),
    "min_objects_per_image": df.groupby("image_id").size().min(),
    "max_objects_per_image": df.groupby("image_id").size().max()
}])

global_stats

Unnamed: 0,num_images,num_objects,avg_objects_per_image,min_objects_per_image,max_objects_per_image
0,5134,12500,2.43,1,26


In [39]:
object_per_class = (
    df.groupby("class")
      .size()
      .reset_index(name="num_objects")
      .sort_values("num_objects", ascending=False)
)

object_per_class

Unnamed: 0,class,num_objects
0,bicycle,2500
1,car,2500
2,chair,2500
3,dog,2500
4,person,2500


In [40]:
images_per_class = (
    df.groupby("class")["image_id"]
      .nunique()
      .reset_index(name="num_images")
      .sort_values("num_images", ascending=False)
)

images_per_class

Unnamed: 0,class,num_images
3,dog,1962
0,bicycle,1168
2,chair,829
1,car,706
4,person,583


In [41]:
class_summary = pd.merge(
    object_per_class,
    images_per_class,
    on="class"
)

class_summary["avg_objects_per_image"] = (
    class_summary["num_objects"] /
    class_summary["num_images"]
).round(2)

class_summary

Unnamed: 0,class,num_objects,num_images,avg_objects_per_image
0,bicycle,2500,1168,2.14
1,car,2500,706,3.54
2,chair,2500,829,3.02
3,dog,2500,1962,1.27
4,person,2500,583,4.29


In [42]:
bbox_area_stats = (
    df.groupby("class")["box_area"]
      .agg(["min", "mean", "median", "max"])
      .reset_index()
      .round(2)
)

bbox_area_stats

Unnamed: 0,class,min,mean,median,max
0,bicycle,15.62,17132.2,3987.76,337100.8
1,car,8.59,7353.24,1079.9,307200.0
2,chair,7.42,11137.74,3074.18,322296.01
3,dog,14.39,48498.85,23548.56,352803.77
4,person,3.37,19428.01,3363.4,333642.99


In [43]:
relative_area_stats = (
    df.groupby("class")["relative_area"]
      .agg(["min", "mean", "median", "max"])
      .reset_index()
      .round(4)
)

relative_area_stats

Unnamed: 0,class,min,mean,median,max
0,bicycle,0.0001,0.0616,0.015,1.0
1,car,0.0001,0.0263,0.004,1.0
2,chair,0.0,0.04,0.0112,0.9922
3,dog,0.0001,0.1773,0.0899,0.998
4,person,0.0,0.0719,0.0127,0.9797


In [49]:
objects_per_image = (
    df.groupby("image_id")
      .size()
      .reset_index(name="num_objects")
)
objects_per_image.describe().round(2)

Unnamed: 0,image_id,num_objects
count,5134.0,5134.0
mean,290789.7,2.43
std,167888.7,2.76
min,74.0,1.0
25%,144718.5,1.0
50%,290852.5,1.0
75%,437115.0,3.0
max,581906.0,26.0


In [45]:
df["size_category"] = pd.cut(
    df["box_area"],
    bins=[0, 32**2, 96**2, float("inf")],
    labels=["small", "medium", "large"]
)

In [46]:
size_distribution = (
    df.groupby(["class", "size_category"])
        .size()
        .reset_index(name="count")
)
size_distribution

  df.groupby(["class", "size_category"])


Unnamed: 0,class,size_category,count
0,bicycle,small,654
1,bicycle,medium,1027
2,bicycle,large,819
3,car,small,1229
4,car,medium,939
5,car,large,332
6,chair,small,712
7,chair,medium,1112
8,chair,large,676
9,dog,small,218


In [47]:
print(len(filtered_img_ids))

5134
