# COCO Dataset Statistics

This notebook analyzes the COCO annotation file to provide statistics on classes, image counts, and annotations.

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import os
from collections import Counter

# Set file path (default from classification.sh)
json_path = "Verified.json"

if not os.path.exists(json_path):
    print(f"Warning: File not found at {json_path}. Please update 'json_path' variable.")


In [None]:
print(f"Loading JSON from {json_path}...")
with open(json_path, 'r') as f:
    data = json.load(f)
print("Done.")

In [None]:
# Parse Categories
categories = {cat['id']: cat['name'] for cat in data.get('categories', [])}
supercategories = {cat['id']: cat['supercategory'] for cat in data.get('categories', [])}

print(f"Total Images: {len(data.get('images', []))}")
print(f"Total Annotations: {len(data.get('annotations', []))}")
print(f"Total Categories: {len(categories)}")

In [None]:
# Count annotations per category
cat_counts = Counter()
supercat_counts = Counter()

for ann in data.get('annotations', []):
    cat_id = ann.get('category_id')
    if cat_id in categories:
        cat_counts[categories[cat_id]] += 1
        if cat_id in supercategories:
             supercat_counts[supercategories[cat_id]] += 1

df_cat = pd.DataFrame.from_dict(cat_counts, orient='index', columns=['count'])
df_cat = df_cat.sort_values(by='count', ascending=False)
df_cat.index.name = 'Category'

print("Top 20 Categories by Annotation Count:")
print(df_cat.head(20))

# Optional: Display full table
# df_cat

In [None]:
# Count annotations per Supercategory (if applicable, typically used for class names in Fishial)
df_super = pd.DataFrame.from_dict(supercat_counts, orient='index', columns=['count'])
df_super = df_super.sort_values(by='count', ascending=False)
df_super.index.name = 'Supercategory'

pd.set_option('display.max_rows', None)
print("Supercategories by Annotation Count:")
df_print = df_super.reset_index()
df_print.index = df_print.index + 1
print(df_print)


In [None]:
df

In [None]:
if not df_super.empty:
    plt.figure(figsize=(12, 6))
    df_super.head(20)['count'].plot(kind='bar')
    plt.title('Top 20 Classes (Supercategory)')
    plt.ylabel('Count')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

In [None]:
dataset.delete()

In [None]:

import fiftyone as fo
from collections import Counter
import sys

dataset_name = "classification_v0.10"

print(f"Loading dataset: {dataset_name}")
if dataset_name not in fo.list_datasets():
    print(f"Error: Dataset '{dataset_name}' not found in FiftyOne.")
    print("Available datasets:", fo.list_datasets())
    sys.exit(1)

dataset = fo.load_dataset(dataset_name)
print(f"Dataset loaded. Total samples: {len(dataset)}")

print("Computing statistics...")
label_counts = Counter()

# Iterate over samples to count labels
# Based on classification_dataset_creator.py, the label is in sample.polyline.label
for sample in dataset.iter_samples(progress=True):
    if "polyline" in sample:
        poly = sample.polyline
        if poly is None:
            continue
        
        # Check if polyline is a single object or list (FiftyOne field types can vary)
        if isinstance(poly, fo.Polyline):
            label = poly.label
            label_counts[label] += 1
        elif isinstance(poly, list):
            # If it's a list (Polylines), iterate
            for p in poly:
                label_counts[p.label] += 1
        else:
            # Handle unexpected type if possible or skip
            pass

print("\n" + "="*50)
print(f"Statistics for '{dataset_name}'")
print("="*50)
print(f"Total Unique Classes: {len(label_counts)}")
print(f"Total Annotated Items: {sum(label_counts.values())}")
print("-" * 50)
print(f"{'Class Name':<35} | {'Count':<10}")
print("-" * 50)

for label, count in label_counts.most_common():
    print(f"{label:<35} | {count:<10}")

print("-" * 50)

In [None]:
path_to_src_coco_json = "Verified.json"
path_full_images = "data"


print(f"Scanning images directory: {path_full_images} ...")
list_of_files = set(next(os.walk(path_full_images))[2])
print(f"Total files in folder: {len(list_of_files)}")

print(f"Reading JSON: {path_to_src_coco_json} ...")
data_full = read_json(path_to_src_coco_json)


In [None]:
# Filter valid images (exist on disk and not flagged as invalid)
valid_images = {}

for img in data_full.get('images', []):
    if img.get('is_invalid'):
        continue
    if img.get('file_name') in list_of_files:
        valid_images[img.get('id')] = img
    else:
        print('file does not exist: ', img.get('file_name'))
        print(img)