[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1lP354J_WD0mwzT1lxBv2zZkZ1LjyXZJM?usp=colab)

# Visualise the Angling Freshwater Fish Netherlands dataset
This notebook was used to download, analyze and process the data from the Angling Freshwater Fish Netherlands dataset.

In [1]:
!pip install supervision



In [2]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import supervision as sv
from pathlib import Path
import random

import xml.etree.ElementTree as ET
import json
import os

## Download the Data

In [3]:
dataset_shortname = "affine"
data_url = "https://www.kaggle.com/api/v1/datasets/download/jorritvenema/affine"
data_path = dataset_shortname + ".zip"

In [4]:
!wget -nc -O {data_path} {data_url}
!if [ ! -d {dataset_shortname} ]; then unzip -q {data_path} -d {dataset_shortname}; fi
!rm {data_path}

--2025-02-13 20:44:53--  https://www.kaggle.com/api/v1/datasets/download/jorritvenema/affine
Resolving www.kaggle.com (www.kaggle.com)... 35.244.233.98
Connecting to www.kaggle.com (www.kaggle.com)|35.244.233.98|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://storage.googleapis.com:443/kaggle-data-sets/2166873/3620603/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20250213%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20250213T204453Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=a55115f01e8625f0d65d90893bccd40205915b7a67fae5f99130f7bbe0224f9cc83dc1f77b0be50d6aea098f59fb549876514bdff779afe48c3a5fce2a13291eb9b9248d5912ae7c4d33cc213d1558a299403518178c89c4f36e41c31157c0dd58931136f3af6d965a07bf52a4f11fcde297d3e12540fd3e01d400f8a188b73343d8a971fe7de58bd0dd500bdef0ea6d1b5a56da907e661893b81ad33a5ca6c07960fe5cca38d8af1e2b1fdbcf6ed1d97305707d226ba29cc6c9fec9a

In [5]:
images_path = "affine/dataset"
annotations_path = "affine/annotations.json"

### Clean the annotations
Turn into COCO format readable by `supervision` library, for easy visualization and conversion to other formats.
- annotations.json only contains annotations for images with at least one bounding box

In [6]:
def xml_to_coco_json(images_path, output_json):
    """
    Converts XML annotations to COCO JSON format.

    Args:
        images_path: Path to the images folder containing subdirectories with XML files.
        output_json: Path to the output JSON file.
    """

    images = []
    annotations = []
    categories = {}
    image_id = 1
    annotation_id = 1

    for subdir in os.listdir(images_path):
        subdir_path = os.path.join(images_path, subdir)
        if os.path.isdir(subdir_path):
            category_name = subdir.lower().replace(" ", "_")

            if category_name not in categories:
                categories[category_name] = len(categories) + 1

            for filename in os.listdir(subdir_path):
                if filename.endswith(".xml"):
                    xml_path = os.path.join(subdir_path, filename)
                    tree = ET.parse(xml_path)
                    root = tree.getroot()

                    image_filename = root.find("filename").text
                    width = int(root.find("size/width").text)
                    height = int(root.find("size/height").text)

                    images.append({
                        "id": image_id,
                        "file_name": os.path.join(subdir, image_filename),
                        "width": width,
                        "height": height
                    })

                    for obj in root.findall("object"):
                        category_id = categories[category_name]
                        xmin = int(obj.find("bndbox/xmin").text)
                        ymin = int(obj.find("bndbox/ymin").text)
                        xmax = int(obj.find("bndbox/xmax").text)
                        ymax = int(obj.find("bndbox/ymax").text)
                        w = xmax - xmin
                        h = ymax - ymin

                        annotations.append({
                            "id": annotation_id,
                            "image_id": image_id,
                            "category_id": category_id,
                            "bbox": [xmin, ymin, w, h],
                            "area": w * h,
                            "iscrowd": 0
                        })
                        annotation_id += 1

                    image_id += 1

    # Convert categories dictionary to COCO format
    categories_list = [{"id": v, "name": k} for k, v in categories.items()]

    coco_data = {
        "images": images,
        "annotations": annotations,
        "categories": categories_list
    }

    with open(output_json, "w") as f:
        json.dump(coco_data, f, indent=4)

In [7]:
xml_to_coco_json(images_path, annotations_path)

In [8]:
with open(annotations_path, "r") as f:
    annotations = json.load(f)

cleaned_annotations = []

print(f"Number of annotations: {len(annotations['annotations'])}")

for i, annotation in enumerate(annotations["annotations"]):
    if "bbox" not in annotation or len(annotation["bbox"]) == 0:
        print(f"No bbox found for {annotation['image_id']}")
    else:
        cleaned_annotations.append(annotation)

annotations["annotations"] = cleaned_annotations

with open(annotations_path, "w") as f:
    print(f"Number of annotations: {len(annotations['annotations'])}")
    json.dump(annotations, f)

Number of annotations: 7483
Number of annotations: 7483


## Visualise


In [9]:
# get number images
def count_images(root_folder):
    """
    Counts the total number of images in all subdirectories.

    Args:
        root_folder: Path to the root folder containing the subdirectories.

    Returns:
        The total number of images.
    """

    total_images = 0
    for subdir in os.listdir(root_folder):
        subdir_path = os.path.join(root_folder, subdir)
        if os.path.isdir(subdir_path):
            image_files = [f for f in os.listdir(subdir_path) if f.endswith((".jpg", ".png", ".jpeg"))]
            total_images += len(image_files)
    return total_images

In [10]:
total_image_count = count_images(images_path)
print(f"Total number of images: {total_image_count}")

Total number of images: 7482


In [11]:
dataset = sv.DetectionDataset.from_coco(
    images_directory_path=str(images_path),
    annotations_path=str(annotations_path),
)

print(f"Dataset length: {len(dataset)}")
print(f"Dataset classes: {dataset.classes}")

Dataset length: 7482
Dataset classes: ['scardinius_erythrophthalmus', 'carassius_carassius', 'vimba_vimba', 'leuciscus_leuciscus', 'sander_lucioperca', 'neogobius_melanostomus', 'leuciscus_idus', 'neogobius_kessleri', 'barbus_barbus', 'gasterosteus_aculeatus', 'esox_lucius', 'acipenseridae', 'perca_fluviatilis', 'gymnocephalus_cernuus', 'abramis_brama', 'rhodeus_amarus', 'salmo_trutta_subsp._fario', 'cyprinus_carpio', 'lepomis_gibbosus', 'ctenopharyngodon_idella', 'rutilus_rutilus', 'blicca_bjoerkna', 'silurus_glanis', 'gobio_gobio', 'carassius_gibelio', 'anguilla_anguilla', 'leuciscus_cephalus', 'tinca_tinca', 'neogobius_fluviatilis', 'aspius_aspius']


In [12]:
box_annotator = sv.BoxAnnotator()
label_annotator = sv.LabelAnnotator()

image_example = None

annotated_images = []
for _ in range(16):
    i = random.randint(0, len(dataset))

    _, image, annotations = dataset[i]

    labels = [dataset.classes[class_id] for class_id in annotations.class_id]

    annotated_image = image.copy()
    annotated_image = box_annotator.annotate(annotated_image, annotations)
    annotated_image = label_annotator.annotate(annotated_image, annotations, labels)
    annotated_images.append(annotated_image)

    if len(annotations) > 0:
        image_example = annotated_image

sv.plot_images_grid(
    annotated_images,
    grid_size=(4, 4),
    titles=None,
    size=(20, 12),
    cmap="gray"
)

plt.imsave(f"{dataset_shortname}/{dataset_shortname}_sample_image.png", image_example)

Output hidden; open in https://colab.research.google.com to view.

## Save Output
- Save example image
- Save notebook to visualize the image