# Data Processing Template
This notebook was used to download, analyze and process the data from the NOAA Pudget Sound dataset.

You can use this template to process your own dataset.

In [None]:
import os
import csv
import json
import matplotlib.pyplot as plt
import supervision as sv
from pathlib import Path
import random

## Download the Data
- If you want to use the `unzip` command you might need to install it. On linux, run `sudo apt-get install unzip`

In [None]:
dataset_shortname = "deep_vision"
data_dir = Path("/tmp/data/") / dataset_shortname
data_dir.mkdir(exist_ok=True, parents=True)

In [None]:
data = "https://ftp.nmdc.no/nmdc/IMR/MachineLearning/fishDatasetSimulationAlgorithm.zip"

data_path = data_dir / "images.zip"

!wget -O {data_path} {data}

!unzip {data_path} -d {data_dir}

!rm {data_path}


In [None]:
images_path = data_dir / "fish_dataset"

### Clean the annotations
Turn into COCO format readable by `supervision` library, for easy visualization and conversion to other formats.
- The dataset has annotations group by year and divided into train and test splits
- Read all the annotations across 2017 and 2018; and acrosss both train and test splits.
- Create a `supervision` dataset with all annotations

In [None]:
def csvs_to_coco(csv_files, output_json):
    """
    Converts multiple CSV files with annotations in the format:
      relative_image_path,xmin,ymin,xmax,ymax,label
    to a COCO-format JSON file with "images", "annotations", and "categories".
    The base_images_path is prepended to the relative image paths.
    
    Args:
        csv_files (list of str or Path): List of paths to the CSV files.
        output_json (str or Path): Path to save the output COCO JSON.
    """
    # Dictionaries for images and categories; list for annotations
    images = {}
    annotations = []
    categories = {}
    
    ann_id = 1   # Unique annotation id
    image_id = 1 # Unique image id

    # Process each CSV file
    for csv_file in csv_files:
        with csv_file.open('r', newline='') as f:
            reader = csv.reader(f)
            for row in reader:
                if len(row) != 6:
                    continue
                rel_file_name, xmin, ymin, xmax, ymax, label = row

                # Remove any leading "/" from relative path if present
                rel_file_name = rel_file_name.lstrip('/')
                file_name = os.path.join(images_path, rel_file_name)

                try:
                    xmin, ymin, xmax, ymax = int(xmin), int(ymin), int(xmax), int(ymax)
                except ValueError:
                    # Skip rows with invalid coordinate data
                    continue

                width = xmax - xmin
                height = ymax - ymin

                # Add image entry if not already added
                if file_name not in images:
                    images[file_name] = {
                        "id": image_id,
                        "file_name": file_name,
                        "width": None,   # Optionally, set the width if known
                        "height": None   # Optionally, set the height if known
                    }
                    image_id += 1

                # Add category entry if not already added
                if label not in categories:
                    cat_id = len(categories) + 1  # unique category id
                    categories[label] = {
                        "id": cat_id,
                        "name": label,
                        "supercategory": label  # or assign a default supercategory
                    }
                cat_id = categories[label]["id"]

                # Create annotation entry
                ann = {
                    "id": ann_id,
                    "image_id": images[file_name]["id"],
                    "category_id": cat_id,
                    "bbox": [xmin, ymin, width, height],  # COCO format: [x, y, width, height]
                    "area": width * height,
                    "iscrowd": 0
                }
                annotations.append(ann)
                ann_id += 1

    # Convert dictionaries to lists
    coco_images = list(images.values())
    coco_categories = list(categories.values())
    
    coco_dict = {
        "images": coco_images,
        "annotations": annotations,
        "categories": coco_categories
    }
    
    with open(output_json, "w") as f:
        json.dump(coco_dict, f, indent=4)
    
    print(f"COCO annotations saved to {output_json}")
    print(f"Number of images: {len(coco_images)}")
    print(f"Number of annotations: {len(annotations)}")


In [None]:
csv_files = [data_dir / "fish_dataset/2017/train/source-train2017-annotations.csv",
                    data_dir / "fish_dataset/2017/test/test_2017_annotations.csv",
                    data_dir / "fish_dataset/2018/train/source-train2018-annotations.csv",
                    data_dir / "fish_dataset/2018/test/test_2018_annotations.csv"]
                    
json_annotations_path = data_dir / "combined_coco_annotations.json"
csvs_to_coco(csv_files, json_annotations_path)

## Visualise
To visualise we need to extract 16 images randomly


In [None]:
dataset = sv.DetectionDataset.from_coco(
    images_directory_path=str(images_path),
    annotations_path=str(json_annotations_path),
)

print(f"Dataset length: {len(dataset)}")
print(f"Dataset classes: {dataset.classes}")

In [None]:
box_annotator = sv.BoxAnnotator()
label_annotator = sv.LabelAnnotator()

image_example = None

annotated_images = []
for _ in range(16):
    i = random.randint(0, len(dataset))
    
    _, image, annotations = dataset[i]

    labels = [dataset.classes[class_id] for class_id in annotations.class_id]

    annotated_image = image.copy()
    annotated_image = box_annotator.annotate(annotated_image, annotations)
    annotated_image = label_annotator.annotate(annotated_image, annotations, labels)
    annotated_images.append(annotated_image)
    
    if len(annotations) > 0:
        image_example = annotated_image
    
sv.plot_images_grid(
    annotated_images,
    grid_size=(4, 4),
    titles=None,
    size=(20, 12),
    cmap="gray"
)

plt.imsave(f"{dataset_shortname}_sample_image.png", image_example)


## Save Output
- Save example image
- Save notebook to visualize the image