# Data Processing Template
This notebook was used to download, analyze and process the data from the NOAA Pudget Sound dataset.

You can use this template to process your own dataset.

In [1]:
import matplotlib.pyplot as plt
import supervision as sv
from pathlib import Path
import random

## Download the Data
- If you want to use the `unzip` command you might need to install it. On linux, run `sudo apt-get install unzip`

In [4]:
dataset_shortname = "brackish_dataset"
data_dir = Path("/tmp/data/") / dataset_shortname
data_dir.mkdir(exist_ok=True, parents=True)

In [None]:
data = "https://public.roboflow.com/ds/J1rFmOSvu0\?key=0adJYkLeGN" # 960x540
# data = "https://public.roboflow.com/ds/XNcGlJKpVd\?key=ErlvzPXTvT" # 1920x1080

data_path = data_dir/"data.zip"

!wget -O {data_path} {data}

!unzip {data_path} -d {data_dir}

!rm {data_path}


### Clean the annotations
Turn into COCO format readable by `supervision` library, for easy visualization and conversion to other formats.
- annotations.json only contains annotations for images with at least one bounding box

In [None]:
import json

splits = ["train", "valid", "test"]

for split in splits:

    annotations_path = data_dir / f"{split}/_annotations.coco.json"
    print(annotations_path)
    with open(annotations_path, "r") as f:
        annotations = json.load(f)
        
    cleaned_annotations = []
        
    print(f"Number of annotations: {len(annotations['annotations'])}")
        
    for i, annotation in enumerate(annotations["annotations"]):
        if "bbox" not in annotation or len(annotation["bbox"]) == 0:
            print(f"No bbox found for {annotation['image_id']}")
        else:
            cleaned_annotations.append(annotation)

    annotations["annotations"] = cleaned_annotations

    with open(annotations_path, "w") as f:
        print(f"Number of annotations: {len(annotations['annotations'])}")
        json.dump(annotations, f)


## Visualise
To visualise we need to extract the frames from the video, therefore, pick only one video to analyse


In [None]:
datasets = {}

for split in splits:
    annotations_path = data_dir / f"{split}/_annotations.coco.json"
    images_path = data_dir / split
    datasets[split] = sv.DetectionDataset.from_coco(
        images_directory_path=str(images_path),
        annotations_path=str(annotations_path),
    )

    print(f"Split: {split}; Dataset length: {len(datasets[split])}")
    print(f"Split: {split}; Dataset classes: {datasets[split].classes}")

In [None]:
for split in splits:
    box_annotator = sv.BoxAnnotator()
    label_annotator = sv.LabelAnnotator()

    image_example = None

    annotated_images = []
    for _ in range(16):
        i = random.randint(0, len(datasets[split]))
        
        _, image, annotations = datasets[split][i]

        labels = [datasets[split].classes[class_id] for class_id in annotations.class_id]

        annotated_image = image.copy()
        annotated_image = box_annotator.annotate(annotated_image, annotations)
        annotated_image = label_annotator.annotate(annotated_image, annotations, labels)
        annotated_images.append(annotated_image)
        
        if len(annotations) > 0:
            image_example = annotated_image
        
    sv.plot_images_grid(
        annotated_images,
        grid_size=(4, 4),
        titles=None,
        size=(20, 12),
        cmap="gray"
    )

plt.imsave(f"{dataset_shortname}_sample_image.png", image_example)


## Save Output
- Save example image
- Save notebook to visualize the image