# Process and visualise the [DeepFish](https://github.com/alzayats/DeepFish) dataset

This dataset includes count, classification, and segmentation labels; we are only using the segmentation labels in this notebook, and because we are training a detector, we are reducing them to boxes.  Segmentation labels are stored as images, not as text, so we need to parse the connected components from the images.

## Imports and constants

In [None]:
import os
import random
import json

import cv2
import numpy as np
import matplotlib.pyplot as plt
import supervision as sv

from skimage import measure
from tqdm import tqdm

source_url = 'http://data.qld.edu.au/public/Q5842/2020-AlzayatSaleh-00e364223a600e83bd9c3f5bcd91045-DeepFish/DeepFish.tar'

dataset_shortname = "deepfish"
download_base = os.path.join(os.path.expanduser('~/data'),dataset_shortname)
segmentation_base = os.path.join(download_base,'DeepFish','Segmentation')
segmentation_mask_base = os.path.join(segmentation_base,'masks/valid')
segmentation_image_base = os.path.join(segmentation_base,'images/valid')

## Download the data

In [None]:
os.makedirs(data_dir,exist_ok=True)
local_tar_file = os.path.join(download_base,os.path.split(source_url)[-1])
print('Downloading data to {}'.format(local_tar_file))

In [None]:
!wget -O {local_tar_file} {source_url}
!tar -xvf {local_tar_file} -C {data_dir}

## Enumerate mask files

The annotations are stored as image masks, we want to find connected components in those masks and convert to boxes.

In [None]:
assert os.path.isdir(segmentation_mask_base), 'Folder {} does not exist'.format(segmentation_mask_base)
valid_masks = [os.path.join(segmentation_mask_base,fn) for fn in \
               os.listdir(segmentation_mask_base)]
print('Found {} mask files'.format(len(valid_masks)))

## Enumerate all files as a consistency check

In [None]:
#!pip install megadetector-utils
from megadetector.utils.path_utils import find_images
image_files_relative = find_images(download_base,recursive=True,return_relative_paths=True)
print('Dataset contains a total of {} images'.format(len(image_files_relative)))

## Function to convert segmentation mask images to bounding boxes

In [None]:
def get_boxes_from_mask_image(mask_file):
    """
    Load a binary image, find connected components, and convert to COCO-formatted bounding boxes.
    
    Args:
        mask_file (str): Path to the binary image file
        
    Returns:
        dict: COCO format annotations
    """
    
    # Read the image
    mask = cv2.imread(mask_file, cv2.IMREAD_GRAYSCALE)

    image_id = os.path.relpath(mask_file,segmentation_mask_base).replace('\\','/')
    image_id = os.path.splitext(image_id)[0]
    
    # Ensure binary image (threshold if not already binary)
    _, binary = cv2.threshold(mask, 127, 255, cv2.THRESH_BINARY)
    
    # Find connected components
    labels = measure.label(binary, connectivity=2)
    regions = measure.regionprops(labels)
    
    # Prepare COCO-formatted annotations
    annotations = []
    for idx, region in enumerate(regions):
        # Get bounding box (y1, x1, y2, x2)
        bbox = region.bbox
        
        # Convert to COCO format [x, y, width, height]
        coco_bbox = [
            bbox[1],                    # x
            bbox[0],                    # y
            bbox[3] - bbox[1],         # width
            bbox[2] - bbox[0]          # height
        ]
        
        # Create annotation entry
        annotation = {
            'id': image_id + '_' + str(idx).zfill(3),
            'image_id': image_id,
            'category_id': 1,
            'bbox': coco_bbox,            
        }
        annotations.append(annotation)
    
    return annotations

## Convert mask images to bounding boxes

In [None]:
debug_max_file = None

annotation_records = []

for i_mask,mask_file in tqdm(enumerate(valid_masks),total=len(valid_masks)):

    if debug_max_file is not None and i_mask > debug_max_file:
        break

    coco_formatted_annotations = get_boxes_from_mask_image(mask_file)
    annotation_records.extend(coco_formatted_annotations)

print('Created {} annotations'.format(len(annotation_records)))

## Create a complete COCO dataset

In [None]:
# Enumerate images
assert os.path.isdir(segmentation_image_base)
valid_images = [os.path.join(segmentation_image_base,fn) for fn in \
               os.listdir(segmentation_image_base)]
print('Found {} image files'.format(len(valid_images)))

assert len(valid_images) == len(valid_masks)

In [None]:
coco_data = {}
coco_data['info'] = {}
coco_data['categories'] = [{'name':'fish','id':1}]
coco_data['annotations'] = annotation_records
coco_data['images'] = []

for image_file_abs in tqdm(valid_images):
    im = {}
    im_cv = cv2.imread(mask_file)
    image_id = os.path.splitext(os.path.basename(image_file_abs))[0]
    im['id'] = image_id
    im['file_name'] = image_file_abs.replace('\\','/')
    im['height'] = im_cv.shape[0]
    im['width'] = im_cv.shape[1]

    coco_data['images'].append(im)

coco_dataset_file = os.path.join(download_base,'deepfish_coco.json')
with open(coco_dataset_file,'w') as f:
    json.dump(coco_data,f,indent=1)

## Visualize

In [None]:
dataset = sv.DetectionDataset.from_coco(
    images_directory_path=segmentation_image_base,
    annotations_path=coco_dataset_file,
)

print(f"Dataset length: {len(dataset)}")
print(f"Dataset classes: {dataset.classes}")

### Visualize an image grid

In [None]:
box_annotator = sv.BoxAnnotator()
label_annotator = sv.LabelAnnotator()

image_example = None

# random.seed(0)

annotated_images = []
for _ in range(16):
   
    i = random.randint(0, len(dataset))    
    _, image, annotations = dataset[i]
    labels = [dataset.classes[class_id] for class_id in annotations.class_id]

    annotated_image = image.copy()
    annotated_image = box_annotator.annotate(annotated_image, annotations)
    annotated_image = label_annotator.annotate(annotated_image, annotations, labels)
    annotated_images.append(annotated_image)
    
sv.plot_images_grid(
    annotated_images,
    grid_size=(4, 4),
    titles=None,
    size=(20, 12),
    cmap="gray"
)

### Write one visualized image to file

In [None]:
i_image = 100
_, image, annotations = dataset[i_image]
labels = [dataset.classes[class_id] for class_id in annotations.class_id]

annotated_image = image.copy()
annotated_image = box_annotator.annotate(annotated_image, annotations)
annotated_image = label_annotator.annotate(annotated_image, annotations, labels)

sv.plot_image(annotated_image)

sample_image_output_file = os.path.join(download_base,'deepfish_sample.jpg')
cv2.imwrite(sample_image_output_file,annotated_image)