In [None]:
#!/usr/bin/env python
# -*-coding:utf-8 -*-
'''
  ████
██    ██   Datature
  ██  ██   Powering Breakthrough AI
    ██

@File    :   nifti_data_onboarding.ipynb
@Author  :   Trevor Carrell
@Version :   2.0
@Contact :   hello@datature.io
@License :   Apache License 2.0
@Desc    :   Demo for uploading NIfTI files using Datature Python SDK
             and converting bitmask annotations to COCO polygon annotation
             files.
'''

# Creating a COCO formatted file using an RLE Mask for object annotation via `.nii` file uploading.
In this python notebook, we will be using `.nii` files and their corresponding labels to create a COCO formatted file which allows Datature users to upload their `.nii` files and labels to create a COCO formatted file, which will annotate their uploaded files.

## Uploading testing files to Nexus using Datature SDK:
Before we start creating our COCO formatted file, we first need to upload our `.nii` files onto Nexus. To do so, we will utilize Datature's SDK, which converts our `.nii` files to `.mp4` files. Note that this conversion is necessary to allow for interpolation when using the annotation tool in Nexus.

### Install / Import necessary libaries:
In this python notebook, we will be using a few libraries to perform the task at hand. We do this below:

In [None]:
# Handle installation of packages.
! pip3 install -U pip       # Upgrade pip.
! pip3 install alive-progress  # Install alive-progress package.
! pip3 install -U datature  # Install and update datature package.
! pip3 install -U matplotlib       # Install matplotlib package.
! pip3 install nibabel             # Install nibabel package.
! pip3 install numpy               # Install numpy package.
! pip3 install pycocotools         # Install pycocotools package.

In [27]:
# Handle imports.
import glob
import json
import os
from datetime import datetime, timezone

import nibabel as nib
import numpy as np
from alive_progress import alive_bar  # For uploading progress bar.
from datature import nexus
from pycocotools import mask

Looking in indexes: https://pypi.org/simple, https://asia-python.pkg.dev/datature-puppeteer/python/simple/
Looking in indexes: https://pypi.org/simple, https://asia-python.pkg.dev/datature-puppeteer/python/simple/


### Define data paths, secret key, and project key:
Now that we've imported the necessary libraries, we should define the path which will contain our `.nii` files and labels, the path that we want to output our COCO formatted file, and our projects secret key (which is necessary to use Datature's SDK).

When using on your machine, replace `DATA_PATH` with the path to your `.nii` labels, the `OUTPUT_PATH` to where you want to output to go.

We then need to define our secret key and project ID to connect to our project.

In [25]:
DATA_PATH = "/Users/trevorcarrell/Documents/Datature.nosync/nii to COCO Format Code/Medical AI Series/Dataset/"   # Path to data.
OUTPUT_PATH = "/Users/trevorcarrell/Documents/Datature.nosync/nii to COCO Format Code/Medical AI Series/output"  # Path to output.

# To see how to get our secret key and project key, please read the information below this cell, after "Data sanity checks"!
SECRET_KEY = "YOUR_SECRET_KEY"
PROJECT_KEY = "YOUR_PROJECT_KEY"

# Don't modify PROJECT_NAME
PROJECT_NAME = f"proj_{PROJECT_KEY}"

### Data sanity checks:
Now, before we continue, it's always good practice to ensure our `.nii` files and their labels have the same dimensions, and that we have the same number of each (for one `.nii` file, we need a label file).

In [26]:
data_files = glob.glob("*.nii", root_dir=f'{DATA_PATH}/t1gd')  # Get all data files.
label_files = glob.glob("*.nii", root_dir=f'{DATA_PATH}/labels/original')  # Get all label files.

# Ensure that we have the same number of data files and label files, and ensure that these files exist.
assert len(data_files) > 0 and len(label_files) > 0, "Data and labels directories must not be empty."
assert len(data_files) == len(label_files), "Number of data files and label files must be equal."

# Ensure that each data file has a label file. Our naming schema is that the label file is the same name
# as the data file, the first character is a "l".
for file in data_files:

    # Create the label file name we expect.
    label_filepath = os.path.join(f'{DATA_PATH}/labels/original', f'l{file[1:]}')

    # Ensure that the label file exists.
    assert os.path.exists(label_filepath), f"Label file {label_filepath} does not exist."

    # Load the label file and data file.
    label_file = nib.load(label_filepath)
    data_file = nib.load(os.path.join(f'{DATA_PATH}/t1gd', file))

    # Ensure that the label file and data file have the same shape.
    assert label_file.shape == data_file.shape, f"Label file {label_file} does not match data file {data_file}."


### Generating your project's secret key:

The steps are as follows:

1. Sign up for a free Datature account at https://www.datature.io
2. Create a new project on Nexus
3. Go to the Integrations page
4. Choose `Generate New Secret` to get your Secret Key
5. Follow this script to use Datature SDK to upload the NifTi files to Nexus

For more information about Datature's Python SDK, see https://developers.datature.io/docs/python-sdk.

### Uploading `.nii` files to Datature Nexus:
Note that for `.nii` files, each file is a separate 3D volume.

* If the axis of orientation is provided, the SDK will upload a series of 2D slices corresponding to the specified orientation. Thus, you will only see one asset on Nexus which contains the 2D slices.

* If the axis of orientation is **not** provided, the SDK will upload a series of 2D slices for each orientation (x, y, and z). Thus, you will see three assets on Nexus which contain the 2D slices using the axial (z orientation), coronal (y orientation), and sagittal (x orientation) planes.

With that, we can begin.

### Uploading our dataset using Datature's SDK
To use Datature's SDK, we need to first create an `UploadSession` class. Then we use `.add_path(path, nifti-orientation='z')` to specify the path we want to upload, as well as the orientation of our uploads (z in this case).

**Note that we only want to upload our `.nii` files and not the labels.**

***!!! Uploading may take awhile !!!*** – have waited at most three and a half minutes.

In [22]:
# Get our project: (here endpopint is just used since we're in beta)
project = nexus.Client(SECRET_KEY).get_project(PROJECT_NAME)

# Create an upload session using the Datature API.
upload_session = project.assets.create_upload_session(groups=["main"], background=True)

# Get the files we need to upload.
files = glob.glob(f"{DATA_PATH}/t1gd/*.nii")

# Now we upload the data and labels to the upload session, using a progress bar to show the progress.
with alive_bar(len(files), title='Preparing upload assets', title_length=25) as progress_bar, upload_session as session:
    for file in files:
      session.add_path(file, nifti_orientation='z')
      progress_bar()

# Get the operation ID's so we can track the upload progress.
operations = upload_session.get_operation_ids()
with alive_bar(len(operations), title='Waiting server processing',title_length=25) as progress_bar:
  for op in operations:
    project.operations.wait_until_done(op)
    progress_bar()

Preparing upload assets   |████████████████████████████████████████| 10/10 [100%] in 44.0s (0.23/s) 
Waiting server processing |████████████████████████████████████████| 1/1 [100%] in 3:10.7 (0.01/s) 


## Creating COCO formatted file from `.nii` files and labels:
Now that we have uploaded our files to Nexus, we want to create annotations for those files using a run-length encoding (RLE) binary mask. To do so, we use the names of the files we created to create our COCO formatted file. ***Ultimately, we do not need to process each 2D slice to get the names of the files, so we can simply make a list or structure format for our files that we uploaded to Nexus***

### More about our `.nii` files and their processing:
Note that each of our `.nii` files are `(240, 240, 155)`, where the orientation is along the z-axis, meaning that we have 155 slices and each slice is `(240, 240)`. When converting to `.mp4`, this means that we create a video which runs through 155 frames of size `240px, 240px`.

For a given `.nii` file, i.e. `d_0001.nii`, that file was uploaded to Nexus as a `.mp4` file named `d_0001-z.mp4`. We then represent each frame of the video file as it's own `.jpg` file, which is referred to as `d_0001-z#frame=[x].jpg`, where `[x]` is replaced with a frame numbered 0 to 154, inclusive.

Also note that the `.nii` label files are `.nii` files themselves, but only contain the `category_id`'s for our identifable classes.

### Creating our COCO formatted file:
Since our `.nii` files are stored in the Nexus, we can start building our COCO formatted file using the names of the files we uploaded with naming scheme above.

There are a few considerations taken into account when making the COCO formatted file:
 
* We assume the file format is similar to the format described here: https://developers.datature.io/docs/uploading-annotations#coco-annotator-polygons--masks

* We use run-length encoding (RLE) binary masks for each `.nii` slice to create `n` separate masks for each slice, where `n` is the number of classes. In our case, we have `n = 3` classes: `non-enhancing tumor`, `enhancing tumor`, `edema`. The absence of a class indicates the brain region is unafflicted.

Now, we create the COCO formatted file as a dictionary, then conver it to a `.json` file:

In [23]:
NUM_IMAGES = 155  # Number of images per .nii file.


def create_rle_mask(mask_data: np.ndarray[int]) -> dict:
    """
    Given a binary mask, we create the RLE mask.

    Input:
        mask_data (ndarray): binary mask

    Output:
        rle_mask (ndarray): RLE binary mask
    """

    # Since the RLE mask requires a fortran array, we need to encode the mask_data as a fortran array.
    rle_data = mask.encode(np.asfortranarray(mask_data).astype(np.uint8))

    # Now we create the RLE mask using the string encoding of the bytes.
    rle_mask = {'counts': rle_data['counts'].decode('ascii'), 'size': rle_data['size']}

    return rle_mask


def create_licenses_entry() -> list[dict]:
    return [{'id': 0,
             'name': "Unknown",
             'url': ""}]


def create_info_entry() -> dict:
    return {'description': 'Datature Created COCO Format Dataset',
            'url': '',
            'version': 1,
            'year': datetime.now(timezone.utc).strftime('%Y'),
            'contributor': 'Datature',
            'date_created': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%S.%f%z')}


def create_annotation_entry(curr_image_id: int, curr_annotation_id: int, category_id: int, rle_mask: dict) -> dict:
    """
    Given the current image id, current annotation id, category id, and rle mask, we create the annotation entry.

    Input:
        curr_image_id (int): current image id
        curr_annotation_id (int): current annotation id
        category_id (int): category id
        rle_mask (dict): RLE binary mask where the keys are 'counts' and 'size'.

    Output:
        annotation_entry (dict): annotation entry
    """
    return {'id': curr_annotation_id,
            'image_id': curr_image_id,
            'category_id': category_id,
            'segmentation': rle_mask,
            'area': 0,
            'bbox': [0, 0, 0, 0],
            'iscrowd': 1}


def create_image_entry(filename: str, img_shape: tuple[int, int], curr_image_id: int) -> dict:
    """
    Given the filename, image shape, and current image id, we create the image entry.

    Input:
        filename (string): filename of the image
        img_data (tuple): image shape, which is a tuple of the image's width and height
        curr_image_id (int): current image id

    Output:
        image_entry (dict): image entry
    """
    return {'id': curr_image_id,
            'width': img_shape[0],
            'height': img_shape[1],
            'file_name': filename,
            'license': 0,
            'date_captured': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%S.%f%z')}


def create_coco_json(nii_labels_path: str, classes: dict[str, int]) -> dict:
    """
    Given the original nii_path, which is a directory to .nii label files, and the output_path, which we stored our updated
    binary masks, we create a COCO format json file.

    Input:
        nii_labels_path (string): path to directory containing .nii files
        output_path (string): path to directory where you want to save the numpy arrays
        classes (list): list of classes in the dataset

    Output:
        None
    """

    # Initialize the items in the COCO json file.
    info = create_info_entry()
    images = []
    categories = [{'supercategory': key, 'id': val, 'name': key} for key, val in classes.items()]
    annotations = []
    licenses = create_licenses_entry()

    # Use to keep track of the image id.
    curr_image_id = 0
    curr_annotation_id = 0

    # Now we create annotations from each image's binary mask (located in output directory); simutaneously, we create the RLEs
    for file in glob.glob(f'*.nii', root_dir=nii_labels_path):  # We specify root here since we use the the filename later.

        # Load in the .nii file, create ndarray of the data.
        nii_mask = nib.load(os.path.join(nii_labels_path, f'{file}'))
        nii_mask_data = nii_mask.get_fdata()

        # Go through each frame in the .nii label file and create its RLE binary mask.
        for i in range(nii_mask_data.shape[2]):

            # Add the image entry in the images dictionary (remember to add the frame number to the filename and orientation).
            images.append(create_image_entry(f'd{file[1:-4]}-z#frame={i}.jpg', nii_mask_data[:, :, i].shape, curr_image_id))

            # Now create the rle binary mask for each of the classes (not background) in the current frame.
            for _, val in classes.items():
                rle_mask_data = create_rle_mask(np.where(nii_mask_data[:, :, i] == val, 1, 0))
                annotations.append(create_annotation_entry(curr_image_id, curr_annotation_id, val, rle_mask_data))
                curr_annotation_id += 1

            curr_image_id += 1

    return {'info': info,
            'images': images,
            'annotations': annotations,
            'categories': categories,
            'licenses': licenses}


# Now we begin creating our COCO format json file using the labels and classes for each image. Note that this function is
# specific to the dataset we are working with, so it will need to be modified for different datasets. To see this, not that we
# explicitly name the image entry by assuming a specific naming convention for our .nii files:
#
#             images.append(create_image_entry(f'd{file[1:-4]}-z#frame={i}.jpg', nii_mask_data[:, :, i], curr_image_id))
#
coco_dict = create_coco_json(f'{DATA_PATH}/labels/original', {'edema': 1, 'non-enhancing tumor': 2,'enhancing tumor': 3})

# Save the coco_dict as a json file.
with open(os.path.join(f'{OUTPUT_PATH}', 'coco.json'), 'w') as f:
    json.dump(coco_dict, f)

## Culmination
Ultimately, after running the last cell, we should have a COCO formatted `.json` file named `coco.json` in our `OUTPUT_PATH` directory, which can be uploaded using Nexus' Upload / Export Annotations feature. 

Here, when uploading, remember to specify that we are importing a file of format `[Polygon / Mask] COCO Mask` since we used an RLE binary mask.