In [1]:
import glob
import os
import albumentations as albu
import cv2
from PIL import Image
from skimage import measure
from shapely.geometry import Polygon, MultiPolygon  
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as img
import numpy as np
from tqdm import tqdm
import json

In [3]:
# Initialize the data class
categories = [
    {"supercategory": "cell_nuc", "id": 1, "name": "nucleus"},
    {"supercategory": "cell_cyt", "id": 2, "name": "cytoplasm"}
]

In [4]:
# Define the data paths
img_root = "../TCIA_SegPC_dataset/train/x/"
mask_root = "../TCIA_SegPC_dataset/train/y/"
dest_root = "../TCIA_SegPC_dataset/coco_two_train/"
names = os.listdir(img_root)

In [5]:
# Create the destination folders
os.makedirs(dest_root, exist_ok=True)
os.makedirs(dest_root+'x', exist_ok=True)
os.makedirs(dest_root+'instance_y', exist_ok=True)
os.makedirs(dest_root+'semantic_y', exist_ok=True)

images = []
annos = []
res_size=(1080,1440)

In [6]:
# Iterate over the images
for name in tqdm(names):

    # Read the image
    image = np.array(Image.open(img_root+name))

    # Resize the image
    image= cv2.resize(image, res_size[::-1],interpolation=cv2.INTER_NEAREST)
    # Convert numpy array to image and save
    new_im = Image.fromarray(image)
    new_im.save(dest_root+'x/'+name)

    # Get image shape
    h,w,_ = image.shape
    index = name[:-4]

    # Create the image info
    img_info = {}
    img_info['file_name'] = name
    img_info['height'] = h
    img_info['width'] = w
    img_info['id'] = int(index)
    images.append(img_info)

    # Create the semantic mask
    semantic_mask = np.zeros(res_size)

    # Get the mask list
    mask_list = glob.glob(mask_root+index+"_*")

    # Iterate over the masks
    count = 0
    for mask_name in mask_list: 
        count+=1
        ann_nuc = {}
        ann_cyt = {}
        
        # Read the mask
        mask = cv2.imread(mask_name, 0)
        # Resize the mask
        mask= cv2.resize(mask, res_size[::-1], interpolation=cv2.INTER_NEAREST)
        # normalize the mask
        semantic_mask = np.maximum(semantic_mask,mask)

        # initialize mask for nucleus and cytoplasm
        mask_nuc = np.zeros(mask.shape).astype(np.uint8)
        mask_nuc[mask==40] = 1
        mask_cyt = np.zeros(mask.shape).astype(np.uint8)
        mask_cyt[mask==20] = 1

        # Get the mask id
        mask_id_nuc = f"999{mask_name.split('/')[-1][:-4]}"
        mask_id_cyt = f"888{mask_name.split('/')[-1][:-4]}"
        
        # Save the masks
        img.imsave(dest_root+'instance_y/'+mask_id_nuc+'.bmp', mask_nuc)
        img.imsave(dest_root+'instance_y/'+mask_id_cyt+'.bmp', mask_cyt)

        # Find the contours
        res_nuc = cv2.findContours(mask_nuc.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
        res_cyt = cv2.findContours(mask_cyt.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
        
        # Create the annotation for nucleus
        ann_nuc['id'] = mask_id_nuc
        ann_nuc['image_id'] = int(index)
        ann_nuc['segmentation'] = []
        
        # Create the annotation for cytoplasm
        ann_cyt['id'] = mask_id_cyt
        ann_cyt['image_id'] = int(index)
        ann_cyt['segmentation'] = []

        # Get the area, bbox and segmentation for nucleus
        a = res_nuc[0][0]
        mx = 0
        for i in res_nuc[0]:
            if i.shape[0]>mx:
                mx = i.shape[0]
                a = i
        ann_nuc['area'] =  cv2.contourArea(a)
        a = a.squeeze()
        max_x, max_y = np.max(a, axis =0)
        min_x, min_y = np.min(a, axis =0)
        seg = a.ravel()
        seg = seg.astype('float64')
        ann_nuc['segmentation'].append(seg.tolist())
        ann_nuc["bbox"] =  [float(min_x-0.5), float(min_y-0.5), float(max_x-min_x+1), float(max_y-min_y+1)]
        ann_nuc["iscrowd"]= 0
        ann_nuc["category_id"] = 1
        annos.append(ann_nuc)
        
        # Get the area, bbox and segmentation for cytoplasm
        a = res_cyt[0][0]
        mx = 0
        for i in res_cyt[0]:
            if i.shape[0]>mx:
                mx = i.shape[0]
                a = i
        ann_cyt['area'] =  cv2.contourArea(a)
        a = a.squeeze()
        max_x, max_y = np.max(a, axis =0)
        min_x, min_y = np.min(a, axis =0)
        seg = a.ravel()
        seg = seg.astype('float64')
        ann_cyt['segmentation'].append(seg.tolist())
        ann_cyt["bbox"] =  [float(min_x-0.5), float(min_y-0.5), float(max_x-min_x+1), float(max_y-min_y+1)]
        ann_cyt["iscrowd"]= 0
        ann_cyt["category_id"] = 2
        annos.append(ann_cyt)
    
    # Save the semantic mask
    semantic_mask = (semantic_mask>0)*255
    cv2.imwrite(dest_root+'semantic_y/'+name,semantic_mask)

100%|██████████| 298/298 [04:19<00:00,  1.15it/s]


In [7]:
# Create the dataset dictionary
dataset = {
    "licenses": [],
    "images": images,
    "annotations": annos,
    "categories": categories,
}

# Save the dataset in COCO format
with open(dest_root+'COCO.json', 'w') as fp:
    json.dump(dataset, fp)

In [8]:
print("number of images saved: ", len(os.listdir(dest_root+'x')))
print("number of instances saved: ", len(os.listdir(dest_root+'instance_y')))

number of images saved:  298
number of instances saved:  3286
