# Import Libraries

In [None]:
!pip install 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
!pip install git+https://github.com/waspinator/pycococreator.git

In [None]:
# import libraries for pre processing
import os
import sys
import random
import math
import re
import time
import numpy as np
import json
import cv2
import matplotlib
import matplotlib.pyplot as plt
import scipy
import warnings
import shutil
import random

from PIL import Image
from pycococreatortools import pycococreatortools
from tqdm import tqdm

warnings.filterwarnings('ignore')

# Data Preparation
##### code from https://github.com/nolancardozo13/pathology_maskrcnn.git

In [None]:
def organize_data(image_path, mask_path, test_split):
    '''
    Organizes the images and masks and splits it into train and test splits
    '''
    index = []
    cell_names = ["neoplastic","inflammatory","softtissue","dead","epithelial"]
    # Loading the data
    images = np.load(image_path, mmap_mode='r+')
    masks = np.load(mask_path, mmap_mode='r+')

    print('----------Loaded data----------')
    
    # Changing the datatype to reduce the size
    images = images.astype(np.int16)
    masks = masks.astype(np.int16)
    
    print('----------Reduced size----------')

    # Selecting the list of indexes of images with no cells ()
    if images.shape[0]== 2656:
        index = [584, 586, 604, 748, 750, 780, 811, 812, 813, 828, 830, 832, 833,
                 996, 998, 1147, 1148, 1149, 1152, 1155, 1158, 1160, 1161, 1164,
                 1166, 1432, 1433, 1512, 1578, 1614, 1615, 1616, 1617, 1618, 1619,
                 1620, 1629, 1632, 1704, 1705, 1707, 1708, 1709, 1723, 1724, 1725,
                 1748, 1749, 1750, 1751, 1752, 1753, 1859, 1864, 1870, 1880, 1923,
                 1939, 1940, 1945, 1946, 1966, 1967, 1968, 1969, 1970, 1971, 1972,
                 1973, 1974, 1975, 1976, 1977, 1978, 1979, 2007, 2009, 2019, 2020,
                 2022, 2098, 2108, 2109, 2110, 2111, 2115, 2131, 2132, 2133, 2134,
                 2135, 2137, 2163, 2164, 2165, 2174, 2176, 2202, 2263, 2264, 2265,
                 2267, 2406, 2407, 2462, 2463, 2464, 2465, 2515, 2550, 2551, 2552,
                 2626, 2636, 2639, 2640]   

    print('----------Removed images with no cells----------')

    # Deleting indexes with images which contain no cells
    images = np.delete(images, index, 0)
    masks = np.delete(masks, index, 0)

    indices = list(range(len(images)))
    random.shuffle(indices)
    train_indices = indices[:math.floor(test_split*len(indices))]

    print('----------Splitting indices----------')

    # Organising folders
    if not os.path.isdir('Pannuke_dataset'):
        os.mkdir('Pannuke_dataset/')    
    
    for i, img in enumerate(images):
        if i in train_indices:
            phase = "train"
        else:
            phase = "val"
        im = Image.fromarray(img.astype(np.uint8))
        if not os.path.isdir('Pannuke_dataset/'+phase+'/image_'+str(i)+'/images/'):
            os.makedirs('Pannuke_dataset/'+phase+'/image_'+str(i)+'/images/')
        im.save('Pannuke_dataset/'+phase+'/image_'+str(i)+'/images/image_'+str(i)+'.jpg')
        for k in range(masks[i].shape[2]-1):
            cell_mask = masks[i][:,:,k]
            indiviudal_masks = []
            unique_colors = np.unique(cell_mask)
            if len(unique_colors) > 1:
                for l in range(len(unique_colors)-1):
                    ms = Image.fromarray(cell_mask == unique_colors[l+1])
                    if not os.path.isdir('Pannuke_dataset/'+phase+'/image_'+str(i)+'/masks/'+cell_names[k]):
                        os.makedirs('Pannuke_dataset/'+phase+'/image_'+str(i)+'/masks/'+cell_names[k])
                    ms.save('Pannuke_dataset/'+phase+'/image_'+str(i)+'/masks/'+cell_names[k]+'/masks_'+str(l)+'.jpg')
    print('----------Finished organising----------')

In [None]:
!cp -r ../input/cancer-inst-segmentation-and-classification/Images/images.npy ./
!cp -r ../input/cancer-inst-segmentation-and-classification/Masks/masks.npy ./

In [None]:
image_path = './images.npy'
mask_path = './masks.npy'

organize_data(image_path, mask_path, 0.8)

In [None]:
# loop to save images in one folder as desired by detextron 2
def final_directory_structure(input_path, output_path):
    print(input_path)
    images = os.listdir(input_path)
    
    if not os.path.isdir(output_path):
        os.makedirs(output_path)
        
    for img in tqdm(images):
        shutil.copyfile(input_path+img+'/images/'+img+'.jpg', output_path+img+'.jpg')
        time.sleep(0.2)
        

In [None]:
train_input_path = "./Pannuke_dataset/train/"
train_output_path = './dataset/train/'

final_directory_structure(train_input_path, train_output_path)

In [None]:
val_input_path = "./Pannuke_dataset/val/"
val_output_path = './dataset/val/'

final_directory_structure(val_input_path, val_output_path)

In [None]:
def pannuke_to_coco_format(image_path, 
                           output_path, 
                           categories = ["neoplastic","inflammatory","softtissue","dead","epithelial"] , 
                           dataset_name = "pannuke"):
    '''
    this function converts the pannuke dataset format to the coco format which makes it easier to apply detectron 
    2 algorithms on.
    '''
    images_name = os.listdir(image_path)
    cocoformat = {"licenses":[], "info":[], "images":[], "annotations":[], "categories":[]}
    
    for i in range(len(categories)):
        cocoformat["categories"].append({"id": int(i+1), "name": categories[i], "supercategory": dataset_name})
    
    m_id = 1
    
    for i, img in tqdm(enumerate(images_name)):
        
        image = Image.open(image_path + img + "/images/" + img + ".jpg")
        image_info = pycococreatortools.create_image_info(int(i+1), 
                                                          img + ".jpg" , 
                                                          image.size)
        
        cocoformat["images"].append(image_info)
        c_types = os.listdir(image_path + img + "/masks/")
        
        for c in c_types:
            masks = os.listdir(image_path + img + "/masks/"+c)
            for msk in masks:
                category_info = {'id': int(categories.index(c)+1), 'is_crowd': False}
                m_image = np.asarray(Image.open(image_path + img + "/masks/"+c+"/"+ msk).convert('1')).astype(np.uint8)
                annotation_info = pycococreatortools.create_annotation_info(
                    m_id, int(i+1), category_info, m_image,
                    image.size, tolerance=2)
                m_id = m_id + 1
                
                if annotation_info is not None:
                    cocoformat["annotations"].append(annotation_info) 
                    
        time.sleep(0.2)
    with open(output_path, "w") as f:
        json.dump(cocoformat, f)

In [None]:
train_path = './Pannuke_dataset/train/'
train_output_path = './dataset/train.json'

pannuke_to_coco_format(image_path = train_path, 
                       output_path = train_output_path, 
                       categories = ["neoplastic","inflammatory","softtissue","dead","epithelial"])

In [None]:
val_path = './Pannuke_dataset/val/'
val_output_path = './dataset/val.json'

pannuke_to_coco_format(image_path = val_path, 
                       output_path = val_output_path, 
                       categories = ["neoplastic","inflammatory","softtissue","dead","epithelial"])

# Zip the converted dataset

In [None]:
!zip -r ./dataset.zip ./dataset