# Extract patches

In [34]:
import numpy as np 
#import SlideRunner.general.dependencies
from SlideRunner.dataAccess.database import Database
from SlideRunner.dataAccess.annotations import *
import os
import openslide
import sqlite3
import cv2
import sys

# Get annotation images for all three databases and train/test splits

## Setup

Create directories for all three databases
* MEL, ODAEL, CODAEL
* Each database has three different train/test splits
* So there will be 9 total directories created

In [35]:
# make list of tuples
databases = ['MEL','ODAEL','CODAEL']
splits = ['1','2','3']

dir_names = []
for database in databases:
    for split in splits:
        dir_names.append((database, split))

# create the directories in a folder named Data
for database, split in dir_names:
    os.system('mkdir -p Data/Data_CMC_%s_%s' % (database, split))

    dirs = ['Mitosis', 'Nonmitosis']
    for k in dirs:
        os.system('mkdir -p Data/Data_CMC_%s_%s/train/%s' % (database,split,k))
        os.system('mkdir -p Data/Data_CMC_%s_%s/test/%s' % (database,split,k))


Define database file paths

In [36]:
# path to directory main folder
path = '../../'
# list of databases
DBs = [path+'prework/databases/MITOS_WSI_CMC_MEL.sqlite', 
path+'prework/databases/MITOS_WSI_CMC_ODAEL_TR.sqlite', 
path+'prework/databases/MITOS_WSI_CMC_CODAEL_TR_ROI.sqlite']
# make tuple with corresponding database label
databases = ['MEL','ODAEL','CODAEL']
DB_names = []
for i in range(len(DBs)):
    DB_names.append((DBs[i], databases[i]))


Define the test sets

In [37]:
slidelist_test_1 = ['18','3', '22','10','15','21','14']
slidelist_test_2 = ['1', '20','17','5', '2', '11','16']
slidelist_test_3 = ['12','13','7', '19','8', '6', '9']

test_slides = { '1': slidelist_test_1,
                '2': slidelist_test_2,
                '3': slidelist_test_3}

**Important:** Set image patch size with modeling in mind. Most pre-trained networks like ResNet require (224,224) inputs

In [38]:
patchSize = 250

Set parameters
* Ensure WSIpath and basepath are setup properly according to your setup

In [39]:
disagreedclass = 0
agreedclass = 0
 # path to WSI folder (depends on your setup)
WSIpath='../../WSI/'
# absolute path to the working directory + Data folder
basepath = os.getcwd()+'/Data/'

List of slides function

In [40]:
def listOfSlides(DB):
    DB.execute('SELECT uid,filename from Slides')
    return DB.fetchall()

**WARNING:** Takes a long time to run and will consume on order of 10s of GBs of memory on local storage depending on image size

Get image patches for every annotation in each database and assign it to the correct directory

In [202]:
# outer for loop to go through each database
for database, name in DB_names:
    # instantiate database object and open database
    DB = Database().open(database)

    # first inner for loop to go through slides of DB
    for slideid,filename in listOfSlides(DB):
        # load slide into memoetry and open w/openslide
        DB.loadIntoMemory(slideid)
    
        slide=openslide.open_slide(WSIpath+filename)

        # second inner for loop to go through each annotation on the current slide
        for k in DB.annotations.keys():

            anno = DB.annotations[k]

            if anno.deleted or anno.annotationType != AnnotationType.SPOT:
                continue
            coord_x = anno.x1
            coord_y = anno.y1

            lu_x = int(coord_x - int(patchSize/2))
            lu_y = int(coord_y - int(patchSize/2))
            img = np.array(slide.read_region(location=(lu_x, lu_y), level=0, size=(patchSize, patchSize)))
            img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)
            
            # third inner for loop to assign annotation/image to proper directory of current database
            for split in splits:
                istest = 'train/' if str(slideid) not in test_slides[split] else 'test/'
                if (anno.agreedClass ==2):
                    cv2.imwrite(basepath+'Data_CMC_%s_%s/' % (name, split) +istest+'Mitosis/%d.png' % k, img)
                    #os.system(f'ln -s ../../../images/Mitosis/{k}.png Data_CMC{sys.argv[1]}/'+istest+'Mitosis/%d.png' %k)

                if (anno.agreedClass==1):
                    cv2.imwrite(basepath+'Data_CMC_%s_%s/' % (name, split) +istest+'Nonmitosis/%d.png' % k, img)
                    #os.system(f'ln -s ../../../images/Nonmitosis/{k}.png Data_CMC{sys.argv[1]}/'+istest+'Nonmitosis/%d.png' %k)
        print('Done with slide {} in database {}'.format(slideid, name))


Done with slide 1 in database CODAEL
Done with slide 2 in database CODAEL
Done with slide 3 in database CODAEL
Done with slide 5 in database CODAEL
Done with slide 6 in database CODAEL
Done with slide 7 in database CODAEL
Done with slide 8 in database CODAEL
Done with slide 9 in database CODAEL
Done with slide 10 in database CODAEL
Done with slide 11 in database CODAEL
Done with slide 12 in database CODAEL
Done with slide 13 in database CODAEL
Done with slide 14 in database CODAEL
Done with slide 15 in database CODAEL
Done with slide 16 in database CODAEL
Done with slide 17 in database CODAEL
Done with slide 18 in database CODAEL
Done with slide 19 in database CODAEL
Done with slide 20 in database CODAEL
Done with slide 21 in database CODAEL
Done with slide 22 in database CODAEL
