# Extract patches

In [2]:
import numpy as np
import pandas as pd
#import SlideRunner.general.dependencies
from SlideRunner.dataAccess.database import Database
from SlideRunner.dataAccess.annotations import *
import os
import openslide
import sqlite3
import cv2
import sys
import random

# Get centered image patches for cell/image patch classifier

Generate centered image patches (i.e., annotation in center of patch) of a specified size for every database (total 3) and test slide list combination (total 3).

## Setup

Create directories for all three databases
* MEL, ODAEL, CODAEL
* Each database has three different train/test splits
* So there will be 9 total directories created

In [101]:
# make list of tuples
databases = ['MEL','ODAEL','CODAEL']
splits = ['1','2','3']

dir_names = []
for database in databases:
    for split in splits:
        dir_names.append((database, split))

# create the directories in a folder named Data
for database, split in dir_names:
    os.system('mkdir -p Data/Data_CMC_%s_%s' % (database, split))

    dirs = ['Mitosis', 'Nonmitosis']
    for k in dirs:
        os.system('mkdir -p Data/Data_CMC_%s_%s/train/%s' % (database,split,k))
        os.system('mkdir -p Data/Data_CMC_%s_%s/test/%s' % (database,split,k))


Define database file paths

In [15]:
# path to directory main folder
path = '../'
# list of databases
DBs = [path+'prework/databases/MITOS_WSI_CMC_MEL.sqlite', path+'prework/databases/MITOS_WSI_CMC_ODAEL_TR.sqlite', path+'prework/databases/MITOS_WSI_CMC_CODAEL_TR_ROI.sqlite']
# make tuple with corresponding database label
databases = ['MEL','ODAEL','CODAEL']
DB_names = []
for i in range(len(DBs)):
    DB_names.append((DBs[i], databases[i]))


Define the test sets

In [16]:
slidelist_test_1 = ['18','3', '22','10','15','21','14']
slidelist_test_2 = ['1', '20','17','5', '2', '11','16']
slidelist_test_3 = ['12','13','7', '19','8', '6', '9']

test_slides = { '1': slidelist_test_1,
                '2': slidelist_test_2,
                '3': slidelist_test_3}


**Important:** Set image patch size with modeling in mind. Most pre-trained networks like ResNet require (224,224) inputs

In [17]:
patchSize = 512

Set parameters
* Ensure WSIpath and basepath are setup properly according to your setup

In [18]:
disagreedclass = 0
agreedclass = 0
 # path to WSI folder (depends on your setup)
WSIpath='../WSI/'
# absolute path to the working directory + Data folder
basepath = os.getcwd()+'/Data/'

List of slides function

In [9]:
def listOfSlides(DB):
    DB.execute('SELECT uid,filename from Slides')
    return DB.fetchall()

**WARNING:** Takes a long time to run and will consume on order of 10s of GBs of memory on local storage depending on image size

Get image patches for every annotation in each database and assign it to the correct directory

In [10]:
# outer for loop to go through each database
for database, name in DB_names:
    # instantiate database object and open database
    DB = Database().open(database)

    # first inner for loop to go through slides of DB
    for slideid,filename in listOfSlides(DB):
        # load slide into memory and open w/openslide
        DB.loadIntoMemory(slideid)
    
        slide=openslide.open_slide(WSIpath+filename)

        # second inner for loop to go through each annotation on the current slide
        for k in DB.annotations.keys():

            anno = DB.annotations[k]

            if anno.deleted or anno.annotationType != AnnotationType.SPOT:
                continue
            coord_x = anno.x1
            coord_y = anno.y1

            lu_x = int(coord_x - int(patchSize/2))
            lu_y = int(coord_y - int(patchSize/2))
            img = np.array(slide.read_region(location=(lu_x, lu_y), level=0, size=(patchSize, patchSize)))
            img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)
            
            # third inner for loop to assign annotation/image to proper directory of current database
            for split in splits:
                istest = 'train/' if str(slideid) not in test_slides[split] else 'test/'
                if (anno.agreedClass ==2):
                    cv2.imwrite(basepath+'Data_CMC_%s_%s/' % (name, split) +istest+'Mitosis/%d.png' % k, img)
                    #os.system(f'ln -s ../../../images/Mitosis/{k}.png Data_CMC{sys.argv[1]}/'+istest+'Mitosis/%d.png' %k)

                if (anno.agreedClass==1):
                    cv2.imwrite(basepath+'Data_CMC_%s_%s/' % (name, split) +istest+'Nonmitosis/%d.png' % k, img)
                    #os.system(f'ln -s ../../../images/Nonmitosis/{k}.png Data_CMC{sys.argv[1]}/'+istest+'Nonmitosis/%d.png' %k)
        print('Done with slide {} in database {}'.format(slideid, name))


Done with slide 1 in database MEL
Done with slide 2 in database MEL
Done with slide 3 in database MEL
Done with slide 5 in database MEL
Done with slide 6 in database MEL
Done with slide 7 in database MEL
Done with slide 8 in database MEL
Done with slide 9 in database MEL
Done with slide 10 in database MEL
Done with slide 11 in database MEL
Done with slide 12 in database MEL
Done with slide 13 in database MEL
Done with slide 14 in database MEL
Done with slide 15 in database MEL
Done with slide 16 in database MEL
Done with slide 17 in database MEL
Done with slide 18 in database MEL
Done with slide 19 in database MEL
Done with slide 20 in database MEL
Done with slide 21 in database MEL
Done with slide 22 in database MEL
Done with slide 1 in database CODAEL
Done with slide 2 in database CODAEL
Done with slide 3 in database CODAEL
Done with slide 5 in database CODAEL
Done with slide 6 in database CODAEL
Done with slide 7 in database CODAEL
Done with slide 8 in database CODAEL
Done with slid

## Non-centered image patches and bounding box info for object detection

Generates random (not-centered) 512x512 crops of mitosis annotations and stores bounding box info. Database and test slide list defined explicitly instead of looping through every combination.

In [103]:
# create dictionary to store bounding boxes
lbl_bbox_train = {'image_id': [],
                        'xmin': [],
                            'ymin': [],
                                'xmax': [],
                                    'ymax': [],
                                        'labels': [],
                                            'WSI_coord': [],
                                                'slideid': []}

lbl_bbox_test = {'image_id': [],
                        'xmin': [],
                            'ymin': [],
                                'xmax': [],
                                    'ymax': [],
                                        'labels': [],
                                            'WSI_coord': [],
                                                'slideid': []}

List of slides function

In [104]:
def listOfSlides(DB):
    DB.execute('SELECT uid,filename from Slides')
    return DB.fetchall()

Define database and test slide set explicitly instead of looping through every combo

In [105]:
# downsampling factor (author's used 1)
down_factor = 1

# bounding box 'radius'
r = 25

# path to WSI folder (depends on your setup)
WSIpath='../WSI/'
# absolute path to the working directory + Data folder
basepath = os.getcwd()+'/Data/'
# path to database
database = '../prework/databases/MITOS_WSI_CMC_CODAEL_TR_ROI.sqlite'
name = 'CODAEL'
test_slides = ['18','3', '22','10','15','21','14']
split = '1'

**WARNING:** Takes a long time to run and will consume on order of 10s of GBs of memory on local storage depending on image size

Get non-centered image patches and bounding boxes for every annotation in the specified database and assign it to the correct directory

Exports .csv file of train and test set bounding boxes (and other info) to current working directory

In [106]:
DB = Database().open(database)

# first inner for loop to go through slides of DB
for slideid,filename in listOfSlides(DB):
    # load slide into memory and open w/openslide
    DB.loadIntoMemory(slideid)

    slide=openslide.open_slide(WSIpath+filename)

    # second inner for loop to go through each annotation on the current slide
    for k in DB.annotations.keys():

        anno = DB.annotations[k]

        # only need mitosis annotations (class = 2)
        if anno.deleted or anno.annotationType != AnnotationType.SPOT or anno.agreedClass == 1:
            continue

        coord_x = anno.x1
        coord_y = anno.y1
        WSI_coord = [int(coord_x), int(coord_y)]

        # define how close to edge of image annotation can be for random crops (don't want them cutoff by edge)
        pad = 75 # this puts the edge of the 50 px bounding box at least 50 px from any edge of the image

        # generate random distances for random left upper (lu) corner of 512x512 patch
        # these become the coordinates (x, y) of the annotation relative to the 512x512 patch
        x = random.randint(pad, int(patchSize - pad))
        y = random.randint(pad, int(patchSize - pad))

        # get bounding box coordinates relative to 512x512 patch
        d = 2 * r / down_factor
        x_min = (x - r) / down_factor
        y_min = (y - r) / down_factor
        x_max = x_min + d
        y_max = y_min + d

        # define left upper corner to generate image
        lu_x = int(coord_x - x)
        lu_y = int(coord_y - y)

        # generate image
        img = np.array(slide.read_region(location=(lu_x, lu_y), level=0, size=(patchSize, patchSize)))
        img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)

        # write images to directory and update dictionaries
        
        # check if slideid in train or test set
        istest = 'train/' if str(slideid) not in test_slides else 'test/'
        # write image to directory
        cv2.imwrite(basepath+'Data_CMC_%s_%s/' % (name, split) +istest+'Mitosis/%d.png' % k, img)
    
        # update dictionary values
        dic = lbl_bbox_train if str(slideid) not in test_slides else lbl_bbox_test
        
        dic['slideid'].append(slideid)
        dic['image_id'].append(str(k))
        dic['WSI_coord'].append(WSI_coord)
        dic['xmin'].append(int(x_min))
        dic['ymin'].append(int(y_min))
        dic['xmax'].append(int(x_max))
        dic['ymax'].append(int(y_max))

    print('Done with slide {} in database {}'.format(slideid, name))

lbl_bbox_train['labels'] = np.ones(len(lbl_bbox_train['image_id']), dtype=int)
lbl_bbox_test['labels'] = np.ones(len(lbl_bbox_test['image_id']), dtype=int)

# format dataframes
train_df = pd.DataFrame(lbl_bbox_train)
train_df.sort_values(by=['image_id'], inplace=True)
train_df.reset_index(inplace=True)
train_df.drop(columns='index', inplace=True)

test_df = pd.DataFrame(lbl_bbox_test)
test_df.sort_values(by=['image_id'], inplace=True)
test_df.reset_index(inplace=True)
test_df.drop(columns='index', inplace=True)

# export dfs as csvs
train_df.to_csv(os.getcwd()+'/train_annotations.csv')
test_df.to_csv(os.getcwd()+'/test_annotations.csv')

Done with slide 1 in database CODAEL
Done with slide 2 in database CODAEL
Done with slide 3 in database CODAEL
Done with slide 5 in database CODAEL
Done with slide 6 in database CODAEL
Done with slide 7 in database CODAEL
Done with slide 8 in database CODAEL
Done with slide 9 in database CODAEL
Done with slide 10 in database CODAEL
Done with slide 11 in database CODAEL
Done with slide 12 in database CODAEL
Done with slide 13 in database CODAEL
Done with slide 14 in database CODAEL
Done with slide 15 in database CODAEL
Done with slide 16 in database CODAEL
Done with slide 17 in database CODAEL
Done with slide 18 in database CODAEL
Done with slide 19 in database CODAEL
Done with slide 20 in database CODAEL
Done with slide 21 in database CODAEL
Done with slide 22 in database CODAEL


## Generate WSI sections for deployment demonstration

This will take the defined database and slide # and generate an image patch of a specified size from a specified section of the WSI. The point of this is to generate a large patch to pass through the ML pipeline.

List of slides function

In [3]:
def listOfSlides(DB):
    DB.execute('SELECT uid,filename from Slides')
    return DB.fetchall()

Define database, slideid, and paths

In [18]:
database = 'CODAEL'
my_slide = 21
# path to database
database = '../prework/databases/MITOS_WSI_CMC_CODAEL_TR_ROI.sqlite'
# path to WSI folder (depends on your setup)
WSIpath='../WSI/'
# path to save image to
savepath = '/Users/gsowell/Desktop/Research/Fourthbrain/Capstone/samsung-capstone/notebooks/img'

Define upper left corner of image patch and patch size

In [20]:
# x-coordinate of upper left corner
lu_x = 84944
# width of patch
width = 7008
# y-coordinate of upper left corner
lu_y = 33792
# height of patch
height = 5248
# leave at level 0
level=0


List of slides and associated filenames for reference

In [19]:
DB = Database().open(database)
df = {'slideid':[], 'filename':[]}
# first inner for loop to go through slides of DB
for slideid,filename in listOfSlides(DB):
    df['slideid'].append(slideid)
    df['filename'].append(filename)
df = pd.DataFrame(df)
print(df)

    slideid                  filename
0         1  a8773be388e12df89edd.svs
1         2  460906c0b1fe17ea5354.svs
2         3  2d56d1902ca533a5b509.svs
3         5  d0423ef9a648bb66a763.svs
4         6  50cf88e9a33df0c0c8f9.svs
5         7  da18e7b9846e9d38034c.svs
6         8  d7a8af121d7d4f3fbf01.svs
7         9  2191a7aa287ce1d5dbc0.svs
8        10  69a02453620ade0edefd.svs
9        11  c4b95da36e32993289cb.svs
10       12  fa4959e484beec77543b.svs
11       13  72c93e042d0171a61012.svs
12       14  4eee7b944ad5e46c60ce.svs
13       15  b1bdee8e5e3372174619.svs
14       16  3d3d04eca056556b0b26.svs
15       17  084383c18b9060880e82.svs
16       18  e09512d530d933e436d5.svs
17       19  d37ab62158945f22deed.svs
18       20  deb768e5efb9d1dcbc13.svs
19       21  022857018aa597374b6c.svs
20       22  13528f1921d4f1f15511.svs


In [26]:
DB = Database().open(database)
# first inner for loop to go through slides of DB
for slideid,filename in listOfSlides(DB):
    if slideid == my_slide:
        # load slide into memory and open w/openslide
        DB.loadIntoMemory(slideid)
        
        slide=openslide.open_slide(WSIpath+filename)

        img = np.array(slide.read_region(location=(lu_x, lu_y), level=0, size=(width, height)))
        img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)

        cv2.imwrite(os.path.join(savepath, 'test_slide_%d.png' % slideid), img)