## setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#%cd "/content/drive/MyDrive/shared folder/Master_Thesis"
%cd "/content/drive/MyDrive/shared_folder/Master_Thesis"

/content/drive/MyDrive/shared_folder/Master_Thesis


In [None]:
!apt update && apt install -y openslide-tools
!pip install openslide-python

In [None]:
import openslide
import openslide as opsl

In [None]:
import numpy as np
import shutil
import os
from skimage import io
import glob
import datetime
import logging
from pathlib import Path
from PIL import Image
import numpy as np

## configuration

In [None]:
# configuration

## xml_to_region
import lxml.etree as ET

def xml_to_region(xml_file):
    """
    parse XML label file and get the points
    :param xml_file: xml file
    :return: region list,region_class
    """

    tree = ET.parse(xml_file)
    region_list = []
    region_class = []
    for color in tree.findall('.//Annotation'):
        #if color.attrib['LineColor'] in ['65280','255','65535']:
        if True:
            for region in color.findall('./Regions/Region'):
                vertex_list = []
                #region.attrib.get('Type')=='0':
                region_class.append(region.attrib.get('Type'))
                for vertex in region.findall('.//Vertices/Vertex'):
                    # parse the 'X' and 'Y' for the vertex
                    vertex_list.append(vertex.attrib)
                region_list.append(vertex_list)

    return region_list,region_class



## region_handler
from PIL import ImageDraw, Image

def region_handler(im, region_list,region_class, level_downsample):
    
    dr = ImageDraw.Draw(im)
    for r_class, region in enumerate(region_list):
        point_list = []
        if region_class[r_class] == '0' or region_class[r_class] == '3':
            for __, point in enumerate(region):
                X, Y = int(float(point['X'])/level_downsample), int(float(point['Y'])/level_downsample)
                point_list.append((X, Y))
            
            if region_class[r_class] == '3':
                dr.arc(point_list, 0, 360, fill='#000000', width=12)
            else:
                dr.line(point_list, fill="#000000", width=12)                            

    return im

## Slide



class Slide(openslide.OpenSlide):
    def __init__(self, svs_file, level=2):

        super().__init__(svs_file)
        self._filepath = svs_file
        self._basename = os.path.basename(svs_file).split('.')[0]
        self.slide = openslide.OpenSlide(svs_file)
        self._level = level

    def get_basename(self):

        return self._basename

    def get_filepath(self):

        return self._filepath

    def get_level(self):

        return self._level

    def get_level_count(self):

        return len(self.slide.level_downsamples)

    def get_level_downsample(self, level=2):

        return self.slide.level_downsamples[level]

    def get_level_dimension(self, level=2):

        return self.slide.level_dimensions[level]

    def get_thumb(self, level=2):

        level_dimension = self.get_level_dimension(level)
        tile = self.slide.get_thumbnail(level_dimension)

        return tile

    def svs_to_png(self,save_dir):

        self.get_thumb().save(save_dir)

    def expand_img(self, im, size, value=(0, 0, 0)):

        im_new = Image.new("RGB", size, value)
        im_new.paste(im, (0, 0))

        return im_new
    
    def get_mpp(self):

        properties = self.properties
        properties['openslide.mpp-x']
        return np.float(properties['openslide.mpp-x'])/1000

    def __del__(self):
        self.slide.close()


## Cut Patch

In [None]:
def openslide_cut_patch(filename,label_result_dir,livel,patch_size,save_dir, subname):
    slide = opsl.open_slide(filename) #
    file_name  = os.path.basename(filename).split('.')[0] + "." + os.path.basename(filename).split('.')[1]
    level_downsamples = slide.level_downsamples[livel]
    Wh = np.zeros((len(slide.level_dimensions),2))
    for i in range (len(slide.level_dimensions)):
        Wh[i,:] = slide.level_dimensions[i]
    w_count = int(Wh[0,0] // patch_size)
    h_count = int(Wh[0,1] // patch_size)

    get_cut = 0

    xml_file = os.path.join(label_result_dir,file_name + '.xml')
    slide_xml = Slide(filename)
    tile =slide_xml.get_thumb()
    if xml_file and os.path.exists(xml_file):
        region_list,region_class = xml_to_region(xml_file)
        svs_im_npy = region_handler(tile, region_list, region_class,slide_xml.get_level_downsample())
        svs_im_npy = np.array(svs_im_npy.convert('RGBA'))
    else:
        svs_im_npy = np.array(tile.convert('RGBA'))
    #np.save(os.path.join(label_result_dir,file_name + '.npy') ,svs_im_npy)


    label_result = svs_im_npy
    #label_result = np.load(os.path.join(label_result_dir,file_name + '.npy'))

    file_name=subname + "-" +file_name.split('.')[0]
    file_name_new= file_name.split('-')[0] + "-" + file_name.split('-')[2] + "-" + file_name.split('-')[3] + "-" + file_name.split('-')[4] + "-" + file_name.split('-')[5] + "-" + file_name.split('-')[6]
    save_subfolder= save_dir + file_name_new
    
    for w in range (w_count):
        for h in range (h_count):
            bottom = int(h * patch_size / level_downsamples)
            top = bottom + int(patch_size / level_downsamples) -1
            left = int(w * patch_size / level_downsamples)
            right = left + int(patch_size / level_downsamples) -1                                               
            if np.sum(label_result[bottom : top,left : right ] > 0) > 0.75 * (patch_size / level_downsamples)**2:
                subHIC = np.array(slide.read_region((w * patch_size, h * patch_size), 0, (patch_size, patch_size)))[:,:,:3]
                rgb_s = (abs(subHIC[:,:,0] -107) >= 93) & (abs(subHIC[:,:,1] -107) >= 93) & (abs(subHIC[:,:,2] -107) >= 93)
                if np.sum(rgb_s)<= patch_size**2 * 0.50:

                    if not os.path.isdir(save_subfolder): os.makedirs(save_subfolder)
                    #io.imsave(os.path.join(save_dir,f"{file_name}-{w}_{h}_.jpeg"),subHIC)
                    io.imsave(os.path.join(save_subfolder,f"{file_name_new}-{w}_{h}_.jpeg"),subHIC)
                    get_cut += 1
    
    return get_cut  

In [None]:
if __name__ == '__main__':
    
    # for LUSC
    INPUT_IMAGE_DIR_scc = './Images/TCGA_upload2/LUSC'
    svs_file_scc = glob.glob(os.path.join(INPUT_IMAGE_DIR_scc, "*.svs"))
    #svs_file_scc = sorted(svs_file_scc)
    LABEL_RESULT_DIR_scc = './Images/TCGA_upload1/LUSC/'
    save_dir_scc = './Images/TCGA_patch/patch/scc/'
    if not os.path.isdir(save_dir_scc): os.makedirs(save_dir_scc)

    # for LUAD
    INPUT_IMAGE_DIR_aca = './Images/TCGA_upload2/LUAD'
    svs_file_aca = glob.glob(os.path.join(INPUT_IMAGE_DIR_aca, "*.svs"))
    #svs_file_aca = sorted(svs_file_scc)
    LABEL_RESULT_DIR_aca = './Images/TCGA_upload1/LUAD/'
    save_dir_aca = './Images/TCGA_patch/patch/aca/'
    if not os.path.isdir(save_dir_aca): os.makedirs(save_dir_aca)


    patch_size = 512
    livel = 2

    logfile_scc = "./patch_logs/" + "scc_" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + ".txt"
    with open(logfile_scc, 'w') as f:
        for svs in svs_file_scc:
            try:
                get_cut = openslide_cut_patch(svs,LABEL_RESULT_DIR_scc,livel,patch_size,save_dir_scc,"scc")
                print("Finished cutting %d pictures from %s" % (get_cut,os.path.basename(svs)), file=f)
            except Exception as error:
                print(str(svs),'is not cutted')
                print('error: ' + repr(error))            
    f.close()

    logfile_aca = "./patch_logs/" + "aca_" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + ".txt"
    with open(logfile_aca, 'w') as f:
        for svs in svs_file_aca:      
            get_cut = openslide_cut_patch(svs,LABEL_RESULT_DIR_aca,livel,patch_size,save_dir_aca,"aca")
            print("Finished cutting %d pictures from %s" % (get_cut,os.path.basename(svs)), file=f)
    f.close()