In [15]:
import os
from tqdm.notebook import tqdm
from pathlib import Path
import sys

from src.utils import get_cropped_images, get_file_path
from PIL import Image
import pandas as pd
import numpy as np
import cv2

project_path = str(Path().absolute())
if project_path not in sys.path:
    sys.path.append(project_path)
os.listdir(project_path)

database_dir_raw = project_path + '/Database_raw'
train_dir_raw = database_dir_raw + '/train_thumbnails'
database_dir_compress = project_path + '/Database_compressed'
train_dir_compress = database_dir_compress + '/train_thumbnails'

In [2]:
# load raw dataset 
df_dataset = pd.read_csv(f"{database_dir_raw}/train.csv")
df_dataset['file_path'] = df_dataset['image_id'].apply(lambda x: get_file_path(train_dir_raw, x))

df_new = df_dataset[df_dataset['is_tma'] != True]
df_new.head()

Unnamed: 0,image_id,label,image_width,image_height,is_tma,file_path
0,4,HGSC,23785,20008,False,/Users/hcc/Python_venv/Histographic_Image_MAE/...
1,66,LGSC,48871,48195,False,/Users/hcc/Python_venv/Histographic_Image_MAE/...
3,281,LGSC,42309,15545,False,/Users/hcc/Python_venv/Histographic_Image_MAE/...
4,286,EC,37204,30020,False,/Users/hcc/Python_venv/Histographic_Image_MAE/...
5,431,HGSC,39991,40943,False,/Users/hcc/Python_venv/Histographic_Image_MAE/...


In [18]:
def img_crop_compress(file_path, image_id, classname, target_resolution, save_folder, th_area = 1000):

    image = Image.open(file_path)
    # Aspect ratio
    as_ratio = image.size[0] / image.size[1]
    
    sxs, exs, sys, eys = [],[],[],[]
    if as_ratio >= 1.5:
        # Crop
        mask = np.max( np.array(image) > 0, axis=-1 ).astype(np.uint8)
        retval, labels = cv2.connectedComponents(mask)
        if retval >= as_ratio:
            x, y = np.meshgrid( np.arange(image.size[0]), np.arange(image.size[1]) )
            for label in range(1, retval):
                area = np.sum(labels == label)
                if area < th_area:
                    continue
                xs, ys= x[ labels == label ], y[ labels == label ]
                sx, ex = np.min(xs), np.max(xs)
                cx = (sx + ex) // 2
                crop_size = image.size[1]
                sx = max(0, cx-crop_size//2)
                ex = min(sx + crop_size - 1, image.size[0]-1)
                sx = ex - crop_size + 1
                sy, ey = 0, image.size[1]-1
                sxs.append(sx)
                exs.append(ex)
                sys.append(sy)
                eys.append(ey)
        else:
            crop_size = image.size[1]
            for i in range(int(as_ratio)):
                sxs.append( i * crop_size )
                exs.append( (i+1) * crop_size - 1 )
                sys.append( 0 )
                eys.append( crop_size - 1 )
    else:
        # Not Crop (entire image)
        sxs, exs, sys, eys = [0,],[image.size[0]-1],[0,],[image.size[1]-1]
    
    # save image
    crop_id, file_path_list = [], []
    base_dir = os.path.basename(file_path)
    image_name, _ = os.path.splitext(base_dir)
    for idx in range(len(sxs)):
        new_img_name = f"{image_name}_{idx}.png"
        output_dir = os.path.join(save_folder, new_img_name)
        
        file_path_list.append(output_dir)
        crop_id.append(idx)

        img_crop = image.crop((sxs[idx], sys[idx], exs[idx], eys[idx]))
        img_resize = img_crop.resize(target_resolution, Image.LANCZOS)
        img_resize.save(output_dir)

    # df
    df_crop = pd.DataFrame()
    df_crop["image_id"] = [image_id] * len(sxs)
    df_crop["crop_id"] = crop_id
    df_crop["file_path"] = file_path_list
    df_crop["class"] = classname
    class2label = ['HGSC', 'EC', 'CC', 'LGSC', 'MC'] 
    df_crop["label"] = class2label.index(classname)
    df_crop["sx_in_raw"] = sxs
    df_crop["ex_in_raw"] = exs
    df_crop["sy_in_raw"] = sys
    df_crop["ey_in_raw"] = eys
    
    return df_crop

In [19]:
dfs = []
target_resolution = (512, 512)

for (file_path, image_id, classname) in tqdm(zip(df_new["file_path"], df_new["image_id"], df_new["label"])):
    dfs.append( img_crop_compress(file_path, image_id, classname, target_resolution,
                                   save_folder = train_dir_compress) )

df_crop = pd.concat(dfs).reset_index(drop=True)
crop_resize_csv_dir = database_dir_compress + 'train_crop_resize.csv'
df_crop.to_csv(crop_resize_csv_dir, index=False)
df_crop.head()

0it [00:00, ?it/s]

Unnamed: 0,image_id,crop_id,file_path,class,label,sx_in_raw,ex_in_raw,sy_in_raw,ey_in_raw
0,4,0,/Users/hcc/Python_venv/Histographic_Image_MAE/...,HGSC,0,0,2999,0,2522
1,66,0,/Users/hcc/Python_venv/Histographic_Image_MAE/...,LGSC,3,0,2999,0,2957
2,281,0,/Users/hcc/Python_venv/Histographic_Image_MAE/...,LGSC,3,0,1101,0,1101
3,281,1,/Users/hcc/Python_venv/Histographic_Image_MAE/...,LGSC,3,1898,2999,0,1101
4,286,0,/Users/hcc/Python_venv/Histographic_Image_MAE/...,EC,1,0,2999,0,2419
