In [1]:
from concurrent.futures import ProcessPoolExecutor
import cv2
import glob
import numpy as np
from tqdm import tqdm
import os
from PIL import Image
import matplotlib.pyplot as plt
import re
DEBUG = False

data_dir = '../input/images_gpugen/vindr_1536_16bit_2' # data folder
output_dir = "../input/images_gpugen/vindr_1536896_16bit_2_cutoff" # output folder
os.makedirs(output_dir, exist_ok=True)

In [2]:
def crop_image(fname):
    X = cv2.imread(fname, cv2.IMREAD_ANYDEPTH) # 16bit
    # X = cv2.imread(fname) # 8bit
    if X.ndim == 2:
        X = np.repeat(X[:, :, np.newaxis], 3, axis=-1)
    
    # Some images have narrow exterior "frames" that complicate selection of the main data. Cutting off the frame
    X = X[5:-5, 5:-5]
    
    choose_X = (X > 5140).astype(np.uint8)[:, :, 0]
    # choose_X = (X > 20).astype(np.uint8)[:, :, 0]
    
    output= cv2.connectedComponentsWithStats(
        choose_X, # image: 一个二值化图像，黑色为背景，白色为前景
        8, # 一个整数，表示连通域的连接方式，可以是4或8
        cv2.CV_32S # 表示输出图像的数据类型，可以是cv2.CV_32S或cv2.CV_16U
        )

    # stats.shape == (N, 5), where N is the number of regions, 5 dimensions correspond to:
    # left, top, width, height, area_size
    stats = output[2]

    # finding max area which always corresponds to the breast data. 
    idx = stats[1:, 4].argmax() + 1
    x1, y1, w, h = stats[idx][:4]
    x2 = x1 + w
    y2 = y1 + h
    
    # cutting out the breast data
    X_fit = X[y1: y2, x1: x2]

    # save
    fname_list = fname.replace(".png", "").split("/")
    patient_id, im_id = fname_list[-1].split("_")[0], fname_list[-1].split("_")[1]
    os.makedirs(f'{output_dir}/{patient_id}', exist_ok=True)
    cv2.imwrite(f'{output_dir}/{patient_id}/{im_id}.png', X_fit[:, :, 0])

def fit_all_images(all_images):
    with ProcessPoolExecutor(4) as p:
        for i in tqdm(p.map(crop_image, all_images), total=len(all_images)):
            pass

In [None]:
all_images = glob.glob(f'{data_dir}/*')
print(f"all_images: {len(all_images)}")

if DEBUG:
    all_images = np.random.choice(all_images, size=100)
fit_all_images(all_images)

In [4]:
assert len(all_images) == len(glob.glob(f'{output_dir}/*/*')) == 20000, f"Something went wrong, {len(all_images)} != {len(glob.glob(f'{output_dir}/*/*'))} "