In [281]:
import numpy as np
import pandas as pd
from PIL import Image
import os
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import random
import sklearn as sk
import shutil

inbreast_path = '/mnt/d/datasets/INBREAST/inbreast'
croped_path = '/mnt/d/datasets/INBREAST/crops'
split_path = '/mnt/d/datasets/INBREAST/split'
file_dir = '/mnt/c/Users/Szelestey/Projects/laboratory-mammography'

crop_size = [224,224]

In [8]:
image_names = []
anon = []
side = []
view_pos = []
acr = []
birads = []

for image_name in os.listdir(f"{inbreast_path}{os.sep}images"):
    image_names.append(image_name)

    meta = image_name[0:-4].split('_')
    
    anon.append(meta[0])
    side.append(meta[1])
    view_pos.append(meta[2])
    acr.append(meta[4])
    birads.append(meta[6])

dict_imgs = {
    'anon': anon,
    'side': side,
    'view_pos': view_pos,
    'acr': acr,
    'birads': birads,
    'file': image_names
}

df_images = pd.DataFrame(dict_imgs)

df_images

Unnamed: 0,anon,side,view_pos,acr,birads,file
0,20586908,Right,CC,2,2,20586908_Right_CC_acr_2_birads_2.png
1,20586934,Left,CC,2,5,20586934_Left_CC_acr_2_birads_5.png
2,20586960,Right,MLO,2,2,20586960_Right_MLO_acr_2_birads_2.png
3,20586986,Left,MLO,2,5,20586986_Left_MLO_acr_2_birads_5.png
4,20587054,Right,CC,2,4c,20587054_Right_CC_acr_2_birads_4c.png
...,...,...,...,...,...,...
404,53587599,Left,CC,2,2,53587599_Left_CC_acr_2_birads_2.png
405,53587663,Right,CC,1,2,53587663_Right_CC_acr_1_birads_2.png
406,53587690,Left,MLO,1,1,53587690_Left_MLO_acr_1_birads_1.png
407,53587717,Right,MLO,1,2,53587717_Right_MLO_acr_1_birads_2.png


In [10]:
mask_names = []
anons = []
finding_types = []
coords_str = []
files = []

for image_name in os.listdir(f"{inbreast_path}{os.sep}masks"):
    mask_names.append(image_name)
    meta = image_name[0:-4].split('_')
    anons.append(meta[0])
    if '.png' in image_name:
        finding_types.append(meta[1])
        coords_str.append(str([]))
    elif '.csv' in image_name:
        coords = np.genfromtxt(f"{inbreast_path}{os.sep}masks{os.sep}{image_name}",delimiter=',')
        finding_types.append('-'.join(meta[1:3]))
        coords_str.append(str(coords))
        
dict_findings = {
    'anon': anons,
    'finding_type': finding_types,
    'coord': coords_str,
    'mask_file': mask_names
}

df_findings = pd.DataFrame(dict_findings)
df_findings


Unnamed: 0,anon,finding_type,coord,mask_file
0,20586908,calc,[],20586908_calc_0.png
1,20586908,calc,[],20586908_calc_1.png
2,20586908,calc,[],20586908_calc_10.png
3,20586908,calc,[],20586908_calc_2.png
4,20586908,calc,[],20586908_calc_4.png
...,...,...,...,...
3240,53587717,calc,[],53587717_calc_2.png
3241,53587717,calc,[],53587717_calc_3.png
3242,53587717,calc,[],53587717_calc_4.png
3243,53587717,calc,[],53587717_calc_5.png


In [44]:
df_images_and_findings = df_findings.merge(df_images,how='outer',on=['anon'])

df_images_and_findings

Unnamed: 0,anon,finding_type,coord,mask_file,side,view_pos,acr,birads,file
0,20586908,calc,[],20586908_calc_0.png,Right,CC,2,2,20586908_Right_CC_acr_2_birads_2.png
1,20586908,calc,[],20586908_calc_1.png,Right,CC,2,2,20586908_Right_CC_acr_2_birads_2.png
2,20586908,calc,[],20586908_calc_10.png,Right,CC,2,2,20586908_Right_CC_acr_2_birads_2.png
3,20586908,calc,[],20586908_calc_2.png,Right,CC,2,2,20586908_Right_CC_acr_2_birads_2.png
4,20586908,calc,[],20586908_calc_4.png,Right,CC,2,2,20586908_Right_CC_acr_2_birads_2.png
...,...,...,...,...,...,...,...,...,...
3306,53587717,calc,[],53587717_calc_3.png,Right,MLO,1,2,53587717_Right_MLO_acr_1_birads_2.png
3307,53587717,calc,[],53587717_calc_4.png,Right,MLO,1,2,53587717_Right_MLO_acr_1_birads_2.png
3308,53587717,calc,[],53587717_calc_5.png,Right,MLO,1,2,53587717_Right_MLO_acr_1_birads_2.png
3309,53587717,calc-grains,[[0.63565132 0.60847356]\n [0.35700294 0.82902...,53587717_calc_grains.csv,Right,MLO,1,2,53587717_Right_MLO_acr_1_birads_2.png


In [76]:
df_neg = df_images_and_findings[~df_images_and_findings['finding_type'].notna()]

df_neg

Unnamed: 0,anon,finding_type,coord,mask_file,side,view_pos,acr,birads,file
173,20588138,,,,Right,MLO,2,1,20588138_Right_MLO_acr_2_birads_1.png
174,20588164,,,,Right,CC,2,1,20588164_Right_CC_acr_2_birads_1.png
303,22580218,,,,Left,CC,2,1,22580218_Left_CC_acr_2_birads_1.png
305,22580270,,,,Left,MLO,2,1,22580270_Left_MLO_acr_2_birads_1.png
486,22613848,,,,Left,MLO,2,1,22613848_Left_MLO_acr_2_birads_1.png
...,...,...,...,...,...,...,...,...,...
3220,53586388,,,,Right,MLO,1,1,53586388_Right_MLO_acr_1_birads_1.png
3221,53586415,,,,Left,CC,1,1,53586415_Left_CC_acr_1_birads_1.png
3222,53586442,,,,Right,CC,1,1,53586442_Right_CC_acr_1_birads_1.png
3302,53587690,,,,Left,MLO,1,1,53587690_Left_MLO_acr_1_birads_1.png


In [215]:
df_pos = df_images_and_findings[df_images_and_findings['finding_type'].isin(['calc','calc-grains','cluster'])]

df_pos[df_pos['birads'] == '4a']

Unnamed: 0,anon,finding_type,coord,mask_file,side,view_pos,acr,birads,file
487,22613918,calc-grains,[[0.49699519 0.77773437]\n [0.40144231 0.74804...,22613918_calc_grains.csv,Right,CC,2,4a,22613918_Right_CC_acr_2_birads_4a.png
488,22613918,cluster,[],22613918_cluster_0.png,Right,CC,2,4a,22613918_Right_CC_acr_2_birads_4a.png
490,22613970,calc-grains,[[0.48317308 0.896875 ]\n [0.62860577 0.78789...,22613970_calc_grains.csv,Right,MLO,2,4a,22613970_Right_MLO_acr_2_birads_4a.png
491,22613970,cluster,[],22613970_cluster_0.png,Right,MLO,2,4a,22613970_Right_MLO_acr_2_birads_4a.png
527,22614522,calc-grains,[[0.57842548 0.05117187]\n [0.63942308 0.01835...,22614522_calc_grains.csv,Left,CC,3,4a,22614522_Left_CC_acr_3_birads_4a.png
534,22614568,calc-grains,[[0.5703125 0.39023438]\n [0.50991587 0.33437...,22614568_calc_grains.csv,Left,MLO,3,4a,22614568_Left_MLO_acr_3_birads_4a.png
634,22670809,calc,[],22670809_calc_1.png,Right,CC,1,4a,22670809_Right_CC_acr_1_birads_4a.png
635,22670809,calc,[],22670809_calc_2.png,Right,CC,1,4a,22670809_Right_CC_acr_1_birads_4a.png
636,22670809,calc,[],22670809_calc_3.png,Right,CC,1,4a,22670809_Right_CC_acr_1_birads_4a.png
637,22670809,calc-grains,[[0.60945152 0.65715144]\n [0.55754163 0.70132...,22670809_calc_grains.csv,Right,CC,1,4a,22670809_Right_CC_acr_1_birads_4a.png


In [187]:
def scale_to_pixels(relative_coords, resolution):
    ret = np.zeros([relative_coords.shape[0]])

    ret[0] = int(relative_coords[0] * resolution[0])
    ret[1] = int(relative_coords[1] * resolution[1])

    return ret

for i,calc_finding in df_pos.iterrows():
    image = Image.open(f"{inbreast_path}{os.sep}images{os.sep}{calc_finding['file']}")
    image_array = np.array(image)

    if calc_finding['finding_type'] in ['calc','cluster']:
        mask = Image.open(f"{inbreast_path}{os.sep}masks{os.sep}{calc_finding['mask_file']}")
        mask_array = np.array(mask)

        masked_columns = np.nonzero(np.sum(mask_array, axis=0))
        x_min = masked_columns[0][0]
        x_max = masked_columns[0][-1]

        masked_rows = np.nonzero(np.sum(mask_array, axis=1))
        y_min = masked_rows[0][0]
        y_max = masked_rows[0][-1]

        mask_width = x_max - x_min
        mask_height = y_max - y_min

        # Choose random offset so the calcifications can be placed anywhere on the image not just the middle
        x_offset = random.randint(-crop_size[0] + mask_width, crop_size[0] + mask_width)
        y_offset = random.randint(-crop_size[1] + mask_height, crop_size[1] + mask_height)

        x_crop = x_min + x_offset
        y_crop = y_min + y_offset
        
        image_width = image_array.shape[1]
        image_height = image_array.shape[0]
        
        if x_crop + crop_size[1] > image_width:
            x_crop = image_width - crop_size[1] - 1
        
        if y_crop + crop_size[0] > image_height:
            y_crop = image_height - crop_size[0] - 1
            
        if x_crop < 0:
            x_crop = 0
        
        if y_crop < 0:
            y_crop = 0
        
        # print(f"{image_height} *  {image_width}")
        # print(f"{y_crop}:{y_crop + crop_size[0]},{x_crop}:{x_crop + crop_size[1]}")
        crop_array = image_array[y_crop:y_crop + crop_size[0],x_crop:x_crop + crop_size[1]]
        # print(f"{np.sum(crop_array > 0)} > {int(crop_size[1] * crop_size[0] * 0.25)}")
        if np.sum(crop_array > 0) > int(crop_size[1] * crop_size[0] * 0.25):
                    crop = Image.fromarray(crop_array)
                    crop.save(f"{croped_path}/positive/{calc_finding['anon']}_{calc_finding['finding_type']}_{i}_{j}.png", mode='L')
    
    elif calc_finding['finding_type'] == 'calc-grains':
        grains_relative = np.genfromtxt(f"{inbreast_path}{os.sep}masks{os.sep}{calc_finding['mask_file']}",delimiter=',')
        
        size = image_array.shape
        
        # Change relative values to pixels
        if len(grains_relative.shape) > 1:
            grains_scaled = np.apply_along_axis(lambda array: scale_to_pixels(array, size), 1, grains_relative)
            grains_sorted = np.sort(grains_scaled, axis=0)
    
            crops = int(len(grains_sorted) / 6)
            chosen_rects = np.zeros([crops, 2])
            chosen_grains_indexes = random.sample(range(len(grains_sorted)), crops)
            offsets = np.zeros([crops, 2])
            
            for j in range(crops):
                chosen_grain = grains_sorted[chosen_grains_indexes[j]]            
                
                x_offset = random.randint(-crop_size[0] + 1, 0) + chosen_grain[1]
                y_offset = random.randint(-crop_size[1] + 1, 0) + chosen_grain[0]
                
                image_width = image_array.shape[1]
                image_height = image_array.shape[0]
                
                if x_offset + crop_size[1] > image_width:
                    x_offset = image_width - crop_size[1] - 1
                
                if y_offset + crop_size[0] > image_height:
                    y_offset = image_height - crop_size[0] - 1
                    
                if x_offset< 0:
                    x_offset = 0
                
                if y_offset < 0:
                    y_offset = 0
                
                offsets[j] = (y_offset, x_offset)
                
            for j, offset in enumerate(offsets):
                crop_array = image_array[int(offset[0]):int(crop_size[0] + offset[0]),int(offset[1]):int(crop_size[1] + offset[1])]
                if np.sum(crop_array > 0) > crop_size[1] * crop_size[0] * 25:
                    crop = Image.fromarray(crop_array)
                    crop.save(f"{croped_path}/positive/{calc_finding['anon']}_calc-grain_{i}_{j}.png", mode='L')


In [190]:
def crop_black(image_array):
    not_dark = np.where(image_array != 0, 1, 0)
    tissue_distrib_x = np.sum(not_dark, axis=0) / not_dark.shape[0]
    tissue_distri_y = np.sum(not_dark, axis=1) / not_dark.shape[1]
    is_tissue_x = np.where(tissue_distrib_x > 0.1, 1, 0)
    is_tissue_y = np.where(tissue_distri_y > 0.1, 1, 0)

    first_tissue_x = np.where(is_tissue_x == 1)[0][0]
    last_tissue_x = np.where(is_tissue_x == 1)[-1][-1]
    first_tissue_y = np.where(is_tissue_y == 1)[0][0]
    last_tissue_y = np.where(is_tissue_y == 1)[-1][-1]

    return image_array[first_tissue_y:last_tissue_y,first_tissue_x:last_tissue_x]

In [261]:
for i,neg in df_neg.iterrows(): # .loc[0:5,:]
    image = Image.open(f"{inbreast_path}/images/{neg['file']}")
    image_array = np.array(image)
    tissue = crop_black(image_array)
    
    tissue_width = tissue.shape[1]
    tissue_height = tissue.shape[0]
    
    width_crops = int(tissue_width / (crop_size[1] * 1.25))
    height_crops = int(tissue_height / (crop_size[0] * 1.25))
    
    for i in range(width_crops):
        for j in range(height_crops):
            x_offset = random.randint(0, int(crop_size[1] * 0.25))
            y_offset = random.randint(0, int(crop_size[0] * 0.25))
            
            x_crop = x_offset + int(i * (tissue_width / width_crops))
            y_crop = y_offset + int(j * (tissue_height / height_crops))
            
            tissue_array = tissue[y_crop:y_crop + crop_size[0],x_crop:x_crop + crop_size[1]]
            if np.sum(tissue_array > 0) > int(crop_size[0] * crop_size[1] * 0.25):
                path = f"{croped_path}/negative/{neg['anon']}_{i * height_crops + j}.png"
                
                im = Image.fromarray(tissue_array)
                im.save(path)

In [270]:
neg_imgs = []

for neg_img in os.listdir(f"{croped_path}/negative"):
    neg_imgs.append(neg_img)
    
pos_imgs = []

for pos_img in os.listdir(f"{croped_path}/positive"):
    pos_imgs.append(pos_img)
    
neg_imgs = sk.utils.shuffle(neg_imgs, random_state=5)
pos_imgs = sk.utils.shuffle(pos_imgs, random_state=5)

neg_train, neg_validation, neg_test = np.split(neg_imgs, [round(0.8 * len(neg_imgs)), round(0.9 * len(neg_imgs))])
pos_train, pos_validation, pos_test = np.split(pos_imgs, [round(0.8 * len(pos_imgs)), round(0.9 * len(pos_imgs))])

In [273]:
print(pos_train[0])

50993868_calc_1639_10.png


In [282]:
for path in pos_train:
    src_path = f"{croped_path}/positive/{path}"
    dst_path = f"{split_path}/train/positive/{path}"
    try:
        shutil.copyfile(src_path, dst_path)
    except FileNotFoundError:
        shutil.copyfile(src_path, dst_path)

In [283]:
for path in neg_train:
    src_path = f"{croped_path}/negative/{path}"
    dst_path = f"{split_path}/train/negative/{path}"
    try:
        shutil.copyfile(src_path, dst_path)
    except FileNotFoundError:
        shutil.copyfile(src_path, dst_path)
        
for path in pos_validation:
    src_path = f"{croped_path}/positive/{path}"
    dst_path = f"{split_path}/validation/positive/{path}"
    try:
        shutil.copyfile(src_path, dst_path)
    except FileNotFoundError:
        shutil.copyfile(src_path, dst_path)
        
for path in neg_validation:
    src_path = f"{croped_path}/negative/{path}"
    dst_path = f"{split_path}/validation/negative/{path}"
    try:
        shutil.copyfile(src_path, dst_path)
    except FileNotFoundError:
        shutil.copyfile(src_path, dst_path)
        
for path in pos_test:
    src_path = f"{croped_path}/positive/{path}"
    dst_path = f"{split_path}/test/positive/{path}"
    try:
        shutil.copyfile(src_path, dst_path)
    except FileNotFoundError:
        shutil.copyfile(src_path, dst_path)
        
for path in neg_test:
    src_path = f"{croped_path}/negative/{path}"
    dst_path = f"{split_path}/test/negative/{path}"
    try:
        shutil.copyfile(src_path, dst_path)
    except FileNotFoundError:
        shutil.copyfile(src_path, dst_path)