In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import math
from collections import Counter
from pathlib import Path
from IPython.display import clear_output
import random

data_path = '/home/szelesteya/projects/EMBED_Open_Data/'
image_root_path = '/media/szelesteya/F824D4D024D492CC/EMBED-images/'
image_training_path = image_root_path + 'training/'
image_negative_path = image_root_path + 'negative-full'
image_positive_path = image_root_path + 'positive-full'
tables_path = data_path + 'tables/'
image_croped_neg_path = image_training_path + 'negative/' 

df_neg = pd.read_csv(data_path + 'negative_empirical_png.csv')

fix_resolution = [3328,4096]
crop_size = [224, 224]

In [7]:
def get_size_of_image(image_array):
    height = image_array.shape[0]
    width = image_array.shape[1]
    full_resolution = height * width
    diameter = math.sqrt( height ** 2 + width ** 2)
    dark_area = np.sum(image_array == 0)
    return [f'{height} x {width}', [height, width], full_resolution, diameter, dark_area]

def crop_black_part(image_array):
    # Cropping every column that doesn't contain tissue
    not_dark = np.where(image_array != 0, 1, 0)
    tissue_distrib_x = np.sum(not_dark, axis=0) / not_dark.shape[0]
    tissue_distri_y = np.sum(not_dark, axis=1) / not_dark.shape[1]
    is_tissue_x = np.where(tissue_distrib_x > 0.1, 1, 0)
    is_tissue_y = np.where(tissue_distri_y > 0.1, 1, 0)

    first_tissue_x = np.where(is_tissue_x == 1)[0][0]
    last_tissue_x = np.where(is_tissue_x == 1)[-1][-1]
    first_tissue_y = np.where(is_tissue_y == 1)[0][0]
    last_tissue_y = np.where(is_tissue_y == 1)[-1][-1]

    return image_array[first_tissue_y:last_tissue_y,first_tissue_x:last_tissue_x]


def generate_crop_path(index, roi_num):
    return image_croped_pos_path + f"{index}_{roi_num}_cropped"

In [23]:
def crop_negative_array(image_array):
    tissue_array = np.array(crop_black_part(image_array))
    tissue_width = tissue_array.shape[1]
    tissue_height = tissue_array.shape[0]
    
    if crop_size[1] > tissue_width + 1:
        x_offset_1 = 0
        x_offset_2 = 0
    else:
        x_offset_1 = random.randint(0, tissue_width - crop_size[1])
        x_offset_2 = random.randint(0, tissue_width - crop_size[1])
    
    y_half_point = round((tissue_height - crop_size[0]) / 2)
    y_offset_1 = random.randint(0, y_half_point)
    y_offset_2 = random.randint(y_half_point, tissue_height - crop_size[0])
    
    crop_arrays = [tissue_array[y_offset_1:(y_offset_1 + crop_size[0]), x_offset_1:(x_offset_1 + crop_size[1])], tissue_array[y_offset_2:(y_offset_2 + crop_size[0]), x_offset_2:(x_offset_2 + crop_size[1])]]
    
    ret_arrays = []

    for crop_array in crop_arrays:
        tissue_dist = np.sum(np.where((crop_array / 255) > 0.05, 1, 0)) / (crop_size[0] * crop_size[1])
        too_bright = np.sum(np.where((crop_array / 255) > 0.9, 1, 0) / (crop_size[0] * crop_size[1]))
        print(too_bright)
        if too_bright > 0.5:
            print(f"Too much bright area on picture")  
        elif (tissue_dist < 0.5):
            print(f"Too much dark area on picture try to flip the roi horizontally")
        else:
            print(f"Image will be croped brightness {too_bright}")
            ret_arrays.append(crop_array)
        
    return ret_arrays

In [55]:
def crop_negative(idx):
    path = f"{image_croped_neg_path}{idx}"
    image = Image.open(df_neg.loc[idx,'png_path'])
    resize_image_array = np.array(image.resize(fix_resolution))
    tissue_image_array = crop_black_part(resize_image_array)
    crops = crop_negative_array(tissue_image_array)
    for i,crop in enumerate(crops):
        path_crop = f"{path}_{i}.png"
        df_neg_train.loc[len(df_neg_train),:] = df_neg.loc[idx,:]
        df_neg_train.loc[len(df_neg_train) - 1,'negative_index'] = idx
        #df_neg_train.loc[len(df_neg_train)-1,:] = df_neg.loc[idx,:][['empi_anon','acc_anon','side','asses','age_at_study','calc_find','calc_distrib','other_find','num_find','view_pos','eth_desc','study_date_anon','diag_study_date','relative_dcm_path','spot_mag','diag_date_diff','png_path']]
        df_neg_train.loc[len(df_neg_train)-1,'training_path'] = path_crop
        print(f"Saving image {path_crop}")
        im1 = Image.fromarray(crop, mode='L')
        im1.save(path_crop, mode='L')

In [14]:
crop_negative(df_neg.loc[50,:],50)

Saving image /media/szelesteya/F824D4D024D492CC/EMBED-images/training/negative/50_0.png


In [57]:
df_neg_train = pd.DataFrame(columns=df_neg.columns).rename(columns={'Unnamed: 0': 'negative_index'})
for i in range(0,250):
    print(f"Cropping image number {i} / {280}")
    crop_negative(i)

df_neg_train

Cropping image number 0 / 280
0.01865433673469388
Image will be croped brightness 0.01865433673469388
0.042709661989795915
Too much dark area on picture try to flip the roi horizontally
Saving image /media/szelesteya/F824D4D024D492CC/EMBED-images/training/negative/0_0.png
Cropping image number 1 / 280
0.0
Too much dark area on picture try to flip the roi horizontally
0.03501674107142857
Image will be croped brightness 0.03501674107142857
Saving image /media/szelesteya/F824D4D024D492CC/EMBED-images/training/negative/1_0.png
Cropping image number 2 / 280
0.07694913903061225
Image will be croped brightness 0.07694913903061225
0.09522480867346939
Image will be croped brightness 0.09522480867346939
Saving image /media/szelesteya/F824D4D024D492CC/EMBED-images/training/negative/2_0.png
Saving image /media/szelesteya/F824D4D024D492CC/EMBED-images/training/negative/2_1.png
Cropping image number 3 / 280
0.0001992984693877551
Image will be croped brightness 0.0001992984693877551
0.085917570153061

Unnamed: 0,empi_anon,acc_anon,side,age_at_study,relative_dcm_path,calc_find,calc_distrib,other_find,num_find,eth_desc,study_date_anon,png_path,negative_index,training_path
0,29011563,1572834723540517,R,82.651937,cohort_1/29011563/1.2.846.113979.3.61.1.622904...,,,,1,African American or Black,2021-02-17,/media/szelesteya/F824D4D024D492CC/EMBED-image...,0.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...
1,11485524,6356263291812846,L,74.038481,cohort_1/11485524/1.2.844.113973.3.64.1.622065...,,,,1,African American or Black,2021-02-10,/media/szelesteya/F824D4D024D492CC/EMBED-image...,1.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...
2,41042249,9346948292263998,R,67.763198,cohort_2/41042249/1.2.845.113978.3.60.1.621182...,,,,1,Caucasian or White,2021-01-31,/media/szelesteya/F824D4D024D492CC/EMBED-image...,2.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...
3,41042249,9346948292263998,R,67.763198,cohort_2/41042249/1.2.845.113978.3.60.1.621182...,,,,1,Caucasian or White,2021-01-31,/media/szelesteya/F824D4D024D492CC/EMBED-image...,2.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...
4,12927057,6114291372605113,R,68.045203,cohort_2/12927057/1.2.841.113975.3.61.1.622750...,,,,1,Caucasian or White,2021-01-24,/media/szelesteya/F824D4D024D492CC/EMBED-image...,3.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,21750107,1405471234758682,L,54.851229,cohort_2/21750107/1.2.845.113970.3.62.1.608859...,,,,1,Caucasian or White,2019-07-12,/media/szelesteya/F824D4D024D492CC/EMBED-image...,247.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...
391,64698570,6856182079652543,L,68.154719,cohort_2/64698570/1.2.843.113979.3.60.1.591744...,,,,1,Caucasian or White,2019-07-12,/media/szelesteya/F824D4D024D492CC/EMBED-image...,248.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...
392,64698570,6856182079652543,L,68.154719,cohort_2/64698570/1.2.843.113979.3.60.1.591744...,,,,1,Caucasian or White,2019-07-12,/media/szelesteya/F824D4D024D492CC/EMBED-image...,248.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...
393,72804936,1237281019577717,L,73.326625,cohort_2/72804936/1.2.844.113975.3.65.1.599358...,,,,1,Caucasian or White,2019-07-12,/media/szelesteya/F824D4D024D492CC/EMBED-image...,249.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...


In [66]:
df_neg_train['label_0'] = 0
df_neg_train = df_neg_train.rename(columns={'training_path':'crop_path'})
df_neg_train['asses'] = 'N'

with open('neg_training.csv','w') as f:
    df_neg_train.to_csv(f)

df_neg_train

Unnamed: 0,empi_anon,acc_anon,side,age_at_study,relative_dcm_path,calc_find,calc_distrib,other_find,num_find,eth_desc,study_date_anon,png_path,negative_index,crop_path,label_0,asses
0,29011563,1572834723540517,R,82.651937,cohort_1/29011563/1.2.846.113979.3.61.1.622904...,,,,1,African American or Black,2021-02-17,/media/szelesteya/F824D4D024D492CC/EMBED-image...,0.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...,0,N
1,11485524,6356263291812846,L,74.038481,cohort_1/11485524/1.2.844.113973.3.64.1.622065...,,,,1,African American or Black,2021-02-10,/media/szelesteya/F824D4D024D492CC/EMBED-image...,1.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...,0,N
2,41042249,9346948292263998,R,67.763198,cohort_2/41042249/1.2.845.113978.3.60.1.621182...,,,,1,Caucasian or White,2021-01-31,/media/szelesteya/F824D4D024D492CC/EMBED-image...,2.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...,0,N
3,41042249,9346948292263998,R,67.763198,cohort_2/41042249/1.2.845.113978.3.60.1.621182...,,,,1,Caucasian or White,2021-01-31,/media/szelesteya/F824D4D024D492CC/EMBED-image...,2.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...,0,N
4,12927057,6114291372605113,R,68.045203,cohort_2/12927057/1.2.841.113975.3.61.1.622750...,,,,1,Caucasian or White,2021-01-24,/media/szelesteya/F824D4D024D492CC/EMBED-image...,3.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...,0,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,21750107,1405471234758682,L,54.851229,cohort_2/21750107/1.2.845.113970.3.62.1.608859...,,,,1,Caucasian or White,2019-07-12,/media/szelesteya/F824D4D024D492CC/EMBED-image...,247.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...,0,N
391,64698570,6856182079652543,L,68.154719,cohort_2/64698570/1.2.843.113979.3.60.1.591744...,,,,1,Caucasian or White,2019-07-12,/media/szelesteya/F824D4D024D492CC/EMBED-image...,248.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...,0,N
392,64698570,6856182079652543,L,68.154719,cohort_2/64698570/1.2.843.113979.3.60.1.591744...,,,,1,Caucasian or White,2019-07-12,/media/szelesteya/F824D4D024D492CC/EMBED-image...,248.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...,0,N
393,72804936,1237281019577717,L,73.326625,cohort_2/72804936/1.2.844.113975.3.65.1.599358...,,,,1,Caucasian or White,2019-07-12,/media/szelesteya/F824D4D024D492CC/EMBED-image...,249.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...,0,N


In [64]:
df_neg_train

Unnamed: 0,empi_anon,acc_anon,side,age_at_study,relative_dcm_path,calc_find,calc_distrib,other_find,num_find,eth_desc,study_date_anon,png_path,negative_index,training_path,label_0
0,29011563,1572834723540517,R,82.651937,cohort_1/29011563/1.2.846.113979.3.61.1.622904...,,,,1,African American or Black,2021-02-17,/media/szelesteya/F824D4D024D492CC/EMBED-image...,0.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...,0
1,11485524,6356263291812846,L,74.038481,cohort_1/11485524/1.2.844.113973.3.64.1.622065...,,,,1,African American or Black,2021-02-10,/media/szelesteya/F824D4D024D492CC/EMBED-image...,1.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...,0
2,41042249,9346948292263998,R,67.763198,cohort_2/41042249/1.2.845.113978.3.60.1.621182...,,,,1,Caucasian or White,2021-01-31,/media/szelesteya/F824D4D024D492CC/EMBED-image...,2.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...,0
3,41042249,9346948292263998,R,67.763198,cohort_2/41042249/1.2.845.113978.3.60.1.621182...,,,,1,Caucasian or White,2021-01-31,/media/szelesteya/F824D4D024D492CC/EMBED-image...,2.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...,0
4,12927057,6114291372605113,R,68.045203,cohort_2/12927057/1.2.841.113975.3.61.1.622750...,,,,1,Caucasian or White,2021-01-24,/media/szelesteya/F824D4D024D492CC/EMBED-image...,3.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,21750107,1405471234758682,L,54.851229,cohort_2/21750107/1.2.845.113970.3.62.1.608859...,,,,1,Caucasian or White,2019-07-12,/media/szelesteya/F824D4D024D492CC/EMBED-image...,247.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...,0
391,64698570,6856182079652543,L,68.154719,cohort_2/64698570/1.2.843.113979.3.60.1.591744...,,,,1,Caucasian or White,2019-07-12,/media/szelesteya/F824D4D024D492CC/EMBED-image...,248.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...,0
392,64698570,6856182079652543,L,68.154719,cohort_2/64698570/1.2.843.113979.3.60.1.591744...,,,,1,Caucasian or White,2019-07-12,/media/szelesteya/F824D4D024D492CC/EMBED-image...,248.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...,0
393,72804936,1237281019577717,L,73.326625,cohort_2/72804936/1.2.844.113975.3.65.1.599358...,,,,1,Caucasian or White,2019-07-12,/media/szelesteya/F824D4D024D492CC/EMBED-image...,249.0,/media/szelesteya/F824D4D024D492CC/EMBED-image...,0
