In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install SimpleITK==2.1.1
import SimpleITK as sitk
import numpy as np
import pandas as pd
import cv2
from tqdm.notebook import tqdm
import copy
import os
import glob
import shutil
import random
from sklearn.utils import shuffle
import matplotlib.pyplot as plt

In [None]:
def select_largest_obj(img_bin, lab_val=255, fill_holes=False, 
                       smooth_boundary=False, kernel_size=15):
    '''Select the largest object from a binary image and optionally
    fill holes inside it and smooth its boundary.
    Args:
        img_bin(2D array): 2D numpy array of binary image.
        lab_val([int]): integer value used for the label of the largest 
                        object. Default is 255.
        fill_holes([boolean]): whether fill the holes inside the largest 
                               object or not. Default is false.
        smooth_boundary([boolean]): whether smooth the boundary of the 
                                    largest object using morphological 
                                    opening or not. Default is false.
        kernel_size([int]): the size of the kernel used for morphological 
                            operation.
    '''
    n_labels, img_labeled, lab_stats, _ = cv2.connectedComponentsWithStats(
        img_bin, connectivity=8, ltype=cv2.CV_32S)
    largest_obj_lab = np.argmax(lab_stats[1:, 4]) + 1
    largest_mask = np.zeros(img_bin.shape, dtype=np.uint8)
    largest_mask[img_labeled == largest_obj_lab] = lab_val
    if fill_holes:
        bkg_locs = np.where(img_labeled == 0)
        bkg_seed = (bkg_locs[0][0], bkg_locs[1][0])
        img_floodfill = largest_mask.copy()
        h_, w_ = largest_mask.shape
        mask_ = np.zeros((h_ + 2, w_ + 2), dtype=np.uint8)
        cv2.floodFill(img_floodfill, mask_, seedPoint=bkg_seed, newVal=lab_val)
        holes_mask = cv2.bitwise_not(img_floodfill)  # mask of the holes.
        largest_mask = largest_mask + holes_mask
    if smooth_boundary:
        kernel_ = np.ones((kernel_size, kernel_size), dtype=np.uint8)
        largest_mask = cv2.morphologyEx(largest_mask, cv2.MORPH_OPEN, kernel_)
    
    cnts, _ = cv2.findContours(largest_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnt = max(cnts, key = cv2.contourArea)
    x, y, w, h = cv2.boundingRect(cnt)
    return x, y, w, h, largest_mask

In [None]:
def cropp_image(img, threshold, truncate = False):
    _, binary_img = cv2.threshold(img, threshold, 
                                    maxval=255, type=cv2.THRESH_BINARY)
    x, y, w, h, mask_img = select_largest_obj(binary_img, lab_val=255, 
                                       fill_holes=False, 
                                       smooth_boundary=True, kernel_size=2)
    croped_img, br_mask = img[y:y+h, x:x+w], mask_img[y:y+h, x:x+w]

    if truncate:
        Pmin = np.percentile(croped_img[br_mask!=0], 5)
        Pmax = np.percentile(croped_img[br_mask!=0], 99)
        truncated = np.clip(croped_img,Pmin, Pmax)  
        normalized = (truncated - Pmin)/(Pmax - Pmin)
        normalized[br_mask==0]=0
        filtered_img = normalized
    else:
        filtered_img = croped_img
    filtered_img = ((filtered_img - filtered_img.min()) / (filtered_img.max() - filtered_img.min())*255).astype(np.uint8)
    return filtered_img

In [None]:
def synthesize(img):
    clahe = cv2.createCLAHE(clipLimit=2, tileGridSize=(5,5))
    cl1 = clahe.apply(np.array(img, dtype=np.uint8))
    cl1 = ((cl1 - cl1.min()) / (cl1.max() - cl1.min())*255).astype(np.uint8)

    clahe = cv2.createCLAHE(clipLimit=3)
    cl2 = clahe.apply(np.array(img, dtype=np.uint8))
    cl2 = ((cl2 - cl2.min()) / (cl2.max() - cl2.min())*255).astype(np.uint8)

    synthetized = cv2.merge((img,cl1,cl2))
    return synthetized.astype(np.uint8)

In [None]:
import numpy as np
from scipy.ndimage import zoom


def clipped_zoom(img, zoom_factor, **kwargs):

    h, w = img.shape[:2]

    zoom_tuple = (zoom_factor,) * 2 + (1,) * (img.ndim - 2)
    zh = int(np.round(h / zoom_factor))
    zw = int(np.round(w))
    top = (h - zh) // 2
    left = (w - zw) // 2

    out = zoom(img[top:top+zh, left:left+zw], zoom_tuple, **kwargs)
    return out

In [None]:
root_path = '/content/drive/MyDrive/Breast Cancer Datasets/'

# CMMD

In [None]:
def read_cmmd_dicom(path):
    img = sitk.ReadImage(path)
    img_hu = sitk.GetArrayFromImage(img)[0]
    img = (((img_hu - img_hu.min())/(img_hu.max() - img_hu.min()))*255).astype(np.uint8)
    img = cropp_image(img, 5, True)
    return img

In [None]:
clinical = pd.read_csv(root_path+'cmmd-meta-data.csv')
clinical

In [None]:
os.mkdir('CMMD')

In [None]:
metadict = {
    'PatientID': [],
    'Side': [],

    'CC': [],
    'MLO': [],

    'Pathology': [],
    'Type': [],
    'BI-RADS': [],
    'Density': [],
    'Subtype': [],
    'Age': []
}

for i in tqdm(range(len(clinical))):
    p_id = clinical['PatientID'].values[i]
    side = clinical['Side'].values[i]
    cc = root_path+clinical['CC'].values[i]
    mlo = root_path+clinical['MLO'].values[i]
    pathology = clinical['Pathology'].values[i]
    typee = clinical['Type'].values[i]
    subtype = clinical['subtype'].values[i]
    age = clinical['Age'].values[i]
    dense = None
    birad = None

    p_folder = 'CMMD/{}/'.format(p_id)
    if p_id not in metadict['PatientID']:
        os.mkdir(p_folder)

    s_folder = p_folder+'{}/'.format(side)
    os.mkdir(s_folder)
    cc_new = s_folder+'CC.jpg'
    mlo_new = s_folder+'MLO.jpg'

    cc_img = read_cmmd_dicom(cc)
    mlo_img = read_cmmd_dicom(mlo)

    cv2.imwrite(cc_new, cc_img)
    cv2.imwrite(mlo_new, mlo_img)

    metadict['PatientID'].append(p_id)
    metadict['Side'].append(side)
    metadict['CC'].append(cc_new)
    metadict['MLO'].append(mlo_new)
    metadict['Pathology'].append(pathology)
    metadict['Type'].append(typee)
    metadict['Subtype'].append(subtype)
    metadict['Age'].append(age)
    metadict['BI-RADS'].append(birad)
    metadict['Density'].append(dense)

In [None]:
df = pd.DataFrame.from_dict(metadict)
df

In [None]:
shutil.make_archive('/content/drive/MyDrive/Breast Cancer Preprocessed Datasets/CMMD',
                    'zip',
                    '/content/CMMD')

In [None]:
df.to_csv('/content/drive/MyDrive/Breast Cancer Preprocessed Datasets/cmmd-meta-data.csv')

# CBIS-DDSM

In [None]:
def read_cbis_dicom(path):
    img = sitk.ReadImage(path)
    img_hu = sitk.GetArrayFromImage(img)[0]
    img = (((img_hu - img_hu.min())/(img_hu.max() - img_hu.min()))*255).astype(np.uint8)
    img = clipped_zoom(img, 1.2)
    img = cropp_image(img, 5, True)
    return img

In [None]:
clinical = pd.read_csv(root_path+'cbis-meta-data.csv')
clinical

In [None]:
p = clinical['PatientID'].values
p, n = np.unique(p, return_counts=True)
np.unique(n, return_counts=True)

In [None]:
os.mkdir('CBIS')

In [None]:
metadict = {
    'PatientID': [],
    'Side': [],

    'CC': [],
    'MLO': [],

    'Pathology': [],
    'Type': [],
    'BI-RADS': [],
    'Density': [],
    'Subtype': [],
    'Age': []
}

for i in tqdm(range(len(clinical))):
    p_id = clinical['PatientID'].values[i]
    side = clinical['Side'].values[i][0]
    cc = root_path+clinical['CC'].values[i]
    mlo = root_path+clinical['MLO'].values[i]
    pathology = clinical['Pathology'].values[i]
    typee = clinical['Type'].values[i]
    subtype = None
    age = None
    dense = clinical['Density'].values[i]
    birad = clinical['Assessment'].values[i]

    p_folder = 'CBIS/{}/'.format(p_id)
    if p_id not in metadict['PatientID']:
        os.mkdir(p_folder)

    s_folder = p_folder+'{}/'.format(side)
    os.mkdir(s_folder)
    cc_new = s_folder+'CC.jpg'
    mlo_new = s_folder+'MLO.jpg'

    cc_img = read_cbis_dicom(cc)
    mlo_img = read_cbis_dicom(mlo)

    cv2.imwrite(cc_new, cc_img)
    cv2.imwrite(mlo_new, mlo_img)

    metadict['PatientID'].append(p_id)
    metadict['Side'].append(side)
    metadict['CC'].append(cc_new)
    metadict['MLO'].append(mlo_new)
    metadict['Pathology'].append(pathology)
    metadict['Type'].append(typee)
    metadict['Subtype'].append(subtype)
    metadict['Age'].append(age)
    metadict['BI-RADS'].append(birad)
    metadict['Density'].append(dense)

In [None]:
df = pd.DataFrame.from_dict(metadict)
df

In [None]:
shutil.make_archive('/content/drive/MyDrive/Breast Cancer Preprocessed Datasets/CBIS',
                    'zip',
                    '/content/CBIS'
                    )

In [None]:
df.to_csv('/content/drive/MyDrive/Breast Cancer Preprocessed Datasets/cbis-meta-data.csv')

# Local DICOM Data

In [None]:
clinical = pd.read_csv(root_path+'local-meta-data.csv')
clinical.head()

In [None]:
sample = list(clinical['CC'].values)
sample2 = list(clinical['MLO'].values)

sample.extend(sample2)
np.random.shuffle(sample)
dicoms = [root_path+x for x in sample]
print(len(dicoms))

## Crop, Clahe, Synthesize

In [None]:
for i in range(20):
    img = sitk.ReadImage(dicoms[i*25])
    img_hu = sitk.GetArrayFromImage(img)[0]
    img = (((img_hu - img_hu.min())/(img_hu.max() - img_hu.min()))*255).astype(np.uint8)

    plt.figure(figsize=(20, 4))
    plt.subplot(1, 4, 1)
    plt.imshow(img_hu, cmap='gray')
    plt.axis('off')
    plt.title('original')

    
    img = cropp_image(img, 5)
    plt.subplot(1, 4, 2)
    plt.imshow(img, cmap='gray')
    plt.axis('off')
    plt.title('Cropped')

    plt.subplot(1, 4, 3)
    cl = cv2.createCLAHE(2)
    img_clipped = cl.apply(img)
    plt.imshow(img_clipped, cmap='gray')
    plt.axis('off')
    plt.title('Clahe: 2')

    plt.subplot(1, 4, 4)
    img = synthesize(img)
    plt.imshow(img)
    plt.axis('off')
    plt.title('Synthesized')
    plt.show()

## Crop, Trucate, Clahe, Synthesize

In [None]:
for i in range(20):
    img = sitk.ReadImage(dicoms[i*25])
    img_hu = sitk.GetArrayFromImage(img)[0]
    img = (((img_hu - img_hu.min())/(img_hu.max() - img_hu.min()))*255).astype(np.uint8)

    plt.figure(figsize=(20, 4))
    plt.subplot(1, 4, 1)
    plt.imshow(img_hu, cmap='gray')
    plt.axis('off')
    plt.title('original')

    
    img = cropp_image(img, 5, True)
    plt.subplot(1, 4, 2)
    plt.imshow(img, cmap='gray')
    plt.axis('off')
    plt.title('Cropped')

    plt.subplot(1, 4, 3)
    cl = cv2.createCLAHE(2)
    img_clipped = cl.apply(img)
    plt.imshow(img_clipped, cmap='gray')
    plt.axis('off')
    plt.title('Clahe: 2')

    plt.subplot(1, 4, 4)
    img = synthesize(img)
    plt.imshow(img)
    plt.axis('off')
    plt.title('Synthesized')
    plt.show()

# KAA Data

In [None]:
clinical = pd.read_csv(root_path+'kaa-meta-data.csv')
clinical

In [None]:
sample = list(clinical['CC'].values)
sample2 = list(clinical['MLO'].values)

sample.extend(sample2)
np.random.shuffle(sample)
dicoms = [root_path+x for x in sample]
print(len(dicoms))

## Crop, Clahe, Synthesize

In [None]:
for i in range(20):
    img = sitk.ReadImage(dicoms[i*210])
    img_hu = sitk.GetArrayFromImage(img)[0]
    img = (((img_hu - img_hu.min())/(img_hu.max() - img_hu.min()))*255).astype(np.uint8)

    plt.figure(figsize=(20, 4))
    plt.subplot(1, 4, 1)
    plt.imshow(img_hu, cmap='gray')
    plt.axis('off')
    plt.title('original')

    
    img = cropp_image(img, 100)
    plt.subplot(1, 4, 2)
    plt.imshow(img, cmap='gray')
    plt.axis('off')
    plt.title('Cropped')

    plt.subplot(1, 4, 3)
    cl = cv2.createCLAHE(2)
    img_clipped = cl.apply(img)
    plt.imshow(img_clipped, cmap='gray')
    plt.axis('off')
    plt.title('Clahe: 2')

    plt.subplot(1, 4, 4)
    img = synthesize(img)
    plt.imshow(img)
    plt.axis('off')
    plt.title('Synthesized')
    plt.show()

## Crop, Trucate, Clahe, Synthesize

In [None]:
for i in range(20):
    img = sitk.ReadImage(dicoms[i*210])
    img_hu = sitk.GetArrayFromImage(img)[0]
    img = (((img_hu - img_hu.min())/(img_hu.max() - img_hu.min()))*255).astype(np.uint8)

    plt.figure(figsize=(20, 4))
    plt.subplot(1, 4, 1)
    plt.imshow(img_hu, cmap='gray')
    plt.axis('off')
    plt.title('original')

    
    img = cropp_image(img, 100, True)
    plt.subplot(1, 4, 2)
    plt.imshow(img, cmap='gray')
    plt.axis('off')
    plt.title('Cropped')

    plt.subplot(1, 4, 3)
    cl = cv2.createCLAHE(2)
    img_clipped = cl.apply(img)
    plt.imshow(img_clipped, cmap='gray')
    plt.axis('off')
    plt.title('Clahe: 2')

    plt.subplot(1, 4, 4)
    img = synthesize(img)
    plt.imshow(img)
    plt.axis('off')
    plt.title('Synthesized')
    plt.show()