In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install SimpleITK==2.1.1
!pip install pydicom

import pydicom
import SimpleITK as sitk
import numpy as np
import pandas as pd
import cv2
from tqdm.notebook import tqdm
import copy
import os
import glob
import shutil
import random
from sklearn.utils import shuffle
import matplotlib.pyplot as plt

In [None]:
def select_largest_obj(img_bin, lab_val=255, fill_holes=False, 
                       smooth_boundary=False, kernel_size=15):
    '''Select the largest object from a binary image and optionally
    fill holes inside it and smooth its boundary.
    Args:
        img_bin(2D array): 2D numpy array of binary image.
        lab_val([int]): integer value used for the label of the largest 
                        object. Default is 255.
        fill_holes([boolean]): whether fill the holes inside the largest 
                               object or not. Default is false.
        smooth_boundary([boolean]): whether smooth the boundary of the 
                                    largest object using morphological 
                                    opening or not. Default is false.
        kernel_size([int]): the size of the kernel used for morphological 
                            operation.
    '''
    n_labels, img_labeled, lab_stats, _ = cv2.connectedComponentsWithStats(
        img_bin, connectivity=8, ltype=cv2.CV_32S)
    largest_obj_lab = np.argmax(lab_stats[1:, 4]) + 1
    largest_mask = np.zeros(img_bin.shape, dtype=np.uint8)
    largest_mask[img_labeled == largest_obj_lab] = lab_val
    if fill_holes:
        bkg_locs = np.where(img_labeled == 0)
        bkg_seed = (bkg_locs[0][0], bkg_locs[1][0])
        img_floodfill = largest_mask.copy()
        h_, w_ = largest_mask.shape
        mask_ = np.zeros((h_ + 2, w_ + 2), dtype=np.uint8)
        cv2.floodFill(img_floodfill, mask_, seedPoint=bkg_seed, newVal=lab_val)
        holes_mask = cv2.bitwise_not(img_floodfill)  # mask of the holes.
        largest_mask = largest_mask + holes_mask
    if smooth_boundary:
        kernel_ = np.ones((kernel_size, kernel_size), dtype=np.uint8)
        largest_mask = cv2.morphologyEx(largest_mask, cv2.MORPH_OPEN, kernel_)
    
    cnts, _ = cv2.findContours(largest_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnt = max(cnts, key = cv2.contourArea)
    x, y, w, h = cv2.boundingRect(cnt)
    return x, y, w, h, largest_mask

In [None]:
def process(img, threshold):
    _, binary_img = cv2.threshold(img, threshold, 
                                    maxval=255, type=cv2.THRESH_BINARY)
    x, y, w, h, mask_img = select_largest_obj(binary_img, lab_val=255, 
                                       fill_holes=False, 
                                       smooth_boundary=True, kernel_size=2)
    croped_img, br_mask = img[y:y+h, x:x+w], mask_img[y:y+h, x:x+w]
    filtered_img = croped_img
    filtered_img = ((filtered_img - filtered_img.min()) / (filtered_img.max() - filtered_img.min())*255).astype(np.uint8)
    return filtered_img

In [None]:
import numpy as np
from scipy.ndimage import zoom


def clipped_zoom(img, zoom_factor, **kwargs):

    h, w = img.shape[:2]

    zoom_tuple = (zoom_factor,) * 2 + (1,) * (img.ndim - 2)
    zh = int(np.round(h / zoom_factor))
    zw = int(np.round(w))
    top = (h - zh) // 2
    left = (w - zw) // 2

    out = zoom(img[top:top+zh, left:left+zw], zoom_tuple, **kwargs)
    return out

In [None]:
root_path = '/content/drive/MyDrive/Breast Cancer/'

# CMMD

In [None]:
cmmd = pd.read_csv(root_path + 'Chinese Mammography/metadata.csv')
cmmd.head(2)

In [None]:
clinical = pd.read_excel(root_path+'Chinese Mammography/CMMD_clinicaldata_revision.xlsx')
clinical

In [None]:
directory = []

for i in tqdm(range(len(clinical))):
    pid = clinical['ID1'].values[i]
    cmmd_p = cmmd.loc[cmmd['Subject ID']==pid]
    direc = cmmd_p['File Location'].values[0].replace('\\', '/')
    final_path = 'Chinese Mammography'+direc[1:]
    directory.append(final_path)

clinical['File Location'] = directory
clinical

In [None]:
cc = []
mlo = []

not_downloaded = []
for i in tqdm(range(len(clinical))):
    sample = clinical.iloc[i]
    file_path = root_path+sample['File Location']
    try:
        dicoms = os.listdir(file_path)
    except:
        cc.append(None)
        mlo.append(None)
        not_downloaded.append(file_path)
        continue
    else:
        if len(dicoms) > 2:
            side = sample['LeftRight']
            if side == 'R':
                cc_name = sample['File Location']+'/'+dicoms[2]
                mlo_name = sample['File Location']+'/'+dicoms[3]
            else:
                cc_name = sample['File Location']+'/'+dicoms[0]
                mlo_name = sample['File Location']+'/'+dicoms[1]
        elif len(dicoms)==2:
            cc_name = sample['File Location']+'/'+dicoms[0]
            mlo_name = sample['File Location']+'/'+dicoms[1]
        elif len(dicoms)==0:
            cc_name = None
            mlo_name = None
            not_downloaded.append(file_path)
        
        cc.append(cc_name)
        mlo.append(mlo_name)

clinical['CC'] = cc
clinical['MLO'] = mlo
clinical

In [None]:
clinical = clinical[clinical['CC'].notna()].reset_index()
clinical

In [None]:
subjects = clinical['ID1'].values
subs, counts = np.unique(subjects, return_counts=True)
print('Number of Subjects:', len(subs))

n = 0
for i in counts:
    if i > 1:
        n+=1
print('Number of Subjects with Both-Side Mammography: ', n)

print('*********************************************************')

print('Number of Subjects with Calcification: ',
      len(clinical.loc[clinical['abnormality']=='calcification'])
      )

print('Number of Subjects with Mass: ',
      len(clinical.loc[clinical['abnormality']=='mass'])
      )

print('Number of Subjects with Mass and Calcification: ',
      len(clinical.loc[clinical['abnormality']=='both'])
      )

print('Number of Subjects with Benign Abnormality: ',
      len(clinical.loc[clinical['classification']=='Benign'])
      )

print('Number of Subjects with Malignant Abnormality: ',
      len(clinical.loc[clinical['classification']=='Malignant'])
      )

print('*********************************************************')

print('Numbers of Subjects with Benign Calcification: ',
      len(clinical.loc[(clinical['abnormality']=='calcification') & (clinical['classification']=='Benign')])
      )

print('Numbers of Subjects with Malignant Calcification: ',
      len(clinical.loc[(clinical['abnormality']=='calcification') & (clinical['classification']=='Malignant')])
      )

print('Numbers of Subjects with Benign Mass: ',
      len(clinical.loc[(clinical['abnormality']=='mass') & (clinical['classification']=='Benign')])
      )

print('Numbers of Subjects with Malignant Mass: ',
      len(clinical.loc[(clinical['abnormality']=='mass') & (clinical['classification']=='Malignant')])
      )

print('Numbers of Subjects with Benign Mass and Calcifacation: ',
      len(clinical.loc[(clinical['abnormality']=='both') & (clinical['classification']=='Benign')])
      )

print('Numbers of Subjects with Malignant Mass and Calcifacation: ',
      len(clinical.loc[(clinical['abnormality']=='both') & (clinical['classification']=='Malignant')])
      )

In [None]:
n = 1222
sample = clinical.iloc[n]
file_path = root_path+sample['File Location']
dicoms = os.listdir(file_path)
abnormality = sample['abnormality']
label = sample['classification']

print(sample['ID1'])
print(label, abnormality)
print('Images: ')

for i in range(len(dicoms)):
    img = sitk.ReadImage(file_path+'/'+dicoms[i])
    img = sitk.GetArrayFromImage(img)[0]
    maxx = img.max()
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 3, 1)
    plt.imshow(img, cmap='gray')
    plt.axis('off')
    plt.title('original, name= {}'.format(dicoms[i]))

    img = (((img - img.min())/(img.max() - img.min()))*255).astype(np.uint8)
    img = process(img, 5)
    plt.subplot(1, 3, 2)
    plt.imshow(img, cmap='gray')
    plt.axis('off')
    plt.title('cropped')

    plt.subplot(1, 3, 3)
    cl = cv2.createCLAHE(2)
    img = cl.apply(img)
    plt.imshow(img, cmap='gray')
    plt.axis('off')
    plt.title('clahe=2')
    plt.show()

In [None]:
clinical.to_csv(root_path+'Chinese Mammography/new-metadata.csv')

df = pd.DataFrame(index=np.arange(len(not_downloaded)), columns=['Not Downloaded'])
df['Not Downloaded'] = not_downloaded
df.to_csv(root_path+'/notdownloaded.csv')

# CBIS

In [None]:
mass1 = pd.read_csv(root_path+'CBIS/CBIS-DDSM-All-doiJNLP-zzWs5zfZ/mass_case_description_train_set.csv')
mass2 = pd.read_csv(root_path+'CBIS/CBIS-DDSM-All-doiJNLP-zzWs5zfZ/mass_case_description_test_set.csv')
mass = mass1.append(mass2, ignore_index=True)
mass.head()

In [None]:
calc1 = pd.read_csv(root_path+'CBIS/CBIS-DDSM-All-doiJNLP-zzWs5zfZ/calc_case_description_train_set.csv')
calc2 = pd.read_csv(root_path+'CBIS/CBIS-DDSM-All-doiJNLP-zzWs5zfZ/calc_case_description_test_set.csv')
calc = calc1.append(calc2, ignore_index=True)
calc.head()

In [None]:
cbis = mass.append(calc, ignore_index=True)
cbis.head()

In [None]:
meta = pd.read_csv(root_path+'CBIS/CBIS-DDSM-All-doiJNLP-zzWs5zfZ/metadata.csv')
meta = meta.loc[meta['Series Description']=='full mammogram images']
meta.head()

In [None]:
patients = cbis['patient_id'].values
print(len(patients))
patients = np.unique(patients)
print(len(patients))

In [None]:
patient_id = []
side = []
cc_path = []
mlo_path = []
pathology = []
density = []
assessment = []
ab_type = []
for i in tqdm(range(len(patients))):
    df_patient = cbis.loc[cbis['patient_id']==patients[i]]
    for s in ['LEFT', 'RIGHT']:
        df_side = df_patient.loc[df_patient['left or right breast'] == s]
        if len(np.unique(df_side['image view'].values)) >= 2:
            try:
                cc = df_side.loc[df_side['image view']=='CC']['image file path'].values[0]
                cc = glob.glob(root_path+'CBIS/CBIS-DDSM-All-doiJNLP-zzWs5zfZ/'+meta.loc[meta['Subject ID']==cc.split('/')[0]]['File Location'].values[0][2:]+'/*.dcm')[0]
                cc = cc.replace(root_path, '')
                mlo = df_side.loc[df_side['image view']=='MLO']['image file path'].values[0]
                mlo = glob.glob(root_path+'CBIS/CBIS-DDSM-All-doiJNLP-zzWs5zfZ/'+meta.loc[meta['Subject ID']==mlo.split('/')[0]]['File Location'].values[0][2:]+'/*.dcm')[0]
                mlo = mlo.replace(root_path, '') 
            except:
                continue
            cc_path.append(cc)
            mlo_path.append(mlo)
            patient_id.append(patients[i])
            side.append(s)
            density.append(df_side['breast_density'].values[0])
            pathh = df_side['pathology'].values[0]
            if '_' in pathh:
                pathh = 'BENIGN'
            pathology.append(pathh.lower().capitalize())
            assessment.append(df_side['assessment'].values[0])
            ab_type.append(df_side['abnormality type'].values[0])

In [None]:
index = np.arange(len(patient_id))
columns = ['PatientID', 'Side', 'CC', 'MLO', 'Pathology', 'Density', 'Assessment', 'Type']
df = pd.DataFrame(index=index, columns=columns)

df['PatientID'] = patient_id
df['Side'] = side
df['CC'] = cc_path
df['MLO'] = mlo_path
df['Pathology'] = pathology
df['Density'] = density
df['Assessment'] = assessment
df['Type'] = ab_type

df

In [None]:
clinical = df
subjects = clinical['PatientID'].values
subs, counts = np.unique(subjects, return_counts=True)
print('Number of Subjects:', len(subs))

n = 0
for i in counts:
    if i > 1:
        n+=1
print('Number of Subjects with Both-Side Mammography: ', n)

print('*********************************************************')

print('Number of Subjects with Calcification: ',
      len(clinical.loc[clinical['Type']=='calcification'])
      )

print('Number of Subjects with Mass: ',
      len(clinical.loc[clinical['Type']=='mass'])
      )

print('Number of Subjects with Benign Abnormality: ',
      len(clinical.loc[clinical['Pathology']=='Benign'])
      )

print('Number of Subjects with Malignant Abnormality: ',
      len(clinical.loc[clinical['Pathology']=='Malignant'])
      )

print('*********************************************************')

print('Numbers of Subjects with Benign Calcification: ',
      len(clinical.loc[(clinical['Type']=='calcification') & (clinical['Pathology']=='Benign')])
      )

print('Numbers of Subjects with Malignant Calcification: ',
      len(clinical.loc[(clinical['Type']=='calcification') & (clinical['Pathology']=='Malignant')])
      )

print('Numbers of Subjects with Benign Mass: ',
      len(clinical.loc[(clinical['Type']=='mass') & (clinical['Pathology']=='Benign')])
      )

print('Numbers of Subjects with Malignant Mass: ',
      len(clinical.loc[(clinical['Type']=='mass') & (clinical['Pathology']=='Malignant')])
      )


In [None]:
n = 1222
sample = clinical.iloc[n]
CC = sample['CC']
MLO = sample['MLO']
abnormality = sample['Type']
label = sample['Pathology']

print(sample['PatientID'])
print(label, abnormality)
print('Images: ')

dicoms = [CC, MLO]
for i in range(len(dicoms)):
    img = sitk.ReadImage(root_path+'/'+dicoms[i])
    img = sitk.GetArrayFromImage(img)[0]
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 3, 1)
    plt.imshow(img, cmap='gray')
    plt.axis('off')
    plt.title('original')
    img = (((img - img.min())/(img.max() - img.min()))*255).astype(np.uint8)
    img = clipped_zoom(img, 1.2)
    img = process(img, 5)
    plt.subplot(1, 3, 2)
    plt.imshow(img, cmap='gray')
    plt.axis('off')
    plt.title('cropped')
    plt.subplot(1, 3, 3)
    cl = cv2.createCLAHE(2)
    img = cl.apply(img)
    plt.imshow(img, cmap='gray')
    plt.axis('off')
    plt.title('clahe=2')
    plt.show()

In [None]:
clinical.to_csv(root_path+'CBIS/CBIS-DDSM-All-doiJNLP-zzWs5zfZ/new-metadata.csv')

# Local Data

In [None]:
labels = pd.read_csv(root_path+'label_with_path.csv')
labels.head()

In [None]:
path = labels['Path'].values

path = [x.replace('/content/drive/MyDrive/', '') for x in path]

labels['Path'] = path
labels.head()

In [None]:
des = []

for i in tqdm(range(len(labels))):
    dicom = root_path+'/'+labels['Path'].values[i]
    ds = pydicom.dcmread(dicom, stop_before_pixels=True)
    des.append(ds.SeriesDescription)

labels['Description'] = des
labels

In [None]:
keep = ['L CC', 'R CC', 'R MLO', 'L MLO']
labels = labels.loc[labels['Description'].isin(keep)]
labels

In [None]:
patients = labels['Subject'].values
patients, number = np.unique(patients, return_counts=True)
patients = [patients[i] for i in range(len(patients)) if number[i] > 1]
number = np.array([number[i] for i in range(len(number)) if number[i] > 1])
print('Number of Subjects: ', len(patients))

In [None]:
labels = labels.loc[labels['Subject'].isin(patients)]
labels

In [None]:
side = []
view = []

for i in range(len(labels)):
    des = labels['Description'].values[i]

    if 'R' in des:
        side.append('R')
    else:
        side.append('L')

    if 'CC' in des:
        view.append('CC')
    else:
        view.append('MLO')

labels['Side'] = side
labels['View'] = view

labels

In [None]:
keep = []

for i in range(len(labels)):
    p = labels['Subject'].values[i]
    pdf = labels.loc[labels['Subject']==p]

    if len(pdf) == 4:
        side = np.unique(pdf['Side'].values)

        if ('L' in side) and ('R' in side):
            view = np.unique(pdf['View'].values)
            if ('CC' in view) and ('MLO' in view):
                keep.append(p)

    elif len(pdf) == 3:
        side = np.unique(pdf['Side'].values)
        view = np.unique(pdf['View'].values)
        if ('CC' in view) and ('MLO' in view):
            keep.append(p)
    elif len(pdf) == 2:
        side = np.unique(pdf['Side'].values)
        if len(side) == 1:
            view = np.unique(pdf['View'].values)
            if ('CC' in view) and ('MLO' in view):
                keep.append(p)
    elif len(pdf) == 5:
        side = np.unique(pdf['Side'].values)
        view = np.unique(pdf['View'].values)
        if ('CC' in view) and ('MLO' in view):
            keep.append(p)
labels = labels.loc[labels['Subject'].isin(keep)]
labels

In [None]:
patients = labels['Subject'].values
patients, number = np.unique(patients, return_counts=True)
patients = [patients[i] for i in range(len(patients)) if number[i] > 1]
number = np.array([number[i] for i in range(len(number)) if number[i] > 1])
print('Number of Subjects: ', len(patients))

In [None]:
number

In [None]:
np.unique(number, return_counts=True)

In [None]:
PatientID = []
Side = []
CC = []
MLO = []

for i in tqdm(range(len(patients))):
    if number[i] == 4:
        p = patients[i]
        p_df = labels.loc[labels['Subject']==p]
        S = ['L', 'R']
        for s in S:
            PatientID.append(p)
            Side.append(s)
            s_df = p_df.loc[p_df['Side']==s]
            cc = s_df.loc[s_df['View']=='CC']['Path'].values[0]
            mlo = s_df.loc[s_df['View']=='MLO']['Path'].values[0]
            CC.append(cc)
            MLO.append(mlo)
    elif number[i] == 2:
        p = patients[i]

        p_df = labels.loc[labels['Subject']==p]
        s = p_df['Side'].values[0]

        Side.append(s)
        PatientID.append(p)
        s_df = p_df.loc[p_df['Side']==s]
        cc = s_df.loc[s_df['View']=='CC']['Path'].values[0]
        mlo = s_df.loc[s_df['View']=='MLO']['Path'].values[0]
        CC.append(cc)
        MLO.append(mlo)

print(len(PatientID))

In [None]:
for i in tqdm(range(len(patients))):
    if number[i] == 3:
        p = patients[i]
        p_df = labels.loc[labels['Subject']==p]
        s = p_df['Side'].values
        s = np.unique(s)
        if len(s) == 1:
            PatientID.append(p)
            side = s[0]
            Side.append(side)
            s_df = p_df.loc[p_df['Side']==side]
            cc = s_df.loc[s_df['View']=='CC']['Path'].values[0]
            mlo = s_df.loc[s_df['View']=='MLO']['Path'].values[0]
            CC.append(cc)
            MLO.append(mlo)
        else:
            S = ['L', 'R']
            for s in S:
                s_df = p_df.loc[p_df['Side']==s]
                if len(s_df) == 2:
                    PatientID.append(p)
                    Side.append(s)
                    cc = s_df.loc[s_df['View']=='CC']['Path'].values[0]
                    mlo = s_df.loc[s_df['View']=='MLO']['Path'].values[0]
                    CC.append(cc)
                    MLO.append(mlo)

print(len(PatientID))

In [None]:
Pathology = []
Type = []
birads = []

candids = ['Normal', 'Benign', 'Malignant']
for i in tqdm(range(len(PatientID))):
    pdf = labels.loc[labels['Subject']==PatientID[i]]
    c = candids[np.argmax(pdf[['Normal', 'Benign', 'Malignant']].values[0])]
    Pathology.append(c)
    Type.append(None)
    birads.append(pdf['BI_RADS'].values[0])

In [None]:
columns = ['PatientID', 'Side', 'CC', 'MLO', 'Pathology', 'Type', 'BI-RADS']
index = np.arange(len(PatientID))

df = pd.DataFrame(index=index, columns=columns)
df['PatientID'] = PatientID
df['Side'] = Side
df['CC'] = CC
df['MLO'] = MLO
df['Pathology'] = Pathology
df['Type'] = Type
df['BI-RADS'] = birads

df

In [None]:
df = df.drop(250)
df

In [None]:
def person_names_callback(dataset, data_element):
    if data_element.VR == "PN":
        data_element.value = "anonymous"

def anonymize_dicom(path):
    dataset = pydicom.dcmread(path)
    dataset.walk(person_names_callback)
    return dataset

In [None]:
patients = np.unique(df['PatientID'].values)

PatientID = []
Side = []
CC = []
MLO = []
Pathology = []
Type = []
BI_Rads = []

for i in tqdm(range(len(patients))):
    pdf = df.loc[df['PatientID']==patients[i]]

    p_direc = 'Local_Data/'+'{}/'.format(i)
    try:
        os.mkdir(root_path+p_direc)
    except:
        print('exists')

    if len(pdf) == 2:
        sides = ['R', 'L']
        for s in sides:
            s_df = pdf.loc[pdf['Side']==s]

            s_direc = '{}/'.format(s)
            try:
                os.mkdir(root_path+p_direc+s_direc)
            except:
                print('exists')

            cc = s_df['CC'].values[0]
            mlo = s_df['MLO'].values[0]

            ccdicom = anonymize_dicom(root_path+'/'+cc)
            mlodicom = anonymize_dicom(root_path+'/'+mlo)


            ccdicom.save_as(root_path+p_direc+s_direc+'CC.dcm')
            mlodicom.save_as(root_path+p_direc+s_direc+'MLO.dcm')

            PatientID.append(i)
            Side.append(s)
            CC.append(p_direc+s_direc+'CC.dcm')
            MLO.append(p_direc+s_direc+'MLO.dcm')
            Pathology.append(s_df['Pathology'].values[0])
            Type.append(s_df['Type'].values[0])
            BI_Rads.append(s_df['BI-RADS'].values[0])

    else:
        sides = ['R', 'L']
        for s in sides:
            s_df = pdf.loc[pdf['Side']==s]
            if len(s_df) == 1:
                s_direc = '{}/'.format(s)
                try:
                    os.mkdir(root_path+p_direc+s_direc)
                except:
                    print('exists')
                
                cc = s_df['CC'].values[0]
                mlo = s_df['MLO'].values[0]

                ccdicom = anonymize_dicom(root_path+'/'+cc)
                mlodicom = anonymize_dicom(root_path+'/'+mlo)


                ccdicom.save_as(root_path+p_direc+s_direc+'CC.dcm')
                mlodicom.save_as(root_path+p_direc+s_direc+'MLO.dcm')

                PatientID.append(i)
                Side.append(s)
                CC.append(p_direc+s_direc+'CC.dcm')
                MLO.append(p_direc+s_direc+'MLO.dcm')
                Pathology.append(s_df['Pathology'].values[0])
                Type.append(s_df['Type'].values[0])
                BI_Rads.append(s_df['BI-RADS'].values[0])

In [None]:
from pandas.core.arrays.numeric import T
columns = ['PatientID', 'Side', 'CC', 'MLO', 'Pathology', 'Type', 'BI-RADS']
index = np.arange(len(PatientID))

df = pd.DataFrame(index=index, columns=columns)
df['PatientID'] = PatientID
df['Side'] = Side
df['CC'] = CC
df['MLO'] = MLO
df['Pathology'] = Pathology
df['Type'] = Type
df['BI-RADS'] = BI_Rads

df

In [None]:
for i in range(len(df)):
    cc = root_path+df['CC'].values[i]
    mlo = root_path+df['MLO'].values[i]

    s = df['Side'].values[i]

    img = sitk.ReadImage(cc)
    img = sitk.GetArrayFromImage(img)[0]
    img = (((img - img.min())/(img.max() - img.min()))*255).astype(np.uint8)
    img = process(img, 5)

    print('*********************{}**********************'.format(i))
    plt.figure()
    plt.subplot(1, 2, 1)
    plt.imshow(img, cmap='gray')
    plt.title('{}-{}'.format(s, 'CC'))
    plt.axis('off')

    img = sitk.ReadImage(mlo)
    img = sitk.GetArrayFromImage(img)[0]
    img = (((img - img.min())/(img.max() - img.min()))*255).astype(np.uint8)
    img = process(img, 5)

    plt.subplot(1, 2, 2)
    plt.imshow(img, cmap='gray')
    plt.title('{}-{}'.format(s, 'MLO'))
    plt.axis('off')
    plt.show()

In [None]:
df.to_csv(root_path+'/local-meta-data.csv')

In [None]:
print(len(df.loc[df['Pathology']=='Normal']))
print(len(df.loc[df['Pathology']=='Benign']))
print(len(df.loc[df['Pathology']=='Malignant']))

# KAA Dataset

In [None]:
data_dir = 'KAA_Dataset/mammogram dataset/'

metadata = pd.read_excel(root_path+data_dir+'correctSheetlast.xlsx', sheet_name='correctSheet')
cols_to_keep = [
                'patient id ', 'date of the study', 'brest type',
                'brest view', 'assesment ', 'image path ', 'patient age'
]
metadata = metadata[cols_to_keep]
metadata = metadata.loc[metadata['assesment ']!='BIRAD 0']

path = metadata['image path '].values
path = [data_dir+'KAU-MDS (DICOM format)/'+x.replace('mammograms', '') for x in path]
metadata['image path ']=path
metadata

In [None]:
patients = metadata['patient id '].values
patients, number = np.unique(patients, return_counts=True)
patients = [patients[i] for i in range(len(patients)) if number[i] in [4, 8, 12]]
number = np.array([number[i] for i in range(len(number)) if number[i] in [4, 8, 12]])
print('Number of Subjects: ', len(patients))

In [None]:
metadata = metadata.loc[metadata['patient id '].isin(patients)]
metadata

In [None]:
PatientID = []
Side = []
CC = []
MLO = []
Pathology = []
Type = []
birads = []

for i in tqdm(range(len(patients))):
    pdf = metadata.loc[metadata['patient id ']==patients[i]]
    dates = list(np.unique(pdf['date of the study'].values))

    for date in dates:
        date_df = pdf.loc[pdf['date of the study']==date]

        side = ['R', 'L']

        for s in side:
            sdf = date_df.loc[date_df['brest type']==s]
            if len(sdf) == 2:
                cc = sdf.loc[sdf['brest view']=='CC']['image path '].values[0]
                mlo = sdf.loc[sdf['brest view']=='MLO']['image path '].values[0]
                birad = int(sdf['assesment '].values[0].split(' ')[1])

                PatientID.append(patients[i])
                Side.append(s)
                CC.append(cc)
                MLO.append(mlo)
                Pathology.append(None)
                Type.append(None)
                birads.append(birad)

In [None]:
columns = ['PatientID', 'Side', 'CC', 'MLO', 'Pathology', 'Type', 'BI-RADS']
index = np.arange(len(PatientID))

df = pd.DataFrame(index=index, columns=columns)
df['PatientID'] = PatientID
df['Side'] = Side
df['CC'] = CC
df['MLO'] = MLO
df['Pathology'] = Pathology
df['Type'] = Type
df['BI-RADS'] = birads

df

In [None]:
indexes = []
for i in tqdm(range(len(df))):
    cc = root_path+df['CC'].values[i]
    mlo = root_path+df['MLO'].values[i]

    if (os.path.isfile(cc)) and (os.path.isfile(mlo)):
        continue
    else:
        indexes.append(i)

print(len(indexes))

In [None]:
df2 = df.drop(index=indexes)
df2

In [None]:
p = df2['PatientID'].values
p = np.unique(p)
print(len(p))

In [None]:
print(len(df2.loc[df2['BI-RADS']==1]))
print(len(df2.loc[df2['BI-RADS']==2]))
print(len(df2.loc[df2['BI-RADS']==3]))
print(len(df2.loc[df2['BI-RADS']==4]))
print(len(df2.loc[df2['BI-RADS']==5]))

In [None]:
df2.to_csv(root_path+'kaa-meta-data.csv')

In [None]:
for i in range(10):
    cc = root_path+df['CC'].values[i]
    mlo = root_path+df['MLO'].values[i]

    s = df['Side'].values[i]

    img = sitk.ReadImage(cc)
    img = sitk.GetArrayFromImage(img)[0]
    # img = np.clip(img, 0, 200)
    img = (((img - img.min())/(img.max() - img.min()))*255).astype(np.uint8)
    img = process(img, 100)

    print('*********************{}**********************'.format(i))
    plt.figure()
    plt.subplot(1, 2, 1)
    plt.imshow(img, cmap='gray')
    plt.title('{}-{}'.format(s, 'CC'))
    plt.axis('off')

    img = sitk.ReadImage(mlo)
    img = sitk.GetArrayFromImage(img)[0]
    # img = np.clip(img, 0, 50)
    img = (((img - img.min())/(img.max() - img.min()))*255).astype(np.uint8)
    img = process(img, 100)

    plt.subplot(1, 2, 2)
    plt.imshow(img, cmap='gray')
    plt.title('{}-{}'.format(s, 'MLO'))
    plt.axis('off')
    plt.show()
    break