In [1]:
import pydicom
import numpy as np
import pandas as pd

In [6]:
# Set padding percentage as int divisor
divisor = 4
key = 'identifier'

In [7]:
# Define function to convert 
def dicom_to_npy(csv_file, img_dir, export_dir):
    # Load csv of dicom metadata
    df = pd.read_csv(csv_file)
    # Set identifier as index
    df.set_index(key, inplace=True)
    # Loop through dicom files in csv
    for id in df.index:
        print(id)
        print('--------------------')
        # Get dcm for mask
        mask_path = f"{img_dir}{id}.dcm"
        mask_array = pydicom.dcmread(mask_path).pixel_array
        # Get index of where mask is white
        mask_index = np.argwhere(mask_array == 255)
        # Get bounding box indices of mask
        bbox = [np.min(mask_index[:,0]), np.max(mask_index[:,0]), np.min(mask_index[:,1]), np.max(mask_index[:,1])]
        print(f"{bbox} {max(bbox[3]-bbox[2],bbox[1]-bbox[0])}")
        # Get padding by percentage
        pad = max(bbox[3]-bbox[2],bbox[1]-bbox[0])//divisor
        # Get padded region of mass
        img_bbox = [max(x+y,0) for x,y in zip(bbox,[-pad,pad,-pad,pad])]
        print(f"{img_bbox} {pad}")
        # Get dcm for scan
        img_path = f"{mask_path.rsplit('_', 1)[0]}.dcm"
        # Get pixel array of region of interest
        img16bit = pydicom.dcmread(img_path).pixel_array[img_bbox[0]:img_bbox[1],img_bbox[2]:img_bbox[3]]
        # Normalize 16bit image into float between (0,255)
        img8bit = (img16bit - img16bit.min()) * 255.0 / (img16bit.max() - img16bit.min())
        print(f"{img8bit.shape} {img8bit.min()}-{img8bit.max()}")
        # Export image csvs
        np.savetxt(f'{export_dir}csv/{id}.csv', img8bit, delimiter=',')
        np.save(f'{export_dir}npy/{id}.npy', img8bit)
        print('--------------------')

In [4]:
# Convert training set
dicom_to_npy('data/df_train.csv','CBIS-DDSM/Mass-Training/','data/cbis-ddsm/')

P_00001_LEFT_CC_1
--------------------
[2422, 2896, 270, 660] 474
[2304, 3014, 152, 778] 118
(710, 626) 0.0-255.0
--------------------
P_00001_LEFT_MLO_1
--------------------
[2980, 3231, 271, 486] 251
[2918, 3293, 209, 548] 62
(375, 339) 0.0-255.0
--------------------
P_00004_LEFT_CC_1
--------------------
[3500, 3925, 1417, 1799] 425
[3394, 4031, 1311, 1905] 106
(637, 594) 0.0-255.0
--------------------
P_00004_LEFT_MLO_1
--------------------
[3055, 3441, 1653, 2034] 386
[2959, 3537, 1557, 2130] 96
(578, 573) 0.0-255.0
--------------------
P_00004_RIGHT_MLO_1
--------------------
[3804, 4162, 694, 1118] 424
[3698, 4268, 588, 1224] 106
(570, 636) 0.0-255.0
--------------------
P_00009_RIGHT_CC_1
--------------------
[2176, 2356, 485, 684] 199
[2127, 2405, 436, 733] 49
(278, 297) 0.0-255.0
--------------------
P_00009_RIGHT_MLO_1
--------------------
[2559, 2789, 584, 763] 230
[2502, 2846, 527, 820] 57
(344, 293) 0.0-255.0
--------------------
P_00015_LEFT_MLO_1
--------------------
[2

In [8]:
# Convert testing set
dicom_to_npy('data/df_test.csv','CBIS-DDSM/Mass-Test/','data/cbis-ddsm/')

P_00016_LEFT_CC_1
--------------------
[2588, 2931, 1341, 1685] 344
[2502, 3017, 1255, 1771] 86
(515, 516) 0.0-255.0
--------------------
P_00016_LEFT_MLO_1
--------------------
[3659, 3987, 1624, 1911] 328
[3577, 4069, 1542, 1993] 82
(492, 451) 0.0-255.0
--------------------
P_00017_LEFT_CC_1
--------------------
[3767, 3940, 251, 405] 173
[3724, 3983, 208, 448] 43
(259, 240) 0.0-255.0
--------------------
P_00017_LEFT_MLO_1
--------------------
[3815, 3994, 207, 391] 184
[3769, 4040, 161, 437] 46
(271, 276) 0.0-255.0
--------------------
P_00032_RIGHT_CC_1
--------------------
[1786, 2150, 1868, 2203] 364
[1695, 2241, 1777, 2294] 91
(546, 517) 0.0-255.0
--------------------
P_00032_RIGHT_MLO_1
--------------------
[2143, 2512, 1369, 1789] 420
[2038, 2617, 1264, 1894] 105
(579, 630) 0.0-255.0
--------------------
P_00037_RIGHT_CC_1
--------------------
[1366, 1754, 1679, 2001] 388
[1269, 1851, 1582, 2098] 97
(582, 516) 0.0-255.0
--------------------
P_00037_RIGHT_MLO_1
---------------