In [1]:
import numpy as np
import pydicom
from models.preprocessing import SimplePreprocessor
import os

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Load data and conver to numpy

In [2]:
## get a list of images
pneumonia_path = '/Users/clidev/Desktop/Capstone_Project/pnumonia/all_train_dcm_data/pneumonia'
normal_path =  '/Users/clidev/Desktop/Capstone_Project/pnumonia/all_train_dcm_data/normal'

In [3]:
pneumonia_list = os.listdir(pneumonia_path)
normal_list = os.listdir(normal_path)

In [4]:
# create a list of classes that contains lists of file names for each category
classes = []
classes.append(normal_list)
classes.append(pneumonia_list)

In [5]:
# create a list of folder name for each class
class_paths = []
class_paths.append(normal_path)
class_paths.append(pneumonia_path)


In [6]:
# itialize the simple preprocessor to reduce resolution to 512 by 512
sp = SimplePreprocessor(width = 128, height = 128)

### Convert to numpy

In [7]:
data = []
labels = []
PrintCount = 0
# loop through classes
for (i, classs) in enumerate(classes):
    #print('class', classs)
    for (j,image_path) in enumerate(classs):
        #print(class_paths[i] + '/' + image_path)

        # load the image into an array
        dcm_data = pydicom.read_file(class_paths[i] + '/' + image_path)
        im = dcm_data.pixel_array
        
        # cut image size down to 512 X 512
        im = sp.preprocess(im)

        # add the depth dimension
        im = np.expand_dims(im,axis = 2)
        
        

        # append to list
        labels.append(i)
        data.append(im)
        
        PrintCount += 1
        if (PrintCount % 100 == 0):
            print('class', i)
            print('percentage', (j/len(classs))*100)

class 0
percentage 1.749425693585439
class 0
percentage 3.5165223537727512
class 0
percentage 5.283619013960063
class 0
percentage 7.0507156741473755
class 0
percentage 8.817812334334688
class 0
percentage 10.584908994522001
class 0
percentage 12.352005654709313
class 0
percentage 14.119102314896626
class 0
percentage 15.886198975083937
class 0
percentage 17.65329563527125
class 0
percentage 19.420392295458562
class 0
percentage 21.187488955645875
class 0
percentage 22.954585615833185
class 0
percentage 24.7216822760205
class 0
percentage 26.48877893620781
class 0
percentage 28.255875596395125
class 0
percentage 30.022972256582435
class 0
percentage 31.790068916769748
class 0
percentage 33.55716557695706
class 0
percentage 35.324262237144374
class 0
percentage 37.09135889733168
class 0
percentage 38.858455557518994
class 0
percentage 40.62555221770631
class 0
percentage 42.39264887789362
class 0
percentage 44.159745538080934
class 0
percentage 45.92684219826825
class 0
percentage 47.69

## Save as X and y

In [8]:
X = np.array(data)

In [9]:
y = np.array(labels)

In [10]:
np.save(file = './image_as_numpy/X.npy', arr = X)
np.save(file = './image_as_numpy/y.npy', arr = y)

### Check if data is safe

In [11]:
test_image_path = '/Users/clidev/Desktop/Capstone_Project/pnumonia/all_train_dcm_data/normal/7be6b4de-afe9-43c0-a581-0f49608c8976.dcm'

In [12]:
dcm_test = pydicom.read_file(test_image_path)
im_test = dcm_test.pixel_array

In [13]:
im_test = sp.preprocess(im_test)

In [14]:
X[0,:, :,0]

array([[ 10,   0,   0, ...,   0,   0,   0],
       [  1,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       ...,
       [  1,   1,   1, ...,  51,  65, 102],
       [  1,   1,   1, ...,  54,  71, 110],
       [  1,   1,   1, ...,  60,  76, 115]], dtype=uint8)

In [15]:
im_test

array([[ 10,   0,   0, ...,   0,   0,   0],
       [  1,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       ...,
       [  1,   1,   1, ...,  51,  65, 102],
       [  1,   1,   1, ...,  54,  71, 110],
       [  1,   1,   1, ...,  60,  76, 115]], dtype=uint8)

## Train Validate Split 80/20/10

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X_train, X_vt, y_train, y_vt = train_test_split(X, y, test_size = 0.3, stratify = y)

In [18]:
X_validate, X_test, y_validate, y_test = train_test_split(X_vt, y_vt, test_size = 0.3333, stratify = y_vt)

In [19]:
np.save(file = './image_as_numpy/X_train.npy', arr = X_train)
np.save(file = './image_as_numpy/X_validate.npy', arr = X_validate)
np.save(file = './image_as_numpy/X_test.npy', arr = X_test)

np.save(file = './image_as_numpy/y_train.npy', arr = y_train)
np.save(file = './image_as_numpy/y_validate.npy', arr = y_validate)
np.save(file = './image_as_numpy/y_test.npy', arr = y_test)

## Test loading the data

In [20]:
X_train = np.load('./image_as_numpy/X_train.npy')

In [21]:
X_train[1,:,:,0]

array([[117,  39,   7, ...,   5,   5,   4],
       [ 54,   7,   1, ...,   5,   5,   4],
       [ 51,   7,   1, ...,   5,   5,   4],
       ...,
       [ 27,   5,   4, ...,   5,   5,   4],
       [ 29,   5,   4, ...,   5,   5,   4],
       [ 31,   7,   4, ...,   5,   5,   4]], dtype=uint8)