In [20]:
import pandas as pd
import numpy as np
import pydicom as dicom
import os
import matplotlib.pyplot as plt
import cv2
import math
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
# Reduced Image pixel size and the depth to process data without cloud
IMG_PXL_SIZE = 50
HM_SLICES = 20

In [4]:
data_labels = pd.read_csv('./Lung1.clinical.csv',index_col=0)
data_dir = 'C:/Users/Daniel/Desktop/NSCLC-Radiomics/patients/'
patients = os.listdir(data_dir)


In [5]:
len(patients) # sample data of 20 patients

422

In [6]:

# Function to break a list of slices into chunks of lists
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [7]:
def mean(l):
    return sum(l)/len(l)

# Function to limit depth

This function will limit the depth of 3-D CT Scans to 20 slices. 

In [104]:
# Function to process data
def process_data(patient, data_labels, img_pxl_size=20, hm_slices=20, vizualize = False):

    label = data_labels.get_value(patient,'Overall.Stage')
    path = data_dir + patient
    _, __, files = os.walk(path)
    slices = [dicom.read_file(files[0] + '/' + s) for s in os.listdir(files[0])]
    slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
    #print(len(slices), label)
    #print(slices[0])
    #plt.imshow(slices[0].pixel_array)
    #plt.show()

    new_slices = []

    slices = [cv2.resize(np.array(each_slice.pixel_array),(IMG_PXL_SIZE,IMG_PXL_SIZE)) for each_slice in slices]

    chunk_sizes = math.ceil(len(slices) / HM_SLICES)



    for slice_chunk in chunks(slices,chunk_sizes):
        slice_chunk = list(map(mean, zip(*slice_chunk)))
        new_slices.append(slice_chunk)


    if len(new_slices) == HM_SLICES-1:
        new_slices.append(new_slices[-1])

    if len(new_slices) == HM_SLICES-2:
        new_slices.append(new_slices[-1])
        new_slices.append(new_slices[-1])

    if len(new_slices) == HM_SLICES+2:
        new_val = list(map(mean, zip(*[new_slices[HM_SLICES-1],new_slices[HM_SLICES]])))
        del new_slices[HM_SLICES]
        new_slices[HM_SLICES-1] = new_val

    if len(new_slices) == HM_SLICES+1:
        new_val = list(map(mean, zip(*[new_slices[HM_SLICES-1],new_slices[HM_SLICES]])))
        del new_slices[HM_SLICES]
        new_slices[HM_SLICES-1] = new_val


    if vizualize:
        fig = plt.figure()
        for num,each_slice in enumerate(slices[:12]):
            y = fig.add_subplot(4,5,num+1)
            #new_image = scipy.misc.imresize(np.array(each_slice.pixel_array),(IMG_PXL_SIZE,IMG_PXL_SIZE))
            #y.imshow(slices[0].pixel_array)
            #y.imshow(each_slice)

        plt.show()

    if label == 'I' or label == 'II': 
        label = np.array([0,1])
    else:
#     elif label == 'IIIa' or label == 'IIIb': 
        label = np.array([1,0])

    return np.array(new_slices), label

# Function to process CT Scans and save them as #-Dimensional Arrays forneural network



In [105]:
# Save processed data in a list

arrays2 = []
labels2 = []


for num, patient in enumerate(patients):
        if num%50 == 0:
            print(num)

        try:
            img_data, label = process_data(patient, data_labels, img_pxl_size=IMG_PXL_SIZE, hm_slices=HM_SLICES)
            if img_data.shape[0] == 20:
                arrays2.append(img_data)
                labels2.append(label)
                print(img_data.shape, label)
            else:
                print("num skipped")

        except KeyError as e:
            
            print(img_data.shape , '\tThis is unlabeled data')

np.save('arrays2-{}-{}-{}.npy'.format(IMG_PXL_SIZE,IMG_PXL_SIZE,HM_SLICES), arrays2)
np.save('labels2-{}-{}-{}.npy'.format(IMG_PXL_SIZE,IMG_PXL_SIZE,HM_SLICES), labels2)
print('Finished processing')

0


  after removing the cwd from sys.path.


(20, 50, 50) [1 0]
(20, 50, 50) [0 1]
(20, 50, 50) [1 0]
(20, 50, 50) [0 1]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [0 1]
(20, 50, 50) [1 0]
(20, 50, 50) [0 1]
num skipped
(20, 50, 50) [1 0]
(20, 50, 50) [0 1]
num skipped
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [0 1]
(20, 50, 50) [1 0]
(20, 50, 50) [0 1]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [0 1]
num skipped
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [0 1]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [0 1]
(20, 50, 50) [1 0]
(20, 50, 50) [0 1]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [0 1]
(20, 50, 50) [0 1]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [0 1]
(20, 50, 50) [1 0]
50
(20, 50, 50) [0 1]
(20, 50, 50) [1 0]
(20, 50, 50) [0 1]
(20, 50, 50

In [103]:
arrays_stack = np.stack(arrays2)
labels_stack = np.stack(labels2)

ValueError: all input arrays must have the same shape

In [98]:
arrays_stack

(411, 20, 50, 50)

In [106]:
X_train, X_test, y_train, y_test = train_test_split(arrays2, labels2, test_size=0.30, random_state=42)

In [77]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D, Conv3D, MaxPooling3D
from keras.utils import np_utils

In [118]:
# Instantiate a CNN.
cnn_model = Sequential()

# Add a convolutional layer.
cnn_model.add(Conv2D(filters = 16,         # number of filters
                     kernel_size = 3,        # height/width of filter
                     activation='relu',      # activation function 
                     input_shape=(20,50,50))) # shape of input (image)

# Add a pooling layer.
cnn_model.add(MaxPooling2D(pool_size=(2,2))) # dimensions of region of pooling

# Add another convolutional layer.
cnn_model.add(Conv2D(64,
                       kernel_size = 3,
                       activation='relu'))

# Add another pooling layer.
cnn_model.add(MaxPooling2D(pool_size=(2,2)))

# We have to remember to flatten to go from the "box" to the vertical line of nodes!
cnn_model.add(Flatten())

# Add a densely-connected layer with 64 neurons.
cnn_model.add(Dense(64, activation='relu'))

# Let's try to avoid overfitting!
cnn_model.add(Dropout(0.5))

# Add a densely-connected layer with 32 neurons.
cnn_model.add(Dense(32, activation='relu'))

# Let's try to avoid overfitting!
cnn_model.add(Dropout(0.5))

# Add a final layer with 2 neurons.
cnn_model.add(Dense(2, activation='sigmoid'))

# Compile model
cnn_model.compile(loss='categorical_crossentropy',
                    optimizer='adam',
                    metrics=['accuracy'])



In [115]:
cnn_model.summary()

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_16 (Conv2D)           (None, 18, 48, 16)        7216      
_________________________________________________________________
max_pooling2d_15 (MaxPooling (None, 9, 24, 16)         0         
_________________________________________________________________
conv2d_17 (Conv2D)           (None, 7, 22, 64)         9280      
_________________________________________________________________
max_pooling2d_16 (MaxPooling (None, 3, 11, 64)         0         
_________________________________________________________________
flatten_9 (Flatten)          (None, 2112)              0         
_________________________________________________________________
dense_25 (Dense)             (None, 64)                135232    
_________________________________________________________________
dropout_17 (Dropout)         (None, 64)              

In [109]:
trial = np.stack(X_train)

In [110]:
trial.shape

(287, 20, 50, 50)

In [112]:
trial[0].shape

(20, 50, 50)

In [119]:
# Fit model on training data
history = cnn_model.fit(np.stack(X_train),
                          np.stack(y_train),
                          batch_size=32,
                          validation_data=(np.stack(X_test), np.stack(y_test)),
                          epochs=5,
                          verbose=1)

Instructions for updating:
Use tf.cast instead.
Train on 287 samples, validate on 124 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
