# Lung Cancer Detection Using CNNs

### Import libraries for data imports

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import cv2
from sklearn.utils import shuffle
import tensorflow as tf

def show_img(image):
    plt.imshow(image)
    plt.show()

### Import and Convert Data

First I import the images and convert them to numpy arrays for experimentation and development.

In [2]:
external_drive_path = '/Volumes/WhiteElephant/'
home_path = os.getcwd()
os.chdir(external_drive_path)

## Get Working Dataset

Read in previously pickled data. The training and validation sets consist of a shuffled assortment of 4 images for each patient not diagnosed with cancer and 8 images for patients diagnosed with cancer.

In [3]:
import pickle

def pickle2np(dicts):
    images = []
    labels = []
    for data_dict in dicts:
        images.append(data_dict['features'])
        labels.append(data_dict['labels'])
    return np.concatenate([img for img in images],axis=0), np.concatenate([lbl for lbl in labels], axis=0)

In [4]:
train_dicts = []
for i in range(1,8):
    path = './aws_pickle_files/train_set'+str(i)+'.p'
    with open(path, mode='rb') as f:
        train_dicts.append(pickle.load(f))
train_images, train_labels = pickle2np(train_dicts)
del train_dicts

In [5]:
valid_dicts = []
for i in range(1,3):
    path = './aws_pickle_files/valid_set'+str(i)+'.p'
    with open(path, mode='rb') as f:
        valid_dicts.append(pickle.load(f))
valid_images, valid_labels = pickle2np(valid_dicts)
del valid_dicts

In [None]:
test_dicts = []
path = './aws_pickle_files/test.p'
with open(path, 'rb') as f:
    test_dicts.append(pickle.load(f))
test_images, test_ids = pickle2np(test_dicts)
del test_dicts



## Preprocessing



In [6]:
def center_and_normalize(data, mu, dev):
    return (data-mu)/dev 

process_mu = np.mean(train_images)
process_dev = np.std(train_images)

In [7]:
def one_hot_encode(labels, n_labels):
    encoded_labels = np.zeros((labels.shape[0], n_labels), dtype=np.float32)
    for i,label in enumerate(labels):
        encoded_labels[i,int(label)] = 1
    return encoded_labels

In [8]:
train_images, train_labels = shuffle(train_images, train_labels)
valid_images,valid_labels = shuffle(valid_images,valid_labels)
valid_images = valid_images[:100]
valid_labels = valid_labels[:100]
valid_images = center_and_normalize(valid_images, process_mu, process_dev)

In [9]:
train_images = train_images.reshape([train_images.shape[i] for i in range(3)]+[1])
valid_images = valid_images.reshape([valid_images.shape[i] for i in range(3)]+[1])
train_labels = one_hot_encode(train_labels,2)
valid_labels = one_hot_encode(valid_labels,2)

In [10]:
def chop(image, img_size):
    limit = image.shape[0]//img_size
    chops = []
    for i in range(1,limit+1):
        for j in range(1,limit+1):
            if (i == 1 and j == 1) or (i == 1 and j == limit) or (i == limit and j == 1) or (i == limit and j == limit): continue
            chops.append(image[img_size*(i-1):img_size*i,img_size*(j-1):img_size*j])
    return np.array(chops, dtype=np.float32)

def chop_data(images, img_size):
    chopped_images = []
    print("Start Chopping")
    for i,img in enumerate(images):
        chopped_images.append(chop(img, img_size))
    print("End Chopping")
    return np.array([chop for chop in chopped_images], dtype=np.float32)



In [11]:
train_images = chop_data(train_images, 32)
valid_images = chop_data(valid_images, 32)
print(train_images.shape)
print(valid_images.shape)

Start Chopping
End Chopping
Start Chopping
End Chopping
(5628, 252, 32, 32, 1)
(100, 252, 32, 32, 1)


In [13]:
train_images = test_images
del test_images
valid_images = vtest_images
del vtest_images


## Tensorflow Approach

#### Parameter Initializations

In [12]:
## Chopped Images Parameter Creation

mu = 0
dev = 0.1

chopped_convweight_shapes = [
    # 32x32
    (5,5,1,64), # 28x28
    # Pool: 14x14
    (5,5,64,16), # 10x10
    # Pool: 5x5
]
chopped_fcweight_shapes = [
    (5*5*16,50),
    (50,2),
    (2,1)
]

chopped_convweights = [tf.Variable(tf.truncated_normal(shape=x,mean=mu,stddev=dev),name="conv"+str(x[-1])) for x in chopped_convweight_shapes]
chopped_convbiases = [tf.Variable(tf.zeros([x[-1]]),name="convbias"+str(x[-1])) for x in chopped_convweight_shapes]

chopped_fcweights = [tf.Variable(tf.truncated_normal(shape=x,mean=mu,stddev=dev),name="fc"+str(x[-1])) for x in chopped_fcweight_shapes]
chopped_fcbiases = [tf.Variable(tf.zeros([x[-1]]),name="fcbias"+str(x[-1])) for x in chopped_fcweight_shapes]

combine_chops_weight = tf.Variable(tf.truncated_normal(shape=(((512//32)**2)-4, 2), mean=mu, stddev=dev),name="combine")
combine_chops_bias = tf.Variable(tf.zeros([2]),name="combinebias")





#### Neural Net Helper Functions

In [20]:
def conv2d(data, weight, bias, stride=1, padding="VALID"):
    activations = tf.nn.bias_add(tf.nn.conv2d(data, weight,strides=[1,stride,stride,1],padding=padding),bias)
    return tf.nn.elu(activations)

def max_pool(data,k=2):
    return tf.nn.max_pool(data,ksize=[1,k,k,1],strides=[1,k,k,1],padding="VALID")

def conv_net(data, weights, biases, dropout_prob, strides=[]):
    if len(strides) == 0: strides = [1]*len(weights)
    logits = data
    for i,weight in enumerate(weights):
        logits = conv2d(logits, weight, biases[i],stride=strides[i])
        logits = max_pool(logits)
        logits = tf.nn.dropout(logits, dropout_prob)
    return logits

def fc_net(data, weights, biases, dropout_prob):
    logits = data
    for i,weight in enumerate(weights):
        if i < len(weights)-1:
            logits = tf.matmul(logits, weight) + biases[i]
            logits = tf.nn.elu(logits)
    #         logits = tf.nn.dropout(logits, dropout_prob)
    return tf.matmul(logits,weights[-1])+biases[-1]

In [15]:
data_features = tf.placeholder(tf.float32, [None]+[train_images.shape[i] for i in range(1,len(train_images.shape))], name="data_features")
data_labels = tf.placeholder(tf.float32, [None, 2], name='data_labels')

convdropout = tf.placeholder(tf.float32, name="convdropout")
fcdropout = tf.placeholder(tf.float32, name="fcdropout")
momentum = tf.placeholder(tf.float32, name="momentum")
learning_rate = tf.placeholder(tf.float32, name="learning_rate")


In [21]:
## Chopped Datas

batch_size = 100

def chopped_net(datas, conv_weights, conv_biases, fc_weights, fc_biases, batch_size, dropout_prob, strides=[]):
    combined_logits = []
    for i in range(batch_size):
        logits = conv_net(datas[i],conv_weights,conv_biases,dropout_prob,strides=strides)
        logits = tf.reshape(logits, [252,fc_weights[0].get_shape().as_list()[0]])
        logits = tf.nn.dropout(logits,dropout_prob)
        logits = fc_net(logits, fc_weights, fc_biases,dropout_prob)
        combined_logits.append(tf.reshape(logits,[252]))
    outputs = tf.stack(combined_logits)
    return outputs

def combine_chopped_logits(combined_logits, weight, bias):
    return tf.matmul(combined_logits, weight) + bias

In [22]:
shapes = [shape for shape in train_images.shape]
chopped_features = tf.placeholder(tf.float32, [None]+[shapes[i] for i in range(1,len(shapes))], name="chopped_features")

chopped_logits = chopped_net(chopped_features, chopped_convweights, chopped_convbiases, chopped_fcweights, chopped_fcbiases, batch_size, convdropout)
combined_logits = combine_chopped_logits(chopped_logits, combine_chops_weight, combine_chops_bias)

chopped_cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=combined_logits,labels=data_labels))
chopped_optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,momentum=momentum).minimize(chopped_cost)

chopped_equals_list = tf.equal(tf.argmax(combined_logits,1), tf.argmax(data_labels,1))
chopped_accuracy = tf.reduce_mean(tf.cast(chopped_equals_list,tf.float32))

save_file = './net.ckpt'
saver = tf.train.Saver()

init = tf.global_variables_initializer()



## Session

#### Training

In [None]:
rate = .05
moment = .75
conv_dropout = .5
fc_dropout = .5
epochs = 6

with tf.Session() as sess:
    sess.run(init)
    print("Session Start")
    n_batches = int(train_images.shape[0]/batch_size)
    for epoch in range(epochs):
        print("Epoch: " + str(epoch+1))
        train_images, train_labels = shuffle(train_images, train_labels)
        for batch in range(1,n_batches):
            optcost = sess.run([chopped_optimizer, chopped_cost, chopped_accuracy], 
                               feed_dict={learning_rate: rate, momentum: moment,
                                          convdropout: conv_dropout, fcdropout:fc_dropout,
                                          chopped_features: train_images[batch*batch_size-batch_size:batch*batch_size],
                                          data_labels: train_labels[batch*batch_size-batch_size:batch*batch_size]})
            if batch % 10 == 0:
                print("Non-Cancer Percentage: " + str(1-np.sum(np.argmax(train_labels[batch*batch_size-batch_size:batch*batch_size],1))/batch_size))
                print("Cost (Batch " + str(batch) + "): " + str(optcost[1]) + ", Acc: " + str(optcost[2]))
        valid_images, valid_labels = shuffle(valid_images, valid_labels)
        accost = sess.run([chopped_accuracy,chopped_cost], feed_dict={convdropout: 1.,
                                                                      fcdropout:1., 
                                                                      chopped_features: valid_images[:batch_size], 
                                                                      data_labels: valid_labels[:batch_size]})
        print("\nActual Cancer Percentage: " + str(np.sum(np.argmax(valid_labels[:batch_size],1))/batch_size))
        print("Cost: " + str(accost[1]) + ", Accuracy: " + str(accost[0]))
        print("\n")
        saver.save(sess,save_file)        

In [None]:
n_datasets = 10

with tf.Session() as sess:
    print("Session Start\n")
    saver.restore(sess, save_file)
    for epoch in range(epochs):
        print("Epoch: " + str(epoch+1))
        for i in range(1,n_datasets+1):
            file_path = './chopped_data' + str(i) + '.p'
            with open(file_path, 'rb') as f:
                dataset_dict = pickle.load(f)
            chops = dataset_dict['features']
            labels = dataset_dict['labels']
            chops,labels = shuffle(chops,labels)
            n_batches = int(chops.shape[0]/batch_size)
            running_acc = 0
            for batch in range(1,n_batches):
                batch_x = chops[batch*batch_size-batch_size:batch*batch_size]
                batch_y = labels[batch*batch_size-batch_size:batch*batch_size]
                feed = {learning_rate: rate, momentum: moment,\
                                               convdropout: 1., fcdropout: 1.,\
                                                chopped_features: batch_x, data_labels: batch_y}
                acc = sess.run(chopped_accuracy, feed_dict=feed)
                running_acc += acc
            print("Test Accuracy: " + str(running_acc/n_batches))

## Large Scale Training

This training uses the pickled data sets


## Session



## Results
