# Lung Cancer Detection Using CNNs

### Import libraries for data imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.utils import shuffle
import tensorflow as tf

## Get Working Dataset

Read in previously pickled data. The training and validation sets consist of a shuffled assortment of 4 images for each patient not diagnosed with cancer and 8 images for patients diagnosed with cancer.

In [2]:
import pickle

def pickle2np(data_dict):
    images = chop_data(data_dict['features'], 32)
    labels = data_dict['labels']
    return images,labels

In [3]:
def chop(image, img_size):
    limit = image.shape[0]//img_size
    chops = []
    for i in range(1,limit+1):
        for j in range(1,limit+1):
            if (i == 1 and j == 1) or (i == 1 and j == limit) or (i == limit and j == 1) or (i == limit and j == limit): continue
            chops.append(image[img_size*(i-1):img_size*i,img_size*(j-1):img_size*j])
    return np.array(chops, dtype=np.float32)

def chop_data(images, img_size):
    chopped_images = []
    for i,img in enumerate(images):
        chopped_images.append(chop(img, img_size))
    return np.array([chop for chop in chopped_images], dtype=np.float32)



In [4]:
train_images = []
train_labels = []
for i in range(1,8):
    path = '../downloads/train_set'+str(i)+'.p'
    with open(path, mode='rb') as f:
        temp_imgs, temp_labels = pickle2np(pickle.load(f))
    train_images.append(temp_imgs)
    train_labels.append(temp_labels)
del temp_imgs
del temp_labels

In [5]:
train_images = np.concatenate(train_images,axis=0)
train_labels = np.concatenate(train_labels,axis=0)

In [6]:
valid_images = []
valid_labels = []
for i in range(1,3):
    path = '../downloads/valid_set'+str(i)+'.p'
    with open(path, mode='rb') as f:
        temp_imgs, temp_labels = pickle2np(pickle.load(f))
    valid_images.append(temp_imgs)
    valid_labels.append(temp_labels)
del temp_imgs
del temp_labels

In [7]:
valid_images = np.concatenate(valid_images,axis=0)
valid_labels = np.concatenate(valid_labels,axis=0)

In [None]:
# test_dicts = []
# path = '../downloads/test.p'
# with open(path, 'rb') as f:
#     test_dicts.append(pickle.load(f))
# test_images, test_ids = pickle2np(test_dicts)
# del test_dicts



## Preprocessing



In [8]:
def center_and_normalize(data, mu, dev):
    return (data-mu)/dev 

process_mu = np.mean(train_images)
process_dev = np.std(train_images)

In [9]:
def one_hot_encode(labels, n_labels):
    encoded_labels = np.zeros((labels.shape[0], n_labels), dtype=np.float32)
    for i,label in enumerate(labels):
        encoded_labels[i,int(label)] = 1
    return encoded_labels

In [10]:
for i in range(train_images.shape[0]):
    train_images[i:i+1] = center_and_normalize(train_images[i:i+1], process_mu, process_dev)

In [11]:
train_images, train_labels = shuffle(train_images, train_labels)
valid_images,valid_labels = shuffle(valid_images,valid_labels)
valid_images = valid_images[:100]
valid_labels = valid_labels[:100]
valid_images = center_and_normalize(valid_images, process_mu, process_dev)

In [12]:
train_images = train_images.reshape([s for s in train_images.shape]+[1])
valid_images = valid_images.reshape([s for s in valid_images.shape]+[1])
train_labels = one_hot_encode(train_labels,2)
valid_labels = one_hot_encode(valid_labels,2)


## Tensorflow Approach

#### Parameter Initializations

In [13]:
## Chopped Images Parameter Creation

chopped_convweight_shapes = [
    # 32x32
    (5,5,1,36), # 28x28
    # Pool: 14x14
    (5,5,36,16), # 10x10
    # Pool: 5x5
]
chopped_fcweight_shapes = [
    (5*5*16,50),
    (50,2),
    (2,1)
]

chopped_convweights = [tf.Variable(tf.truncated_normal(shape=x,mean=0,stddev=0.1),name="conv"+str(x[-1])) for x in chopped_convweight_shapes]
chopped_convbiases = [tf.Variable(tf.zeros([x[-1]]),name="convbias"+str(x[-1])) for x in chopped_convweight_shapes]

chopped_fcweights = [tf.Variable(tf.truncated_normal(shape=x,mean=0,stddev=0.1),name="fc"+str(x[-1])) for x in chopped_fcweight_shapes]
chopped_fcbiases = [tf.Variable(tf.zeros([x[-1]]),name="fcbias"+str(x[-1])) for x in chopped_fcweight_shapes]

combine_chops_weight = tf.Variable(tf.truncated_normal(shape=(((512//32)**2)-4, 2), mean=0, stddev=0.1),name="combine")
combine_chops_bias = tf.Variable(tf.zeros([2]),name="combinebias")


del chopped_convweight_shapes
del chopped_fcweight_shapes


#### Neural Net Helper Functions

In [14]:
def conv2d(data, weight, bias, stride=1, padding="VALID"):
    activations = tf.nn.bias_add(tf.nn.conv2d(data, weight,strides=[1,stride,stride,1],padding=padding),bias)
    return tf.nn.elu(activations)

def max_pool(data,k=2):
    return tf.nn.max_pool(data,ksize=[1,k,k,1],strides=[1,k,k,1],padding="VALID")

def conv_net(data, weights, biases, dropout_prob, strides=[]):
    if len(strides) == 0: strides = [1]*len(weights)
    logits = data
    for i,weight in enumerate(weights):
        logits = conv2d(logits, weight, biases[i],stride=strides[i])
        logits = max_pool(logits)
        logits = tf.nn.dropout(logits, dropout_prob)
    return logits

def fc_net(data, weights, biases):
    logits = data
    for i,weight in enumerate(weights):
        if i < len(weights)-1:
            logits = tf.matmul(logits, weight) + biases[i]
            logits = tf.nn.elu(logits)
    return tf.matmul(logits,weights[-1])+biases[-1]

In [15]:
data_labels = tf.placeholder(tf.float32, [None, 2], name='data_labels')

convdropout = tf.placeholder(tf.float32, name="convdropout")
momentum = tf.placeholder(tf.float32, name="momentum")
learning_rate = tf.placeholder(tf.float32, name="learning_rate")


In [16]:
## Chopped Datas

batch_size = 90

def chopped_net(datas, conv_weights, conv_biases, fc_weights, fc_biases, batch_size, dropout_prob, strides=[]):
    combined_logits = []
    for i in range(batch_size):
        logits = conv_net(datas[i],conv_weights,conv_biases,dropout_prob,strides=strides)
        logits = tf.reshape(logits, [252,fc_weights[0].get_shape().as_list()[0]])
        logits = tf.nn.dropout(logits,dropout_prob)
        logits = fc_net(logits, fc_weights, fc_biases)
        combined_logits.append(tf.reshape(logits,[252]))
    outputs = tf.stack(combined_logits)
    return outputs

def combine_chopped_logits(combined_logits, weight, bias):
    return tf.matmul(combined_logits, weight) + bias

In [17]:
chopped_features = tf.placeholder(tf.float32, [None]+[s for s in train_images.shape[1:]], name="chopped_features")

chopped_logits = chopped_net(chopped_features, chopped_convweights, chopped_convbiases, chopped_fcweights, chopped_fcbiases, batch_size, convdropout)
chopped_logits = combine_chopped_logits(chopped_logits, combine_chops_weight, combine_chops_bias)

chopped_cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=chopped_logits,labels=data_labels))
chopped_optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,momentum=momentum).minimize(chopped_cost)

chopped_equals_list = tf.equal(tf.argmax(chopped_logits,1), tf.argmax(data_labels,1))
chopped_accuracy = tf.reduce_mean(tf.cast(chopped_equals_list,tf.float32))

save_file = './net.ckpt'
saver = tf.train.Saver()

init = tf.global_variables_initializer()



## Session

#### Training

In [18]:
rate = .05
moment = .75
conv_dropout = .5
fc_dropout = .5
epochs = 6

with tf.Session() as sess:
    sess.run(init)
    print("Session Start")
    n_batches = int(train_images.shape[0]/batch_size)
    for epoch in range(epochs):
        print("Epoch: " + str(epoch+1))
        train_images, train_labels = shuffle(train_images, train_labels)
        for batch in range(1,n_batches):
            optcost = sess.run([chopped_optimizer, chopped_cost, chopped_accuracy], 
                               feed_dict={learning_rate: rate, momentum: moment,
                                          convdropout: conv_dropout,
                                          chopped_features: train_images[batch*batch_size-batch_size:batch*batch_size],
                                          data_labels: train_labels[batch*batch_size-batch_size:batch*batch_size]})
            if batch % 10 == 0:
                print("Non-Cancer Percentage: " + str(1-np.sum(np.argmax(train_labels[batch*batch_size-batch_size:batch*batch_size],1))/batch_size))
                print("Cost (Batch " + str(batch) + "): " + str(optcost[1]) + ", Acc: " + str(optcost[2]))
        valid_images, valid_labels = shuffle(valid_images, valid_labels)
        accost = sess.run([chopped_accuracy,chopped_cost], feed_dict={convdropout: 1., 
                                                                      chopped_features: valid_images[:batch_size], 
                                                                      data_labels: valid_labels[:batch_size]})
        print("\nActual Cancer Percentage: " + str(np.sum(np.argmax(valid_labels[:batch_size],1))/batch_size))
        print("Cost: " + str(accost[1]) + ", Accuracy: " + str(accost[0]))
        print("\n")
        saver.save(sess,save_file)        

Session Start
Epoch: 1
Non-Cancer Percentage: 0.522222222222
Cost (Batch 10): 0.689976, Acc: 0.544444
Non-Cancer Percentage: 0.611111111111
Cost (Batch 20): 0.680076, Acc: 0.611111
Non-Cancer Percentage: 0.544444444444
Cost (Batch 30): 0.698462, Acc: 0.544444
Non-Cancer Percentage: 0.511111111111
Cost (Batch 40): 0.712991, Acc: 0.511111
Non-Cancer Percentage: 0.611111111111
Cost (Batch 50): 0.669514, Acc: 0.611111
Non-Cancer Percentage: 0.6
Cost (Batch 60): 0.680355, Acc: 0.6

Actual Cancer Percentage: 0.411111111111
Cost: 0.677118, Accuracy: 0.588889


Epoch: 2
Non-Cancer Percentage: 0.588888888889
Cost (Batch 10): 0.671745, Acc: 0.588889
Non-Cancer Percentage: 0.544444444444
Cost (Batch 20): 0.688255, Acc: 0.544444
Non-Cancer Percentage: 0.666666666667
Cost (Batch 30): 0.655402, Acc: 0.666667
Non-Cancer Percentage: 0.744444444444
Cost (Batch 40): 0.613319, Acc: 0.744444
Non-Cancer Percentage: 0.588888888889
Cost (Batch 50): 0.677838, Acc: 0.588889
Non-Cancer Percentage: 0.58888888888

KeyboardInterrupt: 


## Results
