# Lung Cancer Detection Using CNNs

### Import libraries for data imports

In [1]:
import numpy as np
from sklearn.utils import shuffle
import tensorflow as tf

#### Read From Pickled Files (Instead of Pipeline)

In [2]:
import pickle

def pickle2np(dicts):
    images = []
    labels = []
    for data_dict in dicts:
        images.append(data_dict['features'])
        labels.append(data_dict['labels'])
    return np.concatenate([img for img in images],axis=0), np.concatenate([lbl for lbl in labels], axis=0)

In [3]:
train_dicts = []
for i in range(1,8):
    path = '../downloads/train_set'+str(i)+'.p'
    with open(path, mode='rb') as f:
        train_dicts.append(pickle.load(f))
train_images, train_labels = pickle2np(train_dicts)
del train_dicts

In [4]:
valid_dicts = []
for i in range(1,3):
    path = '../downloads/valid_set'+str(i)+'.p'
    with open(path, mode='rb') as f:
        valid_dicts.append(pickle.load(f))
valid_images, valid_labels = pickle2np(valid_dicts)
del valid_dicts

In [None]:
test_dicts = []
path = '../downloads/test.p'
with open(path, 'rb') as f:
    test_dicts.append(pickle.load(f))
test_images, test_ids = pickle2np(test_dicts)
del test_dicts



## Preprocessing



In [5]:
def center_and_normalize(data, mu, dev):
    return (data-mu)/dev 

process_mu = np.mean(train_images)
process_dev = np.std(train_images)

In [6]:
def one_hot_encode(labels, n_labels):
    encoded_labels = np.zeros((labels.shape[0], n_labels), dtype=np.float32)
    for i,label in enumerate(labels):
        encoded_labels[i,int(label)] = 1
    return encoded_labels

In [7]:
for i in range(train_images.shape[0]):
    train_images[i:i+1] = center_and_normalize(train_images[i:i+1],process_mu, process_dev)

In [8]:
valid_images,valid_labels = shuffle(valid_images,valid_labels)
valid_images = valid_images[:100]
valid_labels = valid_labels[:100]
valid_images = center_and_normalize(valid_images, process_mu, process_dev)

In [9]:
train_images = train_images.reshape([train_images.shape[i] for i in range(3)]+[1])
valid_images = valid_images.reshape([valid_images.shape[i] for i in range(3)]+[1])
train_labels = one_hot_encode(train_labels,2)
valid_labels = one_hot_encode(valid_labels,2)



## Tensorflow Approach

#### Parameter Initializations

In [10]:
convweight_shapes = [
    # 512x512
    (5,5,1,36), # 255x255 stride=2
    (5,5,36,26), # 126x126 stride=2
    (4,4,26,50), # 61x61 stride=2
    (61,61,50,2)
]

convweights = [tf.Variable(tf.truncated_normal(shape=x,mean=0,stddev=0.1),name="reg_conv"+str(x[-1])) for x in convweight_shapes]
convbiases = [tf.Variable(tf.zeros([x[-1]]),name="reg_convbias"+str(x[-1])) for x in convweight_shapes]
del convweight_shapes


#### Neural Net Helper Functions

In [11]:
def conv2d(data, weight, bias, stride=1, padding="VALID"):
    activations = tf.nn.bias_add(tf.nn.conv2d(data, weight,strides=[1,stride,stride,1],padding=padding),bias)
    return tf.nn.elu(activations)

def max_pool(data,k=2):
    return tf.nn.max_pool(data,ksize=[1,k,k,1],strides=[1,k,k,1],padding="VALID")

def conv_net(data, weights, biases, dropout_prob, strides=[]):
    if len(strides) == 0: strides = [1]*len(weights)
    logits = data
    for i,weight in enumerate(weights):
        logits = conv2d(logits, weight, biases[i],stride=strides[i])
#         logits = max_pool(logits)
        logits = tf.nn.dropout(logits, dropout_prob)
    return logits

def fc_net(data, weights, biases, dropout_prob):
    logits = data
    for i,weight in enumerate(weights):
        if i < len(weights)-1:
            logits = tf.matmul(logits, weight) + biases[i]
            logits = tf.nn.elu(logits)
    #         logits = tf.nn.dropout(logits, dropout_prob)
    return tf.matmul(logits,weights[-1])+biases[-1]

In [12]:
data_features = tf.placeholder(tf.float32, [None]+[train_images.shape[i] for i in range(1,len(train_images.shape))], name="data_features")
data_labels = tf.placeholder(tf.float32, [None, 2], name='data_labels')

convdropout = tf.placeholder(tf.float32, name="convdropout")
momentum = tf.placeholder(tf.float32, name="momentum")
learning_rate = tf.placeholder(tf.float32, name="learning_rate")


In [13]:
## Fully Convolutional Architecture

logits = conv_net(data_features, convweights, convbiases, convdropout, strides=[2,2,2,1,1])
logits = tf.reshape(logits, [-1,2])

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits,labels=data_labels))
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,momentum=momentum).minimize(cost)

equals_list = tf.equal(tf.argmax(logits,1), tf.argmax(data_labels,1))
accuracy = tf.reduce_mean(tf.cast(equals_list,tf.float32))

save_file = './net.ckpt'
saver = tf.train.Saver()

init = tf.global_variables_initializer()


## Fully Covolutional tf Session

In [14]:
rate = .005
moment = .75
conv_dropout = .9
epochs = 6
batch_size = 100

with tf.Session() as sess:
    sess.run(init)
    print("Session Start")
    for epoch in range(epochs):
        print("Epoch: " + str(epoch+1))
        train_images, train_labels = shuffle(train_images, train_labels)
        for batch in range(1,int(train_images.shape[0]/batch_size)):
            optcost = sess.run([optimizer, cost, accuracy], feed_dict={learning_rate: rate, momentum: moment,
                                                                       convdropout: conv_dropout,
                                                                       data_features: train_images[(batch-1)*batch_size:batch*batch_size],
                                                                       data_labels: train_labels[(batch-1)*batch_size:batch*batch_size]})
            if batch % 10 == 0:
                print("Non-Cancer Percentage: " + str(1-np.sum(np.argmax(train_labels[(batch-1)*batch_size:batch*batch_size],1))/batch_size))
                print("Running Cost (Batch " + str(batch) + "): " + str(optcost[1]) + ", Acc: " + str(optcost[2]))
        valid_images, valid_labels = shuffle(valid_images, valid_labels)
        accost = sess.run([accuracy,cost], feed_dict={convdropout: 1.,
                                                      data_features: valid_images[:batch_size//2], 
                                                      data_labels: valid_labels[:batch_size//2]})
        print("\nActual Cancer Percentage: " + str(np.sum(np.argmax(valid_labels[:batch_size],1))/batch_size))
        print("Cost: " + str(accost[1]) + ", Accuracy: " + str(accost[0]))
        print("\n")
        saver.save(sess,save_file)
        
    acc = sess.run(accuracy, feed_dict={learning_rate: rate, momentum: moment,\
                                   convdropout: 1., \
                                    data_features: valid_images, data_labels: valid_labels})
    print("Validation Accuracy: " + str(acc))        

Session Start
Epoch: 1
Non-Cancer Percentage: 0.61
Running Cost (Batch 10): 0.684477, Acc: 0.63
Non-Cancer Percentage: 0.64
Running Cost (Batch 20): 0.688064, Acc: 0.64
Non-Cancer Percentage: 0.6
Running Cost (Batch 30): 0.709639, Acc: 0.59
Non-Cancer Percentage: 0.58
Running Cost (Batch 40): 0.695588, Acc: 0.6
Non-Cancer Percentage: 0.55
Running Cost (Batch 50): 0.719604, Acc: 0.54

Actual Cancer Percentage: 0.35
Cost: 0.693147, Accuracy: 0.7


Epoch: 2
Non-Cancer Percentage: 0.51
Running Cost (Batch 10): 0.709639, Acc: 0.51
Non-Cancer Percentage: 0.62
Running Cost (Batch 20): 0.719604, Acc: 0.6
Non-Cancer Percentage: 0.56
Running Cost (Batch 30): 0.698029, Acc: 0.58
Non-Cancer Percentage: 0.64
Running Cost (Batch 40): 0.712579, Acc: 0.64
Non-Cancer Percentage: 0.54
Running Cost (Batch 50): 0.711432, Acc: 0.56

Actual Cancer Percentage: 0.35
Cost: 0.693147, Accuracy: 0.58


Epoch: 3
Non-Cancer Percentage: 0.52
Running Cost (Batch 10): 0.71781, Acc: 0.5
Non-Cancer Percentage: 0.64
Runn

KeyboardInterrupt: 


## Results
