## Train nodule detector with LUNA16 dataset

In [1]:
INPUT_DIR = '../../input/'
OUTPUT_DIR = '../../output/lung-cancer/01/'
IMAGE_DIMS = (50,50,50,1)

In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import h5py
import matplotlib.pyplot as plt
import sklearn
import os
import glob

from modules.logging import logger
import modules.utils as utils
from modules.utils import Timer
import modules.logging
import modules.cnn as cnn
import modules.ctscan as ctscan

SyntaxError: 'return' outside function (ImageAugmentation3d.py, line 178)

## Analyse input data

### Let us import annotations

In [None]:
annotations = pd.read_csv(INPUT_DIR + 'annotations.csv')
candidates = pd.read_csv(INPUT_DIR + 'candidates.csv')

In [None]:
print(annotations.iloc[1]['seriesuid'])
print(str(annotations.head()))
annotations.info()

In [None]:
print(candidates.iloc[1]['seriesuid'])
print(str(candidates.head()))
candidates.info()

In [None]:
print(len(candidates[candidates['class'] == 1]))
print(len(candidates[candidates['class'] == 0]))

### Lets take a look at some images

In [None]:
scan = ctscan.CTScanMhd(INPUT_DIR, '1.3.6.1.4.1.14519.5.2.1.6279.6001.979083010707182900091062408058')

In [None]:
pixels = scan.get_image()
plt.imshow(pixels[80])

In [None]:
pixels = scan.get_subimage((40,40,10), (230,230,230))
plt.imshow(pixels[40])

### Classes are heaviliy unbalanced, hardly 0.2% percent are positive.

The best way to move forward will be to undersample the negative class and then augment the positive class heaviliy to balance out the samples.

#### Plan of attack:

1. Get an initial subsample of negative class and keep all of the positives such that we have a 80/20 class distribution

2. Create a training set such that we augment minority class heavilby rotating to get a 50/50 class distribution

In [None]:
positives = candidates[candidates['class']==1].index  
negatives = candidates[candidates['class']==0].index

###  Ok the class to get image data works

Next thing to do is to undersample negative class drastically. Since the number of positives in the data set of 551065 are 1351 and rest are negatives, I plan to make the dataset less skewed. Like a 70%/30% split.

In [None]:
positives

In [None]:
np.random.seed(42)
negIndexes = np.random.choice(negatives, len(positives)*5, replace = False)
print(len(positives))
print(len(negIndexes))

In [None]:
candidatesDf = candidates.iloc[list(positives)+list(negIndexes)]

## Prepare input data

### Split into test train set 

In [None]:
from sklearn.cross_validation import train_test_split
X = candidatesDf.iloc[:,:-1]
Y = candidatesDf.iloc[:,-1]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 42)

In [None]:
#print(str(X_test))
#print(str(Y_test))

### Create a validation dataset

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.20, random_state = 42)

In [None]:
print(len(X_train))
print(len(X_val))
print(len(X_test))

In [None]:
print('number of positive cases are ' + str(Y_train.sum()))
print('total set size is ' + str(len(Y_train)))
print('percentage of positive cases are ' + str(Y_train.sum()*1.0/len(Y_train)))

### We will need to augment the positive dataset like mad! Add new keys to X_train and Y_train for augmented data

In [None]:
tempDf = X_train[Y_train == 1]
tempDf = tempDf.set_index(X_train[Y_train == 1].index + 1000000)
X_train_new = X_train.append(tempDf)
tempDf = tempDf.set_index(X_train[Y_train == 1].index + 2000000)
X_train_new = X_train_new.append(tempDf)

ytemp = Y_train.reindex(X_train[Y_train == 1].index + 1000000)
ytemp.loc[:] = 1
Y_train_new = Y_train.append(ytemp)
ytemp = Y_train.reindex(X_train[Y_train == 1].index + 2000000)
ytemp.loc[:] = 1
Y_train_new = Y_train_new.append(ytemp)

X_train = X_train_new
Y_train = Y_train_new
print(len(X_train), len(Y_train))

In [None]:
print('After undersampling')
print('number of positive cases are ' + str(Y_train.sum()))
print('total set size is ' + str(len(Y_train)))
print('percentage of positive cases are ' + str(Y_train.sum()*1.0/len(Y_train)))

In [None]:
print(len(X_train))
print(len(X_val))
print(len(X_test))
print(X_train.head())
print(Y_train.head())

### Prepare output dir

In [None]:
utils.mkdirs(OUTPUT_DIR, recreate=True)
modules.logging.setup_file_logger(OUTPUT_DIR + 'out.log')
logger.info('Dir ' + OUTPUT_DIR + ' created')

### Create HDF5 dataset with input data

In [None]:
def create_dataset(file_path, x_data, y_data):
    logger.info('Creating dataset ' + file_path + ' size=' + str(len(x_data)))
    file_path_tmp = file_path + '.tmp'
    with h5py.File(file_path_tmp, 'w') as h5f:
        x_ds = h5f.create_dataset('X', (len(x_data), IMAGE_DIMS[0], IMAGE_DIMS[1], IMAGE_DIMS[2], IMAGE_DIMS[3]), chunks=(1, IMAGE_DIMS[0], IMAGE_DIMS[1], IMAGE_DIMS[2], IMAGE_DIMS[3]), dtype='f')
        y_ds = h5f.create_dataset('Y', (len(y_data), 2), dtype='f')
        valid = []
        for c, idx in enumerate(x_data.index):
            #if(c>3): break
            d = x_data.loc[idx]
            filename = d[0]
            t = Timer('Loading scan ' + str(filename))
            scan = ctscan.CTScanMhd(INPUT_DIR, filename)
            pixels = scan.get_subimage((d[3],d[2],d[1]), IMAGE_DIMS)
            #add color channel dimension
            pixels = np.expand_dims(pixels, axis=3)
            #plt.imshow(pixels[round(np.shape(pixels)[0]/2),:,:,0])
            #plt.show()
            if(np.shape(pixels) == (50,50,50,1)):
                x_ds[c] = pixels
                y_ds[c] = [1,0]
                if(y_data.loc[idx] == 1):
                    y_ds[c] = [0,1]
                valid.append(c)
            else:
                logger.warning('Invalid shape detected in image. Skipping. ' + str(np.shape(pixels)))
            t.stop()

    #dump only valid entries to dataset file
    c = 0
    with h5py.File(file_path, 'w') as h5fw:
        x_dsw = h5fw.create_dataset('X', (len(valid), IMAGE_DIMS[0], IMAGE_DIMS[1], IMAGE_DIMS[2], IMAGE_DIMS[3]), chunks=(1, IMAGE_DIMS[0], IMAGE_DIMS[1], IMAGE_DIMS[2], IMAGE_DIMS[3]), dtype='f')
        y_dsw = h5fw.create_dataset('Y', (len(valid), 2), dtype='f')
        with h5py.File(file_path_tmp, 'r') as h5fr:
            x_dsr = h5fr['X']
            y_dsr = h5fr['Y']
            for i in range(len(x_dsr)):
                if(i in valid):
                    x_dsw[c] = x_dsr[i]
                    y_dsw[c] = y_dsr[i]
                    c = c + 1

    os.remove(file_path_tmp)
            
    utils.validate_xy_dataset(file_path, save_dir=OUTPUT_DIR + 'samples/')

In [None]:
create_dataset(OUTPUT_DIR + 'nodules-train.h5', X_train, Y_train)

In [None]:
create_dataset(OUTPUT_DIR + 'nodules-validate.h5', X_val, Y_val)

In [None]:
create_dataset(OUTPUT_DIR + 'nodules-test.h5', X_test, Y_test)