## Train nodule detector with LUNA16 dataset

In [1]:
INPUT_DIR = '../../input/luna16/'
OUTPUT_DIR = '../../output/lung-cancer/01/'
IMAGE_DIMS = (50,50,50,1)

In [2]:
import numpy as np
import pandas as pd
import h5py
#import matplotlib.pyplot as plt
#import seaborn as sns 
import sklearn
import os
import glob
#from PIL import Image

from modules.logging import logger
import modules.utils as utils
from modules.utils import Timer
import modules.logging
import modules.cnn as cnn
import modules.ctscan as ctscan

## Analyse input data

### Let us import annotations

In [3]:
annotations = pd.read_csv('../../input/luna16/annotations.csv')
candidates = pd.read_csv('../../input/luna16/candidates.csv')

In [4]:
print(annotations.iloc[1]['seriesuid'])
print(str(annotations.head()))
annotations.info()

1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860
                                           seriesuid      coordX      coordY  \
0  1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222... -128.699421 -175.319272   
1  1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...  103.783651 -211.925149   
2  1.3.6.1.4.1.14519.5.2.1.6279.6001.100398138793...   69.639017 -140.944586   
3  1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016...  -24.013824  192.102405   
4  1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016...    2.441547  172.464881   

       coordZ  diameter_mm  
0 -298.387506     5.651471  
1 -227.121250     4.224708  
2  876.374496     5.786348  
3 -391.081276     8.143262  
4 -405.493732    18.545150  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1186 entries, 0 to 1185
Data columns (total 5 columns):
seriesuid      1186 non-null object
coordX         1186 non-null float64
coordY         1186 non-null float64
coordZ         1186 non-null float64
diameter_mm    1186 non-nul

In [5]:
print(candidates.iloc[1]['seriesuid'])
print(str(candidates.head()))
candidates.info()

1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860
                                           seriesuid  coordX  coordY  coordZ  \
0  1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...  -56.08  -67.85 -311.92   
1  1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...   53.21 -244.41 -245.17   
2  1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...  103.66 -121.80 -286.62   
3  1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...  -33.66  -72.75 -308.41   
4  1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...  -32.25  -85.36 -362.51   

   class  
0      0  
1      0  
2      0  
3      0  
4      0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 551065 entries, 0 to 551064
Data columns (total 5 columns):
seriesuid    551065 non-null object
coordX       551065 non-null float64
coordY       551065 non-null float64
coordZ       551065 non-null float64
class        551065 non-null int64
dtypes: float64(3), int64(1), object(1)
memory usage: 21.0+ MB


In [6]:
print(len(candidates[candidates['class'] == 1]))
print(len(candidates[candidates['class'] == 0]))

1351
549714


### Classes are heaviliy unbalanced, hardly 0.2% percent are positive.

The best way to move forward will be to undersample the negative class and then augment the positive class heaviliy to balance out the samples.

#### Plan of attack:

1. Get an initial subsample of negative class and keep all of the positives such that we have a 80/20 class distribution

2. Create a training set such that we augment minority class heavilby rotating to get a 50/50 class distribution

In [7]:
positives = candidates[candidates['class']==1].index  
negatives = candidates[candidates['class']==0].index

###  Ok the class to get image data works

Next thing to do is to undersample negative class drastically. Since the number of positives in the data set of 551065 are 1351 and rest are negatives, I plan to make the dataset less skewed. Like a 70%/30% split.

In [8]:
positives

Int64Index([    13,     78,   1303,   3050,   3052,   3080,   3223,   3285,
              3287,   3289,
            ...
            545928, 546205, 546372, 546400, 547498, 548674, 550171, 550334,
            550810, 550906],
           dtype='int64', length=1351)

In [9]:
np.random.seed(42)
negIndexes = np.random.choice(negatives, len(positives)*5, replace = False)
print(len(positives))
print(len(negIndexes))

1351
6755


In [10]:
candidatesDf = candidates.iloc[list(positives)+list(negIndexes)]

## Prepare input data

### Split into test train set 

In [11]:
from sklearn.cross_validation import train_test_split
X = candidatesDf.iloc[:,:-1]
Y = candidatesDf.iloc[:,-1]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 42)



In [12]:
#print(str(X_test))
#print(str(Y_test))

### Create a validation dataset

In [13]:
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.20, random_state = 42)

In [14]:
print(len(X_train))
print(len(X_val))
print(len(X_test))

5187
1297
1622


In [15]:
print('number of positive cases are ' + str(Y_train.sum()))
print('total set size is ' + str(len(Y_train)))
print('percentage of positive cases are ' + str(Y_train.sum()*1.0/len(Y_train)))

number of positive cases are 845
total set size is 5187
percentage of positive cases are 0.16290726817


### We will need to augment the positive dataset like mad! Add new keys to X_train and Y_train for augmented data

In [16]:
tempDf = X_train[Y_train == 1]
tempDf = tempDf.set_index(X_train[Y_train == 1].index + 1000000)
X_train_new = X_train.append(tempDf)
tempDf = tempDf.set_index(X_train[Y_train == 1].index + 2000000)
X_train_new = X_train_new.append(tempDf)

ytemp = Y_train.reindex(X_train[Y_train == 1].index + 1000000)
ytemp.loc[:] = 1
Y_train_new = Y_train.append(ytemp)
ytemp = Y_train.reindex(X_train[Y_train == 1].index + 2000000)
ytemp.loc[:] = 1
Y_train_new = Y_train_new.append(ytemp)

print(len(X_train_new), len(Y_train_new))

6877 6877


In [17]:
#print(X_train_new.index)
#print(y_train_new)

In [18]:
print(len(X_train))
print(len(X_val))
print(len(X_test))
print(X_train.head())
print(Y_train.head())

5187
1297
1622
                                                seriesuid      coordX  \
59270   1.3.6.1.4.1.14519.5.2.1.6279.6001.132817748896...  -97.808167   
150277  1.3.6.1.4.1.14519.5.2.1.6279.6001.182192086929...   58.990000   
432208  1.3.6.1.4.1.14519.5.2.1.6279.6001.397522780537...  -50.307219   
423122  1.3.6.1.4.1.14519.5.2.1.6279.6001.339882192295... -106.731000   
344581  1.3.6.1.4.1.14519.5.2.1.6279.6001.296863826932...   67.240000   

            coordY      coordZ  
59270     3.897917 -201.030000  
150277  -24.230000  -18.320000  
432208  159.439740 -113.418797  
423122 -104.468000  751.163333  
344581  -32.590000 -107.790000  
59270     0
150277    0
432208    0
423122    0
344581    0
Name: class, dtype: int64


### Prepare output dir

In [19]:
utils.mkdirs(OUTPUT_DIR, recreate=True)
modules.logging.setup_file_logger(OUTPUT_DIR + 'out.log')
logger.info('Dir ' + OUTPUT_DIR + ' created')

2017-03-19 01:33:58,839 INFO Dir ../../output/lung-cancer/01/ created


### Create HDF5 dataset with input data

In [25]:
def create_dataset(file_path, x_data, y_data):
    logger.info('Creating dataset ' + file_path + ' size=' + str(len(x_data)))
    with h5py.File(file_path, 'w') as h5f:
        x_ds = h5f.create_dataset('X', (len(x_data), IMAGE_DIMS[0], IMAGE_DIMS[1], IMAGE_DIMS[2], IMAGE_DIMS[3]), chunks=(1, IMAGE_DIMS[0], IMAGE_DIMS[1], IMAGE_DIMS[2], IMAGE_DIMS[3]), dtype='f')
        y_ds = h5f.create_dataset('Y', (len(y_data), 2), dtype='f')
        for c, idx in enumerate(x_data.index):
            #if(c>3): break
            d = x_data.loc[idx]
            filename = d[0]
            t = Timer('Loading scan ' + str(filename))
            scan = ctscan.CTScanMhd(INPUT_DIR, filename, coords=(d[1],d[2],d[3]))
            pixels = scan.get_subimage(IMAGE_DIMS)
            #add color channel dimension
            pixels = np.expand_dims(pixels, axis=3)
            x_ds[c] = pixels
            y_ds[c] = [1,0]
            if(y_data.loc[idx] == 1):
                y_ds[c] = [0,1]
            t.stop()
    utils.validate_xy_dataset(file_path, save_dir=OUTPUT_DIR + 'samples/')

In [26]:
create_dataset(OUTPUT_DIR + 'nodules-train.h5', X_train, Y_train)

In [27]:
create_dataset(OUTPUT_DIR + 'nodules-validate.h5', X_val, Y_val)

In [28]:
create_dataset(OUTPUT_DIR + 'nodules-test.h5', X_test, Y_test)

## Training

### Prepare CNN model

In [24]:
logger.info('Prepare CNN for training')
network = cnn.net_nodule2d_good(IMAGE_DIMS)
model = cnn.prepare_cnn_model(network, OUTPUT_DIR, model_file=None)

2017-03-19 01:35:22,323 INFO Prepare CNN for training


AttributeError: 'module' object has no attribute 'net_nodule2d_good'

### Train model

In [None]:
dataset_path = utils.dataset_path(INPUT_DIR, 'train', IMAGE_DIMS)

with h5py.File(dataset_path, 'r') as train_hdf5:
    X = train_hdf5['X']
    Y = train_hdf5['Y']
    logger.info('X shape ' + str(X.shape))
    logger.info('Y shape ' + str(Y.shape))

    dataset_path = utils.dataset_path(input_dir, 'validate', image_dims)
    with h5py.File(dataset_path, 'r') as validate_hdf5:
        X_validate = validate_hdf5['X']
        Y_validate = validate_hdf5['Y']
        logger.info('X_validate shape ' + str(X_validate.shape))
        logger.info('Y_validate shape ' + str(Y_validate.shape))

        logger.info('Starting CNN training...')
        model.fit(X, Y, 
            validation_set=(X_validate, Y_validate), 
            shuffle=True, 
            batch_size=96, 
            n_epoch=100,
            show_metric=True,
            snapshot_epoch=True,
            run_id='nodule_classifier')

model.save("nodule-classifier.tfl")
logger.info("Network trained and saved as nodule-classifier.tfl!")

### Evaluate results

In [None]:
with h5py.File('testdataset.h5', 'r') as test_hdf5:
    X_test = test_hdf5['X']
    Y_test = test_hdf5['Y']
    Y_pred = model.predict(X_test)

In [None]:
sklearn.metrics.confusion_matrix(Y_test, Y_pred)