In [1]:
#only (patient_id%NR_SHARDS) == SHARD_ID will be processed here
#choose a value between 1-NR_SHARDS
#those ratios where obtained by the mean dimension ratio from studying the full stage1 images
SHARD_ID = 1

IMAGE_DIMS = (224, 152, 224, 1)

NR_SHARDS = 4
RANDOM_SEED = 0.1
SAVE_IMAGES = True

#Patient DICOM images folder
INPUT_FOLDER = '../../input/stage1_images/'
LABELS_FILE = '../../input/stage1_labels.csv'

OUTPUT_FOLDER = '../../output/step3/' + str(SHARD_ID) + '/'

In [2]:
import csv
import sys
import h5py
import pandas as pd
import os
import numpy as np # linear algebra
from numpy import ndarray
from random import shuffle
import csv
import os

from modules.logging import logger
import modules.logging
import modules.lungprepare as lungprepare
import modules.utils as utils
from modules.utils import Timer
import modules.logging

In [3]:
def get_patient_ids(shard_id, input_dir):
    shard_patients = []
    
    file = csv.DictReader(open(LABELS_FILE))    
    for row in file:
        p = row['id']
        if(int(p,16)%NR_SHARDS == (shard_id-1)):
            shard_patients.append(p)
    logger.info('found {} patients for shard {}'.format(len(shard_patients), shard_id))
    shuffle(shard_patients, lambda: RANDOM_SEED)
    return shard_patients

#     force ids
#     return ['0c37613214faddf8701ca41e6d43f56e', '0a0c32c9e08cc2ea76a71649de56be6d', '0a38e7597ca26f9374f8ea2770ba870d']
#     return ['0c37613214faddf8701ca41e6d43f56e']
    

In [4]:
def patient_label(input_dir, patient_id):
    labels = pd.read_csv(LABELS_FILE, index_col=0)
    value = labels.get_value(patient_id, 'cancer')
    #one-hot encoding
    label = np.array([0,1])
    if(value == 0): label = np.array([1,0])
    return label

In [5]:
def start_processing(input_dir, shard_id, max_patients, image_dims, output_dir):
    logger.info('Processing patients. shard_id=' + str(shard_id) + ' max_patients='+ str(max_patients) + ' input_dir=' + input_dir + ' output_dir=' + output_dir)
    
    logger.info('Gathering patient ids for this shard')
    patient_ids = get_patient_ids(shard_id, input_dir)
    total_patients = len(patient_ids)
    
    dataset_name = 'data-centered-rotated'
    
    logger.info('Preparing output dir')
    utils.mkdirs(output_dir, dirs=['images'], recreate=True)

    modules.logging.setup_file_logger(output_dir + 'out.log')

    logger.info('Creating datasets')
    dataset_file = utils.dataset_path(output_dir, dataset_name, image_dims)
    with h5py.File(dataset_file, 'w') as h5f:
        x_ds = h5f.create_dataset('X', (total_patients, image_dims[0], image_dims[1], image_dims[2], image_dims[3]), chunks=(1, image_dims[0], image_dims[1], image_dims[2], image_dims[3]), dtype='f')
        y_ds = h5f.create_dataset('Y', (total_patients, 2), dtype='f')

        logger.info('Starting to process each patient (count={})'.format(len(patient_ids)))
        count = 0
        record_row = 0

        for patient_id in patient_ids:
            if(count>(max_patients-1)):
                break

            t = Timer('>>> PATIENT PROCESSING ' + patient_id + ' (count=' + str(count) + '; output_dir=' + output_dir + ')')
            patient_pixels = lungprepare.process_patient_images(input_dir + patient_id, image_dims)
            if(patient_pixels != None):
                if(not np.any(patient_pixels)):
                    logger.error('Patient pixels returned with zero values patient_id=' + patient_id)
                logger.info('Recording patient pixels to output dataset count=' + str(count))
                x_ds[record_row] = patient_pixels
                label = patient_label(input_dir, patient_id)
                y_ds[record_row] = label
                record_row = record_row + 1
            else:
                logger.warning('Patient lung not found. Skipping.')   

            t.stop()
            count = count + 1

    if(not utils.validate_dataset(output_dir, dataset_name, image_dims, save_dir=output_dir + 'images/')):
        raise Exception('Validation ERROR!')


In [6]:
logger.info('==== PROCESSING SHARD ' + str(SHARD_ID) + ' ====')
start_processing(INPUT_FOLDER, SHARD_ID, 1, IMAGE_DIMS, OUTPUT_FOLDER)
logger.info('==== ALL DONE ====')

2017-02-23 00:19:44,998 INFO ==== PROCESSING SHARD 1 ====
2017-02-23 00:19:45,003 INFO Processing patients. shard_id=1 max_patients=1 input_dir=../../input/stage1_images/ output_dir=../../output/step3/1/
2017-02-23 00:19:45,004 INFO Gathering patient ids for this shard
2017-02-23 00:19:45,027 INFO found 354 patients for shard 1
2017-02-23 00:19:45,029 INFO Preparing output dir
2017-02-23 00:19:45,048 INFO Creating datasets
2017-02-23 00:19:45,052 INFO Starting to process each patient (count=354)
2017-02-23 00:19:45,054 INFO > [started] >>> PATIENT PROCESSING 40c044145f5c87c12bd8c725924add3c (count=0; output_dir=../../output/step3/1/)...
2017-02-23 00:19:45,056 INFO > [started] load_scan ../../input/stage1_images/40c044145f5c87c12bd8c725924add3c...
2017-02-23 00:19:45,429 INFO > [done]    load_scan ../../input/stage1_images/40c044145f5c87c12bd8c725924add3c (372.794 ms)
2017-02-23 00:19:45,784 INFO > [started] resample...
2017-02-23 00:20:09,415 INFO > [done]    resample (23631.137 ms)
2

Exception: Validation ERROR!