In [5]:
#depth, height, width, channels
IMAGE_DIMS = (312, 212, 312, 1)

NR_SHARDS = 700

DATASET_NAME = 'data-centered-rotated'

INPUT_FOLDER = '../../../output/kaggle-bowl/step3/'
OUTPUT_FOLDER = '../../../output/kaggle-bowl/step4/'

In [6]:
import sys
import h5py
from random import shuffle
import numpy as np
from numpy import ndarray
import datetime
import logging

from modules.logging import logger
import modules.logging
import modules.lungprepare as lungprepare
import modules.utils as utils
from modules.utils import Timer
import modules.logging

In [7]:
def start_processing(input_dir, nr_shards, image_dims, output_dir):
    logger.info('Merging shard results. nr_shards=' + str(nr_shards) + ' input_dir='+ str(input_dir) + ' output_dir=' + output_dir)
    
    t = Timer('Preparing output dir')
    utils.mkdirs(output_dir, dirs=['images'], recreate=True)

    modules.logging.setup_file_logger(output_dir + 'out.log')

    dataset_name = DATASET_NAME

    t = Timer('Count total patients among shards')
    total_patients = 0
    for shard_id in range(1,nr_shards+1):
        dataset_dir = input_dir + str(shard_id) + '/'
        dataset_file = utils.dataset_path(dataset_dir, dataset_name, image_dims)
        with h5py.File(dataset_file, 'r') as h5f:
            logger.info('shard_id={} shape={}'.format(shard_id,h5f['X'].shape))
            total_patients = total_patients + len(h5f['X'])
        if(not utils.validate_dataset(dataset_dir, dataset_name, image_dims)):
            raise Exception('Validation ERROR!')
    t.stop()
            
    logger.info('total_patients=' + str(total_patients))

    t = Timer('Creating output merged dataset')
    output_dataset_file = utils.dataset_path(output_dir, dataset_name, image_dims)
    with h5py.File(output_dataset_file, 'w') as h5f:
        x_ds = h5f.create_dataset('X', (total_patients, image_dims[0], image_dims[1], image_dims[2], image_dims[3]), chunks=(1, image_dims[0], image_dims[1], image_dims[2], image_dims[3]), dtype='f')
        y_ds = h5f.create_dataset('Y', (total_patients, 2), dtype='f')

        logger.info('Merging shards')
        pb = 0
        for shard_id in range(1,nr_shards+1):
            ts = Timer('Processing shard' + str(shard_id))
            dataset_file = utils.dataset_path(input_dir + str(shard_id) + '/', dataset_name, image_dims)
            with h5py.File(dataset_file, 'r') as sh5f:
                shard_x_ds = sh5f['X']
                shard_y_ds = sh5f['Y']
                le = len(shard_x_ds)
                pe = pb + le
                logger.debug('output' + str(pb) + ' ' + str(pe) + ' input ' + str(0) + str(le))
                x_ds[pb:pe] = shard_x_ds[0:le]
                y_ds[pb:pe] = shard_y_ds[0:le]
                pb = pe
            ts.stop()
    t.stop()
    
    t = Timer('Output dataset validations')
    if(not utils.validate_dataset(output_dir, dataset_name, image_dims, save_dir=output_dir + 'images'))):
        raise Exception('Validation ERROR!')
    t.stop()

In [9]:
logger.info('==== PROCESSING SHARDS MERGE ====')
start_processing(INPUT_FOLDER, NR_SHARDS, IMAGE_DIMS, OUTPUT_FOLDER)
logger.info('==== ALL DONE ====')

2017-02-22 02:24:40,134 INFO ==== PROCESSING SHARDS MERGE ====
2017-02-22 02:24:40,136 INFO Merging shard results. nr_shards=2 input_dir=../../output/step3/ output_dir=../../output/step4/
2017-02-22 02:24:40,137 INFO > [started] Preparing output dir...
2017-02-22 02:24:40,403 INFO > [started] Count total patients among shards...
2017-02-22 02:24:40,404 INFO shard_id=1 shape=(19, 224, 152, 224, 1)
2017-02-22 02:24:40,405 INFO VALIDATING DATASET ../../output/step3/1/data-centered-rotated-224-152-224.h5
2017-02-22 02:24:40,731 INFO Summary
2017-02-22 02:24:40,732 INFO X shape=(19, 224, 152, 224, 1)
2017-02-22 02:24:40,733 INFO Y shape=(19, 2)
2017-02-22 02:24:40,734 INFO Y: total: 19
2017-02-22 02:24:40,734 INFO Y: label 0: 14.0 73.6842105263%
2017-02-22 02:24:40,735 INFO Y: label 1: 5.0 26.3157894737%
2017-02-22 02:24:40,736 INFO Recording sample data
2017-02-22 02:24:40,737 INFO patient_index 0
2017-02-22 02:24:40,738 INFO x=
2017-02-22 02:24:40,738 INFO patient_index 6
2017-02-22 02:24