In [6]:
#depth, height, width, channels
IMAGE_DIMS = (312, 212, 312, 1)

NR_SHARDS = 700

DATASET_NAME = 'data-centered-rotated'

INPUT_FOLDER = '../../../output/kaggle-bowl/step3/'
OUTPUT_FOLDER = '../../../output/kaggle-bowl/step4/'

In [7]:
import sys
import h5py
from random import shuffle
import numpy as np
from numpy import ndarray
import datetime
import logging

from modules.logging import logger
import modules.logging
import modules.lungprepare as lungprepare
import modules.utils as utils
from modules.utils import Timer
import modules.logging

In [10]:
def start_processing(input_dir, nr_shards, image_dims, output_dir):
    logger.info('Merging shard results. nr_shards=' + str(nr_shards) + ' input_dir='+ str(input_dir) + ' output_dir=' + output_dir)
    
    t = Timer('Preparing output dir')
    utils.mkdirs(output_dir, dirs=['images'], recreate=True)

    modules.logging.setup_file_logger(output_dir + 'out.log')

    dataset_name = DATASET_NAME

    t = Timer('Count total patients among shards')
    total_patients = 0
    unusable_shards = []
    for shard_id in range(1,nr_shards+1):
        dataset_dir = input_dir + str(shard_id) + '/'
        dataset_file = utils.dataset_path(dataset_dir, dataset_name, image_dims)
        with h5py.File(dataset_file, 'r') as h5f:
            try:
                logger.info('shard_id={} shape={}'.format(shard_id,h5f['X'].shape))
                total_patients = total_patients + len(h5f['X'])
            except:
                logger.warning('no data on shard ' + str(shard_id))
                unusable_shards.append(shard_id)
                continue
        if(not utils.validate_dataset(dataset_dir, dataset_name, image_dims)):
            raise Exception('Validation ERROR!')
    t.stop()
            
    logger.info('total_patients=' + str(total_patients))

    t = Timer('Creating output merged dataset')
    output_dataset_file = utils.dataset_path(output_dir, dataset_name, image_dims)
    with h5py.File(output_dataset_file, 'w') as h5f:
        x_ds = h5f.create_dataset('X', (total_patients, image_dims[0], image_dims[1], image_dims[2], image_dims[3]), chunks=(1, image_dims[0], image_dims[1], image_dims[2], image_dims[3]), dtype='f')
        y_ds = h5f.create_dataset('Y', (total_patients, 2), dtype='f')

        logger.info('Merging shards')
        pb = 0
        for shard_id in range(1,nr_shards+1):
            if(shard_id in unusable_shards):
                logger.warning('skipping unusable shard ' + str(shard_id))
                continue
            ts = Timer('Processing shard' + str(shard_id))
            dataset_file = utils.dataset_path(input_dir + str(shard_id) + '/', dataset_name, image_dims)
            with h5py.File(dataset_file, 'r') as sh5f:
                shard_x_ds = sh5f['X']
                shard_y_ds = sh5f['Y']
                le = len(shard_x_ds)
                if(le>0):
                    pe = pb + le
                    logger.debug('output' + str(pb) + ' ' + str(pe) + ' input ' + str(0) + str(le))
                    x_ds[pb:pe] = shard_x_ds[0:le]
                    y_ds[pb:pe] = shard_y_ds[0:le]
                    pb = pe
                else:
                    logger.warning('shard ' + str(shard_id) + ' skipped because it has no data')
            ts.stop()
    t.stop()
    
    t = Timer('Output dataset validations')
    if(not utils.validate_dataset(output_dir, dataset_name, image_dims, save_dir=output_dir + 'images')):
        raise Exception('Validation ERROR!')
    t.stop()

In [None]:
logger.info('==== PROCESSING SHARDS MERGE ====')
start_processing(INPUT_FOLDER, NR_SHARDS, IMAGE_DIMS, OUTPUT_FOLDER)
logger.info('==== ALL DONE ====')

2017-03-12 23:47:20,689 INFO ==== PROCESSING SHARDS MERGE ====
2017-03-12 23:47:20,690 INFO Merging shard results. nr_shards=700 input_dir=../../../output/kaggle-bowl/step3/ output_dir=../../../output/kaggle-bowl/step4/
2017-03-12 23:47:20,691 INFO > [started] Preparing output dir...
2017-03-12 23:47:20,692 INFO > [started] Count total patients among shards...
2017-03-12 23:47:20,693 INFO shard_id=1 shape=(3, 312, 212, 312, 1)
2017-03-12 23:47:20,694 INFO VALIDATING DATASET ../../../output/kaggle-bowl/step3/1/data-centered-rotated-312-212-312.h5
2017-03-12 23:47:20,926 INFO Summary
2017-03-12 23:47:20,927 INFO X shape=(3, 312, 212, 312, 1)
2017-03-12 23:47:20,928 INFO Y shape=(3, 2)
2017-03-12 23:47:20,928 INFO Y: total: 3
2017-03-12 23:47:20,929 INFO Y: label 0: 3.0 100.0%
2017-03-12 23:47:20,930 INFO Y: label 1: 0.0 0.0%
2017-03-12 23:47:20,930 INFO Recording sample data
2017-03-12 23:47:20,931 INFO patient_index 0
2017-03-12 23:47:20,932 INFO x=
2017-03-12 23:47:20,932 INFO patient_

data-centered-rotated[ 1.  0.]


2017-03-13 03:34:42,662 INFO y=[ 1.  0.]
2017-03-13 03:34:42,663 INFO patient_index 464
2017-03-13 03:34:42,664 INFO x=


data-centered-rotated[ 0.  1.]


2017-03-13 03:34:45,702 INFO y=[ 0.  1.]
2017-03-13 03:34:45,703 INFO patient_index 928
2017-03-13 03:34:45,704 INFO x=


data-centered-rotated[ 1.  0.]


2017-03-13 03:34:48,656 INFO y=[ 1.  0.]
2017-03-13 03:34:48,657 INFO > [done]    Output dataset validations (3718746.168 ms)
2017-03-13 03:34:48,658 INFO ==== ALL DONE ====
