In [1]:
INPUT_IMAGE_DIMS = (224, 152, 224, 1)
OUTPUT_IMAGE_DIMS = (50, 34, 50, 1)

DATASET_NAME = '-centered-rotated'

INPUT_FOLDER = '../../input/stage1_some/'
OUTPUT_FOLDER = '../../output/step5/'

In [2]:
import sys
import h5py
import numpy as np
import shutil
import os
from scipy.ndimage.interpolation import rotate
from scipy.ndimage.interpolation import shift
import scipy
import datetime
import logging

from modules.logging import logger
import modules.logging
import modules.lungprepare as lungprepare
import modules.utils as utils
from modules.utils import Timer
import modules.logging

In [3]:
def create_xy_datasets(output_dir, name, image_dims, size):
    dataset_file = utils.dataset_path(output_dir, name, image_dims)
    h5f = h5py.File(dataset_file, 'w')
    x_ds = h5f.create_dataset('X', (size, image_dims[0], image_dims[1], image_dims[2], 1), chunks=(1, image_dims[0], image_dims[1], image_dims[2], 1), dtype='f')
    y_ds = h5f.create_dataset('Y', (size, 2), dtype='f')

    logger.debug('input x shape={}'.format(h5f['X'].shape))
    x_ds = h5f['X']
    y_ds = h5f['Y']
    
    return h5f, x_ds, y_ds

In [4]:
def distribute(input_x_ds, input_y_ds, output_x_ds, output_y_ds, qtty_label0, qtty_label1, non_usable_patients, resize_factor):
    count_label0 = 0
    count_label1 = 0
    output_index = 0
    for i in range(len(input_x_ds)):

        if((count_label0+count_label1)>=len(output_x_ds)):
            logger.warning('Reached max elements in output dataset. Stopping distribution')
            break
        
        if(i in non_usable_patients):
            continue
        
        label = input_y_ds[i]
        valid = False
        
        #distribute label0 sample
        if(label[0]==1 and count_label0<qtty_label0):
            count_label0 = count_label0 + 1
            valid = True

        #distribute label1 sample
        elif(label[1]==1 and count_label1<qtty_label1):
            count_label1 = count_label1 + 1
            valid = True
                           
        if(valid):
            ts = Timer('Resizing patient pixels ' + str(i))
            resize_factor = resize_factor[0:3]#REMOVE LATER
            image_pixels = scipy.ndimage.interpolation.zoom(input_x_ds[i], resize_factor)
            ts.stop()
            image_pixels = np.expand_dims(image_pixels, axis=3)#REMOVE LATER
            output_x_ds[output_index] = image_pixels
            output_y_ds[output_index] = label
            output_index = output_index + 1
            non_usable_patients.append(i)


In [7]:
def start_processing(input_dir, input_image_dims, output_image_dims, output_dir):
    logger.info('Resizing images. input_dir='+ str(input_dir) + ' output_dir=' + output_dir)
    
    logger.info('Preparing output dir')
    utils.mkdirs(output_dir, dirs=['images'], recreate=True)

    modules.logging.setup_file_logger(output_dir + 'out.log')

    t = Timer('Distributing input dataset among train, validate and test datasets')
    len_input_x_ds = None
    resize_factor = (output_image_dims[0]/input_image_dims[0], output_image_dims[1]/input_image_dims[1], output_image_dims[2]/input_image_dims[2], output_image_dims[3]/input_image_dims[3])
    dataset_file = input_dir + 'data-centered-rotated-{}-{}-{}.h5'.format(input_image_dims[0], input_image_dims[1], input_image_dims[2])
    with h5py.File(dataset_file, 'r') as input_h5f:
        logger.info('input x shape={}'.format(input_h5f['X'].shape))
        input_x_ds = input_h5f['X']
        input_y_ds = input_h5f['Y']
        len_input_x_ds = len(input_x_ds)
        len_input_y_ds = len(input_y_ds)
        
        if(len_input_x_ds!=len_input_y_ds):
            raise Error('X and Y datasets have different sizes!')

        logger.info('Calculating label 0|1 proportion')
        label_total = np.array([[0,0]])
        for pi in range(len_input_x_ds):
            label_total = np.array(input_y_ds[pi]) + label_total

        label0_total = label_total[0][0]
        label1_total = label_total[0][1]
        
        label0_ratio = label0_total/len_input_y_ds    
        label1_ratio = label1_total/len_input_y_ds   

        logger.info('Y: total: ' + str(len_input_y_ds))
        logger.info('Y: label 0: ' + str(label_total[0][0]) + ' ' + str(100*label0_ratio) + '%')
        logger.info('Y: label 1: ' + str(label_total[0][1]) + ' ' + str(100*label1_ratio) + '%')
        
        logger.info('Calculate patient distribution among datasets')

        distributed_patients = []
#         total_patients = round(len_input_x_ds*0.2)#REMOVE 0.3 LATER
        total_patients = len_input_x_ds
        
        
        #DISTRIBUTE TO TRAIN DATASET
        dataset_name = 'train' + DATASET_NAME
        t = Timer('Distribute to train dataset')
        train_qtty = round(min(min(label0_total*0.85, label1_total*0.85)*2, total_patients*0.8))
        qtty_label0 = round(train_qtty/2)
        qtty_label1 = train_qtty - qtty_label0
        logger.debug('train_qtty=' + str(train_qtty) + ' qtty_label0=' + str(qtty_label0) + ' qtty_label1=' + str(qtty_label1))
        f, x_ds, y_ds = create_xy_datasets(output_dir, dataset_name, output_image_dims, train_qtty)
        distribute(input_x_ds, input_y_ds, x_ds, y_ds, qtty_label0, qtty_label1, distributed_patients, resize_factor)
        f.close()
        t.stop()

        total_patients = total_patients - train_qtty
        label0_total = label0_total - qtty_label0
        label1_total = label1_total - qtty_label1
        utils.validate_dataset(output_dir, dataset_name, output_image_dims, output_dir + 'images')

        #DISTRIBUTE TO VALIDATE DATASET
        dataset_name = 'validate' + DATASET_NAME
        t = Timer('Distribute to validate dataset')
        validate_qtty = round(min(min(label0_total*0.5, label1_total*0.5)*2, total_patients*0.5))
        qtty_label0 = round(validate_qtty/2)
        qtty_label1 = validate_qtty - qtty_label0
        logger.debug('validate_qtty=' + str(validate_qtty) + ' qtty_label0=' + str(qtty_label0) + ' qtty_label1=' + str(qtty_label1))
        f, x_ds, y_ds = create_xy_datasets(output_dir, dataset_name, output_image_dims, validate_qtty)
        distribute(input_x_ds, input_y_ds, x_ds, y_ds, qtty_label0, qtty_label1, distributed_patients, resize_factor)
        f.close()
        t.stop()
        utils.validate_dataset(output_dir, dataset_name, output_image_dims, output_dir + 'images')

        total_patients = total_patients - validate_qtty
        
        #DISTRIBUTE REMAINING TO TEST DATASET
        dataset_name = 'test' + DATASET_NAME
        t = Timer('Distribute to test dataset')
        test_qtty = total_patients
        qtty_label0 = test_qtty
        qtty_label1 = test_qtty
        logger.debug('test_qtty=' + str(test_qtty) + ' qtty_label0=' + str(qtty_label0) + ' qtty_label1=' + str(qtty_label1))
        f, x_ds, y_ds = create_xy_datasets(output_dir, dataset_name, output_image_dims, test_qtty)
        distribute(input_x_ds, input_y_ds, x_ds, y_ds, qtty_label0, qtty_label1, distributed_patients, resize_factor)
        f.close()
        t.stop()
        utils.validate_dataset(output_dir, dataset_name, output_image_dims, save_dir=output_dir + 'images')

    t.stop()


In [8]:
logger.info('==== PROCESSING SHARDS MERGE ====')
start_processing(INPUT_FOLDER, INPUT_IMAGE_DIMS, OUTPUT_IMAGE_DIMS, OUTPUT_FOLDER)
logger.info('==== ALL DONE ====')

2017-02-22 02:35:57,989 INFO ==== PROCESSING SHARDS MERGE ====
2017-02-22 02:35:57,991 INFO Resizing images. input_dir=../../input/stage1_some/ output_dir=../../output/step5/
2017-02-22 02:35:57,993 INFO Preparing output dir
2017-02-22 02:35:58,259 INFO > [started] Distributing input dataset among train, validate and test datasets...
2017-02-22 02:35:58,262 INFO input x shape=(231, 224, 152, 224)
2017-02-22 02:35:58,264 INFO Calculating label 0|1 proportion
2017-02-22 02:35:58,292 INFO Y: total: 231
2017-02-22 02:35:58,293 INFO Y: label 0: 166.0 71.8614718615%
2017-02-22 02:35:58,294 INFO Y: label 1: 65.0 28.1385281385%
2017-02-22 02:35:58,295 INFO Calculate patient distribution among datasets
2017-02-22 02:35:58,296 INFO > [started] Distribute to train dataset...
2017-02-22 02:35:58,298 DEBUG train_qtty=110.0 qtty_label0=55.0 qtty_label1=55.0
2017-02-22 02:35:58,303 DEBUG input x shape=(110, 50, 34, 50, 1)
2017-02-22 02:35:58,305 INFO > [started] Resizing patient pixels 0...
2017-02-2

TypeError: Can't convert 'int' object to str implicitly