In [1]:
import sys
import os
import random
import os
import numpy as np

from shutil import copyfile
from PIL import Image
from tqdm import tqdm

In [36]:
# whether to use smaller training set
USE_SMALL_DATA = False
SMALL_DATA_CUTOFF = 0.05

# split into 3D numpy arrays flag
SPLIT_3D = True

# training-val-dev split
TRAIN_CUTOFF = 0.70
VAL_CUTOFF = 0.85

# directory pointing to all .npy files
input_directory = '../../data/FETAL/processed'

# experiment directories
output_directory = '../../data/FETAL'

assert os.path.isdir(input_directory), "Couldn't find the dataset at {}".format(input_directory)
assert os.path.isdir(output_directory), "Couldn't find the dataset at {}".format(output_directory)

In [37]:
# takes the array right before it is saved, divides by max value multiplies by 255 and rounds to an int array
def normalize_255(a):
    a = a * (255.0 / a.max()) if a.max() != 0 else a
    a = a.astype(int)
    return a

def pad_matrix(matrix, max_z):
    z_slices, width, height = matrix.shape
    pad_width = max_z - z_slices
    matrix = np.pad(matrix, [(0, pad_width), (0, 0), (0, 0)], mode='constant', constant_values=0)
    return matrix

def slice_and_save(filename, output_dir):
    """Slice the 3d numpy array contained in `filename` into 2d numpy arrays and save 
    it to the `output_dir`"""
    raw_matrix = np.load(filename)
    z_slices, width, height = raw_matrix.shape
    
    # print(filename)
    # print(output_dir)
    # print()
    input_file = os.path.split(filename)[1].split(".")[0]
    if SPLIT_3D:
        output_file_name = "%s.npy" % (input_file)
        output_file_path = os.path.join(output_dir, output_file_name)
        if z_slices <= 40 and (width == height):
#             raw_matrix = normalize_255(raw_matrix)
            if z_slices < 40:
                raw_matrix = pad_matrix(raw_matrix, 40)
            # copyfile(filename, output_file_path)
            # copyfile(filename, output_file_path)
            np.save(output_file_path, raw_matrix)            
    else:
        for slice_num, raw_slice in enumerate(raw_matrix):
            output_file_name = "%s_%s.npy" % (input_file, str(slice_num).zfill(4))
            output_file_path = os.path.join(output_dir, output_file_name)
            np.save(output_file_path, normalize_255(raw_slice))

In [38]:
if __name__ == '__main__':
    # Get the filenames in the input directory
    filenames = os.listdir(input_directory)
    filenames = [os.path.join(input_directory, f) for f in filenames if f.endswith('.npy')]

    # Make sure to always shuffle with a fixed seed so that the split is reproducible
    random.seed(230)
    filenames.sort()
    random.shuffle(filenames)

#     # Whether to use a smaller subset
#     if USE_SMALL_DATA:
#         small_split = int(SMALL_DATA_CUTOFF * len(filenames))
#         filenames = filenames[:small_split]

#         # Reshuffle the filenames
#         filenames.sort()
#         random.shuffle(filenames)

    # Split the image into 70% train and 15% val and 15% test
    first_split = int(TRAIN_CUTOFF * len(filenames))
    second_split = int(VAL_CUTOFF * len(filenames))
    train_filenames = filenames[:first_split]
    val_filenames = filenames[first_split:second_split]
    test_filenames = filenames[second_split:]

    filenames = {'train': train_filenames,
                 'val': val_filenames,
                 'test': test_filenames}

    if not os.path.exists(output_directory):
        os.mkdir(output_directory)
    else:
        print("Warning: output dir {} already exists".format(output_directory))

    # Preprocess train, val and test
    for split in ['train', 'val', 'test']:
        output_dir_split = os.path.join(output_directory, '{}'.format(split))
        if not os.path.exists(output_dir_split):
            os.mkdir(output_dir_split)
        else:
            print("Warning: dir {} already exists".format(output_dir_split))

        print("Processing {} data, saving preprocessed data to {}".format(split, output_dir_split))
        for filename in tqdm(filenames[split]):
            slice_and_save(filename, output_dir_split)

    print("Done building dataset")


  0%|          | 2/3243 [00:00<03:32, 15.27it/s]

Processing train data, saving preprocessed data to ../../data/FETAL/train


100%|██████████| 3243/3243 [05:18<00:00, 10.19it/s]
  0%|          | 2/695 [00:00<00:45, 15.07it/s]

Processing val data, saving preprocessed data to ../../data/FETAL/val


100%|██████████| 695/695 [02:23<00:00,  4.83it/s]
  0%|          | 3/695 [00:00<00:35, 19.77it/s]

Processing test data, saving preprocessed data to ../../data/FETAL/test


100%|██████████| 695/695 [02:01<00:00,  5.72it/s]

Done building dataset



