add data generation, training and prediction code for nodule segmenta…

…tion using keras
drivendataorg · Oct 2, 2017 · 707e012 · 707e012
1 parent aed9fd5
commit 707e012
Show file tree

Hide file tree

Showing 559 changed files with 505 additions and 106 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -5,8 +5,8 @@ tests/assets/test_image_data/small/**/*.dcm -filter=lfs -diff=lfs -merge=lfs -te
 test/assets/* filter=lfs diff=lfs merge=lfs -text
 *.dcm filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
 *.hd5 filter=lfs diff=lfs merge=lfs -text
+*.hdf5 filter=lfs diff=lfs merge=lfs -text
 *.mhd filter=lfs diff=lfs merge=lfs -text
 *.raw filter=lfs diff=lfs merge=lfs -text
 *.ckpt filter=lfs diff=lfs merge=lfs -text
diff --git a/compose/prediction/Dockerfile-dev b/compose/prediction/Dockerfile-dev
@@ -7,5 +7,6 @@ RUN ln -s /usr/bin/python3.6 /usr/local/bin/python
 # Requirements have to be pulled and installed here, otherwise caching won't work
 COPY ./prediction/requirements /requirements
 RUN pip install -r /requirements/local.txt
+COPY ./prediction/.pylidcrc /root/.pylidcrc
 
 WORKDIR /app
diff --git a/prediction/.pylidcrc b/prediction/.pylidcrc
@@ -0,0 +1,3 @@
+[dicom]
+path = /images_full
+warn = True
diff --git a/prediction/requirements/local.txt b/prediction/requirements/local.txt
@@ -1,4 +1,4 @@
 -r base.txt
 flake8==3.3.0
 pytest==3.1.3
-pylidc==0.1.8
+pylidc==0.1.9
diff --git a/prediction/src/algorithms/identify/helpers.py b/prediction/src/algorithms/identify/helpers.py
diff --git a/prediction/src/algorithms/identify/prediction.py b/prediction/src/algorithms/identify/prediction.py
@@ -12,8 +12,7 @@
 from keras.metrics import binary_accuracy, binary_crossentropy, mean_absolute_error
 from keras.models import Model
 from keras.optimizers import SGD
-
-from . import helpers
+from src.preprocess.lung_segmentation import rescale_patient_images
 
 CUBE_SIZE = 32
 MEAN_PIXEL_VALUE = 41
@@ -180,11 +179,11 @@ def predict_cubes(model_path, patient_id, magnification=1, ext_name=""):  # noqa
 
         patient_img = load_patient_images(patient_id, wildcard="*_i.png", exclude_wildcards=[])
         if magnification != 1:
-            patient_img = helpers.rescale_patient_images(patient_img, (1, 1, 1), magnification)
+            patient_img = rescale_patient_images(patient_img, (1, 1, 1), magnification)
 
         patient_mask = load_patient_images(patient_id, wildcard="*_m.png", exclude_wildcards=[])
         if magnification != 1:
-            patient_mask = helpers.rescale_patient_images(patient_mask, (1, 1, 1), magnification, is_mask_image=True)
+            patient_mask = rescale_patient_images(patient_mask, (1, 1, 1), magnification, is_mask_image=True)
 
         step = PREDICT_STEP
         CROP_SIZE = CUBE_SIZE
@@ -221,9 +220,6 @@ def predict_cubes(model_path, patient_id, magnification=1, ext_name=""):  # noqa
                     if cube_mask.sum() < 2000:
                         skipped_count += 1
                     else:
-                        if CROP_SIZE != CUBE_SIZE:
-                            cube_img = helpers.rescale_patient_images2(cube_img, (CUBE_SIZE, CUBE_SIZE, CUBE_SIZE))
-
                         img_prep = prepare_image_for_net3D(cube_img)
                         batch_list.append(img_prep)
                         batch_list_coords.append((z, y, x))

diff --git a/prediction/src/algorithms/segment/assets/best_model.hdf5 b/prediction/src/algorithms/segment/assets/best_model.hdf5
diff --git a/prediction/src/algorithms/segment/assets/lung-mask.npy b/prediction/src/algorithms/segment/assets/lung-mask.npy
diff --git a/prediction/src/algorithms/segment/assets/segmented_lung_patient_LIDC-IDRI-0001.npy b/prediction/src/algorithms/segment/assets/segmented_lung_patient_LIDC-IDRI-0001.npy
diff --git a/prediction/src/algorithms/segment/assets/segmented_lung_patient_LIDC-IDRI-0002.npy b/prediction/src/algorithms/segment/assets/segmented_lung_patient_LIDC-IDRI-0002.npy
diff --git a/prediction/src/algorithms/segment/assets/segmented_lung_patient_LIDC-IDRI-0003.npy b/prediction/src/algorithms/segment/assets/segmented_lung_patient_LIDC-IDRI-0003.npy
diff --git a/prediction/src/algorithms/segment/src/__init__.py b/prediction/src/algorithms/segment/src/__init__.py
diff --git a/prediction/src/algorithms/segment/src/data_generation.py b/prediction/src/algorithms/segment/src/data_generation.py
@@ -0,0 +1,61 @@
+import glob
+import os
+
+import numpy as np
+import pylidc as pl
+
+
+def get_dicom_paths(in_docker=True):
+    """Return DICOM paths to all LIDC directories
+    e.g. ['../images_full/LIDC-IDRI-0001/1.3.6.1.4.1.14519.5.2.1.6279.6001.298806137288633453246975630178/' \
+          '1.3.6.1.4.1.14519.5.2.1.6279.6001.179049373636438705059720603192']"""
+    if in_docker:
+        return glob.glob("../images_full/LIDC-IDRI-*/**/**")
+    else:
+        return glob.glob("../tests/assets/test_image_data/full/LIDC-IDRI-*/**/**")
+
+
+def prepare_training_data():
+    current_dir = os.path.dirname(os.path.realpath(__file__))
+    assets_dir = os.path.abspath(os.path.join(current_dir, '../assets'))
+
+    dicom_paths = sorted(get_dicom_paths())
+    for path in dicom_paths:
+        directories = path.split('/')
+        lidc_id = directories[2]
+        lung_patient_file = os.path.join(assets_dir, "segmented_lung_patient_{}".format(lidc_id))
+
+        if os.path.isfile(lung_patient_file):
+            continue
+
+        # Compute and save binary mask with information whether pixel is cancerous
+        scan = pl.query(pl.Scan).filter(pl.Scan.patient_id == lidc_id).first()
+        if scan is None:
+            print("Scan for path '{}' was not found".format(path))
+            continue
+        vol = scan.to_volume(verbose=False)
+
+        # mask_vol is a boolean, indicator volume for the first annotation of the scan.
+        mask_vol = np.zeros(vol.shape, dtype=np.bool)
+
+        # Load DICOM files and obtain z-coords for each slice, so we can index into them.
+        dicoms = scan.load_all_dicom_images(verbose=False)
+        zs = [float(img.ImagePositionPatient[2]) for img in dicoms]
+
+        cancerous_annotations = pl.query(pl.Annotation).filter(pl.Annotation.malignancy >= 3,
+                                                               pl.Annotation.scan_id == scan.id).all()
+
+        for annotation in cancerous_annotations:
+            mask, bbox = annotation.get_boolean_mask(return_bbox=True)
+
+            # Obtain indexes of `mask` into `mask_vol`
+            i1, i2 = bbox[0].astype(np.int)
+            j1, j2 = bbox[1].astype(np.int)
+
+            k1 = zs.index(bbox[2, 0])
+            k2 = zs.index(bbox[2, 1])
+
+            # In case the area already was segmented, don't overwrite it but add the annotated segmentation
+            annotation_area = np.index_exp[i1:i2 + 1, j1:j2 + 1, k1:k2 + 1]
+            mask_vol[annotation_area] = np.logical_or(mask, mask_vol[annotation_area])
+        np.save(lung_patient_file, mask_vol)
diff --git a/prediction/src/algorithms/segment/src/model.py b/prediction/src/algorithms/segment/src/model.py
@@ -0,0 +1,139 @@
+import numpy as np
+from keras import backend as K
+from keras.engine import Input, Model
+from keras.layers import Conv3D, MaxPooling3D, UpSampling3D, Activation
+from keras.layers.merge import concatenate
+from keras.optimizers import Adam
+
+
+def simple_model_3d(input_shape, downsize_filters_factor=32, pool_size=(2, 2, 2), n_labels=1,
+                    initial_learning_rate=0.01):
+    """
+    Builds a simple 3D classification model.
+    :param input_shape: Shape of the input data (x_size, y_size, z_size, n_channels).
+    :param downsize_filters_factor: Factor to which to reduce the number of filters. Making this value larger will
+    reduce the amount of memory the model will need during training.
+    :param pool_size: Pool size for the max pooling operations.
+    :param n_labels: Number of binary labels that the model is learning.
+    :param initial_learning_rate: Initial learning rate for the model. This will be decayed during training.
+    :return: Untrained simple 3D Model
+    """
+    inputs = Input(input_shape)
+    conv1 = Conv3D(int(32 / downsize_filters_factor), (3, 3, 3), activation='relu', padding='same')(inputs)
+    pool1 = MaxPooling3D(pool_size=pool_size)(conv1)
+    conv2 = Conv3D(int(64 / downsize_filters_factor), (3, 3, 3), activation='relu', padding='same')(pool1)
+    up1 = UpSampling3D(size=pool_size)(conv2)
+    conv8 = Conv3D(n_labels, (1, 1, 1))(up1)
+    act = Activation('sigmoid')(conv8)
+    model = Model(inputs=inputs, outputs=act)
+
+    model.compile(optimizer=Adam(lr=initial_learning_rate), loss=dice_coef_loss, metrics=[dice_coef])
+
+    return model
+
+
+def unet_model_3d(input_shape, downsize_filters_factor=1, pool_size=(2, 2, 2), n_labels=1,
+                  initial_learning_rate=0.01, deconvolution=False):
+    """
+    Builds the 3D UNet Keras model.
+    :param input_shape: Shape of the input data (x_size, y_size, z_size, n_channels).
+    :param downsize_filters_factor: Factor to which to reduce the number of filters. Making this value larger will
+    reduce the amount of memory the model will need during training.
+    :param pool_size: Pool size for the max pooling operations.
+    :param n_labels: Number of binary labels that the model is learning.
+    :param initial_learning_rate: Initial learning rate for the model. This will be decayed during training.
+    :param deconvolution: If set to True, will use transpose convolution(deconvolution) instead of upsamping. This
+    increases the amount memory required during training.
+    :return: Untrained 3D UNet Model
+    """
+    inputs = Input(input_shape)
+    conv1 = Conv3D(int(32 / downsize_filters_factor), (3, 3, 3), activation='relu', padding='same')(inputs)
+    conv1 = Conv3D(int(64 / downsize_filters_factor), (3, 3, 3), activation='relu', padding='same')(conv1)
+    pool1 = MaxPooling3D(pool_size=pool_size)(conv1)
+
+    conv2 = Conv3D(int(64 / downsize_filters_factor), (3, 3, 3), activation='relu', padding='same')(pool1)
+    conv2 = Conv3D(int(128 / downsize_filters_factor), (3, 3, 3), activation='relu', padding='same')(conv2)
+    pool2 = MaxPooling3D(pool_size=pool_size)(conv2)
+
+    conv3 = Conv3D(int(128 / downsize_filters_factor), (3, 3, 3), activation='relu', padding='same')(pool2)
+    conv3 = Conv3D(int(256 / downsize_filters_factor), (3, 3, 3), activation='relu', padding='same')(conv3)
+    print(conv3.shape)
+    pool3 = MaxPooling3D(pool_size=pool_size)(conv3)
+
+    conv4 = Conv3D(int(256 / downsize_filters_factor), (3, 3, 3), activation='relu', padding='same')(pool3)
+    conv4 = Conv3D(int(512 / downsize_filters_factor), (3, 3, 3), activation='relu', padding='same')(conv4)
+    print(conv4.shape)
+
+    up5 = get_upconv(pool_size=pool_size, deconvolution=deconvolution, depth=2,
+                     nb_filters=int(512 / downsize_filters_factor), image_shape=input_shape[-3:])(conv4)
+    print(up5.shape)
+    up5 = concatenate([up5, conv3], axis=4)
+    conv5 = Conv3D(int(256 / downsize_filters_factor), (3, 3, 3), activation='relu', padding='same')(up5)
+    conv5 = Conv3D(int(256 / downsize_filters_factor), (3, 3, 3), activation='relu', padding='same')(conv5)
+
+    up6 = get_upconv(pool_size=pool_size, deconvolution=deconvolution, depth=1,
+                     nb_filters=int(256 / downsize_filters_factor), image_shape=input_shape[-3:])(conv5)
+    up6 = concatenate([up6, conv2], axis=4)
+    conv6 = Conv3D(int(128 / downsize_filters_factor), (3, 3, 3), activation='relu', padding='same')(up6)
+    conv6 = Conv3D(int(128 / downsize_filters_factor), (3, 3, 3), activation='relu', padding='same')(conv6)
+
+    up7 = get_upconv(pool_size=pool_size, deconvolution=deconvolution, depth=0,
+                     nb_filters=int(128 / downsize_filters_factor), image_shape=input_shape[-3:])(conv6)
+    up7 = concatenate([up7, conv1], axis=4)
+    conv7 = Conv3D(int(64 / downsize_filters_factor), (3, 3, 3), activation='relu', padding='same')(up7)
+    conv7 = Conv3D(int(64 / downsize_filters_factor), (3, 3, 3), activation='relu', padding='same')(conv7)
+
+    conv8 = Conv3D(n_labels, (1, 1, 1))(conv7)
+    act = Activation('sigmoid')(conv8)
+    model = Model(inputs=inputs, outputs=act)
+
+    model.compile(optimizer=Adam(lr=initial_learning_rate), loss=dice_coef_loss, metrics=[dice_coef])
+
+    return model
+
+
+def dice_coef(y_true, y_pred, smooth=1.):
+    y_true_f = K.flatten(y_true)
+    y_pred_f = K.flatten(y_pred)
+    intersection = K.sum(y_true_f * y_pred_f)
+    return (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth)
+
+
+def dice_coef_loss(y_true, y_pred):
+    return -dice_coef(y_true, y_pred)
+
+
+def compute_level_output_shape(filters, depth, pool_size, image_shape):
+    """
+    Each level has a particular output shape based on the number of filters used in that level and the depth or number
+    of max pooling operations that have been done on the data at that point.
+    :param image_shape: shape of the 3d image.
+    :param pool_size: the pool_size parameter used in the max pooling operation.
+    :param filters: Number of filters used by the last node in a given level.
+    :param depth: The number of levels down in the U-shaped model a given node is.
+    :return: 5D vector of the shape of the output node
+    """
+    if depth != 0:
+        output_image_shape = np.divide(image_shape, np.multiply(pool_size, depth)).tolist()
+    else:
+        output_image_shape = image_shape
+    return tuple([None, filters] + [int(x) for x in output_image_shape])
+
+
+def get_upconv(depth, nb_filters, pool_size, image_shape, kernel_size=(2, 2, 2), strides=(2, 2, 2),
+               deconvolution=False):
+    if deconvolution:
+        try:
+            from keras_contrib.layers import Deconvolution3D
+        except ImportError:
+            raise ImportError("Install keras_contrib in order to use deconvolution. Otherwise set deconvolution=False.")
+
+        return Deconvolution3D(filters=nb_filters, kernel_size=kernel_size,
+                               output_shape=compute_level_output_shape(filters=nb_filters, depth=depth,
+                                                                       pool_size=pool_size, image_shape=image_shape),
+                               strides=strides, input_shape=compute_level_output_shape(filters=nb_filters,
+                                                                                       depth=depth + 1,
+                                                                                       pool_size=pool_size,
+                                                                                       image_shape=image_shape))
+    else:
+        return UpSampling3D(size=pool_size)