In [None]:
"""Parsing code for DICOMS and contour files"""

import dicom
from dicom.errors import InvalidDicomError

import numpy as np
from PIL import Image, ImageDraw


def parse_contour_file(filename):
    """Parse the given contour filename

    :param filename: filepath to the contourfile to parse
    :return: list of tuples holding x, y coordinates of the contour
    """

    coords_lst = []

    with open(filename, 'r') as infile:
        for line in infile:
            coords = line.strip().split()

            x_coord = float(coords[0])
            y_coord = float(coords[1])
            coords_lst.append((x_coord, y_coord))

    return coords_lst


def parse_dicom_file(filename):
    """Parse the given DICOM filename

    :param filename: filepath to the DICOM file to parse
    :return: dictionary with DICOM image data
    """

    try:
        dcm = dicom.read_file(filename)
        dcm_image = dcm.pixel_array

        try:
            intercept = dcm.RescaleIntercept
        except AttributeError:
            intercept = 0.0
        try:
            slope = dcm.RescaleSlope
        except AttributeError:
            slope = 0.0

        if intercept != 0.0 and slope != 0.0:
            dcm_image = dcm_image*slope + intercept
        dcm_dict = {'pixel_data' : dcm_image}
        return dcm_dict
    except InvalidDicomError:
        return None


def poly_to_mask(polygon, width, height):
    """Convert polygon to mask

    :param polygon: list of pairs of x, y coords [(x1, y1), (x2, y2), ...]
     in units of pixels
    :param width: scalar image width
    :param height: ar image height
    :return: Boolean mask of shape (height, width)
    """

    # http://stackoverflow.com/a/3732128/1410871
    img = Image.new(mode='L', size=(width, height), color=0)
    ImageDraw.Draw(img).polygon(xy=polygon, outline=0, fill=1)
    mask = np.array(img).astype(bool)
    return mask


In [1]:
import subprocess

In [7]:
import sys

In [19]:
contour = "120.5 137.5 \n120.5 137.0 \n121.0 136.5"

In [20]:
with open('tmp.txt', 'w') as f:
    f.write(contour)

In [21]:
f.close()

In [28]:
from PIL import Image, ImageDraw
import numpy as np

In [29]:
def poly_to_mask(polygon, width, height):
    """Convert polygon to mask

    :param polygon: list of pairs of x, y coords [(x1, y1), (x2, y2), ...]
     in units of pixels
    :param width: scalar image width
    :param height: scalar image height
    :return: Boolean mask of shape (height, width)
    """

    # http://stackoverflow.com/a/3732128/1410871
    img = Image.new(mode='L', size=(width, height), color=0)
    ImageDraw.Draw(img).polygon(xy=polygon, outline=0, fill=1)
    mask = np.array(img).astype(bool)
    return mask

In [30]:
poly = poly_to_mask([(9, 9), (9, 20), (20, 20), (20, 9)], 30, 30)


In [34]:
np.where(poly == True)

(array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11,
        11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13,
        13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15,
        15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
        16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19]),
 array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 10, 11, 12, 13,
        14, 15, 16, 17, 18, 19, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 10,
        11, 12, 13, 14, 15, 16, 17, 18, 19, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 10, 11, 12, 13, 14,
        15, 16, 17, 18, 19, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]))

In [32]:
poly[9]

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False], dtype=bool)

In [36]:
poly[10][11]

True

In [39]:
np.ones((4,4), dtype=bool)

array([[ True,  True,  True,  True],
       [ True,  True,  True,  True],
       [ True,  True,  True,  True],
       [ True,  True,  True,  True]], dtype=bool)

In [40]:
coords = [(0,0), (5,5), (0,5), (5,0)]

In [41]:
polygon = poly_to_mask(coords,10,10)

In [43]:
polygon[1:4, 1:4] == np.ones((10,10), dtype=bool)

  """Entry point for launching an IPython kernel.


False

In [48]:
polygon

array([[False, False, False, False, False, False, False, False, False,
        False],
       [False, False,  True,  True, False, False, False, False, False,
        False],
       [False, False, False, False, False, False, False, False, False,
        False],
       [False, False, False, False, False, False, False, False, False,
        False],
       [False, False,  True,  True, False, False, False, False, False,
        False],
       [False, False, False, False, False, False, False, False, False,
        False],
       [False, False, False, False, False, False, False, False, False,
        False],
       [False, False, False, False, False, False, False, False, False,
        False],
       [False, False, False, False, False, False, False, False, False,
        False],
       [False, False, False, False, False, False, False, False, False,
        False]], dtype=bool)

In [49]:
np_mask = np.zeros((10,10), dtype=bool)
np_mask[1:5,1:5] = True

In [51]:
np_mask.shape

(10, 10)

In [54]:
polygon

array([[False, False, False, False, False, False, False, False, False,
        False],
       [False, False,  True,  True, False, False, False, False, False,
        False],
       [False, False, False, False, False, False, False, False, False,
        False],
       [False, False, False, False, False, False, False, False, False,
        False],
       [False, False,  True,  True, False, False, False, False, False,
        False],
       [False, False, False, False, False, False, False, False, False,
        False],
       [False, False, False, False, False, False, False, False, False,
        False],
       [False, False, False, False, False, False, False, False, False,
        False],
       [False, False, False, False, False, False, False, False, False,
        False],
       [False, False, False, False, False, False, False, False, False,
        False]], dtype=bool)

In [55]:
import multiprocessing as mp

In [56]:
exec_pool = mp.Pool(10)

In [80]:
help(exec_pool.)

Help on method apply in module multiprocessing.pool:

apply(self, func, args=(), kwds={}) method of multiprocessing.pool.Pool instance
    Equivalent of `apply()` builtin



In [57]:
x = [1,2,3]
y = [4,5]

In [69]:
x = np.array(x)

In [74]:
z = (set(x.tolist()))

In [75]:
import math
math.c

In [68]:
x.tolist

<function tolist>

In [63]:
x = set(x)
x.intersection(y) == set()a

True

# Part 1
## 1. How did you verify that you are parsing the contours correctly?
I wrote a test case for rendering the mask for a mock polygon. I actually found that the provided code is sensitive to ordering of coordinate pairs--which is probably an undesirable feature. Then I made sure that the enumeration of the number of files actually matched the return value of my bash command. 

## 2. What changes did you make to the code, if any, in order to integrate it into our production code base?
I made it so that the parse_dicom_file function inserted image dimensions into the return dictionary that could be used for the mask generation function

## 3. If the pipeline was going to be run on millions of images, and speed was paramount, how would you parallelize it to run as fast as possible?

   I would  partition files to different executors based on the directory session id using the multiprocessing module. Then these executors can do the work of finding their partition's mapping of DICOM --> binary mask.
In the end, I would aggregate their results into a complete candidate set of paired inputs into the training pipeline. This is essentially like a mapreduce. You can scale to multiple machines, without changing the code much


## 4. If this pipeline were parallelized, what kinds of error checking and/or safeguards, if any, would you add into the pipeline?
A few things:
        1. I would want to make sure that files directories are strictly partitioned to different task threads that will not try to process the same file at once,
        2. else we could get redundant data in the return tuple. (I don't think concurrent reads would actually be problematic)
        3. I would have to make sure that within an epoch of training that no data indices would be reused across worker threads


# Part 2