In [18]:
from collections import namedtuple
import functools # https://docs.python.org/3/library/functools.html
import glob # short for global, is a function that's used to search for files that match a specific file pattern or name.
import os
import csv
import SimpleITK as sitk # https://simpleitk.org/ (We can treat the format of the data files as a black box and use SimpleITK to load them into more familiar NumPy arrays.)
import numpy as np

In [19]:
CandidateInfoTuple = namedtuple(
    'CandidateInfoTuple',
    'isNodule_bool, diameter_mm, series_uid, center_xyz',
)

### getCandidateInfoList comments

* *@functools.lru_cache(1) - in-memory cache decorator*
* *requireOnDisk_bool=True - defaults to screening out series from data subsets that aren't in place yet.*
* *We construct a set with all series_uids that are present on disk. This will let us use the data, even if we haven't downloaded all of the subsets yet.*
* *For each of the candidate entries for a given series_uid, we loop through the annotations we collected earlier for the same series_uid and see if the two coordinates are
close enough to consider them the same nodule.*
* *If we don’t find diameter information for a nodule, that’s fine; we’ll just treat the nodule as having a 0.0 diameter.*

In [20]:
@functools.lru_cache(1)
def getCandidateInfoList(requireOnDisk_bool=True):
    mhd_list = glob.glob('data/luna/subset*/*.mhd') # Return a possibly empty list of path names that match pathname
    presentOnDisk_set = {os.path.split(p)[-1][:-4] for p in mhd_list}

    # First we need to group our annotations by series_uid, as that’s the first key we’ll use to cross-reference each row from the two files.
    diameter_dict = {}

    # seriesuid,coordX,coordY,coordZ,diameter_mm - header of 'annotations.csv'
    with open('data/annotations.csv', 'r') as f:
        for row in list(csv.reader(f))[1:]:
            series_uid = row[0]
            annotationCenter_xyz = tuple([float(x) for x in row[1:4]])
            annotationDiameter_mm = float(row[4])

            diameter_dict.setdefault(series_uid, []).append(
                (annotationCenter_xyz, annotationDiameter_mm)
            )

    # Now we’ll build our full list of candidates using the information in the candidates.csv file.
    candidateInfo_list = []

    # seriesuid,coordX,coordY,coordZ,class - header of 'candidates.csv'
    with open('data/candidates.csv', 'r') as f:
        for row in list(csv.reader(f))[1:]:
            series_uid = row[0]

            # If a series_uid isn’t present, it’s in a subset we don’t have on disk, so we should skip it.
            if series_uid not in presentOnDisk_set and requireOnDisk_bool:
                continue

            isNodule = bool(int(row[4]))
            candidateCenter_xyz = tuple([float(x) for x in row[1:4]])

            candidateDiameter_mm = 0.0
            for annotation_tuple in diameter_dict.get(series_uid, []):
                annotationCenter_xyz, annotationDiameter_mm = annotation_tuple
                for i in range(3):
                    delta_mm = abs(candidateCenter_xyz[i] - annotationCenter_xyz[i])
                    
                    # Divides the diameter by 2 to get the radius, and divides the radius by 2 to require that the two nodule center points not be too far apart relative to the size of the nodule. 
                    # (This results in a bounding-box check, not a true distance check.) 
                    if delta_mm > annotationDiameter_mm / 4:
                        break
                    else:
                        candidateDiameter_mm = annotationDiameter_mm
                        break

                candidateInfo_list.append(CandidateInfoTuple(
                    isNodule,
                    candidateDiameter_mm,
                    series_uid,
                    candidateCenter_xyz,
                ))

    # This means we have all of the actual nodule samples starting with the largest first, 
    # followed by all of the non-nodule samples (which don’t have nodule size information).
    candidateInfo_list.sort(reverse=True)
    return candidateInfo_list

* *The 10 subsets we discussed earlier have about 90 CT scans each (888 in total), with every CT scan represented as two files: one with a .mhd extension and one with a .raw extension. The data being split between multiple files is hidden behind the sitk routines, however, and is not something we need to be directly concerned with.*

* *Continuing the __init__ method, we need to do a bit of cleanup on the ct_a val- ues. CT scan voxels are expressed in Hounsfield units (HU; https://en.wikipedia.org/wiki/Hounsfield_scale), which are odd units; air is –1,000 HU (close enough to 0 g/cc [grams per cubic centimeter] for our purposes), water is 0 HU (1 g/cc), and bone is at least +1,000 HU (2–3 g/cc).*

* *It’s important to know that our data uses the range of –1,000 to +1,000.*

* *Candidate center data is expressed in millimeters, not voxels. We need to transform our coordinates from the millimeter-based coordinate system (X,Y,Z) they’re expressed in, to the voxel-address-based coordinate system (I,R,C) used to take array slices from our CT scan data.*

# ![title](images/convertToIRC.png)

When dealing with CT scans, we refer to the array dimensions as index, row, and column, because a separate meaning exists for X, Y, and Z, as illustrated in figure 10.6. The patient coordinate system defines positive X to be patient- left (left), positive Y to be patient-behind (posterior), and positive Z to be toward-patient- head (superior). **Left-posterior-superior is sometimes abbreviated LPS.**

# ![title](images/figure10_6.png)

* The patient coordinate system is measured in millimeters and has an arbitrarily positioned origin that does not correspond to the origin of the CT voxel array.

# ![title](images/figure10_7.png)

When plotted using square pixels, the non-cubic voxels can end up looking some- what distorted, similar to the distortion near the north and south poles when using a Mercator projection map. We will need to apply a scaling factor if we want the images to depict realistic proportions.

* CTs are commonly 512 rows by 512 columns, with the index dimension ranging from around 100 total slices up to perhaps 250 slices (250 slices times 2.5 millimeters is typically enough to contain the anatomical region of interest). This results in a lower bound of approximately 225 voxels, or about 32 million data points. Each CT specifies the voxel size in millimeters as part of the file metadata;

Do the math manually:
* Flip the coordinates from IRC to CRI, to align with XYZ.
* Scale the indices with the voxel sizes.
* Matrix-multiply with the directions matrix, using @ in Python.
* Add the offset for the origin.

To go back from XYZ to IRC, we need to perform the inverse of each step in the reverse order.

In [22]:
class CT:
    def __init__(self, series_uid):
        mhd_path = glob.glob(
            'data/subset*/{}.mhd'.format(series_uid) # We don’t care to track which subset a given series_uid is in, so we wildcard the subset.
        )[0]

        # sitk.ReadImage implicitly consumes the .raw file in addition to the passed-in .mhd file.
        ct_mhd = sitk.ReadImage(mhd_path)

        # Recreates an np.array since we want to convert the value type to np.float32
        # ct_a is a three-dimensional array.
        ct_a = np.array(sitk.GetArrayFromImage(ct_mhd), dtype=np.float32)

        # Capping the values between -1000 and 1000
        ct_a.clip(-1000, 1000, ct_a)

        self.series_uid = series_uid
        self.hu_a = ct_a

        