In [4]:
from collections import namedtuple
import functools
import glob
import os
import csv

In [2]:
CandidateInfoTuple = namedtuple(
    'CandidateInfoTuple',
    'isNodule_bool, diameter_mm, series_uid, center_xyz',
)

### getCandidateInfoList comments

* *@functools.lru_cache(1) - in-memory cache decorator*
* *requireOnDisk_bool=True - defaults to screening out series from data subsets that aren't in place yet.*
* *We construct a set with all series_uids that are present on disk. This will let us use the data, even if we haven't downloaded all of the subsets yet.*
* *For each of the candidate entries for a given series_uid, we loop through the annotations we collected earlier for the same series_uid and see if the two coordinates are
close enough to consider them the same nodule.*
* *If we don’t find diameter information for a nodule, that’s fine; we’ll just treat the nodule as having a 0.0 diameter.*

In [6]:
@functools.lru_cache(1)
def getCandidateInfoList(requireOnDisk_bool=True):
    mhd_list = glob.glob('data/luna/subset*/*.mhd')
    presentOnDisk_set = {os.path.split(p)[-1][:-4] for p in mhd_list}

    # First we need to group our annotations by series_uid, as that’s the first key we’ll use to cross-reference each row from the two files.
    diameter_dict = {}
    with open('data/annotations.csv', 'r') as f:
        for row in list(csv.reader(f))[1:]:
            series_uid = row[0]
            annotationCenter_xyz = tuple([float(x) for x in row[1:4]])
            annotationDiameter_mm = float(row[4])

            diameter_dict.setdefault(series_uid, []).append(
                (annotationCenter_xyz, annotationDiameter_mm)
            )

    # Now we’ll build our full list of candidates using the information in the candidates.csv file.
    candidateInfo_list = []
    with open('data/annotations.csv', 'r') as f:
        for row in list(csv.reader(f))[1:]:
            series_uid = row[0]

            # If a series_uid isn’t present, it’s in a subset we don’t have on disk, so we should skip it.
            if series_uid not in presentOnDisk_set and requireOnDisk_bool:
                continue

            isNodule_bool = bool(int(row[4]))
            candidateCenter_xyz = tuple([float(x) for x in row[1:4]])

            candidateDiameter_mm = 0.0
            for annotation_tup in diameter_dict.get(series_uid, []):
                annotationCenter_xyz, annotationDiameter_mm = annotation_tup
                for i in range(3):
                    delta_mm = abs(candidateCenter_xyz[i] - annotationCenter_xyz[i])
                    
                    # Divides the diameter by 2 to get the radius, and divides the radius by 2 to require that the two nodule center points not be too far apart relative to the size of the nodule. 
                    # (This results in a bounding-box check, not a true distance check.) 
                    if delta_mm > annotationDiameter_mm / 4:
                        break
                    else:
                        candidateDiameter_mm = annotationDiameter_mm
                        break

                candidateInfo_list.append(CandidateInfoTuple(
                    isNodule_bool,
                    candidateDiameter_mm,
                    series_uid,
                    candidateCenter_xyz,
                ))

    # This means we have all of the actual nodule samples starting with the largest first, 
    # followed by all of the non-nodule samples (which don’t have nodule size information).
    candidateInfo_list.sort(reverse=True)
    return candidateInfo_list