# Preprocess LIDC chest CT scans

## Setup and configuration

In [1]:
import os
from pathlib import Path

import pandas as pd
import numpy as np
import pydicom as dicom
import pylidc as pl
from tqdm import tqdm

In [18]:
LIDC_DIR = Path("/Volumes/LaCie/data/lung-cancer-detection/lidc-idri/")
DICOM_DIR = LIDC_DIR / "LIDC-IDRI"
PROC_DIR = LIDC_DIR / "processed"

In [19]:
print(LIDC_DIR.exists())
print(DICOM_DIR.exists())
PROC_DIR.mkdir(parents=True, exist_ok=True)

True
True


## Load patients

In [4]:
patient_list= [f for f in os.listdir(DICOM_DIR) if not f.startswith('.')]
patient_list.sort()
print(len(patient_list))

1010


In [5]:
scan_counts = [pl.query(pl.Scan).filter(pl.Scan.patient_id == pid).count() for pid in patient_list]
pd.Series(scan_counts).value_counts()

1    1002
2       8
dtype: int64

## Preprocess all images

In [6]:
pid = patient_list[77]
pid

'LIDC-IDRI-0078'

In [46]:
def _load_raw_images(scan):
    scan_path = Path(scan.get_path_to_dicom_files())
    fnames = [fname for fname in os.listdir(scan_path) if fname.endswith('.dcm') and not fname.startswith('.')]
    imgs = [dicom.dcmread(scan_path/fname) for fname in fnames]
    return imgs

def _clean_images(imgs):
    zs = [float(img.ImagePositionPatient[-1]) for img in imgs]
    inums = [float(img.InstanceNumber) for img in imgs]
    inds = list(range(len(zs)))
    while np.unique(zs).shape[0] != len(inds):
        for i in inds:
            for j in inds:
                if i!=j and zs[i] == zs[j]:
                    k = i if inums[i] > inums[j] else j
                    inds.pop(inds.index(k))

    # Prune the duplicates found in the loops above.
    zs = [zs[i] for i in range(len(zs)) if i in inds]
    imgs = [imgs[i] for i in range(len(imgs)) if i in inds]

    # Sort everything by (now unique) ImagePositionPatient z coordinate.
    sort_inds = np.argsort(zs)
    imgs = [imgs[s] for s in sort_inds]
    return imgs

def load_lidc_scan(pid):
    scan = pl.query(pl.Scan).filter(pl.Scan.patient_id == pid).first()
    nods = scan.cluster_annotations()
    imgs = _load_raw_images(scan)
    imgs = _clean_images(imgs)
    return imgs, nods

def _to_volume(imgs):
    # check whether images have same size
    img_sizes = np.array([(img.Rows, img.Columns) for img in imgs])
    if len(np.unique(img_sizes[:,0])) != 1 or len(np.unique(img_sizes[:,1])) != 1:
        raise ValueError("Images do not have the same dimensions.")
    x, y = imgs[0].Rows, imgs[0].Columns
    vol = np.zeros((x, y, len(imgs)), dtype=np.int16)
    for i in range(len(imgs)):
        vol[:,:,i] = imgs[i].pixel_array
    return vol

def _extract_meta(imgs):
    img = imgs[0]
    meta = {
        "studyID": img.StudyInstanceUID if hasattr(img, "StudyInstanceUID") else None,
        "seriesID": img.SeriesInstanceUID if hasattr(img, "SeriesInstanceUID") else None,
        "manufacturer": img.Manufacturer if hasattr(img, "Manufacturer") else None,
        "modelName": img.ManufacturerModelName if hasattr(img, "ManufacturerModelName") else None,
        "patientID": img.PatientID if hasattr(img, "PatientID") else None,
        "patientSex": img.PatientSex if hasattr(img, "PatientSex") else None,
        "patientAge": img.PatientAge if hasattr(img, "PatientAge") else None,
        "patientEthnic": img.EthnicGroup if hasattr(img, "EthnicGroup") else None,
        "patientPosition": img.PatientPosition if hasattr(img, "PatientPosition") else None,
        "sliceThickness": img.SliceThickness if hasattr(img, "SliceThickness") else None,
        "pixelSpacing": img.PixelSpacing if hasattr(img, "PixelSpacing") else None,
        "x": img.Rows if hasattr(img, "Rows") else None,
        "y": img.Columns if hasattr(img, "Columns") else None,
        "z": len(imgs),
    }
    return meta

def _prepare_meta_df(data):
    df = pd.DataFrame(data=data)
    return df

def _preprocess_lidc_scan(pid):
    imgs, nods = load_lidc_scan(pid)
    vol = _to_volume(imgs)
    meta = _extract_meta(imgs)
    return vol, meta, nods

def preprocess_lidc_scans(pids, proc_dir):
    scan_dir = proc_dir / "scans"
    scan_dir.mkdir(parents=True, exist_ok=True)
    
    meta_data = []
    for pid in tqdm(pids):
        vol, meta, _ = _preprocess_lidc_scan(pid)
        meta_data.append(meta)
        np.save(scan_dir/f"{pid}.npy", vol)
    df = _prepare_meta_df(meta_data)
    df.to_csv(proc_dir/"scan_meta.csv")
    return df

In [47]:
df = preprocess_lidc_scans(patient_list[:10], PROC_DIR)

100%|██████████| 10/10 [00:48<00:00,  4.89s/it]


Next up:

- Clean scan meta df
- Preprocess nodules