# Preprocess LIDC chest CT scans

## Setup and configuration

In [47]:
import os
from pathlib import Path

import pandas as pd
import numpy as np
import pydicom as dicom
import pylidc as pl

In [2]:
LIDC_DIR = Path("/Volumes/LaCie/data/lung-cancer-detection/lidc-idri/")
DICOM_DIR = LIDC_DIR / "LIDC-IDRI"

In [3]:
print(LIDC_DIR.exists())
print(DICOM_DIR.exists())

True
True


## Load patients

In [4]:
patient_list= [f for f in os.listdir(DICOM_DIR) if not f.startswith('.')]
patient_list.sort()
print(len(patient_list))

1010


In [9]:
scan_counts = [pl.query(pl.Scan).filter(pl.Scan.patient_id == pid).count() for pid in patient_list]
pd.Series(scan_counts).value_counts()

1    1002
2       8
dtype: int64

## Load scan for one patient

In [26]:
pid = patient_list[77]
pid

'LIDC-IDRI-0078'

In [27]:
scan = pl.query(pl.Scan).filter(pl.Scan.patient_id == pid).first()

In [28]:
nodules_annotation = scan.cluster_annotations()
print(f"Patient ID: {pid}\nNumber of Annotated Nodules: {len(nodules_annotation)}")

Patient ID: LIDC-IDRI-0078
Number of Annotated Nodules: 4


In [54]:
scan_path = Path(scan.get_path_to_dicom_files())
# Note: directories contain files starting with '.' that are not readable by pydicom
fnames = [fname for fname in os.listdir(scan_path) if fname.endswith('.dcm') and not fname.startswith('.')]
len(fnames)

87

In [55]:
fname = fnames[0]
img = dicom.dcmread(scan_path / fname)
img.

Dataset.file_meta -------------------------------
(0002, 0000) File Meta Information Group Length  UL: 204
(0002, 0001) File Meta Information Version       OB: b'\x00\x01'
(0002, 0002) Media Storage SOP Class UID         UI: CT Image Storage
(0002, 0003) Media Storage SOP Instance UID      UI: 1.3.6.1.4.1.14519.5.2.1.6279.6001.305457156046178667359366041284
(0002, 0010) Transfer Syntax UID                 UI: Implicit VR Little Endian
(0002, 0012) Implementation Class UID            UI: 1.3.6.1.4.1.22213.1.143
(0002, 0013) Implementation Version Name         SH: '0.5'
(0002, 0016) Source Application Entity Title     AE: 'POSDA'
-------------------------------------------------
(0008, 0008) Image Type                          CS: ['ORIGINAL', 'PRIMARY', 'AXIAL']
(0008, 0016) SOP Class UID                       UI: CT Image Storage
(0008, 0018) SOP Instance UID                    UI: 1.3.6.1.4.1.14519.5.2.1.6279.6001.305457156046178667359366041284
(0008, 0020) Study Date                 