# Prepare cases for Streamlit app

## Imports and configuration

In [20]:
from pathlib import Path
import math

import pylidc as pl
from pylidc.utils import consensus
import pandas as pd
import numpy as np
from tqdm import tqdm
import pydicom as dicom
import matplotlib.pyplot as plt

%matplotlib widget

import warnings
warnings.filterwarnings('ignore')

plt.rcParams['figure.figsize'] = [12, 8]
pd.set_option('display.max_columns', None)

LIDC_DIR = Path("/Volumes/LaCie/data/lung-cancer-detection/lidc-idri/")
DICOM_DIR = LIDC_DIR / "LIDC-IDRI"
APP_DIR = Path().absolute().parents[0] / "app/data"
APP_DIR.mkdir(parents=True, exist_ok=True)

## Load and prepare metadata

In [2]:
patient_df = pd.read_csv(DICOM_DIR/"lidc_patient_meta.csv", index_col="PatientID")
print(patient_df.shape)
scan_df = pd.read_csv(DICOM_DIR/"lidc_scan_meta.csv")
print(scan_df.shape)
nodule_df = pd.read_csv(DICOM_DIR/"lidc_nodule_meta.csv", index_col="SeriesID")
print(nodule_df.shape)

(157, 3)
(1018, 22)
(2651, 16)


In [3]:
scan_df = scan_df.join(patient_df, on="PatientID")
scan_df.shape

(1018, 25)

## Select interesting cases

For our UI experiments, we are looking for cases with these characteristics:

- Betweeen 5 and 10 nodules
- At least one of these nodules should be malignant
- Diagnosis and diagnosis method for the patient are available

Let's query for these attributes.

In [4]:
scan_df.Diagnosis.describe()

count    158.000000
mean       1.753165
std        1.086611
min        0.000000
25%        1.000000
50%        2.000000
75%        3.000000
max        3.000000
Name: Diagnosis, dtype: float64

In [5]:
cases = scan_df.query('(NumNodules >= 5) & (NumNodules <= 10) & (MaxMalignancy == 5) & (Diagnosis > 1)')
print(f"Found {len(cases)} cases with the specified characteristics.")

Found 4 cases with the specified characteristics.


In [6]:
cases

Unnamed: 0,StudyID,SeriesID,PatientID,SliceThickness,SliceSpacing,PixelSpacing,ContrastUsed,ImagePositionPatient,ImageOrientationPatient,Rows,Columns,RescaleIntercept,RescaleSlope,WindowCenter,WindowWidth,BitsAllocated,PixelRepresentation,Manufacturer,ManufacturerModelName,NumAnnotations,NumNodules,MaxMalignancy,Diagnosis,DiagnosisMethod,PrimaryTumorSiteMetastaticDisease
187,1.3.6.1.4.1.14519.5.2.1.6279.6001.300568323537...,1.3.6.1.4.1.14519.5.2.1.6279.6001.204566802718...,LIDC-IDRI-0186,2.5,2.5,0.644531,True,"[-165.000000, -165.000000, -20.000000]","[1.000000, 0.000000, 0.000000, 0.000000, 1.000...",512,512,-1024.0,1.0,-600,1600,16,1,GE MEDICAL SYSTEMS,LightSpeed Plus,11,5,5.0,2.0,2.0,nonsmall cell lung cancer
459,1.3.6.1.4.1.14519.5.2.1.6279.6001.154309317539...,1.3.6.1.4.1.14519.5.2.1.6279.6001.259123825760...,LIDC-IDRI-0454,1.0,1.0,0.724609,False,"[-194.605842, -64.1392937, -317]","[1, 0, 0, 0, 1, 0]",512,512,-1000.0,1.0,"[-0450, -0450]","[02000, 02000]",16,0,Philips,Brilliance 16P,21,7,5.0,2.0,2.0,non-small cell carcinoma
926,1.3.6.1.4.1.14519.5.2.1.6279.6001.242293704887...,1.3.6.1.4.1.14519.5.2.1.6279.6001.211956804948...,LIDC-IDRI-0921,2.0,1.0,0.679688,False,"[-162, 18, -453.099976]","[1, 0, 0, 0, 1, 0]",512,512,-1000.0,1.0,"[-0450, -0450]","[02000, 02000]",16,0,Philips,Brilliance 16P,14,5,5.0,2.0,2.0,small cell carcinoma
999,1.3.6.1.4.1.14519.5.2.1.6279.6001.300829918445...,1.3.6.1.4.1.14519.5.2.1.6279.6001.534006575256...,LIDC-IDRI-0994,1.25,0.625,0.722656,False,"[-205.199997, -198.500000, 6.190000]","[1.000000, 0.000000, 0.000000, 0.000000, 1.000...",512,512,-1024.0,1.0,40,400,16,1,GE MEDICAL SYSTEMS,LightSpeed Pro 16,14,6,5.0,2.0,3.0,LUL Large cell CA


We will focus on the first case, as the file size should meet GitHub's requirements. Uploading all files to GitHub is required to make Streamlit Sharing work.

In [7]:
case = cases.query("PatientID == 'LIDC-IDRI-0186'")
type(case)

pandas.core.frame.DataFrame

In [8]:
case

Unnamed: 0,StudyID,SeriesID,PatientID,SliceThickness,SliceSpacing,PixelSpacing,ContrastUsed,ImagePositionPatient,ImageOrientationPatient,Rows,Columns,RescaleIntercept,RescaleSlope,WindowCenter,WindowWidth,BitsAllocated,PixelRepresentation,Manufacturer,ManufacturerModelName,NumAnnotations,NumNodules,MaxMalignancy,Diagnosis,DiagnosisMethod,PrimaryTumorSiteMetastaticDisease
187,1.3.6.1.4.1.14519.5.2.1.6279.6001.300568323537...,1.3.6.1.4.1.14519.5.2.1.6279.6001.204566802718...,LIDC-IDRI-0186,2.5,2.5,0.644531,True,"[-165.000000, -165.000000, -20.000000]","[1.000000, 0.000000, 0.000000, 0.000000, 1.000...",512,512,-1024.0,1.0,-600,1600,16,1,GE MEDICAL SYSTEMS,LightSpeed Plus,11,5,5.0,2.0,2.0,nonsmall cell lung cancer


In [9]:
sid = case.iloc[0].SeriesID
sid

'1.3.6.1.4.1.14519.5.2.1.6279.6001.204566802718283633558802774757'

## Visualize selected cases

In [10]:
scans = pl.query(pl.Scan).filter(pl.Scan.series_instance_uid == sid)
scans.count()

1

In [11]:
scan = scans.first()

In [12]:
def visualize_scan(scan):
    nods = scan.cluster_annotations()
    scan.visualize(annotation_groups=nods)
    plt.show()

In [13]:
visualize_scan(scan)

Loading dicom files ... This may take a moment.


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Prepare selected cases

### Save images and masks

In [23]:
def prepare(scan, path, nod_sz=(100, 100, 15)):
    data_path = path/scan.patient_id
    data_path.mkdir(parents=True, exist_ok=True)
    
    vol = scan.to_volume(verbose=False)
    np.save(data_path/"scan.npy", vol)
    print(f"PATIENT {scan.patient_id}")
    print(f"Saved CT scan with shape {vol.shape}")

    clusters = scan.cluster_annotations()
    for i, cluster in enumerate(clusters):
        _, orig_cbbox = consensus(cluster, ret_masks=False)
        orig_nod = vol[orig_cbbox]
        pad_sz = [(math.ceil(i/2), math.floor(i/2)) for i in (np.array(nod_sz) - np.array(orig_nod.shape))]
        full_pad = int(np.max(vol.shape))
        _, cbbox = consensus(cluster, ret_masks=False, pad=pad_sz)
        cmask, _ = consensus(cluster, ret_masks=False, pad=full_pad)
        nod_vol = vol[cbbox]
        np.save(data_path/f"nodule_{i:02d}_vol.npy", nod_vol)
        print(f"Saved nodule {i} volume with shape {nod_vol.shape}")
        np.save(data_path/f"nodule_{i:02d}_mask.npy", cmask.astype(np.int8))
        print(f"Saved nodule {i} mask with shape {cmask.shape}")

In [24]:
prepare(scan, APP_DIR)

PATIENT LIDC-IDRI-0186
Saved CT scan with shape (512, 512, 137)
Saved nodule 0 volume with shape (100, 100, 15)
Saved nodule 0 mask with shape (512, 512, 137)
Saved nodule 1 volume with shape (100, 100, 15)
Saved nodule 1 mask with shape (512, 512, 137)
Saved nodule 2 volume with shape (100, 100, 15)
Saved nodule 2 mask with shape (512, 512, 137)
Saved nodule 3 volume with shape (100, 100, 15)
Saved nodule 3 mask with shape (512, 512, 137)
Saved nodule 4 volume with shape (100, 100, 15)
Saved nodule 4 mask with shape (512, 512, 137)


### Save scan metadata for selected cases

In [25]:
case.to_csv(APP_DIR/"scan_meta.csv", index=False)

### Save nodule metadata for selected cases

In [26]:
nodules = nodule_df.loc[sid]
nodules.shape

(5, 16)

In [27]:
nodules

Unnamed: 0_level_0,PatientID,StudyID,NoduleID,NumAnnotations,Diameter,SurfaceArea,Volume,Malignancy,Texture,Spiculation,Lobulation,Margin,Sphericity,Calcification,InternalStructure,Subtlety
SeriesID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1.3.6.1.4.1.14519.5.2.1.6279.6001.204566802718283633558802774757,LIDC-IDRI-0186,1.3.6.1.4.1.14519.5.2.1.6279.6001.300568323537...,0,2,5.714748,88.608536,87.238244,1,5,1,3,5,5,3,1,5
1.3.6.1.4.1.14519.5.2.1.6279.6001.204566802718283633558802774757,LIDC-IDRI-0186,1.3.6.1.4.1.14519.5.2.1.6279.6001.300568323537...,1,1,12.695788,412.727257,550.431778,2,1,1,1,2,5,6,1,1
1.3.6.1.4.1.14519.5.2.1.6279.6001.204566802718283633558802774757,LIDC-IDRI-0186,1.3.6.1.4.1.14519.5.2.1.6279.6001.300568323537...,2,2,6.485468,100.715517,90.353896,3,5,2,3,3,4,6,1,4
1.3.6.1.4.1.14519.5.2.1.6279.6001.204566802718283633558802774757,LIDC-IDRI-0186,1.3.6.1.4.1.14519.5.2.1.6279.6001.300568323537...,3,2,31.353731,3097.289971,7943.094052,3,5,4,3,3,4,6,4,5
1.3.6.1.4.1.14519.5.2.1.6279.6001.204566802718283633558802774757,LIDC-IDRI-0186,1.3.6.1.4.1.14519.5.2.1.6279.6001.300568323537...,4,4,17.445504,881.249009,1382.700205,5,5,4,3,4,4,6,1,5


In [28]:
nodules.to_csv(APP_DIR/"nodule_meta.csv", index=False)

In [29]:
!tree {APP_DIR}

data
├──LIDC-IDRI-0186
│  ├──nodule_00_mask.npy
│  ├──nodule_00_vol.npy
│  ├──nodule_01_mask.npy
│  ├──nodule_01_vol.npy
│  ├──nodule_02_mask.npy
│  ├──nodule_02_vol.npy
│  ├──nodule_03_mask.npy
│  ├──nodule_03_vol.npy
│  ├──nodule_04_mask.npy
│  ├──nodule_04_vol.npy
│  └──scan.npy
├──nodule_meta.csv
└──scan_meta.csv
