In [1]:
import os
import sys

# Upgrade pip, just in case...
!{sys.executable} -m pip install --upgrade -q pip

# Install requests for downloading data.
!{sys.executable} -m pip install --upgrade -q requests
!{sys.executable} -m pip install --upgrade -q pandas

import requests

tcia_utils_text = requests.get("https://github.com/kirbyju/TCIA_Notebooks/raw/main/tcia_utils.py")
with open('tcia_utils.py', 'wb') as f:
    f.write(tcia_utils_text.content)

import tcia_utils as tcia

[K     |████████████████████████████████| 2.1 MB 16.1 MB/s 
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.8/62.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.2/12.2 MB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
[0m

In [12]:

from builtins import range, input

from tensorflow.keras.layers import Input, Lambda, Dense, Flatten, GlobalAveragePooling2D, Dropout
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.applications import VGG19
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from sklearn.metrics import confusion_matrix, roc_curve
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from glob import glob
import pandas as pd
import cv2

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.utils import to_categorical

## Load, process, and label the cancer data 

In [2]:
# Download a "Shared Cart" that has been previously 
#    created via the NBIA webset 
#    (https://nbia.cancerimagingarchive.net)
cartName = "nbia-47881669847357616" # Our lung cancer data's first subject

# retrieve cart metadata
cart_data = tcia.getSharedCart(cartName)

# download the series_uids list and return dataframe of metadata
df = tcia.downloadSeries(cart_data)

# display dataframe
display(df)

Calling...  https://services.cancerimagingarchive.net/nbia-api/services/v1/getContentsByName?name=nbia-47881669847357616
Downloading 2 Series Instance UIDs (scans).
Downloading... https://services.cancerimagingarchive.net/nbia-api/services/v1/getImage?NewFileNames=Yes&SeriesInstanceUID=1.3.6.1.4.1.14519.5.2.1.6655.2359.213534032021332276911485641315
Downloading... https://services.cancerimagingarchive.net/nbia-api/services/v1/getImage?NewFileNames=Yes&SeriesInstanceUID=1.3.6.1.4.1.14519.5.2.1.6655.2359.257508444832901632590301540805
Download Complete: 2 Series Instance UIDs (scans).


Unnamed: 0,Series UID,Collection,Data Description URI,Subject ID,Study UID,Study Description,Study Date,Series Description,Manufacturer,Modality,SOP Class UID,Number of Images,File Size,Series Number,License Name,License URL,Annotation Size
0,1.3.6.1.4.1.14519.5.2.1.6655.2359.213534032021...,Lung-PET-CT-Dx,https://doi.org/10.7937/TCIA.2020.NNC2-0461,Lung_Dx-A0001,1.3.6.1.4.1.14519.5.2.1.6655.2359.165554066086...,Chest,04-04-2007,5mm,Philips,CT,1.2.840.10008.5.1.4.1.1.2,64,33750912,3.0,Creative Commons Attribution 4.0 International...,https://creativecommons.org/licenses/by/4.0/,0
1,1.3.6.1.4.1.14519.5.2.1.6655.2359.257508444832...,Lung-PET-CT-Dx,https://doi.org/10.7937/TCIA.2020.NNC2-0461,Lung_Dx-A0001,1.3.6.1.4.1.14519.5.2.1.6655.2359.165554066086...,Chest,04-04-2007,5mm,Philips,CT,1.2.840.10008.5.1.4.1.1.2,64,33750912,2.0,Creative Commons Attribution 4.0 International...,https://creativecommons.org/licenses/by/4.0/,0


In [3]:
# Install itk for DICOM I/O and for reading DICOM into an itkImage 
#   that manages all DICOM field values, include acquistion details 
#   such as voxel image, image orientation, and image directions
#   which are critical to image processing and display
!{sys.executable} -m pip install --upgrade --pre -q "itk==5.3rc4.post3"

# Additionally we'll install numpy and torch to explore a variety of 
#    image data structures
!{sys.executable} -m pip install -q torch
!{sys.executable} -m pip install -q numpy

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.3/27.3 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.2/26.2 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.6/83.6 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.2/17.2 MB[0m [31m60.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.9/75.9 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[0m

In [4]:
# Include ITK for DICOM reading
import itk

# Numpy for numpy.arrays
import numpy as np

# Torch for torch.tensors
import torch

In [5]:
# Install and import PyDICOM, then store file paths in list

!{sys.executable} -m pip install pydicom

import pydicom as di 
import os
from os import listdir
os.rename("tciaDownload", "tciaDownloadCancer")
PathDicom = "tciaDownloadCancer"
DCMFiles = [] 
for dirName, subdirList, fileList in os.walk(PathDicom):
    for filename in fileList:
        if ".dcm" in filename.lower():
            DCMFiles.append(os.path.join(dirName,filename))
print("Number of (.dcm) files =", len(DCMFiles))

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pydicom
  Downloading pydicom-2.3.1-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-2.3.1
[0mNumber of (.dcm) files = 128


In [6]:
# Extract pixel data from image files

X_cancer = []
y_cancer = []
for k in DCMFiles:
    Images = di.read_file(k,force=True)
    X_cancer.append(Images.pixel_array)
    y_cancer.append(1) # 0 = covid data, 1 = cancer data

In [7]:
# Use the data frame to find the Series UID where the Modality is CT
dicom_data_dir = "tciaDownloadCancer/"
ct_series_uid = df.at[df.Modality.eq('CT').idxmax(), 'Series UID']
dicom_ct_dir = os.path.join(dicom_data_dir, ct_series_uid)

# Load and sort the DICOM data into a volume - since no series_uid is
#    specified with this read command, it will load the first series in the directory.
dicom_image_large = itk.imread(dicom_ct_dir, itk.F)

# To save time for this demo, we subsample the image in the x and y dimensions
new_spacing = list(dicom_image_large.GetSpacing())
new_spacing[:2] = [x*3 for x in new_spacing[:2]]
new_size = list(dicom_image_large.GetLargestPossibleRegion().GetSize())
new_size[:2] = [x//3 for x in new_size[:2]]
dicom_image = itk.resample_image_filter(Input=dicom_image_large,
                                        output_spacing=new_spacing, 
                                        output_origin=dicom_image_large.GetOrigin(), 
                                        output_direction=dicom_image_large.GetDirection(),
                                        size=new_size)
print(f"New spacing = {new_spacing}")
print(f"New size = {new_size}")

New spacing = [2.525390625, 2.525390625, 5.0]
New size = [170, 170, 64]


## Do it again for COVID data

In [8]:
cartName = "nbia-6761669942417940" # Covid data

# retrieve cart metadata
cart_data = tcia.getSharedCart(cartName)

# download the series_uids list and return dataframe of metadata
df = tcia.downloadSeries(cart_data)

# display dataframe
display(df)

Calling...  https://services.cancerimagingarchive.net/nbia-api/services/v1/getContentsByName?name=nbia-6761669942417940
Downloading 6 Series Instance UIDs (scans).
Downloading... https://services.cancerimagingarchive.net/nbia-api/services/v1/getImage?NewFileNames=Yes&SeriesInstanceUID=1.3.6.1.4.1.14519.5.2.1.99.1071.14850676509103506699400577275960
Downloading... https://services.cancerimagingarchive.net/nbia-api/services/v1/getImage?NewFileNames=Yes&SeriesInstanceUID=1.3.6.1.4.1.14519.5.2.1.99.1071.21083349907841416087078944245719
Downloading... https://services.cancerimagingarchive.net/nbia-api/services/v1/getImage?NewFileNames=Yes&SeriesInstanceUID=1.3.6.1.4.1.14519.5.2.1.99.1071.22090542070462647745743105373525
Downloading... https://services.cancerimagingarchive.net/nbia-api/services/v1/getImage?NewFileNames=Yes&SeriesInstanceUID=1.3.6.1.4.1.14519.5.2.1.99.1071.22366983673254100505145504004129
Downloading... https://services.cancerimagingarchive.net/nbia-api/services/v1/getImage?N

Unnamed: 0,Series UID,Collection,3rd Party Analysis,Data Description URI,Subject ID,Study UID,Study Description,Study Date,Series Description,Manufacturer,Modality,SOP Class UID,Number of Images,File Size,Series Number,License Name,License URL,Annotation Size
0,1.3.6.1.4.1.14519.5.2.1.99.1071.14850676509103...,COVID-19-NY-SBU,NO,https://doi.org/10.7937/TCIA.BBAG-2923,A034518,1.3.6.1.4.1.14519.5.2.1.99.1071.28052166218470...,CT ABD PELVISWITH CHEST IMAGES W IV CON,12-31-1900,2.0,TOSHIBA,CT,1.2.840.10008.5.1.4.1.1.2,2,1314900,1.0,Creative Commons Attribution 4.0 International...,https://creativecommons.org/licenses/by/4.0/,0
1,1.3.6.1.4.1.14519.5.2.1.99.1071.21083349907841...,COVID-19-NY-SBU,NO,https://doi.org/10.7937/TCIA.BBAG-2923,A034518,1.3.6.1.4.1.14519.5.2.1.99.1071.28052166218470...,CT ABD PELVISWITH CHEST IMAGES W IV CON,12-31-1900,Body 3.000 CE,TOSHIBA,CT,1.2.840.10008.5.1.4.1.1.2,118,160721790,7.0,Creative Commons Attribution 4.0 International...,https://creativecommons.org/licenses/by/4.0/,0
2,1.3.6.1.4.1.14519.5.2.1.99.1071.22090542070462...,COVID-19-NY-SBU,NO,https://doi.org/10.7937/TCIA.BBAG-2923,A034518,1.3.6.1.4.1.14519.5.2.1.99.1071.28052166218470...,CT ABD PELVISWITH CHEST IMAGES W IV CON,12-31-1900,Body 3.000 CE,TOSHIBA,CT,1.2.840.10008.5.1.4.1.1.2,89,121222124,6.0,Creative Commons Attribution 4.0 International...,https://creativecommons.org/licenses/by/4.0/,0
3,1.3.6.1.4.1.14519.5.2.1.99.1071.22366983673254...,COVID-19-NY-SBU,NO,https://doi.org/10.7937/TCIA.BBAG-2923,A034518,1.3.6.1.4.1.14519.5.2.1.99.1071.28052166218470...,CT ABD PELVISWITH CHEST IMAGES W IV CON,12-31-1900,Lung 1.0 CE,TOSHIBA,CT,1.2.840.10008.5.1.4.1.1.2,596,314916466,4.0,Creative Commons Attribution 4.0 International...,https://creativecommons.org/licenses/by/4.0/,0
4,1.3.6.1.4.1.14519.5.2.1.99.1071.30475546417294...,COVID-19-NY-SBU,NO,https://doi.org/10.7937/TCIA.BBAG-2923,A034518,1.3.6.1.4.1.14519.5.2.1.99.1071.28052166218470...,CT ABD PELVISWITH CHEST IMAGES W IV CON,12-31-1900,Body 5.0 CE,TOSHIBA,CT,1.2.840.10008.5.1.4.1.1.2,120,63405602,2.0,Creative Commons Attribution 4.0 International...,https://creativecommons.org/licenses/by/4.0/,0
5,1.3.6.1.4.1.14519.5.2.1.99.1071.85179820664090...,COVID-19-NY-SBU,NO,https://doi.org/10.7937/TCIA.BBAG-2923,A034518,1.3.6.1.4.1.14519.5.2.1.99.1071.28052166218470...,CT ABD PELVISWITH CHEST IMAGES W IV CON,12-31-1900,Lung 5.0 CE,TOSHIBA,CT,1.2.840.10008.5.1.4.1.1.2,120,63405842,3.0,Creative Commons Attribution 4.0 International...,https://creativecommons.org/licenses/by/4.0/,0


In [9]:
os.rename("tciaDownload", "tciaDownloadCovid")
PathDicom = "tciaDownloadCovid"
DCMFiles = [] 
for dirName, subdirList, fileList in os.walk(PathDicom):
    for filename in fileList:
        if ".dcm" in filename.lower():
            DCMFiles.append(os.path.join(dirName,filename))
print("Number of (.dcm) files =", len(DCMFiles))

Number of (.dcm) files = 1045


In [10]:
# Extract pixel data from image files

X_covid = []
y_covid = []
for k in DCMFiles:
    Images = di.read_file(k,force=True)
    X_covid.append(Images.pixel_array)
    y_covid.append(0) # 0 = covid data, 1 = cancer data

In [11]:
# Use the data frame to find the Series UID where the Modality is CT
dicom_data_dir = "tciaDownloadCovid/"
ct_series_uid = df.at[df.Modality.eq('CT').idxmax(), 'Series UID']
dicom_ct_dir = os.path.join(dicom_data_dir, ct_series_uid)

# Load and sort the DICOM data into a volume - since no series_uid is
#    specified with this read command, it will load the first series in the directory.
dicom_image_large = itk.imread(dicom_ct_dir, itk.F)

# To save time for this demo, we subsample the image in the x and y dimensions
new_spacing = list(dicom_image_large.GetSpacing())
new_spacing[:2] = [x*3 for x in new_spacing[:2]]
new_size = list(dicom_image_large.GetLargestPossibleRegion().GetSize())
new_size[:2] = [x//3 for x in new_size[:2]]
dicom_image = itk.resample_image_filter(Input=dicom_image_large,
                                        output_spacing=new_spacing, 
                                        output_origin=dicom_image_large.GetOrigin(), 
                                        output_direction=dicom_image_large.GetDirection(),
                                        size=new_size)
print(f"New spacing = {new_spacing}")
print(f"New size = {new_size}")

OSError: ignored

## Train test split

In [20]:
# Convert to array 
X_cancer = np.array(X_cancer)
X_covid = np.array(X_covid)

print(X_cancer.ndim)
print(X_covid.ndim)

# Split into training and testing sets for both types of images
X_cancer_train, X_cancer_test, y_cancer_train, y_cancer_test = train_test_split(
    X_cancer, y_cancer, test_size=0.2)
X_covid_train, X_covid_test, y_covid_train, y_covid_test = train_test_split(
    X_covid, y_covid, test_size=0.2)


3
1


In [22]:
X_cancer.shape

(128, 512, 512)

In [23]:
X_covid.shape

(1045,)

In [19]:
# Merge sets for both types of images
X_train = np.concatenate((X_cancer_train, X_covid_train), axis=0)
X_test = np.concatenate((X_cancer_test, X_covid_test), axis=0)
y_train = np.concatenate((y_cancer_train, y_covid_train), axis=0)
y_test = np.concatenate((y_cancer_test, y_covid_test), axis=0)

ValueError: ignored