# TCIA Cancer Image Net downloader Notebook

- single user cluster is easiest
- serverless compute works

### Setup

In [0]:
%pip install --quiet tcia_utils

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
dbutils.library.restartPython()

In [0]:
import requests
import pandas as pd
from tcia_utils import nbia
import os

### Turn off debug logging

In [0]:

# set logging level to INFO in Google Colab (not necessary in Jupyter)

import logging

for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

# Set handler with level = info
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',
                    level=logging.INFO)

In [0]:
# test the api
nbia.getCollections()

2025-07-06 22:03:06,976:INFO:Success - Token saved to global api_call_headers variable and expires at 2025-07-07 00:03:06.976373
2025-07-06 22:03:06,978:INFO:Accessing public data anonymously. To access restricted data use nbia.getToken() with your credentials.
2025-07-06 22:03:06,980:INFO:Calling getCollectionValues with parameters {}


[{'Collection': '4D-Lung'},
 {'Collection': 'ACRIN-6698'},
 {'Collection': 'ACRIN-Contralateral-Breast-MR'},
 {'Collection': 'ACRIN-FLT-Breast'},
 {'Collection': 'ACRIN-NSCLC-FDG-PET'},
 {'Collection': 'Adrenal-ACC-Ki67-Seg'},
 {'Collection': 'Advanced-MRI-Breast-Lesions'},
 {'Collection': 'Anti-PD-1_Lung'},
 {'Collection': 'B-mode-and-CEUS-Liver'},
 {'Collection': 'BREAST-DIAGNOSIS'},
 {'Collection': 'Breast-Cancer-Screening-DBT'},
 {'Collection': 'Breast-MRI-NACT-Pilot'},
 {'Collection': 'C4KC-KiTS'},
 {'Collection': 'CBIS-DDSM'},
 {'Collection': 'CC-Radiomics-Phantom'},
 {'Collection': 'CC-Radiomics-Phantom-2'},
 {'Collection': 'CC-Radiomics-Phantom-3'},
 {'Collection': 'CC-Tumor-Heterogeneity'},
 {'Collection': 'CMB-AML'},
 {'Collection': 'CMB-BRCA'},
 {'Collection': 'CMB-CRC'},
 {'Collection': 'CMB-GEC'},
 {'Collection': 'CMB-LCA'},
 {'Collection': 'CMB-MEL'},
 {'Collection': 'CMB-MML'},
 {'Collection': 'CMB-OV'},
 {'Collection': 'CMB-PCA'},
 {'Collection': 'CMMD'},
 {'Collection'

## Set Downloader config variables

In [0]:
#
# Set name of collections
#  https://www.cancerimagingarchive.net/collection/MIDI-B-Test-MIDI-B-Validation/
#
collection_names = [
    "MIDI-B-Curated-Test",
    "MIDI-B-Curated-Validation",
    "MIDI-B-Synthetic-Test",
    'MIDI-B-Synthetic-Validation'
]
table_name = "hls_radiology.tcia.tcia_collection_series_metadata"
download_path = "/Volumes/hls_radiology/tcia/downloads/tciaDownload/"

## Drop and reload dataset metadata

In [0]:

spark.sql(f"DROP TABLE IF EXISTS {table_name}")

for collection_name in collection_names:
    print(collection_name)
    try:
        series = nbia.getSeries(collection=collection_name)
        df = spark.createDataFrame(series)
        df.write.mode('append').saveAsTable(table_name)
    except Exception as e:
        print(f"{collection_name}: {e}")

spark.read.table(table_name).display()

2025-07-06 22:03:36,866:INFO:Calling getSeries with parameters {'Collection': 'MIDI-B-Curated-Test'}


MIDI-B-Curated-Test


2025-07-06 22:03:42,582:INFO:Calling getSeries with parameters {'Collection': 'MIDI-B-Curated-Validation'}


MIDI-B-Curated-Validation


2025-07-06 22:03:46,483:INFO:Calling getSeries with parameters {'Collection': 'MIDI-B-Synthetic-Test'}


MIDI-B-Synthetic-Test


2025-07-06 22:03:50,432:INFO:Calling getSeries with parameters {'Collection': 'MIDI-B-Synthetic-Validation'}


MIDI-B-Synthetic-Validation


BodyPartExamined,Collection,CollectionURI,DateReleased,FileSize,ImageCount,Manufacturer,ManufacturerModelName,Modality,PatientID,ProtocolName,SeriesDate,SeriesDescription,SeriesInstanceUID,SeriesNumber,SoftwareVersions,StudyDate,StudyDesc,StudyInstanceUID,ThirdPartyAnalysis,TimeStamp
BREAST,MIDI-B-Synthetic-Test,https://doi.org/10.7937/cf2p-aw56,2025-05-02 00:00:00.0,6349820,12,GE MEDICAL SYSTEMS,GENESIS_SIGNA,MR,569461761,Ordered for Michelle Summers by Dr. Alvarez,2012-03-13 00:00:00.0,PJN at FBH,2.4.485.0.3.2574743.8.340.2053291776754759396,580.0,07,2012-03-13 00:00:00.0,"MR BREAS, UNIT for Michelle Summers",2.4.485.0.3.2574743.8.340.2746447218479878168,NO,2025-02-24 04:52:24.0
BREAST,MIDI-B-Synthetic-Test,https://doi.org/10.7937/cf2p-aw56,2025-05-02 00:00:00.0,6350772,12,GE MEDICAL SYSTEMS,GENESIS_SIGNA,MR,6718897452,Ordered for 413-25-5943 by Dr. Santiago,2019-05-12 00:00:00.0,PJN for Christine Delgado,2.2.141.0.0.8878811.6.310.2199994186582863917,20190512.0,07,2019-05-12 00:00:00.0,"MR BREAS, UNIT",2.2.141.0.0.8878811.6.310.1707064803181179829,NO,2025-02-24 04:50:59.0
BREAST,MIDI-B-Synthetic-Test,https://doi.org/10.7937/cf2p-aw56,2025-05-02 00:00:00.0,6350192,12,GE MEDICAL SYSTEMS,GENESIS_SIGNA,MR,8741077121,Ordered for 8741077121 by CJ,2019-11-18 00:00:00.0,PJN 20191118,2.5.775.1.3.5018818.6.186.7976073169033437168,20191118.0,07,2019-11-18 00:00:00.0,"MR BREAS, UNIT",2.5.775.1.3.5018818.6.186.1598768882900301102,NO,2025-02-24 04:50:18.0
BREAST,MIDI-B-Synthetic-Test,https://doi.org/10.7937/cf2p-aw56,2025-05-02 00:00:00.0,2036872,15,GE MEDICAL SYSTEMS,GENESIS_SIGNA,MR,4833924084,Ordered for Tara Ramos by Dr. Chavez,2014-04-01 00:00:00.0,T1-axial-locator at BASCH,3.1.882.1.1.4262713.6.989.5219852315843357675,1.0,08,2014-04-01 00:00:00.0,"MR BREAS, UNIT",3.1.882.1.1.4262713.6.989.1875342768784524288,NO,2025-02-24 04:50:06.0
BREAST,MIDI-B-Synthetic-Test,https://doi.org/10.7937/cf2p-aw56,2025-05-02 00:00:00.0,8170128,60,GE MEDICAL SYSTEMS,GENESIS_SIGNA,MR,2116061657,Ordered for Nicole Daniels by Dr. Wilson,2012-01-06 00:00:00.0,(15243/3/1)+(15243/3/1) for Nicole Daniels,1.3.747.0.0.0911824.3.342.3389836679591513635,300.0,08,2012-01-06 00:00:00.0,"MR BREAST, BIL for Nicole Daniels",1.3.747.0.0.0911824.3.342.2869463260188281525,NO,2025-02-24 04:52:44.0
BREAST,MIDI-B-Synthetic-Test,https://doi.org/10.7937/cf2p-aw56,2025-05-02 00:00:00.0,1358712,10,GE MEDICAL SYSTEMS,GENESIS_SIGNA,MR,8810201869,Performed at SRCH,2011-11-01 00:00:00.0,2D/FGRE LOC,2.2.277.0.1.3104328.7.259.3251322725021574556,20111101.0,09,2011-11-01 00:00:00.0,MR BREASTUNI UE for 527-72-4269,2.2.277.0.1.3104328.7.259.2022392310343697591,NO,2025-02-24 04:52:36.0
BREAST,MIDI-B-Synthetic-Test,https://doi.org/10.7937/cf2p-aw56,2025-05-02 00:00:00.0,19051936,36,GE MEDICAL SYSTEMS,GENESIS_SIGNA,MR,2733467744,Performed 20120710,2012-07-10 00:00:00.0,PJN for 2733467744,2.1.392.0.3.4986656.8.445.1017078165860970115,247.0,09,2012-07-10 00:00:00.0,MR BREASTUNI UE,2.1.392.0.3.4986656.8.445.6737995069113636776,NO,2025-02-24 04:52:24.0
BREAST,MIDI-B-Synthetic-Test,https://doi.org/10.7937/cf2p-aw56,2025-05-02 00:00:00.0,8233094,60,GE MEDICAL SYSTEMS,GENESIS_SIGNA,MR,4788608286,Ordered for 587-44-3872 by Dr. Warner,2013-07-29 00:00:00.0,Dynamic-3dfgre: PE1 at JG,3.2.731.1.3.8335665.2.945.9639588195886190764,41001.0,09,2013-07-29 00:00:00.0,MR BREASTUNI UE,3.2.731.1.3.8335665.2.945.1890293007068495894,NO,2025-02-24 04:52:28.0
BREAST,MIDI-B-Synthetic-Test,https://doi.org/10.7937/cf2p-aw56,2025-05-02 00:00:00.0,19051546,36,GE MEDICAL SYSTEMS,GENESIS_SIGNA,MR,8810201869,Ordered for 8810201869 by CP,,PJN for Bonnie Wu,2.1.240.0.0.7462603.1.346.2400178324679273184,20180526.0,09,2018-05-26 00:00:00.0,MR BREASTUNI UE for Bonnie Wu,2.1.240.0.0.7462603.1.346.1747148407447898106,NO,2025-02-24 04:52:25.0
BREAST,MIDI-B-Synthetic-Test,https://doi.org/10.7937/cf2p-aw56,2025-05-02 00:00:00.0,19056268,36,GE MEDICAL SYSTEMS,GENESIS_SIGNA,MR,9129369688,Ordered for Catherine Williamson by Dr. Adams,2013-01-04 00:00:00.0,PJN for 760-13-4917,3.5.235.1.1.0575612.2.947.2169976379853250260,20130104.0,,2013-01-04 00:00:00.0,MR BREASTUNI UE,3.5.235.1.1.0575612.2.947.2497963054746102727,NO,2025-02-24 04:51:33.0


## Report out on download completeness

In [0]:
# This code performs the following tasks:
# 1. Reads binary files from the specified download path and drops the 'content' column.
# 2. Extracts the filename from the file path.
# 3. Reads metadata from a specified table.
# 4. Joins the metadata with the filenames based on the SeriesInstanceUID.
# 5. Counts distinct SeriesInstanceUIDs and the presence of filenames, grouped by all non-aggregated columns.

# 1. Read binary files from the specified download path and drop the 'content' column
files_df = spark.read.format("binaryFile").load(download_path).drop('content')

# file name is in the last array element
df3 = spark.sql('''
select reverse(split(path,'/'))[0] as filename
from {df}
''', df=files_df)
#df3.display()

df4 = spark.read.table(table_name)

df5 = spark.sql("""

                select * 
                FROM {table} t 
                LEFT OUTER JOIN {files} f 
                ON concat(t.SeriesInstanceUID,'.zip') = f.filename
                """,
                table=df4, files=df3
                )
# df5.display()

spark.sql("""
    select collection, count(distinct SeriesInstanceUID) distinctSeriesInstanceUID, !isnull(filename),  count(1)
    from {df5}
    GROUP BY ALL
          """, df5=df5).display()

collection,distinctSeriesInstanceUID,!isnull(filename),count(1)
MIDI-B-Curated-Test,428,True,428
MIDI-B-Synthetic-Validation,280,True,280
MIDI-B-Curated-Validation,280,True,280
MIDI-B-Synthetic-Test,428,True,428


In [0]:
%sh find /Volumes/hls_radiology/tcia/downloads -type f -print

/Volumes/hls_radiology/tcia/downloads/MIDI-B-Curated-Test.csv
/Volumes/hls_radiology/tcia/downloads/MIDI-B-Synthetic-Validation.csv.csv
/Volumes/hls_radiology/tcia/downloads/tciaDownload/1.1.149.1.3.9186885.5.845.2842379237865892723.zip
/Volumes/hls_radiology/tcia/downloads/tciaDownload/1.1.159.0.0.6557621.9.147.1113844244573973797.zip
/Volumes/hls_radiology/tcia/downloads/tciaDownload/1.1.240.0.0.1251032.5.838.2367854858599320642.zip
/Volumes/hls_radiology/tcia/downloads/tciaDownload/1.1.266.0.2.7033344.9.031.2129314251425497512.zip
/Volumes/hls_radiology/tcia/downloads/tciaDownload/1.1.284.1.1.4850980.4.279.3265905605243663617.zip
/Volumes/hls_radiology/tcia/downloads/tciaDownload/1.1.293.0.1.9938922.9.457.1056584816284771382.zip
/Volumes/hls_radiology/tcia/downloads/tciaDownload/1.1.331.0.2.9942315.2.295.3008510809397251596.zip
/Volumes/hls_radiology/tcia/downloads/tciaDownload/1.1.335.1.3.0612302.1.183.7036301570623337028.zip
/Volumes/hls_radiology/tcia/downloads/tciaDownload/1.1.3