# IDC Data Landscape

Overview of what imaging data is available in [NCI Imaging Data Commons (IDC)](https://portal.imaging.datacommons.cancer.gov/), focusing on collections most likely to have cross-node data in other CRDC repositories (GDC, PDC).

**Key questions:**
- How many collections, patients, and series are in IDC?
- Which collections originate from programs (TCGA, CPTAC, HTAN) that also have data in GDC/PDC?
- What modalities and cancer types are represented?

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/fedorov/idc-cda/blob/main/notebooks/01_idc_overview.ipynb)

In [None]:
%pip install --upgrade -q idc-index

In [1]:
from idc_index import IDCClient
import pandas as pd
from pathlib import Path

client = IDCClient()
print(f"IDC data version: {client.get_idc_version()}")

IDC data version: v23


## Overall IDC Statistics

In [2]:
stats = client.sql_query("""
    SELECT
        COUNT(DISTINCT collection_id) as collections,
        COUNT(DISTINCT PatientID) as patients,
        COUNT(DISTINCT StudyInstanceUID) as studies,
        COUNT(DISTINCT SeriesInstanceUID) as series,
        SUM(instanceCount) as instances,
        ROUND(SUM(series_size_MB) / 1000, 1) as total_size_GB
    FROM index
""")
stats

Unnamed: 0,collections,patients,studies,series,instances,total_size_GB
0,161,79569,160199,994073,46885909.0,95333.0


## Collection-Level Summary

In [3]:
collections_overview = client.sql_query("""
    SELECT
        collection_id,
        COUNT(DISTINCT PatientID) as num_patients,
        COUNT(DISTINCT StudyInstanceUID) as num_studies,
        COUNT(DISTINCT SeriesInstanceUID) as num_series,
        STRING_AGG(DISTINCT Modality, ', ') as modalities,
        ROUND(SUM(series_size_MB), 1) as total_size_MB
    FROM index
    GROUP BY collection_id
    ORDER BY num_patients DESC
""")

print(f"Total collections: {len(collections_overview)}")
collections_overview.head(20)

Total collections: 161


Unnamed: 0,collection_id,num_patients,num_studies,num_series,modalities,total_size_MB
0,nlst,26410,73574,590572,"SM, CT, SEG, SR",26354752.4
1,cbis_ddsm,6671,6775,6775,MG,163513.4
2,breast_cancer_screening_dbt,5060,5610,22032,MG,1637811.8
3,ccdi_mci,4407,4447,4576,SM,4657163.3
4,victre,2994,8749,8749,MG,1027187.6
5,cmmd,1775,1775,1775,MG,22863.3
6,covid_19_ny_sbu,1384,7361,17950,"CT, DX, MR, OT, CR, PT, SR, NM",511483.8
7,prostate_mri_us_biopsy,1151,2799,10373,"US, MR, SEG, M3D",81643.4
8,tcga_brca,1098,1263,10463,"SEG, MG, SR, ANN, SM, MR",4801057.0
9,lidc_idri,1010,1308,15116,"SEG, SR, CR, DX, CT",136545.5


## Collections Index â€” Cancer Types and Tumor Locations

In [4]:
client.fetch_index("collections_index")

collections_info = client.sql_query("""
    SELECT collection_id, CancerTypes, TumorLocations, Species, Subjects, SupportingData
    FROM collections_index
    ORDER BY Subjects DESC
""")
collections_info.head(20)

2026-02-16 13:09:22,188 - Index collections_index already installed, loading from /Users/af61/github/idc-cda/.venv/lib/python3.11/site-packages/idc_index_data/collections_index.parquet


Unnamed: 0,collection_id,CancerTypes,TumorLocations,Species,Subjects,SupportingData
0,nlst,"Lung Cancer, Non-Cancer",Chest,Human,26410,Clinical
1,cbis_ddsm,"Breast Cancer, Non-Cancer",Breast,Human,6671,Image Analyses
2,breast_cancer_screening_dbt,"Breast Cancer, Non-Cancer",Breast,Human,5060,"Clinical, Software/Source Code"
3,ccdi_mci,Various,Various,Human,4407,
4,victre,Breast Cancer,Breast,Human,2994,Software/Source Code
5,cmmd,Breast Cancer,Breast,Human,1775,Clinical
6,covid_19_ny_sbu,COVID-19 (non-cancer),Lung,Human,1384,"Clinical, Image Analyses"
7,prostate_mri_us_biopsy,Prostate Cancer,Prostate,Human,1151,Image Analyses
8,tcga_brca,Breast Cancer,Breast,Human,1098,"Clinical, Genomics, Image Analyses, Histopatho..."
9,lidc_idri,"Lung Cancer, Non-Cancer, Metastatic disease",Chest,Human,1010,"Clinical, Image Analyses, Software/Source Code"


## Cross-Node Candidate Collections

Collections from TCGA, CPTAC, and HTAN are most likely to have corresponding data in GDC (genomics) and PDC (proteomics).

In [5]:
# Tag collections by program
def tag_program(collection_id):
    cid = collection_id.lower()
    if cid.startswith('tcga_'):
        return 'TCGA'
    elif cid.startswith('cptac_'):
        return 'CPTAC'
    elif cid.startswith('htan_'):
        return 'HTAN'
    else:
        return 'Other'

collections_overview['program'] = collections_overview['collection_id'].apply(tag_program)

program_summary = collections_overview.groupby('program').agg(
    collections=('collection_id', 'count'),
    total_patients=('num_patients', 'sum'),
    total_series=('num_series', 'sum'),
    total_size_MB=('total_size_MB', 'sum')
).sort_values('total_patients', ascending=False)

program_summary

Unnamed: 0_level_0,collections,total_patients,total_series,total_size_MB
program,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Other,112,66380,903078,63992362.7
TCGA,32,11050,76292,26814517.8
CPTAC,13,2151,14340,2880061.6
HTAN,4,68,363,1646019.3


In [6]:
# TCGA collections detail
tcga = collections_overview[collections_overview['program'] == 'TCGA'].copy()
print(f"TCGA: {len(tcga)} collections, {tcga['num_patients'].sum()} total patients")
tcga

TCGA: 32 collections, 11050 total patients


Unnamed: 0,collection_id,num_patients,num_studies,num_series,modalities,total_size_MB,program
8,tcga_brca,1098,1263,10463,"SEG, MG, SR, ANN, SM, MR",4801057.0,TCGA
17,tcga_gbm,607,607,4286,"OT, ANN, SM, SEG",1057632.3,TCGA
18,tcga_ov,591,912,2537,"SEG, SM, CT, MR, OT",516249.7,TCGA
19,tcga_ucec,560,786,4893,"PT, CR, SEG, MR, SM, CT, ANN",2065212.8,TCGA
20,tcga_luad,560,674,4868,"SM, CT, ANN, SEG, PT, NM",1284054.7,TCGA
21,tcga_kirc,537,976,6038,"SM, CT, MR, SEG, CR",923543.4,TCGA
22,tcga_hnsc,523,523,2163,"SEG, SM",574052.9,TCGA
23,tcga_lgg,516,516,1572,SM,1194590.0,TCGA
25,tcga_thca,507,514,1186,"SM, CT, PT",766136.5,TCGA
26,tcga_lusc,504,578,4170,"PT, NM, SEG, SM, CT, ANN",1101495.1,TCGA


In [7]:
# CPTAC collections detail
cptac = collections_overview[collections_overview['program'] == 'CPTAC'].copy()
print(f"CPTAC: {len(cptac)} collections, {cptac['num_patients'].sum()} total patients")
cptac

CPTAC: 13 collections, 2151 total patients


Unnamed: 0,collection_id,num_patients,num_studies,num_series,modalities,total_size_MB,program
44,cptac_ucec,254,396,3459,"RTSTRUCT, US, MR, SM, CT, PT",257358.4,CPTAC
47,cptac_luad,244,319,1585,"PT, MR, CT, SM",521547.8,CPTAC
51,cptac_ccrcc,233,319,2321,"CT, SM, MR, RTSTRUCT, SEG",277023.8,CPTAC
54,cptac_lscc,212,255,1314,"PT, SM, CT",509698.9,CPTAC
57,cptac_brca,198,198,653,SM,123127.4,CPTAC
59,cptac_pda,195,302,2226,"CT, SM, MR, RTSTRUCT, US, PT",170155.5,CPTAC
63,cptac_gbm,178,178,462,SM,100948.5,CPTAC
64,cptac_coad,178,178,372,SM,70234.4,CPTAC
78,cptac_hnscc,112,112,390,SM,111422.2,CPTAC
82,cptac_cm,95,114,597,"MR, SM, CT, PT",156256.7,CPTAC


In [8]:
# HTAN collections detail
htan = collections_overview[collections_overview['program'] == 'HTAN'].copy()
print(f"HTAN: {len(htan)} collections, {htan['num_patients'].sum()} total patients")
htan

HTAN: 4 collections, 68 total patients


Unnamed: 0,collection_id,num_patients,num_studies,num_series,modalities,total_size_MB,program
124,htan_vanderbilt,30,30,30,SM,69497.9,HTAN
130,htan_wustl,21,21,228,"PR, SM",12254.8,HTAN
141,htan_hms,16,16,31,"SM, PR",1232544.7,HTAN
160,htan_ohsu,1,1,74,"SM, PR",331721.9,HTAN


## Extract PatientIDs for Cross-Node Candidate Collections

Get distinct PatientIDs from TCGA and CPTAC collections for later cross-referencing with CDA.

In [9]:
# All TCGA patient IDs
tcga_patients = client.sql_query("""
    SELECT DISTINCT collection_id, PatientID
    FROM index
    WHERE collection_id LIKE 'tcga_%'
    ORDER BY collection_id, PatientID
""")
print(f"TCGA patients in IDC: {len(tcga_patients)}")
tcga_patients.head(10)

TCGA patients in IDC: 11050


Unnamed: 0,collection_id,PatientID
0,tcga_acc,TCGA-OR-A5J1
1,tcga_acc,TCGA-OR-A5J2
2,tcga_acc,TCGA-OR-A5J3
3,tcga_acc,TCGA-OR-A5J4
4,tcga_acc,TCGA-OR-A5J5
5,tcga_acc,TCGA-OR-A5J6
6,tcga_acc,TCGA-OR-A5J7
7,tcga_acc,TCGA-OR-A5J8
8,tcga_acc,TCGA-OR-A5J9
9,tcga_acc,TCGA-OR-A5JA


In [10]:
# All CPTAC patient IDs
cptac_patients = client.sql_query("""
    SELECT DISTINCT collection_id, PatientID
    FROM index
    WHERE collection_id LIKE 'cptac_%'
    ORDER BY collection_id, PatientID
""")
print(f"CPTAC patients in IDC: {len(cptac_patients)}")
cptac_patients.head(10)

CPTAC patients in IDC: 2151


Unnamed: 0,collection_id,PatientID
0,cptac_aml,C3L-00452
1,cptac_aml,C3L-00453
2,cptac_aml,C3L-00455
3,cptac_aml,C3L-00457
4,cptac_aml,C3L-00458
5,cptac_aml,C3L-00459
6,cptac_aml,C3L-00460
7,cptac_aml,C3L-00461
8,cptac_aml,C3L-00465
9,cptac_aml,C3L-00540


In [11]:
# All HTAN patient IDs
htan_patients = client.sql_query("""
    SELECT DISTINCT collection_id, PatientID
    FROM index
    WHERE collection_id LIKE 'htan_%'
    ORDER BY collection_id, PatientID
""")
print(f"HTAN patients in IDC: {len(htan_patients)}")
htan_patients.head(10)

HTAN patients in IDC: 68


Unnamed: 0,collection_id,PatientID
0,htan_hms,HTA7_920
1,htan_hms,HTA7_922
2,htan_hms,HTA7_925
3,htan_hms,HTA7_926
4,htan_hms,HTA7_927
5,htan_hms,HTA7_931
6,htan_hms,HTA7_932
7,htan_hms,HTA7_934
8,htan_hms,HTA7_940
9,htan_hms,HTA7_947


## Modality Distribution Across Programs

In [12]:
modality_by_program = client.sql_query("""
    SELECT
        CASE
            WHEN collection_id LIKE 'tcga_%' THEN 'TCGA'
            WHEN collection_id LIKE 'cptac_%' THEN 'CPTAC'
            WHEN collection_id LIKE 'htan_%' THEN 'HTAN'
            ELSE 'Other'
        END as program,
        Modality,
        COUNT(DISTINCT PatientID) as patients,
        COUNT(DISTINCT SeriesInstanceUID) as series
    FROM index
    GROUP BY program, Modality
    ORDER BY program, patients DESC
""")
modality_by_program

Unnamed: 0,program,Modality,patients,series
0,CPTAC,SM,2107,7377
1,CPTAC,CT,346,3143
2,CPTAC,RTSTRUCT,235,1789
3,CPTAC,MR,81,1907
4,CPTAC,PT,29,80
5,CPTAC,SEG,14,37
6,CPTAC,US,7,7
7,HTAN,SM,68,257
8,HTAN,PR,25,106
9,Other,SEG,34041,160420


## Save Results for Cross-Node Analysis

In [None]:
data_dir = Path('data')
data_dir.mkdir(exist_ok=True)

collections_overview.to_csv(data_dir / 'collections_overview.csv', index=False)
tcga_patients.to_csv(data_dir / 'tcga_patients.csv', index=False)
cptac_patients.to_csv(data_dir / 'cptac_patients.csv', index=False)
htan_patients.to_csv(data_dir / 'htan_patients.csv', index=False)

print("Saved:")
print(f"  collections_overview.csv: {len(collections_overview)} rows")
print(f"  tcga_patients.csv: {len(tcga_patients)} rows")
print(f"  cptac_patients.csv: {len(cptac_patients)} rows")
print(f"  htan_patients.csv: {len(htan_patients)} rows")