In [0]:
import yaml
with open("config.yaml", 'r') as file:
    cfg = yaml.safe_load(file)

table = cfg.get("table")
dbutils.widgets.text("table", table)
table

# Summarize storage requirements


In [0]:
from pyspark.sql.functions import expr
df = (spark.read.format("binaryFile")
      .option("recursiveFileLookup","true")
      .option("pathGlobFilter","*.dcm")
      .load(f"/Volumes/hls_radiology/tcia/downloads/unzipped/"))
df = df.drop('content')
df = df.withColumn("SeriesInstanceUID",expr("split_part(path, '/', -2)"))
df.cache()
df.createOrReplaceGlobalTempView("unzipped")

In [0]:
%sql
select * from global_temp.unzipped

In [0]:
%sql
with base as (
  select
  ( meta:['0020000D'].Value[0]) as StudyInstanceUID,
  ( meta:['0020000E'].Value[0]) as SeriesInstanceUID,
  ( local_path) dcm_path,
  length as dcm_length
  from IDENTIFIER(:table) t
  where extension = 'dcm'
), zips (
    select
    substring(get(split(original_path, '/'), size(split(original_path, '/')) - 1), 1, length(get(split(original_path, '/'), size(split(original_path, '/')) - 1)) - 4) as SeriesInstanceUID,
    original_path as zip_path,
    first(length) as zip_length
  from hls_radiology.tcia.object_catalog_unzip
  where lower(right(original_path,3)) = 'zip'
  group by original_path
), dcms (
    select
    SeriesInstanceUID,
    length as orig_length,
    path as orig_path
    from global_temp.unzipped
), aggs (
  select 'htj2k dcms' as label, count(distinct base.SeriesInstanceUID) as SeriesInstanceUIDs,  count(distinct dcm_path) as files, sum(dcm_length) as length from base
union all
  select 'uncompressed dcms' as label, count(distinct dcms.SeriesInstanceUID), count(distinct orig_path ), sum(orig_length) dcm_length from dcms
  where SeriesInstanceUID in (select distinct SeriesInstanceUID from base)
union all
  select 'zip files' as label, count(distinct zips.SeriesInstanceUID), count(distinct zip_path ), sum(zip_length) dcm_length from zips
  where SeriesInstanceUID in (select distinct SeriesInstanceUID from base)
  order by 1 desc
)
select Label, SeriesInstanceUIDs, Files, round(length/1024/1024/1024,2) as `Length (GB)`
from aggs

In [0]:
%sql
with htj2k as (
  select
  ( meta:['0020000D'].Value[0]) as StudyInstanceUID,
  ( meta:['0020000E'].Value[0]) as SeriesInstanceUID,
  ( local_path) htj2k_path,
  length as htj2k_length
  from IDENTIFIER(:table) t
  where extension = 'dcm'
), zips (
    select
    substring(get(split(original_path, '/'), size(split(original_path, '/')) - 1), 1, length(get(split(original_path, '/'), size(split(original_path, '/')) - 1)) - 4) as SeriesInstanceUID,
    original_path as zip_path,
    first(length) as zip_length
  from hls_radiology.tcia.object_catalog_unzip
  where lower(right(original_path,3)) = 'zip'
  group by original_path
), dcms (
    select
    meta:['0020000D'].Value[0] as StudyInstanceUID,
    meta:['0020000E'].Value[0] as SeriesInstanceUID,
    meta:['00080060'].Value[0] as Modality,
    length as dcm_length,
    local_path as dcm_path
    from hls_radiology.tcia.object_catalog
    where right(local_path,3) = 'dcm'
), lenaggs (
    select
      dcms.StudyInstanceUID,
      dcms.seriesinstanceuid,
      dcms.modality,
      htj2k_length,
      zip_length,
      sum(dcm_length) as dcm_length,
      count(distinct dcm_path) as files
    from dcms
    join zips on dcms.SeriesInstanceUID = zips.SeriesInstanceUID
    join htj2k on dcms.SeriesInstanceUID = htj2k.SeriesInstanceUID
    group by all
)
select *
from lenaggs
where dcm_length > 20 * zip_length
order by zip_length desc


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
select * from hls_radiology.tcia.object_catalog
where  contains(local_path,'/1.3.6.1.4.1.14519.5.2.1.8700.9920.146883340873245283101117847180')

In [0]:
%sh tree -h --du /Volumes/hls_radiology/tcia/downloads/dicom_unzipped/1.3.6.1.4.1.14519.5.2.1.8700.9920.486285013167207558255813049278/

In [0]:
%sh 
du -h /Volumes/hls_radiology/tcia/downloads/dicom_unzipped/1.3.6.1.4.1.14519.5.2.1.8700.9920.486285013167207558255813049278/

In [0]:
%sh
du -h /Volumes/hls_radiology/tcia/downloads/tciaDownload/1.3.6.1.4.1.14519.5.2.1.8700.9920.486285013167207558255813049278.zip

In [0]:
%sh 
du -h -s /Volumes/hls_radiology/tcia/downloads/tciaDownload
find /Volumes/hls_radiology/tcia/downloads/tciaDownload -type f -print |wc -l

In [0]:
%sh
du -h  --summarize /Volumes/hls_radiology/tcia/downloads/unzipped


In [0]:
%sh find /Volumes/hls_radiology/tcia/downloads/unzipped -type f -name '*.dcm' -print |wc -l

In [0]:
%sql
select count(distinct local_path) cnt, round(sum(length)/(1024*1024*1024),3) GBs
from hls_radiology.tcia.object_catalog
where extension = 'dcm'
and local_path like '/Volumes/hls_radiology/tcia/downloads/unzipped/%'
group by all

In [0]:
%sql
select * from global_temp.unzipped limit 5

In [0]:
%sql
create volume hls_radiology.tcia.pixels_20251128;

In [0]:
from dbx.pixels import Catalog
from dbx.pixels.dicom import DicomMetaExtractor # The Dicom transformers
table = "hls_radiology.tcia.object_catalog_20251128"
volume = "hls_radiology.tcia.pixels_20251128"
output_path = "/Volumes/hls_radiology/tcia/downloads/unzipped"
if os.path.exists(output_path):
    catalog = Catalog(
        spark,
        table=escape_table(table),
        volume=volume)

    catalog_df = catalog.catalog(path=output_path)
    meta_df = DicomMetaExtractor(catalog, deep=False).transform(catalog_df)
    catalog.save(meta_df, mode=write_mode)


In [0]:
%sh
ls -al /Volumes/hls_radiology/tcia/downloads/tciaDownload/

In [0]:
%sh
ls -al /Volumes/hls_radiology/tcia/downloads/tciaDownload/1.1.146.0.1.4587991.5.760.4003968910029450044.zip

In [0]:
%sql
select 
  oc.path, oc.length,
  uz.path, uz.length
from hls_radiology.tcia.object_catalog oc
left outer join global_temp.unzipped uz
  on oc.path = uz.path
where oc.extension = 'dcm'
  and oc.path like 'dbfs:/Volumes/hls_radiology/tcia/downloads/unzipped/%'
order by oc.path

In [0]:
%sql
select 
  count(distinct oc.path) as cnt, sum(oc.length),
  count(distinct uz.path) as cnt_unzipped, sum(uz.length)
from hls_radiology.tcia.object_catalog oc
left outer join global_temp.unzipped uz
  on oc.path = uz.path
where oc.extension = 'dcm'
  and oc.path like 'dbfs:/Volumes/hls_radiology/tcia/downloads/unzipped/%'

In [0]:
%sql
select sum(length)/(1024*1024*1024) from global_temp.unzipped

In [0]:
%sh ls -al /Volumes/hls_radiology/tcia/downloads/dicom_unzipped/*/*.dcm | head -10

In [0]:
%sh find /Volumes/hls_radiology/tcia/downloads/dicom_unzipped -type f |wc -l

In [0]:
%sh ls -alh /Volumes/hls_radiology/tcia/downloads/dicom_unzipped/* |head -20

In [0]:
%sh
du -h /Volumes/hls_radiology/tcia/downloads/dicom_unzipped/