In [0]:
%pip install pydicom==3.0.1

In [0]:
import logging
logger = logging.getLogger("zipdcm")
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(filename)s:%(lineno)s - %(levelname)s - %(message)s')
# Create a handler (e.g., StreamHandler for console output)
handler = logging.StreamHandler()
handler.setFormatter(formatter)

# Add the handler to the logger
logger.addHandler(handler)

In [0]:
from pyspark.sql.datasource import DataSource, DataSourceReader
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructType
from zip_dcm_ds import ZipDCMDataSource

def test_single(spark):
    # test a single zip (with a dcm and a license file)
    df = (
        spark.read
            .option('numPartitions','1')
            .format("zipdcm")
            .load("./resources/dcms/3.5.574.1.3.9030958.6.376.2860280475000825621.zip")
    )
    result = df.collect()
    assert len(result) == 1
    print(result)

def test_folder(spark):
    # test on a folder of zips
    df = (
        spark.read
          .option('numPartitions','2')
          .format("zipdcm")
          .load("./resources/dcms")
    )
    df.write.format('delta').mode('overwrite').saveAsTable("douglas_moore.pixels.zipdcm_test")

def test_scale(spark, numPartitions=128):
    df = (
        spark.read
          .option('numPartitions',f'{numPartitions}')
          .format("zipdcm")
          .load("/Volumes/hls_radiology/tcia/downloads/tciaDownload")
    )
    df.write.format('delta').mode('overwrite').saveAsTable("douglas_moore.pixels.zipdcm_test")

In [0]:
# how many zips do we have?
df = (
    spark.read.format("binaryFile")
        .option("pathGlobFilter", "*.zip")
        .option("recursiveFileLookup", "true")
        .load("/Volumes/hls_radiology/tcia/downloads/tciaDownload").drop('content')
    )
df.count()

In [0]:
import time
start_time = time.time()
total_worker_cores = 32

# Add our custom Python DataSource for DICMO files storaged in a Zip Archive
spark.dataSource.register(ZipDCMDataSource)

#test_single(spark)
#test_folder(spark)
test_scale(spark, 128)

end_time = time.time()
operation_time = end_time - start_time
logger.info(f"Operation time: {operation_time} seconds and {operation_time * total_worker_cores} core-seconds")

In [0]:
import time
start_time = time.time()
total_worker_cores = 32

# Add our custom Python DataSource for DICMO files storaged in a Zip Archive
spark.dataSource.register(ZipDCMDataSource)

#test_single(spark)
#test_folder(spark)
test_scale(spark, 64)

end_time = time.time()
operation_time = end_time - start_time
logger.info(f"Operation time: {operation_time} seconds and {operation_time * total_worker_cores} core-seconds")

## Summary stats

In [0]:
%sql
select count(distinct zipfile) zipfiles, count(distinct zipfile, dcmfile) dcmfiles, count(distinct rowid), count(1) records
from douglas_moore.pixels.zipdcm_test

In [0]:
%sql
select * 
from douglas_moore.pixels.zipdcm_test
limit 100;