In [0]:
# Not needed if notebook environment has dependencies specified
# %pip install numpy==1.26.4 pydicom==3.0.1 nvidia-nvimgcodec-cu12[all] highdicom pylibjpeg>=2.0 pylibjpeg-openjpeg>=2.0

In [0]:
dbutils.library.restartPython()

In [0]:
    %load_ext autoreload
    %autoreload 2

In [0]:
import yaml
cfg = yaml.safe_load(open('config.yaml'))
compression = "nvImage_HTJ2K_progressive"
input_path = cfg.get("input_path")
output_path = cfg.get("output_path").replace("{compression}", f"{compression}")
table = "main.pixels_solacc.object_catalog"
experiment_log_dir = "/Volumes/douglas_moore/mlflow/experiments"
input_path, output_path

In [0]:
import logging
logging.basicConfig(level=logging.INFO)
# In your notebook
import time
import sys
import os

In [0]:
import pydicom

In [0]:
# from convert_htj2k import transcode_dicom_to_htj2k
sys.path.append('/Workspace/Users/douglas.moore@databricks.com/pixels-jpeg2000/notebooks/transcoding')
from convert_htj2k import transcode_dicom_to_htj2k

In [0]:
from pyspark.sql.functions import col, split, size, slice
df = spark.read.table("hls_radiology.tcia.object_catalog")
from pyspark.sql.functions import col, split, size, slice, concat
from pyspark.sql.functions import array_join
df = df.withColumn('dirs', array_join(slice(split(col('path'), '/'), 1, size(split(col('path'), '/')) - 1), '/'))
df = df.withColumn('dir', split(col('dirs'),':')[1])
df = df.select('dir').distinct()

display(df)

In [0]:
import mlflow
import glob
import pydicom

#mlflow.set_experiment(cfg.get("experiment_name"))

with mlflow.start_run(run_name=f"compress_{compression}", log_system_metrics=True) as run:

    # Log artifacts (source DICOM, compressed DICOM, config)
    mlflow.log_artifact("config.yaml", artifact_path="config")

    # Check for a GPU
    try:
        import subprocess
        gpu_info = subprocess.check_output("nvidia-smi -L", shell=True).decode().strip()
        mlflow.log_param("gpu_info", gpu_info)
    except (subprocess.CalledProcessError, FileNotFoundError):
        mlflow.log_param("gpu_info", "No GPU detected")

    # Log the full requirements.txt file of the current environment
    with open("full_requirements.txt", "w") as f:
        subprocess.run(["pip", "freeze"], stdout=f)
    mlflow.log_artifact("full_requirements.txt")

    # Optionally log code snapshot
    #mlflow.log_artifact("notebook.py", artifact_path="source_code")

    # Log parameters
    mlflow.log_param("compression", compression)
    mlflow.log_param("input_path", input_path)
    mlflow.log_param("output_path", output_path)
    mlflow.log_param("encoder", "nvimgcodec")
#    mlflow.log_param("enc_params.jpeg2k_params.ht", enc_params.jpeg2k_params.ht)
#    mlflow.log_param("enc_params.jpeg2k_params.num_resolutions", enc_params.jpeg2k_params.num_resolutions)


    # run compression
    from convert_htj2k import transcode_dicom_to_htj2k

    start = time.time()
    transcode_dicom_to_htj2k(
        input_dir=input_path,
        output_dir=output_path
    )
    duration = time.time() - start
    print(f"{duration :.2f}s")
    mlflow.log_metric("duration_seconds", round(duration, 2))

#    mlflow.log_metric("input_size_bytes", input_size)
#    mlflow.log_metric("output_size_bytes", output_size)
#    mlflow.log_metric("compression_ratio", round(input_size/output_size, 2))
#    mlflow.log_metric("savings_percent", round(100*(input_size-output_size)/input_size, 2))
#        mlflow.log_metric("decode_duration_seconds", round(decode_duration_s, 2))


    # Log metrics
    input_files = sorted(glob.glob(os.path.join(input_path, "*.dcm")))
    output_files = sorted(glob.glob(os.path.join(output_path, "*.dcm")))

    for in_file, out_file in zip(input_files, output_files):
        input_size = os.stat(in_file).st_size
        output_size = os.stat(out_file).st_size
        compression_ratio = round(input_size / output_size, 2) if output_size else 0
        savings_percent = round(100 * (input_size - output_size) / input_size, 2) if input_size else 0

        mlflow.log_metric(f"{os.path.basename(in_file)}_input_size_bytes", input_size)
        mlflow.log_metric(f"{os.path.basename(out_file)}_output_size_bytes", output_size)
        mlflow.log_metric(f"{os.path.basename(out_file)}_compression_ratio", compression_ratio)
        mlflow.log_metric(f"{os.path.basename(out_file)}_savings_percent", savings_percent)

        print(f"{in_file}")
        print(f"{out_file}")
        print(f"Input  size: {input_size:>15,}")
        print(f"Output size: {output_size:>15,}")
        print(f"Reduction:   {compression_ratio:>18.2f}x")
        print(f"Savings:     {savings_percent:>18.2f}%")

        with open(in_file, "rb") as f:
            start = time.time()
            ds = pydicom.dcmread(f, stop_before_pixels=True)
            duration = time.time() - start
            mlflow.log_param(f"{os.path.basename(in_file)}_in_decode_duration_no_pixels", duration)
            mlflow.log_param(f"{os.path.basename(in_file)}_in_sop_instance_uid", ds.SOPInstanceUID)
            mlflow.log_param(f"{os.path.basename(in_file)}_in_sop_class_uid", ds.SOPClassUID)
            mlflow.log_param(f"{os.path.basename(in_file)}_in_transfer_syntax_uid", ds.file_meta.TransferSyntaxUID)
        
        with open(out_file, "rb") as f:
            start = time.time()
            ds = pydicom.dcmread(f, stop_before_pixels=True)
            duration = time.time() - start
            mlflow.log_param(f"{os.path.basename(out_file)}_out_decode_duration_no_pixels", duration)
            mlflow.log_param(f"{os.path.basename(out_file)}_out_sop_instance_uid", ds.SOPInstanceUID)
            mlflow.log_param(f"{os.path.basename(out_file)}_out_sop_class_uid", ds.SOPClassUID)
            mlflow.log_param(f"{os.path.basename(out_file)}_out_transfer_syntax_uid", ds.file_meta.TransferSyntaxUID)

    

## DECODE verification

In [0]:
mlflow.end_run()

https://docs.databricks.com/aws/en/machine-learning/sgc-examples/gpu-distributed-training

In [0]:
from serverless_gpu import distributed

@distributed(gpus=1, gpu_type='a10', remote=True)
def foo(x):
  print('hello_world', x)
  return x


foo

In [0]:
foo.distributed(x=5)