In [0]:
%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

# jpeg2000 nvImage compression job


In [0]:
%pip install --quiet numpy==1.26.4 pydicom==3.0.1 nvidia-nvimgcodec-cu12[all] highdicom nvidia-ml-py

In [0]:
import os
import sys

# Add MONAI dicom converter / HTJ2K encoder
sys.path.append('/Workspace/Users/douglas.moore@databricks.com/MONAILabel-htj2k')
from monailabel.datastore.utils.convert_multiframe import batch_convert_by_series


In [0]:
import yaml
cfg = yaml.safe_load(open('config.yaml'))

input_path = cfg.get("input_path")
output_path = cfg.get("output_path")
experiment_name = cfg.get("experiment_name")
table = cfg.get("table")
CODE_FOLDER = cfg.get('code_folder')
max_workers = 4

experiment_log_dir = "/Volumes/douglas_moore/mlflow/experiments"

sys.path.append(CODE_FOLDER)

#set GPU logging
#os.environ['PYNVIMGCODEC_VERBOSITY'] = '3'

input_path, output_path, table, max_workers

In [0]:
import logging

log_level = logging.INFO
logger_format = '%(asctime)s [%(levelname)s] [%(name)s:%(lineno)d] %(message)s'

# Use PySparkLogger for structured JSON logging
logger = logging.getLogger("Pixels-HTJ2K")
if logger.hasHandlers():
    handler = logger.handlers[0]
    handler.setLevel(log_level)
    formatter = logging.Formatter(logger_format)
    handler.setFormatter(formatter)
else:
    logging.basicConfig(level=log_level, format=logger_format)

if True:
    print("Adding a file handler for logging")
    file_handler = logging.FileHandler(CODE_FOLDER + '/app.log')
    file_handler.setLevel(log_level)
    file_handler.setFormatter(logging.Formatter(logger_format))
    logger.addHandler(file_handler)

logging.getLogger("py4j").setLevel(logging.ERROR)

# In your notebook
import time
import sys
import os

In [0]:
## Run in parallel, multiple-CPUs for every GPU
input_dirs = os.listdir(input_path)
input_dirs = [input_path + "/" + d for d in input_dirs]
logger.info(f"Processing {len(input_dirs)} input_dirs")

In [0]:
from pathlib import Path
import json

all_stats=[]
num_dirs=len(input_dirs[0:10])
for i,_ in enumerate(input_dirs):
    input_dir = Path(_)
    input_files = [str(f) for f in input_dir.rglob("*.dcm")]
    output_dir = f"{output_path}/{input_dir.name}"
    file_loader = [(input_files, output_dir)]
    logger.info(f"Converting dir ({i+1}/{num_dirs}): {input_dir}, {len(input_files)} Files ... to {output_dir}")
    if True:
        stats = batch_convert_by_series(
            file_loader=file_loader,
            compress_htj2k=True,
        )
        all_stats.append({
            "input_dir": input_dir,
            "output_dir": output_dir,
            "stats": stats})



In [0]:
all_stats

In [0]:
open(f"{output_path}/stats.json", "w").write(json.dumps(all_stats))

In [0]:
%skip
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd

#
# Run multiple instances of htj2k converter in parallel to feed data faster via the CPU vCores
#

def distributed_htj2k_compressor(args):
    input_dir, output_file = args
    logger.info(f"Processing {input_dir} -> {output_file}")

    return convert_single_frame_dicom_series_to_multiframe(
            input_dir=input_dir,
            output_dir=output_file,
            convert_to_htj2k=True,
            add_basic_offset_table=True
        )

## Run in parallel, multiple-CPUs for every GPU
input_dirs = os.listdir(input_path)
input_dirs = [input_path + "/" + d for d in input_dirs]
logger.info(f"Processing {len(input_dirs)}")

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(distributed_htj2k_compressor, (i, output_path)) for i in input_dirs]
    results = [future.result() for future in as_completed(futures)]

display(
    pd.DataFrame(
        {
            "input_dir": input_dirs,
            "output_dir": [output_path] * len(input_dirs),
            "result": results
        }
    )
)

In [0]:
%sh ls -lah /Volumes/hls_radiology/tcia/htj2k_v8/