In [0]:
%pip install --quiet numpy==1.26.4 pydicom==3.0.1 nvidia-nvimgcodec-cu12[all] highdicom nvidia-ml-py

In [0]:
%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

# sgc test 3



In [0]:
dbutils.library.restartPython()

In [0]:
%sh nvidia-smi

In [0]:
import logging

# Get the root logger
logger = logging.getLogger()
logger_format = '%(asctime)s [%(levelname)s] [%(name)s:%(lineno)d] %(message)s'
# Check if a handler is already present (Jupyter usually adds one by default)
if logger.handlers:
    # Get the existing handler (usually the first one)
    handler = logger.handlers[0]
    # Set the level to INFO
    handler.setLevel(logging.INFO)
    # Define a simple formatter without color codes
    formatter = logging.Formatter(logger_format)
    handler.setFormatter(formatter)
else:
    # If no handler, configure basicConfig (less common in modern Jupyter)
    logging.basicConfig(level=logging.INFO, format=logger_format)


logging.getLogger("py4j").setLevel(logging.ERROR)

# In your notebook
import time
import sys
import os

In [0]:
import yaml
cfg = yaml.safe_load(open('config.yaml'))

compression = "nvImage_HTJ2K_progressive"
input_path = cfg.get("input_path")
output_path = cfg.get("output_path").replace("{compression}", f"{compression}")
experiment_name = cfg.get("experiment_name")

table = "main.pixels_solacc.object_catalog"
experiment_log_dir = "/Volumes/douglas_moore/mlflow/experiments"

data_batch_size=1024
gpu_max_batch_size = 1024
folder_limit = 10000
num_gpus = 1
CODE_FOLDER = '/Workspace/Users/douglas.moore@databricks.com/pixels-jpeg2000/notebooks/transcoding'
sys.path.append(CODE_FOLDER)

#os.environ['PYNVIMGCODEC_VERBOSITY'] = '5'

input_path, output_path

#set GPU logging


## Pull dir paths to compress

In [0]:
import pandas as pd
import numpy as np
from pyspark.sql.functions import col, split, size, slice
from pyspark.sql.functions import col, split, size, slice, concat
from pyspark.sql.functions import array_join

#
## Chunk up dirs
#

df = spark.read.table("hls_radiology.tcia.object_catalog")
df = df.where("meta:['00280100'].Value[0] is not null") # remove dicom files not having required bitsAllocated
df = df.where("not (meta:['00280002'].Value[0] = 3 and meta:['00280006'].Value[0] is null)")
df = df.withColumn('dirs', array_join(slice(split(col('path'), '/'), 1, size(split(col('path'), '/')) - 1), '/'))
df = df.withColumn('dir', split(col('dirs'),':')[1])
df = df.select('dir').distinct().orderBy('dir').limit(folder_limit)

pdf = df.toPandas()
chunks = np.array_split(pdf, num_gpus)


In [0]:
from dicom_file_iterator import DICOMFileIterator
from torch.utils.data import IterableDataset, DataLoader
import pprint
from convert_htj2k_p3 import transcode_dicom_to_htj2k

folders = pdf['dir'].to_list()
dataset = DICOMFileIterator(root=input_path, target=output_path, folders=folders)
dataloader = DataLoader(dataset, batch_size=data_batch_size)

In [0]:
start = time.time()
transcode_dicom_to_htj2k(
    dataloader,
    root_dir = input_path,
    output_dir = output_path,
    max_batch_size=gpu_max_batch_size
)
duration = time.time() - start
print(f"{duration :.2f}s")

In [0]:
%sh ls /Volumes/hls_radiology/tcia/htj2k_compressed/

In [0]:
%sh
mkdir -p /Workspace/Users/douglas.moore@databricks.com/experiments

In [0]:
import mlflow
import glob
import pydicom

print(experiment_name)
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment = mlflow.create_experiment(experiment_name)

mlflow.set_experiment(experiment_name)

In [0]:
%skip
run_name = f"compress_{compression}"
with mlflow.start_run( log_system_metrics=True) as run:

    #init
    import logging
    import sys
    import os

    # Initialization (per worker basis)

    logger = logging.getLogger()

    sys.path.append(CODE_FOLDER)
    from convert_htj2k_p2 import transcode_dicom_to_htj2k
    os.environ['PYNVIMGCODEC_VERBOSITY'] = '6'
    
    # Log artifacts (source DICOM, compressed DICOM, config)
    mlflow.log_artifact("config.yaml", artifact_path="config")

    # Check for a GPU
    try:
        import subprocess
        gpu_info = subprocess.check_output("nvidia-smi -L", shell=True).decode().strip()
        mlflow.log_param("gpu_info", gpu_info)
    except (subprocess.CalledProcessError, FileNotFoundError):
        mlflow.log_param("gpu_info", "No GPU detected")

    # Log the full requirements.txt file of the current environment
    with open("full_requirements.txt", "w") as f:
        subprocess.run(["pip", "freeze"], stdout=f)
    mlflow.log_artifact("full_requirements.txt")

    # Optionally log code snapshot
    #mlflow.log_artifact("notebook.py", artifact_path="source_code")

    # Log parameters
    mlflow.log_param("compression", compression)
    mlflow.log_param("input_path", input_path)
    mlflow.log_param("output_path", output_path)
    mlflow.log_param("encoder", "nvimgcodec")
#    mlflow.log_param("enc_params.jpeg2k_params.ht", enc_params.jpeg2k_params.ht)
#    mlflow.log_param("enc_params.jpeg2k_params.num_resolutions", enc_params.jpeg2k_params.num_resolutions)


    # run compression
    from convert_htj2k_p2 import transcode_dicom_to_htj2k

    start = time.time()
    transcode_dicom_to_htj2k(
        input_dir=input_path,
        output_dir=output_path
    )
    duration = time.time() - start
    print(f"{duration :.2f}s")
    mlflow.log_metric("duration_seconds", round(duration, 2))



In [0]:
%sh nvidia-smi