In [0]:
%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

# Presidio test bed

## Installs

In [0]:
%pip install --quiet presidio_analyzer presidio_anonymizer presidio_image_redactor pydicom

In [0]:
dbutils.library.restartPython()

In [0]:
%sh python -m spacy download en_core_web_lg

In [0]:
import logging
import sys
logging.basicConfig(stream=sys.stderr,
                    level=logging.WARNING,
                    format='%(asctime)s [%(name)s][%(levelname)s] %(message)s')
logging.getLogger('dbx.pixels').setLevel(logging.DEBUG)
logger = logging.getLogger('dbx.pixels')

## Test Setup

In [0]:
paths = [
    "/Volumes/hls_radiology/pseudo-phi-dicom-data/pseudo-phi-dicom-data/pseudo_phi_dicom_data/manifest-1744203894606/Pseudo-PHI-DICOM-Data/292821506/07-13-2013-NA-XR CHEST AP PORTABLE for Douglas Davidson-46198/1001.000000-NA-37718/1-1.dcm",

    "/Volumes/hls_radiology/pseudo-phi-dicom-data/pseudo-phi-dicom-data/pseudo_phi_dicom_data/manifest-1744203894606/Pseudo-PHI-DICOM-Data/292821506/07-13-2013-NA-XR CHEST AP PORTABLE for Douglas Davidson-46198/1002.000000-NA-53238/1-1.dcm",

    "/Volumes/hls_radiology/pseudo-phi-dicom-data/pseudo-phi-dicom-data/pseudo_phi_deid_dicom_data/manifest-1744157828937/Pseudo-PHI-DICOM-Data/Pseudo-PHI-001/06-26-2003-NA-XR CHEST AP PORTABLE-96544/1001.000000-NA-42825/1-1.dcm"
]

# Download
import shutil
for i, path in enumerate(paths):
    shutil.copy(path, f"/local_disk0/tmp/{i}.dcm")

## Test Presidio engine
https://microsoft.github.io/presidio/

In [0]:
source_path = "/Volumes/hls_radiology/pseudo-phi-dicom-data/pseudo-phi-dicom-data/pseudo_phi_dicom_data/manifest-1744203894606/Pseudo-PHI-DICOM-Data"

### Presidio as a Spark UDF

In [0]:
%sql 
-- truncate table douglas_moore.pixels.pixels_deid_presidio;

In [0]:
from dbx.pixels.dicom import PresidioTransformer

In [0]:
# Create a DataFrame with the DICOM file paths
limit = 100

## Read all the DICOM path names
df = (spark.read.format("binaryFile")
  .option("pathGlobFilter", "*.dcm")
  .option("recursiveFileLookup", "true")
  .load(source_path)
  .drop("content")
  .limit(limit)
)

# Apply Transformer
result_df = PresidioTransformer().transform(df)

from pyspark.sql.functions import current_timestamp
res = (result_df
       .withColumn('insert_dt', current_timestamp())
       .write.mode('append')
       .option("mergeSchema", "true")
       .saveAsTable('douglas_moore.pixels.pixels_deid_presidio'))
res

### Evaulate transformation

In [0]:
%sql select insert_dt, count(1) 
from douglas_moore.pixels.pixels_deid_presidio
group by all

In [0]:
%sql
select * from douglas_moore.pixels.pixels_deid_presidio
order by insert_dt desc
limit 10

In [0]:
%sql
with pii(
  select right(path,30)path , explode(pii_analysis) analysis
  from douglas_moore.pixels.pixels_deid_presidio
)
select path, sum(analysis.analyzer_score) total_score
from pii
where analysis.analyzer_score is not null
group by all

## presidio timing

| files - frames   | run time | worker-core-seconds / frame | config |
| -------- | -------- | --------------- | ----|
|1,693  | 8m 13s  | 4.659 | Single GPU A10 cluster |
|1,693  | 6m 13s  | 3.525 | Single GPU A10 cluster |
|1,693  | 5m 42s. | 12.929  | 4 x `c6gd.4xlarge` CPU cluster (config 3) |
| 1,693 | 4m 56s.  | 6.464 | 2 x `c6gd.4xlarge` CPU cluster (config 3) |

Config 3: 
```
{
    "cluster_name": "DM - Pixels",
    "spark_version": "16.4.x-scala2.13",
    "aws_attributes": {
        "first_on_demand": 1,
        "zone_id": "auto",
        "spot_bid_price_percent": 100
    },
    "node_type_id": "c6gd.4xlarge",
    "custom_tags": {
        "solacc": "pixels",
        "owner": "douglas.moore@databricks.com",
        "removeAfter": "20250531"
    },
    "autotermination_minutes": 60,
    "enable_elastic_disk": true,
    "single_user_name": "douglas.moore@databricks.com",
    "enable_local_disk_encryption": false,
    "data_security_mode": "DATA_SECURITY_MODE_DEDICATED",
    "runtime_engine": "STANDARD",
    "kind": "CLASSIC_PREVIEW",
    "use_ml_runtime": true,
    "is_single_node": false,
    "num_workers": 4,
    "apply_policy_default_values": false
}```