# Presidio test bed

## Installs

In [0]:
%pip install -U spacy[cuda113]

In [0]:
%sh python -m spacy download en_core_web_lg

In [0]:
import logging
import sys
logging.basicConfig(stream=sys.stderr,
                    level=logging.INFO,
                    format='%(asctime)s [%(name)s][%(levelname)s] %(message)s')
logging.getLogger('dbx.pixels').setLevel(logging.DEBUG)
logger = logging.getLogger('dbx.pixels')

In [0]:
logger.info("Hello")

## Test Setup

In [0]:
paths = [
    "/Volumes/hls_radiology/pseudo-phi-dicom-data/pseudo-phi-dicom-data/pseudo_phi_dicom_data/manifest-1744203894606/Pseudo-PHI-DICOM-Data/292821506/07-13-2013-NA-XR CHEST AP PORTABLE for Douglas Davidson-46198/1001.000000-NA-37718/1-1.dcm",

    "/Volumes/hls_radiology/pseudo-phi-dicom-data/pseudo-phi-dicom-data/pseudo_phi_dicom_data/manifest-1744203894606/Pseudo-PHI-DICOM-Data/292821506/07-13-2013-NA-XR CHEST AP PORTABLE for Douglas Davidson-46198/1002.000000-NA-53238/1-1.dcm",

    "/Volumes/hls_radiology/pseudo-phi-dicom-data/pseudo-phi-dicom-data/pseudo_phi_deid_dicom_data/manifest-1744157828937/Pseudo-PHI-DICOM-Data/Pseudo-PHI-001/06-26-2003-NA-XR CHEST AP PORTABLE-96544/1001.000000-NA-42825/1-1.dcm"
]

In [0]:
import shutil
for i, path in enumerate(paths):
    shutil.copy(path, f"/local_disk0/tmp/{i}.dcm")


## Test EasyOCR

### Install EasyOCR

In [0]:
%pip install easyocr

In [0]:
%pip install pylibjpeg python-gdcm pydicom

In [0]:
%pip install  --extra-index-url=https://pypi.nvidia.com "cudf-cu12>=25.04" "cuml-cu12>=25.04" "dask-cuda>=25.04"

In [0]:
dbutils.library.restartPython()

In [0]:
paths = [
    "/Volumes/hls_radiology/pseudo-phi-dicom-data/pseudo-phi-dicom-data/pseudo_phi_dicom_data/manifest-1744203894606/Pseudo-PHI-DICOM-Data/292821506/07-13-2013-NA-XR CHEST AP PORTABLE for Douglas Davidson-46198/1001.000000-NA-37718/1-1.dcm",

    "/Volumes/hls_radiology/pseudo-phi-dicom-data/pseudo-phi-dicom-data/pseudo_phi_dicom_data/manifest-1744203894606/Pseudo-PHI-DICOM-Data/292821506/07-13-2013-NA-XR CHEST AP PORTABLE for Douglas Davidson-46198/1002.000000-NA-53238/1-1.dcm",

    "/Volumes/hls_radiology/pseudo-phi-dicom-data/pseudo-phi-dicom-data/pseudo_phi_deid_dicom_data/manifest-1744157828937/Pseudo-PHI-DICOM-Data/Pseudo-PHI-001/06-26-2003-NA-XR CHEST AP PORTABLE-96544/1001.000000-NA-42825/1-1.dcm"
]

# Download
import shutil
for i, path in enumerate(paths):
    shutil.copy(path, f"/local_disk0/tmp/{i}.dcm")

### Easy OCR test Code

In [0]:
dcm_path = "/local_disk0/tmp/1.dcm"

In [0]:
from pprint import pp

In [0]:
import pydicom
from PIL import Image
import numpy as np

ds = pydicom.dcmread(dcm_path)

im = ds.pixel_array.astype(float)
rescaled_image = (np.maximum(im,0)/im.max())*255 # float pixels
final_image = np.uint8(rescaled_image) # integers pixels

x = Image.fromarray(final_image)
# save image as png
x.save("/local_disk0/tmp/xray.png")
Image.open("/local_disk0/tmp/xray.png")

In [0]:
import cv2
import easyocr
import pydicom

# Initialize the EasyOCR Reader for English with GPU acceleration enabled.
reader = easyocr.Reader(['en'], gpu=True, download_enabled=True)

# Load your input image using OpenCV.
def do_read(path:str):
    ds = pydicom.dcmread(path)
    im = ds.pixel_array.astype(float)
    rescaled_image = (np.maximum(im,0)/im.max())*255 # float pixels
    final_image = np.uint8(rescaled_image) # integers pixels

    results = reader.readtext(final_image)

    # Loop over the results and print detected text along with confidence.
    for bbox, text, confidence in results:
        yield({"label": text,
               "conf": float(confidence*100.),
               "bbox": [{"x":int(b[0]),"y":int(b[1])} for b in bbox]
               })

In [0]:
pp([ {"path": _, "results": [r for r in do_read(_)]} for _ in paths])

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, ArrayType, LongType

schema = StructType([
    StructField("path", StringType(), True),
    StructField("results", ArrayType(
        StructType([
            StructField("label", StringType(), True),
            StructField("conf", DoubleType(), True),
            StructField("bbox", ArrayType(
                StructType([
                    StructField("x", IntegerType(), True),
                    StructField("y", IntegerType(), True)
                ])
            ), True)
        ])
    ), True)
])

df = spark.createDataFrame(
    [ {"path": _, "results": [r for r in do_read(_)]} for _ in paths], 
    schema=schema)

print(df.schema)
display(df)

### Performance - Easy OCR
2min 3s ± 235 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
for 100 x 3 images

= .41 seconds per image

In [0]:
import matplotlib.pyplot as plt

# Display the image using matplotlib
ds = pydicom.dcmread(path2)
plt.imshow(ds.pixel_array, cmap='gray')
plt.savefig("/local_disk0/tmp/xray.png")
plt.title('DICOM Image')
plt.axis('on')
plt.show()
do_read(path2)