In [0]:
%pip install pydicom presidio_analyzer presidio_anonymizer presidio-structured "numpy<2"
dbutils.library.restartPython()

Collecting pydicom
  Obtaining dependency information for pydicom from https://files.pythonhosted.org/packages/27/a6/98651e752a49f341aa99aa3f6c8ba361728dfc064242884355419df63669/pydicom-3.0.1-py3-none-any.whl.metadata
  Downloading pydicom-3.0.1-py3-none-any.whl.metadata (9.4 kB)
Collecting presidio_analyzer
  Obtaining dependency information for presidio_analyzer from https://files.pythonhosted.org/packages/04/dd/df0d00ff44977868878f391b08feb65b8269904262d0f1387de827c13c7d/presidio_analyzer-2.2.360-py3-none-any.whl.metadata
  Downloading presidio_analyzer-2.2.360-py3-none-any.whl.metadata (3.4 kB)
Collecting presidio_anonymizer
  Obtaining dependency information for presidio_anonymizer from https://files.pythonhosted.org/packages/f0/79/d8438168402057092a36ba8f7aee099f107a7960a38eee505f1948f9e0c9/presidio_anonymizer-2.2.360-py3-none-any.whl.metadata
  Downloading presidio_anonymizer-2.2.360-py3-none-any.whl.metadata (8.9 kB)
Collecting presidio-structured
  Obtaining dependency informa

## Load benchmark dataset

In [0]:
dicom_df = spark.sql("""SELECT path, path_masked, PatientID,
b.metadata, b.label, b.non_phi_metadata,
meta, masked_json, diffing_tag_list, diffing_errors 
from hls_radiology.tcia.masked_dicom_diffing_results
RIGHT join hls_radiology.tcia.phi_detection_golden b
ON path = original_dicom_path
AND path_masked = masked_dicom_path""")
display(dicom_df.limit(5))

In [0]:
dicom_df.count()

3581

### Applying Presidio to the whole JSON string caused massive overredaction
Instead use `presidio_structured` for JSON field-level control and with [BDC Baseline Protocol](https://bdcatalyst.gitbook.io/biodata-catalyst-documentation/community/request-for-comments/nhlbi-biodata-catalyst-ecosystem-security-statement-1) rules for DICOM metadata de-identification

In [0]:
from pyspark.sql.functions import rand

first_row = dicom_df.orderBy(rand()).limit(1).collect()[0]
meta = first_row['metadata']
label = first_row['label']

In [0]:
from presidio_analyzer import AnalyzerEngine

# Set up the engine, loads the NLP module (spaCy model by default) and other PII recognizers
analyzer = AnalyzerEngine()

# Call analyzer to get results
results = analyzer.analyze(text=meta,
#                           entities=["PHONE_NUMBER"],
                           language='en')
results

### Add DICOM de-identification rules based on [BDC Baseline Protocol](https://bdcatalyst.gitbook.io/biodata-catalyst-documentation/community/request-for-comments/nhlbi-biodata-catalyst-ecosystem-security-statement-1)

In [0]:
# Classify tags based on DICOM DEID Part 15
# https://bdcatalyst.gitbook.io/biodata-catalyst-documentation/community/request-for-comments/nhlbi-biodata-catalyst-ecosystem-security-statement-1
# https://docs.google.com/spreadsheets/d/1unIqzIks0XlH9ECwVOlRyTDoTHB7PHsn/edit?gid=1385559595#gid=1385559595
# Contains PHI and should be fully redacted unless free text
phi_tags = [
    "00001000",
    "00001001",
    "00041511",
    "00080014",
    "00080017",
    "00080019",
    "00080050",
    "00080054",
    "00080055",
    "00080058",
    "00080080",
    "00080081",
    "00080082",
    "00080090",
    "00080092",
    "00080094",
    "00080096",
    "0008009C",
    "0008009D",
    "00081000",
    "00081010",
    "00081040",
    "00081041",
    "00081048",
    "00081049",
    "00081050",
    "00081052",
    "00081060",
    "00081062",
    "00081070",
    "00081072",
    "00081080",
    "00081084",
    "00081088",
    "00081110",
    "00081111",
    "00081120",
    "00081140",
    "00081155",
    "00081195",
    "00082112",
    "00083010",
    "00084000",
    "00100010",
    "00100020",
    "00100021",
    "00100032",
    "00100050",
    "00101000",
    "00101001",
    "00101002",
    "00101005",
    "00101040",
    "00101050",
    "00101060",
    "00101080",
    "00101081",
    "00101090",
    "00101100",
    "00102150",
    "00102152",
    "00102154",
    "00102155",
    "00102180",
    "001021B0",
    "001021F0",
    "00102297",
    "00102299",
    "00104000",
    "00120040",
    "00120042",
    "00120050",
    "00120071",
    "0016002B",
    "0016004D",
    "00160072",
    "00160074",
    "00160084",
    "00160086",
    "0016008C",
    "00181000",
    "00181002",
    "00181004",
    "0018100B",
    "00181400",
    "00182042",
    "00184000",
    "00185011",
    "00189424",
    "00200010",
    "00200052",
    "00200200",
    "00204000",
    "00209158",
    "00209161",
    "00209164",
    "00281199",
    "00281214",
    "00284000",
    "00324000",
    "00340001",
    "00340002",
    "00340005",
    "00380004",
    "00380010",
    "00380060",
    "00380300",
    "00380400",
    "00380500",
    "00384000",
    "003A0310",
    "00400006",
    "0040000B",
    "00400010",
    "00400241",
    "00400242",
    "00400280",
    "00400310",
    "00400512",
    "00400551",
    "00400554",
    "00401002",
    "00401004",
    "00401005",
    "00401010",
    "00401011",
    "00401101",
    "00401102",
    "00401103",
    "00401104",
    "00401400",
    "00402008",
    "00402010",
    "00402017",
    "00402400",
    "00404023",
    "00404036",
    "00404037",
    "0040A075",
    "0040A07A",
    "0040A123",
    "0040A124",
    "0040A171",
    "0040A172",
    "0040A307",
    "0040A353",
    "0040A354",
    "0040A402",
    "0040DB0C",
    "0040DB0D",
    "00500020",
    "00620021",
    "00640003",
    "006A0003",
    "00700084",
    "0070031A",
    "00701101",
    "00701102",
    "00880140",
    "04000100",
    "04000115",
    "04000402",
    "04000403",
    "04000404",
    "20300020",
    "22000002",
    "22000005",
    "30060024",
    "300600C2",
    "300A0013",
    "300A0072",
    "300A0083",
    "300A02EB",
    "300A0609",
    "300A0650",
    "300A0700",
    "300A0785",
    "300E0008",
    "30100006",
    "3010000B",
    "30100013",
    "30100015",
    "30100031",
    "30100035",
    "30100036",
    "30100037",
    "30100038",
    "3010003B",
    "3010006E",
    "3010006F",
    "40004000",
    "40080040",
    "4008010B",
    "4008010C",
    "40080119",
    "40080200",
    "40084000",
    "60xx4000",
    "FFFAFFFA",
    "ggggeeee",
]
free_text_tags = [
    "00084000",
    "00104000",
    "0016002B",
    "00181400",
    "00184000",
    "00189424",
    "00204000",
    "00209158",
    "00284000",
    "00324000",
    "00384000",
    "00400280",
    "00400310",
    "00401400",
    "00402400",
    "20300020",
    "22000002",
    "300A0072",
    "300A02EB",
    "30100035",
    "30100036",
    "30100037",
    "30100038",
    "40004000",
    "4008010B",
    "40084000",
    "60xx4000",
]

# non-free-text PHI tags that must be fully removed
remove_tags = list(set(phi_tags) - set(free_text_tags))

# always keep fully so can be ignored by Presidio
keep_tags = [
    "00020003",
    "00080013",
    "00080018",
    "00080030",
    "00080031",
    "00080032",
    "00080033",
    "00080034",
    "00080035",
    "00080106",
    "00080107",
    "00080201",
    "00081030",
    "0008103E",
    "00082111",
    "00100040",
    "00100101",
    "00100102",
    "00101010",
    "00101020",
    "00101030",
    "00102000",
    "00102110",
    "00102160",
    "001021A0",
    "001021C0",
    "00102203",
    "00120010",
    "00120020",
    "00120021",
    "00120051",
    "00120060",
    "00120072",
    "00120081",
    "00120082",
    "0014407C",
    "0016004B",
    "0016004E",
    "0016004F",
    "00160050",
    "00160051",
    "00160070",
    "00160071",
    "00160073",
    "00160075",
    "00160076",
    "00160078",
    "00160079",
    "0016007A",
    "0016007B",
    "0016007C",
    "0016007D",
    "0016007E",
    "0016007F",
    "00160080",
    "00160081",
    "00160082",
    "00160083",
    "00160085",
    "00160087",
    "00160088",
    "00160089",
    "0016008A",
    "0016008B",
    "0016008E",
    "00180010",
    "00180027",
    "00180035",
    "00181005",
    "00181007",
    "00181008",
    "00181009",
    "0018100A",
    "00181014",
    "00181030",
    "00181042",
    "00181043",
    "00181072",
    "00181073",
    "001811BB",
    "00181200",
    "00181201",
    "00181202",
    "00181204",
    "00181205",
    "0018700A",
    "0018700C",
    "0018700E",
    "00189185",
    "00189367",
    "00189371",
    "00189373",
    "0018937B",
    "0018937F",
    "00189701",
    "00189937",
    "0018A003",
    "0020000D",
    "0020000E",
    "00200027",
    "00203401",
    "00203405",
    "00203406",
    "00320012",
    "00320033",
    "00320035",
    "00321011",
    "00321020",
    "00321030",
    "00321033",
    "00321041",
    "00321051",
    "00321060",
    "00321066",
    "00321067",
    "00321070",
    "0038001B",
    "0038001D",
    "00380021",
    "00380032",
    "00380040",
    "00380050",
    "00380062",
    "003A0329",
    "003A032B",
    "00400001",
    "00400003",
    "00400005",
    "00400007",
    "00400009",
    "00400011",
    "00400012",
    "00400245",
    "00400251",
    "00400253",
    "00400254",
    "00400275",
    "0040050A",
    "0040051A",
    "00400555",
    "00400600",
    "00400602",
    "00400610",
    "004006FA",
    "00401001",
    "0040100A",
    "00402001",
    "00403001",
    "00404025",
    "00404027",
    "00404028",
    "00404030",
    "0040A024",
    "0040A027",
    "0040A07C",
    "0040A112",
    "0040A122",
    "0040A193",
    "0040A352",
    "0040A358",
    "0040A730",
    "0040DB06",
    "0040DB07",
    "0040E004",
    "00420011",
    "0050001B",
    "00500021",
    "006A0005",
    "006A0006",
    "00700001",
    "00700083",
    "0072005F",
    "00720061",
    "00720063",
    "00720065",
    "00720066",
    "00720068",
    "0072006A",
    "0072006B",
    "0072006C",
    "0072006D",
    "0072006E",
    "00720070",
    "00720071",
    "00741234",
    "00880200",
    "00880904",
    "00880906",
    "00880910",
    "00880912",
    "04000550",
    "04000551",
    "04000552",
    "04000561",
    "04000563",
    "04000564",
    "04000565",
    "04000600",
    "21000050",
    "21000070",
    "21000140",
    "30020121",
    "30020123",
    "30060002",
    "30060004",
    "30060006",
    "30060009",
    "30060026",
    "30060028",
    "30060038",
    "30060085",
    "30060088",
    "30080025",
    "30080105",
    "30080164",
    "30080168",
    "30080251",
    "300A0002",
    "300A0003",
    "300A0004",
    "300A0007",
    "300A000B",
    "300A000E",
    "300A0016",
    "300A00B2",
    "300A00C3",
    "300A00DD",
    "300A0196",
    "300A01A6",
    "300A01B2",
    "300A0216",
    "300A022E",
    "300A0608",
    "300A0611",
    "300A0615",
    "300A0619",
    "300A0623",
    "300A062A",
    "300A0676",
    "300A067C",
    "300A067D",
    "300A0734",
    "300A0742",
    "300A0783",
    "300A078E",
    "300A0792",
    "300A0794",
    "300A079A",
    "300C0113",
    "3010000F",
    "30100017",
    "3010001B",
    "3010002D",
    "30100033",
    "30100034",
    "30100043",
    "30100054",
    "30100056",
    "3010005A",
    "3010005C",
    "30100061",
    "30100077",
    "3010007A",
    "3010007B",
    "3010007F",
    "30100081",
    "30100085",
    "40000010",
    "40080101",
    "40080109",
    "40080113",
    "40080115",
    "40080118",
    "4008011A",
    "40080300",
    "50xxxxxx",
    "60xx3000",
    "FFFCFFFC",
    "file_size",
    "has_pixel",
    "hash",
    "img_avg",
    "img_max",
    "img_min",
    "img_shape_x",
    "img_shape_y",
    "pixel_hash",
]
keep_zip3 = [
    "00120030",
    "00120031",
    "00321000",
    "00321001",
    "00321021",
    "0038001E",
    "00400243",
]
keep_year = [
    "00080012",
    "00080015",
    "00080020",
    "00080021",
    "00080022",
    "00080023",
    "00080024",
    "00080025",
    "0008002A",
    "00100030",
    "001021D0",
    "00120086",
    "00120087",
    "0014407E",
    "00160077",
    "0016008D",
    "00181012",
    "00181078",
    "00181079",
    "00181203",
    "00189074",
    "00189151",
    "00189369",
    "0018936A",
    "00189516",
    "00189517",
    "00189623",
    "00189804",
    "00189919",
    "0018A002",
    "00203403",
    "00320032",
    "00320034",
    "00321010",
    "00321040",
    "00321050",
    "00340007",
    "0038001A",
    "0038001C",
    "00380020",
    "00380030",
    "003A0314",
    "00400002",
    "00400004",
    "00400244",
    "00400250",
    "00402004",
    "00404005",
    "00404008",
    "00404010",
    "00404011",
    "00404050",
    "00404051",
    "00404052",
    "0040A023",
    "0040A030",
    "0040A032",
    "0040A033",
    "0040A082",
    "0040A110",
    "0040A120",
    "0040A121",
    "0040A13A",
    "0040A192",
    "00440004",
    "0044000B",
    "00440010",
    "00440104",
    "00440105",
    "00686226",
    "00686270",
    "00700082",
    "0072000A",
    "0072005E",
    "01000420",
    "04000105",
    "04000310",
    "04000562",
    "21000040",
    "30060008",
    "30080024",
    "30080054",
    "30080056",
    "30080162",
    "30080166",
    "30080250",
    "300A0006",
    "300A022C",
    "300A0736",
    "300A073A",
    "300A0741",
    "300A0760",
    "300C0127",
    "300E0004",
    "3010004C",
    "3010004D",
    "40080100",
    "40080108",
    "40080112",
]
# non-PII but provider info that can be removed
maybe_remove = [
    "00321032",
    "00380011",
    "00380014",
    "00380061",
    "00380064",
    "00400513",
    "00400562",
    "00402005",
    "00402009",
    "00402011",
    "00402016",
    "00404034",
    "00404035",
    "0040A073",
    "0040A078",
    "0040A088",
    "00700086",
    "00741236",
    "300600A6",
    "300E0005",
    "40080042",
    "40080102",
    "4008010A",
    "40080111",
    "40080114",
    "40080202",
]

In [0]:
from dataclasses import dataclass, field
from typing import List

@dataclass
class DicomTagClass:
    remove_tags: List[str] = field(default_factory=list)
    maybe_remove: List[str] = field(default_factory=list)
    free_text_tags: List[str] = field(default_factory=list)
    keep_tags: List[str] = field(default_factory=list)
    keep_year: List[str] = field(default_factory=list)
    keep_zip3: List[str] = field(default_factory=list)

tag_classes = DicomTagClass(
    remove_tags, maybe_remove, free_text_tags, keep_tags, keep_year, keep_zip3
)

In [0]:
# Helper functions
from presidio_structured import (
    StructuredEngine,
    StructuredAnalysis,
    JsonDataProcessor,
    JsonAnalysisBuilder,
)

def keep_only_year(date_string: str) -> str:
    from datetime import datetime

    # Date always in YYYYMMDD
    # https://dicom.nema.org/dicom/2013/output/chtml/part05/sect_6.2.html#:~:text=A%20string%20of%20characters%20of,DT%20VR%20in%20this%20table.
    try:
        date_obj = datetime.strptime(date_string, "%Y%m%d")
        return str(date_obj.year)
    except ValueError:
        # If no format matches, return the original string or a placeholder
        return "cannot parse year"


# Flatten the original json and drop the unused vr tags
def flatten_json(json_str: str) -> dict:
    import json
    json_dict = json.loads(json_str)

    dicom_dict = {}
    for k, v in json_dict.items():
        if isinstance(v, dict):
            if v.get("Value") is None:
                continue
            elif isinstance(v.get("Value"), list) and len(v.get("Value")) == 1:
                if not isinstance(v["Value"][0], dict):
                    dicom_dict[k] = v["Value"][0]
                else:
                    if v["Value"][0].get("Alphabetic"):
                        dicom_dict[k] = v["Value"][0].get("Alphabetic")
                    else:
                        try:
                            for k1,v1 in v["Value"][0].items():
                                if v1.get('Value'):
                                    if isinstance(v1['Value'][0], dict):
                                        for k2,v2 in v1["Value"][0].items():
                                            dicom_dict[f"{k}_{k1}_{k2}"] = v2.get('Value')[0]
                                    else:
                                        dicom_dict[f"{k}_{k1}"] = v1.get('Value')[0]
                        except:
                            # keep it nested
                            dicom_dict[k] = v["Value"][0]
            elif isinstance(v.get("Value"), list) and len(v.get("Value")) > 1:
                for i,l in enumerate(v["Value"]):
                    if isinstance(l, dict):
                        for kl,vl in l.items():
                            if vl.get('Value'):
                                if isinstance(vl['Value'],list) and len(vl['Value'])==1:
                                    dicom_dict[f"{k}_{i}_{kl}"] = vl.get('Value')[0]
                                else:
                                    dicom_dict[f"{k}_{i}_{kl}"] = vl.get('Value')
            else:
                dicom_dict[k] = v["Value"]
        elif v is None:
            pass
        else:
            dicom_dict[k] = v
    return dicom_dict


def apply_dicom_rules(flattened_json: dict) -> dict:
    # Mark redact, ignore tags and feed data_remaining to Presidio
    data_copy = flattened_json.copy()
    data_remaining = {}
    # removed = set()
    ignored = set()
    # kept_yr = set()
    # kept_zip3 = set()

    for k, v in flattened_json.items():
        # REDACT tags in remove_tags and maybe_remove
        if k in tag_classes.remove_tags + tag_classes.maybe_remove:
            # removed.add(k)
            data_copy[k] = ""
        # Keep tags will be ignored by Presidio
        elif k in tag_classes.keep_tags:
            ignored.add(k)
        elif k in tag_classes.keep_year:
            # kept_yr.add(k)
            data_copy[k] = keep_only_year(v)
        # TODO: keep_zip3
        # elif k in tag_classes.keep_zip3:
        #     kept_zip3.add(k)
        #     data_copy[k] = keep_zip3(v)
        else:  # includes free_text_tags
            data_remaining[k] = v
    return {"rules_applied": data_copy, "for_presidio_processing": data_remaining}


def merge_results_rules_presidio(
    dict_rules_applied: dict, anonymized_dict: dict
) -> dict:
    # Replace data_copy with anonymized json from Presidio if keys match
    results = dict_rules_applied.copy()
    for k in results.keys():
        if k in anonymized_dict:
            results[k] = anonymized_dict[k]
    return results


# Compare original JSON with deid JSON
# Returns False if correctly (over)redacted
def diff_dicts(
    dict1: dict, dict2: dict, overredact: bool = True, verbose: bool = True
) -> bool:
    from pprint import pprint

    keys1 = set(dict1.keys())
    keys2 = set(dict2.keys())
    only_in_1 = keys1 - keys2
    only_in_2 = keys2 - keys1
    diff_values = {
        k: (dict1[k], dict2[k]) for k in keys1 & keys2 if dict1[k] != dict2[k]
    }
    diff = {
        "only_in_dict1": only_in_1,
        "only_in_dict2": only_in_2,
        "different_values": diff_values,
    }
    pprint(diff)

    if overredact:
        non_none_replacements_in_ans = [
            i for i, j in diff["different_values"].values() if i not in ["<None>", ""]
        ]
        if (
            len(diff["only_in_dict1"])
            + len(diff["only_in_dict2"])
            + len(non_none_replacements_in_ans)
            == 0
        ):
            return False
        else:
            return True
    else:
        if (
            len(diff["only_in_dict1"])
            + len(diff["only_in_dict2"])
            + len(diff["different_values"])
            == 0
        ):
            return False
        else:
            return True

In [0]:
from pyspark.sql.functions import pandas_udf
import pandas as pd

class DicomTagsPresidio(JsonAnalysisBuilder):
    def __init__(
        self, data_processor: JsonDataProcessor, overredact=True, verbose=False, **kwargs,
    ):
        super().__init__(**kwargs)
        self.overredact = overredact
        self.verbose = verbose
        self.structured_engine = StructuredEngine(data_processor)

    # TODO: add gt_json but this will require dicom_df be of another structure
    def deid_and_compare(self, ori_json: str):
        # Unnest JSON to a format that Presidio expects
        ori_dict = flatten_json(ori_json)
        #        gt_dict = flatten_json(gt_json)

        # Apply the DICOM BPC rules
        processed = apply_dicom_rules(ori_dict)

        # Apply Presidio to the tags after applying rules
        try:
            json_analysis = self.generate_analysis(processed["for_presidio_processing"])
            anonymized_json = self.structured_engine.anonymize(
                processed["for_presidio_processing"], json_analysis
            )

            results = merge_results_rules_presidio(
                processed["rules_applied"], anonymized_json
            )
            # TODO: diff_dicts(results, gt_dict) if dicom_df has the gt (needs to be restructured)
            return diff_dicts(
                results, ori_dict, overredact=self.overredact, verbose=self.verbose
            )
        except:
            return None 
            
@pandas_udf("boolean")
def diff_after_presidio(ori_json: pd.Series) -> pd.Series:
    detect_redact = DicomTagsPresidio(
        JsonDataProcessor(), overredact=True, verbose=True
    )
    return ori_json.apply(detect_redact.deid_and_compare)

In [0]:
# Test single json
instance = DicomTagsPresidio(JsonDataProcessor(), overredact=True, 
                             verbose=True, n_process=8)
first_result = instance.deid_and_compare(meta)

In [0]:
assert first_result == bool(label)

In [0]:
df = dicom_df.select(["metadata","label","path","path_masked", "diffing_tag_list"])
#    .orderBy(rand()).limit(100)
df = df.repartition(320)
display(df.limit(5))

In [0]:
from pyspark.sql.functions import col

df_results = df.withColumn("predicted", diff_after_presidio(col("metadata")))
display(df_results.limit(5))

In [0]:
pdf = df_results.select("predicted", "label").where(col('predicted').isNotNull()).toPandas()
display(df_results.limit(5))

In [0]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

precision = precision_score(pdf.label, pdf.predicted)
recall = recall_score(pdf.label, pdf.predicted)
f1 = f1_score(pdf.label, pdf.predicted)
accuracy = accuracy_score(pdf.label, pdf.predicted)
cm = confusion_matrix(pdf.label, pdf.predicted)

precision, recall, f1, accuracy, cm

(1.0,
 1.0,
 1.0,
 1.0,
 array([[1767,    0],
        [   0, 1813]]))