# HIPAA Privacy Rule-based De-identification on DICOM Dataset

HIPAA provides two methods for de-identification: the "Safe Harbor" method and the "Expert Determination" method. The Safe Harbor method is more straightforward and involves anonymizing/redacting 18 specific types of identifiers from the data.

Here, we will focus on the Safe Harbor method, which includes removing or redacting identifiers such as names, geographic subdivisions smaller than a state, dates directly related to an individual, phone numbers, email addresses, and more.

After de-ID,  the DICOM file will be updated and uploaded to destiny storage and evaluated by AWS services, Rekongnition, Comprehend and Comprehend Medical.

## Setup De-identification Environment
## Setup De-identification Environment

Let's start by setting environment variables for de identification of DICOM file:
1) set local path of DICOM img folder.
2) set source and destiny s3 bucket.
3) set source and destiny prefix for DICOM file.
4) cleanup de-id DICOM dir and evaluation DICOM dir
5) set aws session with user profile name.

In [None]:
from med_img_de_id_class import ProcessMedImage
from common.utils import get_boto3_session, cleanup_dir, get_date_time, dump_dict_to_tsv
# setup environment
LOC_DICOM_FOLDER = '/Users/gup2/Documents/AI/Dicom_files/manifest-1617826555824/Pseudo-PHI-DICOM-Data'
LOC_DE_ID_DICOM_FOLDER = '../images/med_de_id_img/Pseudo-PHI-DICOM-Data'
LOC_EVAL_DICOM_FOLDER = '../images/med_eval_img/Pseudo-PHI-DICOM-Data'
SOURCE_BUCKET = "de-id-src"
DESTINATION_BUCKET = "de-id-dest"
SOURCE_PREFIX = "dicom-images/"
DESTINATION_PREFIX = "de-id-dicom-images/"
EVAL_BUCKET = "de-id-evl"
EVAL_PREFIX = "eval-de-id-dicom-images/"
FILE_NAME = 'file_name'
FILE_PATH = 'file_path'
FILE_PREFIX = 'prefix'

# cleanup destination dirs
cleanup_dir([LOC_DE_ID_DICOM_FOLDER, LOC_EVAL_DICOM_FOLDER ])
aws_session = None
rule_config_file_path= '../configs/de-id/de_id_rules_auto.yaml'
batch_size = 164
batch_max = 10
batch_count = 0


## De-identification On Batch DICOM files

In [3]:
import glob, os, datetime

dicom_list = []
dicom_files = glob.glob('{}/**/*.dcm'.format(LOC_DICOM_FOLDER), recursive=True)
print(f'Found {len(dicom_files)} DICOM files under {LOC_DICOM_FOLDER}')
start_date_time = datetime.datetime.now()
print(f"Start De-id on Batch of DICAM Files at {get_date_time()}")
processed_count = 0
de_id_count = 0
try:
    # create a de-id processor
    processor = ProcessMedImage(aws_session, rule_config_file_path, True)
    last_de_if_fold = "None"
    for filepath in dicom_files:
        filename = os.path.basename(filepath)
        prefix = os.path.join(SOURCE_PREFIX, '/'.join(filepath.split('/')[-4:-2]))
        short_file_path = '/'.join(filepath.split('/')[-4:])
        # print('Filename: {}, Filepath: {}'.format(filename, short_file_path))
        local_de_id_dicom = os.path.join(LOC_DE_ID_DICOM_FOLDER, short_file_path)
        if last_de_if_fold != os.path.dirname(local_de_id_dicom):
            last_de_if_fold = os.path.dirname(local_de_id_dicom)
            os.makedirs(os.path.dirname(local_de_id_dicom), exist_ok=True)
        text_in_image = False
        phi_in_image = False
        redacted_tags = 0
        result = processor.parse_dicom_file(None, None, filepath)
        dicom_dataset = processor.ds
        if dicom_dataset:
            redacted_count, redacted_tags = processor.de_identify_dicom()
            id_text_detected, text_in_image = processor.detect_id_in_img(None, None)
            if text_in_image and id_text_detected and len(id_text_detected):
                phi_in_image = True
                # print(f'Sensitive text detected in {filepath}')
                print (f'Found PHI in pixel: {id_text_detected} in DICOM: {short_file_path}.')
                processor.redact_id_in_image(id_text_detected)
                print('PHI in pixel have been redacted')
                de_id_count += 1
                # print('Filename: {}, Filepath: {}, Text In Pixel: {}, PHI In Pixel: {}'.format(filename, short_file_path, text_in_image, phi_in_image))
            else:
                # if text_in_image:
                #      print('Filename: {}, Filepath: {}, Text In Pixel: {}, PHI In Pixel: {}'.format(filename, short_file_path, text_in_image, phi_in_image))
                local_de_id_png = None
                # print(f'No sensitive text detected in {filepath}')
            processor.save_de_id_dicom(local_de_id_dicom)
        # dicom_list.append({FILE_NAME: filename, "Redacted Tags": redacted_count, "Text In Pixel": text_in_image, "PHI In Pixel": id_text_detected, FILE_PATH: short_file_path})        
        processed_count += 1
except Exception as e:
    print(f'Error processing dicom file: {e}')
    raise e
finally:
    processor = None
# create a tsv file with the results of the de-id process
# running_report = f"../output/report/Batch-De-Id-Report-{get_date_time()}.tsv"
# dump_dict_to_tsv(dicom_list, running_report)
end_date_time = datetime.datetime.now()
print(f'Completed De-in on batch of DiCAM files at {get_date_time()}')
print(f'Total {len(dicom_list)} DICOM files are processed.')
run_time = (end_date_time - start_date_time).total_seconds()
print(f'Total run time: {run_time} seconds')

# from IPython.display import Markdown, display
# markdown_content = f"Click [Batch De-identification Report ]({running_report}) to download report."
# # Display the Markdown
# display(Markdown(markdown_content))

## Statistics of De-identification on Batch DICOM Dataset

In [None]:
total_dicom_files = len(dicom_files)
print(f"Number of DICOM files: {total_dicom_files}")
processed_rate = round(processed_count/len(dicom_files) * 100, 3)
print(f"Processed {processed_rate}% of DICOM files")
mean_process_time = round(run_time/total_dicom_files, 1)
print(f"Average processing time per DICOM: {mean_process_time} seconds")
redacted_count = de_id_count
print(f"Number of Redacted DICOM: {redacted_count}")
print(f"Redacted Ratio: {round(redacted_count/processed_count)*100}%")
redacted_metadata_count = de_id_count
print(f"Number of DICOM with PHI in Metadata: {redacted_metadata_count}")
# text_in_pixel_count = len([file for file in dicom_list if file["Text In Pixel"]> 0])
# phi_in_pixel_dicom = [file for file in dicom_list if len(file["PHI In Pixel"])> 0]
# phi_in_pixel_count = len(phi_in_pixel_dicom)
# print(f"Number of DICOM with text in Pixel: {text_in_pixel_count} detected by Tesseract with confidence threshold of 5%.")
# print(f"Number of DICOM with PHI in Pixel: {phi_in_pixel_count} detected by Rules")
# for file in phi_in_pixel_dicom:
#     print(f'file: {file['file_path']}, PHI in pixel: {file["PHI In Pixel"]}')

# cleanup unused resources
# dicom_list = None
dicom_files = None


## Evaluate Batch Redacted DICOM Files

In [None]:
import random, glob, os
eval_dicom_list = []
eval_dicom_files = glob.glob('{}/**/*.dcm'.format(LOC_DE_ID_DICOM_FOLDER), recursive=True)
de_id_dicom_count = len(eval_dicom_files)
print(f'Found {de_id_dicom_count} DICOM files under {LOC_DE_ID_DICOM_FOLDER}')
print(f"Start Evaluating Batch De-id DICOM Files at {get_date_time()}")
# random_test = False
# random_numbers = random.sample(range(1, 1693 + 1), 85)
# print(f'Randomly select {random_numbers} DICOM files for evaluation')
processed_count = 0
de_id_count = 0
try:
     # create a de-id processor
    aws_session = get_boto3_session("esi")
    processor = ProcessMedImage(aws_session, rule_config_file_path, True)
    # for filepath in eval_dicom_files:
    last_eval_fold = "None"
    # for i in random_numbers:
        # filepath = eval_dicom_files[i - 1]
    for filepath in eval_dicom_files:
        # if random_test:
        #     filepath = eval_dicom_files[i - 1]
        # print(f'Processing {filepath}')
        filename = os.path.basename(filepath)
        # key = os.path.join(EVAL_PREFIX, os.path.join(LOC_DICOM_FOLDER.split('/')[-1], '/'.join(filepath.split('/')[-4:])))
        # print(f's3-key: {key}')
        local_de_id_dicom = filepath
        text_in_image = False
        phi_in_image = False
        redacted_tags = 0
        detected_tags = []
        phi_dicom = None
        # result = processor.parse_dicom_file(EVAL_BUCKET, key, filepath, True)
        result = processor.parse_dicom_file(None, None, filepath)
        dicom_dataset = processor.ds
        if dicom_dataset:
            detected_elements, detected_tags, id_entities = processor.detect_id_in_tags()
            if len(detected_elements) > 0 and len(id_entities) > 0:
                for elem in detected_elements:
                    print(f'Found PHI in DICOM: {elem} at {filepath}.')
                # redact id in dicom tags
                processor.redact_tags(detected_elements)
                redacted_tags = len(detected_elements)
                de_id_count += 1
                eval_dicom = local_de_id_dicom.replace(LOC_DE_ID_DICOM_FOLDER, LOC_EVAL_DICOM_FOLDER)
                if last_eval_fold != os.path.dirname(eval_dicom):
                    last_eval_fold = os.path.dirname(eval_dicom)
                    os.makedirs(os.path.dirname(eval_dicom), exist_ok=True)
                    
                processor.save_de_id_dicom(eval_dicom)
               
            # eval_dicom_list.append({FILE_NAME: filename, "Redacted Tags": redacted_tags, "Text In Pixel": text_in_image, "PHI In Pixel": phi_in_image, FILE_PATH: filepath})         
        processed_count += 1  
except Exception as e:
    print(f'Error processing de-id dicom file: {e}')
    raise e
finally:
    processor.update_rules_in_configs(rule_config_file_path)
    processor.close()
    processor = None
# create a tsv file with the results of the de-id process
# running_report = f"../output/report/Batch-Eval-Report-{get_date_time()}.tsv"
# dump_dict_to_tsv(eval_dicom_list, running_report)
print(f'Completed Evaluating on batch of De-id DICAM files at {get_date_time()}')
print(f'Total {len(eval_dicom_list)} DICOM files have remaining PHI and being redacted.')

# from IPython.display import Markdown, display
# markdown_content = f"Click [Batch Evaluating Report ]({running_report}) to download report."
# # Display the Markdown
# display(Markdown(markdown_content))

## Statistics of Evaluation on Batch De-identified DICOM Dataset

In [None]:

total_dicom_files = len(eval_dicom_files)
print(f"Number of De-id DICOM files: {total_dicom_files}")
processed_rate = round(processed_count/total_dicom_files * 100, 3)
print(f"Processed {processed_rate}% of De-id DICOM files")
redacted_count = de_id_count
print(f"Number of Redacted De-id DICOM: {redacted_count}")
print(f"Redacted Ratio: {round(redacted_count/processed_count)*100}%")
redacted_metadata_count = de_id_count
print(f"Number of De-id DICOM with PHI in Metadata: {redacted_metadata_count}")
# text_in_pixel_count = len([file for file in eval_dicom_list if file["Text In Pixel"]> 0])
# phi_in_pixel_count = len([file for file in eval_dicom_list if file["PHI In Pixel"]> 0])
# print(f"Number of De-id DICOM with text in Pixel: {text_in_pixel_count} detected by AWS ReKognition")
# print(f"Number of De-id DICOM with PHI in Pixel: {phi_in_pixel_count} detected by AWS Comprehend Medical")

# cleanup unused resources
# eval_dicom_list = None
eval_dicom_files = None

## Evaluate on Redaction Model De-identification on Pixel Data 

In [None]:
import glob, os
eval_dicom_list = []
eval_dicom_files = glob.glob('{}/**/*.dcm'.format(LOC_DE_ID_DICOM_FOLDER), recursive=True)
de_id_dicom_count = len(eval_dicom_files)
print(f'Found {de_id_dicom_count} DICOM files under {LOC_DE_ID_DICOM_FOLDER}')
print(f"Start Evaluating Batch De-id DICOM Files at {get_date_time()}")
EVAL_BUCKET = "de-id-dst"

processed_count = 0
de_id_count = 0
try:
     # create a de-id processor
    aws_session = get_boto3_session("esi")
    processor = ProcessMedImage(aws_session, rule_config_file_path, True)
    for filepath in eval_dicom_files[10:]:
        filename = os.path.basename(filepath)
        prefix = os.path.join(SOURCE_PREFIX, '/'.join(filepath.split('/')[-4:-2]))
        key = os.path.join(prefix, filename)
        local_de_id_dicom = filepath
        text_in_image = False
        phi_in_image = False
        redacted_tags = 0
        detected_tags = []
        phi_dicom = None
        result = processor.parse_dicom_file(EVAL_BUCKET, key, filepath, True)
        dicom_dataset = processor.ds
        if dicom_dataset:
            id_text_detected, text_in_image = processor.detect_id_in_img(EVAL_BUCKET, key, True)
            if text_in_image and id_text_detected and len(id_text_detected):
                phi_in_image = True
                # print(f'Sensitive text detected in {filepath}')
                print (f'Found PHI in pixel: {id_text_detected} in DICOM: {filepath}.')
                processor.redact_id_in_image(id_text_detected)
                print('PHI in pixel have been redacted')
                de_id_count += 1
                processor.save_de_id_dicom(filepath)
                # print('Filename: {}, Filepath: {}, Text In Pixel: {}, PHI In Pixel: {}'.format(filename, short_file_path, text_in_image, phi_in_image))      
            else:
                # if text_in_image:
                #      print('Filename: {}, Filepath: {}, Text In Pixel: {}, PHI In Pixel: {}'.format(filename, short_file_path, text_in_image, phi_in_image))
                local_de_id_png = None
                # print(f'No sensitive text detected in {filepath}') 
            # eval_dicom_list.append({FILE_NAME: filename, "Redacted Tags": redacted_tags, "Text In Pixel": text_in_image, "PHI In Pixel": phi_in_image, FILE_PATH: filepath})  
        processed_count += 1
except Exception as e:
    print(f'Error processing de-id dicom file: {e}')
    raise e
finally:
    processor.update_rules_in_configs(rule_config_file_path)
    processor.close()
    processor = None
# create a tsv file with the results of the de-id process
# running_report = f"../output/report/Batch-Eval-Report-{get_date_time()}.tsv"
# dump_dict_to_tsv(eval_dicom_list, running_report)
print(f'Completed Evaluating on batch of De-id DICAM files at {get_date_time()}')
print(f'Total {len(eval_dicom_list)} DICOM files are processed.')

# from IPython.display import Markdown, display
# markdown_content = f"Click [Batch Evaluating Report ]({running_report}) to download report."
# # Display the Markdown
# display(Markdown(markdown_content))

## Statistics of evaluation on Pixel data

In [None]:
total_dicom_files = len(eval_dicom_files)
print(f"Number of De-id DICOM files: {total_dicom_files}")
processed_rate = round(processed_count/total_dicom_files * 100, 3)
print(f"Processed {processed_rate}% of De-id DICOM files")
redacted_count = de_id_count
print(f"Number of Redacted De-id DICOM: {redacted_count}")
print(f"Redacted Ratio: {round(redacted_count/processed_count)*100}%")
redacted_metadata_count = de_id_count
print(f"Number of De-id DICOM with PHI in Metadata: {redacted_metadata_count}")

# cleanup unused resources
# eval_dicom_list = None
eval_dicom_files = None

In [None]:
# close the processor
processor.close()