In [87]:
import fitz
import re
import os

In [88]:
pwd

'/Users/ethanpanal/Documents'

In [107]:
# Define paths
input_folder = "./UP/Capstone/ahcm/raw_ahcm_pdf"
output_folder = "./UP/Capstone/ahcm/ahcm_redacted"

In [108]:
# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

In [116]:
# Define regex patterns for sensitive data
patterns = {
    "SSN": r"\b\d{3}-\d{2}-\d{4}\b",
    "Phone": r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b",
    "Email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b",
    "Credit Card": r"\b(?:\d{4}[-\s]?){3}\d{4}\b",
    "Birthdate": r"Birthdate:\s*\d{2}/\d{2}/\d{4}"
}

# Additional sensitive line patterns
lo_year_line_pattern = r".*LO-\d{4}-.*"
client_line_pattern = r".*\(Client\).*"
staff_present_pattern = r".*Staff Present:\s*.*"
date_line_pattern = r".*\b\d{2}/\d{2}/\d{4}\b.*"


In [117]:
def anonymize_pdf(input_pdf, output_pdf):
    doc = fitz.open(input_pdf)

    for page in doc:
        text = page.get_text("text")

        # Redact sensitive data using regex patterns
        sensitive_lines = (
            re.findall(client_line_pattern, text) + 
            re.findall(lo_year_line_pattern, text) + 
            re.findall(staff_present_pattern, text) +
            re.findall(date_line_pattern, text)
        )
        for line in sensitive_lines:
            found = page.search_for(line) or []
            for rect in found:
                page.add_redact_annot(rect, fill=(0, 0, 0))

        for label, pattern in patterns.items():
            matches = re.findall(pattern, text)
            for match in matches:
                found = page.search_for(match) or []
                for rect in found:
                    page.add_redact_annot(rect, fill=(0, 0, 0))

        # Apply all redactions
        page.apply_redactions()

    doc.save(output_pdf)
    print(f"Anonymized PDF saved as {output_pdf}")

In [118]:
# Process all PDFs in the input folder
for filename in os.listdir(input_folder):
    if filename.startswith("ahcm_") and filename.endswith(".pdf"):
        unique_id = filename.replace("ahcm_", "").replace(".pdf", "")
        input_pdf = os.path.join(input_folder, filename)
        output_pdf = os.path.join(output_folder, f"redacted_{unique_id}.pdf")

        anonymize_pdf(input_pdf, output_pdf)

Anonymized PDF saved as ./UP/Capstone/ahcm/ahcm_redacted/redacted_e14d25933774.pdf
Anonymized PDF saved as ./UP/Capstone/ahcm/ahcm_redacted/redacted_e0f34e89bcb6.pdf
Anonymized PDF saved as ./UP/Capstone/ahcm/ahcm_redacted/redacted_b0b025173dad.pdf
Anonymized PDF saved as ./UP/Capstone/ahcm/ahcm_redacted/redacted_19a2b65f2005.pdf
Anonymized PDF saved as ./UP/Capstone/ahcm/ahcm_redacted/redacted_fc16b67cf2c3.pdf
Anonymized PDF saved as ./UP/Capstone/ahcm/ahcm_redacted/redacted_216266668847.pdf
Anonymized PDF saved as ./UP/Capstone/ahcm/ahcm_redacted/redacted_f9b9fc557c40.pdf
Anonymized PDF saved as ./UP/Capstone/ahcm/ahcm_redacted/redacted_df9d65c8a899.pdf
Anonymized PDF saved as ./UP/Capstone/ahcm/ahcm_redacted/redacted_646e638b344c.pdf
Anonymized PDF saved as ./UP/Capstone/ahcm/ahcm_redacted/redacted_aca9ae161017.pdf
Anonymized PDF saved as ./UP/Capstone/ahcm/ahcm_redacted/redacted_9fbaba2f4646.pdf
Anonymized PDF saved as ./UP/Capstone/ahcm/ahcm_redacted/redacted_ec9ae72bd102.pdf
Anon