In [None]:
# Install necessary libraries
!pip install spacy presidio-analyzer presidio-anonymizer -q

# Import required libraries
import spacy
from spacy.pipeline import EntityRuler
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine

# Step 1: Generate a synthetic document
fake_document = """
Dear John Doe,

We appreciate your feedback on our services. Your experience at 123 Elm Street, Springfield, was invaluable.
If you have further concerns, please contact us at 555-1234 or email john.doe@example.com.
Your ticket number is #12345.

Best regards,
Customer Service Team
"""

# Save the fake document to a text file
with open("feedback_data.txt", "w") as file:
    file.write(fake_document)

# Step 2: Load the NLP model and analyze the document
# Load spaCy NLP model
nlp = spacy.load("en_core_web_sm")

# Read the synthetic document
with open("feedback_data.txt", "r") as file:
    feedback = file.read()

# Process the document using spaCy
doc = nlp(feedback)

# Display the detected entities
print("Detected Entities:")
for ent in doc.ents:
    print(f"Entity: {ent.text} | Label: {ent.label_}")

# Step 3: Mask the detected PII
masked_feedback = feedback
for ent in doc.ents:
    masked_feedback = masked_feedback.replace(ent.text, f"[{ent.label_}]")

print("\nMasked Document:")
print(masked_feedback)

# Save the masked document to a file
with open("masked_feedback_data.txt", "w") as file:
    file.write(masked_feedback)

# Step 4: Use Presidio for enhanced PII detection
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

# Analyze and anonymize the text
results = analyzer.analyze(text=feedback, language="en")
anonymized_feedback = anonymizer.anonymize(text=feedback, analyzer_results=results)

print("\nAnonymized Document using Presidio:")
print(anonymized_feedback)

# Save the anonymized document to a file
with open("anonymized_feedback_data.txt", "w") as file:
    file.write(anonymized_feedback)

# Optional: Visualize detected entities with spaCy's Displacy
from spacy import displacy

displacy.render(doc, style="ent", jupyter=True)
