<a href="https://colab.research.google.com/github/emshashank/GithubActionsTutorial/blob/master/PII_Redaction1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# download presidio
!pip install presidio_analyzer presidio_anonymizer
!python -m spacy download en_core_web_lg

In [18]:
from presidio_analyzer import AnalyzerEngine, PatternRecognizer
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
import json
from pprint import pprint

In [13]:
text_to_anonymize = "His name is Mr. Jones and his phone number is 212-555-5555"

In [16]:
analyzer = AnalyzerEngine()
analyzer_results = analyzer.analyze(text=text_to_anonymize, entities=["PHONE_NUMBER"], language='en')

print(analyzer_results)



[type: PHONE_NUMBER, start: 46, end: 58, score: 0.75]


In [19]:
titles_recognizer = PatternRecognizer(supported_entity="TITLE",
                                      deny_list=["Mr.","Mrs.","Miss"])

pronoun_recognizer = PatternRecognizer(supported_entity="PRONOUN",
                                       deny_list=["he", "He", "his", "His", "she", "She", "hers", "Hers"])

analyzer.registry.add_recognizer(titles_recognizer)
analyzer.registry.add_recognizer(pronoun_recognizer)

analyzer_results = analyzer.analyze(text=text_to_anonymize,
                            entities=["TITLE", "PRONOUN"],
                            language="en")
print(analyzer_results)


[type: PRONOUN, start: 0, end: 3, score: 1.0, type: TITLE, start: 12, end: 15, score: 1.0, type: PRONOUN, start: 26, end: 29, score: 1.0]


In [20]:
anonymizer = AnonymizerEngine()

anonymized_results = anonymizer.anonymize(
    text=text_to_anonymize,
    analyzer_results=analyzer_results,
    operators={"DEFAULT": OperatorConfig("replace", {"new_value": "<ANONYMIZED>"}),
                        "PHONE_NUMBER": OperatorConfig("mask", {"type": "mask", "masking_char" : "*", "chars_to_mask" : 12, "from_end" : True}),
                        "TITLE": OperatorConfig("redact", {})}
)

print(f"text: {anonymized_results.text}")
print("detailed response:")

pprint(json.loads(anonymized_results.to_json()))

text: <ANONYMIZED> name is  Jones and <ANONYMIZED> phone number is 212-555-5555
detailed response:
{'items': [{'end': 44,
            'entity_type': 'PRONOUN',
            'operator': 'replace',
            'start': 32,
            'text': '<ANONYMIZED>'},
           {'end': 21,
            'entity_type': 'TITLE',
            'operator': 'redact',
            'start': 21,
            'text': ''},
           {'end': 12,
            'entity_type': 'PRONOUN',
            'operator': 'replace',
            'start': 0,
            'text': '<ANONYMIZED>'}],
 'text': '<ANONYMIZED> name is  Jones and <ANONYMIZED> phone number is '
         '212-555-5555'}
