In [3]:
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine

In [112]:
text="My phone number is 212-555-5555 or 470-815-2571 and the important info is 4147 2026 0114 6333"

# Set up the engine, loads the NLP module (spaCy model by default) 
# and other PII recognizers
analyzer = AnalyzerEngine()

In [107]:
results = analyzer.analyze(text=text,
                           entities=["PHONE_NUMBER","CREDIT_CARD"],
                           language='en')

In [91]:
[(text[res.start:res.end], res.start, res.end, res.score) for res in results]

[('4147 2026 0114 6333', 58, 77, 1.0), ('212-555-5555', 19, 31, 0.75)]

In [120]:
def detect_pii(text: str, entities: List[str]) -> Tuple[str, Dict[str, str]]:
    analyzer = AnalyzerEngine()
    results = analyzer.analyze(text=text, entities=entities, language='en')
    mapping = {}
    for result in results:
        result_dict = result.to_dict()
        mapping[result_dict['entity_type']][text[result.start:result.end]] = result.score
        # if result_dict['entity_type'] in mapping.keys():
        #     mapping[result_dict['entity_type']][text[result.start:result.end]] = result.score)
        # else:
        #     mapping[result_dict['entity_type']] = {text[result.start:result.end]:result.score}
    return text, mapping
detect_pii(text,entities)

KeyError: 'CREDIT_CARD'

In [140]:
def detect_pii(text: str, entities: List[str]) -> Tuple[str, Dict[str, Dict[str, float]]]:
    analyzer = AnalyzerEngine()
    results = analyzer.analyze(text=text, entities=entities, language='en')
    mapping = {}

    for result in results:
        result_dict = result.to_dict()
        entity_type = result_dict['entity_type']
        entity_text = text[result.start:result.end]
        entity_score = result.score

        if entity_type not in mapping:
            mapping[entity_type] = {}

        # If the entity text already exists, we keep the higher score
        if entity_text in mapping[entity_type]:
            mapping[entity_type][entity_text] = max(entity_score, mapping[entity_type][entity_text])
        else:
            mapping[entity_type][entity_text] = entity_score

    return text, [result.to_dict() for result in results]
detect_pii(text,entities)

('My phone number is 212-555-5555 or 470-815-2571 and the important info is 4147 2026 0114 6333',
 [{'entity_type': 'CREDIT_CARD',
   'start': 74,
   'end': 93,
   'score': 1.0,
   'analysis_explanation': None,
   'recognition_metadata': {'recognizer_name': 'CreditCardRecognizer',
    'recognizer_identifier': 'CreditCardRecognizer_6324617904'}},
  {'entity_type': 'PHONE_NUMBER',
   'start': 19,
   'end': 31,
   'score': 0.75,
   'analysis_explanation': None,
   'recognition_metadata': {'recognizer_name': 'PhoneRecognizer',
    'recognizer_identifier': 'PhoneRecognizer_6324618240'}},
  {'entity_type': 'PHONE_NUMBER',
   'start': 35,
   'end': 47,
   'score': 0.75,
   'analysis_explanation': None,
   'recognition_metadata': {'recognizer_name': 'PhoneRecognizer',
    'recognizer_identifier': 'PhoneRecognizer_6324618240'}}])

In [141]:
def detect_pii(text: str, entities: List[str]) -> Tuple[str, Dict[str, Dict[str, float]]]:
    analyzer = AnalyzerEngine()
    results = analyzer.analyze(text=text, entities=entities, language='en')
    mapping = {}

    for result in results:
        result_dict = result.to_dict()
        entity_type = result_dict['entity_type']
        entity_text = text[result.start:result.end]
        entity_score = result.score

        if entity_type not in mapping:
            mapping[entity_type] = {}

        # If the entity text already exists, we keep the higher score
        if entity_text in mapping[entity_type]:
            mapping[entity_type][entity_text] = max(entity_score, mapping[entity_type][entity_text])
        else:
            mapping[entity_type][entity_text] = entity_score

    return text, mapping
detect_pii(text,entities)

('My phone number is 212-555-5555 or 470-815-2571 and the important info is 4147 2026 0114 6333',
 {'CREDIT_CARD': {'4147 2026 0114 6333': 1.0},
  'PHONE_NUMBER': {'212-555-5555': 0.75, '470-815-2571': 0.75}})

In [136]:
def detect_pii(text: str, entities: List[str]) -> Tuple[str, Dict[str, Dict[str, float]]]:
    analyzer = AnalyzerEngine()
    results = analyzer.analyze(text=text, entities=entities, language='en')
    mapping = {}

    for result in results:
        result_dict = result.to_dict()
        entity_type = result_dict['entity_type']
        entity_text = text[result.start:result.end]
        entity_score = result.score

        if entity_type not in mapping:
            mapping[entity_type] = {}

        mapping[entity_type][entity_text] = {"start":result.start, "end":result.end, "confidence":result.score}

    return text, mapping
detect_pii(text,entities)

('My phone number is 212-555-5555 or 470-815-2571 and the important info is 4147 2026 0114 6333',
 {'CREDIT_CARD': {'4147 2026 0114 6333': {'start': 74,
    'end': 93,
    'confidence': 1.0}},
  'PHONE_NUMBER': {'212-555-5555': {'start': 19,
    'end': 31,
    'confidence': 0.75},
   '470-815-2571': {'start': 35, 'end': 47, 'confidence': 0.75}}})

In [128]:
def mask_pii(text: str, entities: List[str]) -> Tuple[str, Dict[str, str]]:
    analyzer = AnalyzerEngine()
    results = analyzer.analyze(text=text, entities=entities, language='en')
    mapping = {}
    result_text = text
    for result in results:
        original_text = text[result.start:result.end]
        masked_text = original_text[0:2] + "X" * (len(original_text) - 4) + original_text[-2:]
        result_text = result_text[:result.start] + masked_text + result_text[result.end:]
        entity_type = result.to_dict()['entity_type']
        result_dict = result.to_dict()
        if entity_type not in mapping:
            mapping[entity_type] = {}
        mapping[entity_type][masked_text] = original_text            
    return result_text, mapping
mask_pii(text,entities)

('My phone number is 21XXXXXXXX55 or 47XXXXXXXX71 and the important info is 41XXXXXXXXXXXXXXX33',
 {'CREDIT_CARD': {'41XXXXXXXXXXXXXXX33': '4147 2026 0114 6333'},
  'PHONE_NUMBER': {'21XXXXXXXX55': '212-555-5555',
   '47XXXXXXXX71': '470-815-2571'}})

In [143]:
def anonymize_pii(text: str, entities: List[str]) -> Tuple[str, Dict[str, str]]:
    anonymizer = PresidioReversibleAnonymizer(analyzed_fields = entities, faker_seed=3)
    anonymized_results = anonymizer.anonymize(text)
    mapping = anonymizer.deanonymizer_mapping
    return anonymized_results, mapping
anonymize_pii(text,entities)

('My phone number is 001-958-999-1394x411 or 001-897-825-1027x3464 and the important info is 4887623286016',
 {'PHONE_NUMBER': {'001-958-999-1394x411': '212-555-5555',
   '001-897-825-1027x3464': '470-815-2571'},
  'CREDIT_CARD': {'4887623286016': '4147 2026 0114 6333'}})

In [118]:
mapping['PHONE_NUMBER']

{'001-958-999-1394x411': '212-555-5555',
 '001-897-825-1027x3464': '470-815-2571'}

In [111]:
from presidio_analyzer import AnalyzerEngine
from typing import List, Dict, Tuple

def detect_pii(text: str, entities: List[str], threshold: float) -> Tuple[str, Dict[str, List[Dict[str, float]]]]:
    analyzer = AnalyzerEngine()
    results = analyzer.analyze(text=text, entities=entities, language='en')
    mapping = {}
    for result in results:
        result_dict = result.to_dict()
        
        if result_dict['score'] > threshold:
            if result_dict['entity_type'] not in mapping:
                mapping[result_dict['entity_type']] = []
                        
            entity_text = text[result_dict['start']:result_dict['end']]
            mapping[result_dict['entity_type']].append({entity_text: result_dict['score']})

    return text, mapping
detect_pii(text,entities,0.9)

('My phone number is 212-555-5555 or 4708152571 and the important info is 4147 2026 0114 6333',
 {'CREDIT_CARD': [{'4147 2026 0114 6333': 1.0}]})

In [90]:
results[0].to_dict()

{'entity_type': 'CREDIT_CARD',
 'start': 58,
 'end': 77,
 'score': 1.0,
 'analysis_explanation': None,
 'recognition_metadata': {'recognizer_name': 'CreditCardRecognizer',
  'recognizer_identifier': 'CreditCardRecognizer_12207259504'}}

In [19]:
anonymizer = AnonymizerEngine()

anonymized_text = anonymizer.anonymize(text=text,analyzer_results=results)

In [27]:
from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
engine = AnonymizerEngine()
result = engine.anonymize(
    text="My name is Bond, James Bond",
    analyzer_results=[
        RecognizerResult(entity_type="PERSON", start=11, end=15, score=0.8),
        RecognizerResult(entity_type="PERSON", start=17, end=27, score=0.8),
    ],
    operators={"PERSON": OperatorConfig("replace", {"new_value": "BIP"})},
)

print(result)

text: My name is BIP, BIP
items:
[
    {'start': 16, 'end': 19, 'entity_type': 'PERSON', 'text': 'BIP', 'operator': 'replace'},
    {'start': 11, 'end': 14, 'entity_type': 'PERSON', 'text': 'BIP', 'operator': 'replace'}
]



In [33]:
import pprint
import json

analyzer = AnalyzerEngine()
analyzer_results = analyzer.analyze(text=text_to_anonymize, entities=["PHONE_NUMBER"], language='en')

print(analyzer_results)

anonymizer = AnonymizerEngine()
text_to_anonymize="My name is Bond, James Bond"
anonymized_results = anonymizer.anonymize(
    text=text_to_anonymize,
    analyzer_results=analyzer_results,    
    operators={"DEFAULT": OperatorConfig("replace", {"new_value": "<ANONYMIZED>"}), 
                        "PHONE_NUMBER": OperatorConfig("mask", {"type": "mask", "masking_char" : "*", "chars_to_mask" : 12, "from_end" : True}),
                        "TITLE": OperatorConfig("redact", {})}
)

print(f"text: {anonymized_results.text}")
print("detailed response:")

pprint(anonymized_results.to_json())

[]
text: My name is Bond, James Bond
detailed response:


TypeError: 'module' object is not callable

In [49]:
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer

anonymizer = PresidioReversibleAnonymizer(
    analyzed_fields=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "CREDIT_CARD"],
    # Faker seed is used here to make sure the same fake data is generated for the test purposes
    # In production, it is recommended to remove the faker_seed parameter (it will default to None)
    faker_seed=42,
)

In [54]:
anonymizer.deanonymizer_mapping

{'PERSON': {'Maria Lynch': 'Slim Shady'},
 'PHONE_NUMBER': {'7344131647': '313-666-7440'},
 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},
 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861'}}

In [53]:
anonymizer.anonymize(
    "My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com. "
    "By the way, my card number is: 4916 0387 9536 0860"
)

'My name is Maria Lynch, call me at 7344131647 or email me at jamesmichael@example.com. By the way, my card number is: 4916 0387 9536 0860'

In [73]:
from typing import List, Tuple, Dict
def anonymize_pii(text: str, entities: List[str]) -> Tuple[str, Dict[str, str]]:
    anonymizer = PresidioReversibleAnonymizer(analyzed_fields = entities, faker_seed=123)
    results = analyzer.analyze(text=text, entities=entities, language='en')
    anonymized_results = anonymizer.anonymize(text)
    mapping = anonymizer.deanonymizer_mapping
    return anonymized_results, mapping

In [76]:
text = "My name is Slim Shady and I work with Vince, call me at 313-666-7440 or email me at real.slim.shady@gmail.com."
entities = ["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "CREDIT_CARD"]
anonymize_pii(text,entities)

('My name is Gwendolyn Solis and I work with Alisha Pruitt, call me at (906)319-6105 or email me at robersonnancy@example.com.',
 {'PERSON': {'Gwendolyn Solis': 'Slim Shady', 'Alisha Pruitt': 'Vince'},
  'PHONE_NUMBER': {'(906)319-6105': '313-666-7440'},
  'EMAIL_ADDRESS': {'robersonnancy@example.com': 'real.slim.shady@gmail.com'}})