### This is a minimal example for sanitizing sentences. 

In [1]:
from utils import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load NER model.
# We use a Universal NER model here: https://universal-ner.github.io/
ner_model = NER("/nobackup3/divyam/models/uniner-7b-pii-v3", device="cuda:1")
# ner_model = NER("/path/to/Meta-Llama-3-8B-Instruct/", device="cuda:1")

# Load Sanitizer object
sanitizer = Sanitizer(ner_model, key = "EF4359D8D580AA4F7F036D6F04FC6A94", tweak = "D8E7920AFA330A73")

Loading checkpoint shards: 100%|██████████| 6/6 [00:03<00:00,  1.87it/s]


### Example for sanitizing names of people:

In [3]:
sentences = ["Ben Parker and John Doe went to the bank.", "Who was late today? Adam."]
extracted = ner_model.extract(sentences, entity_type='Name')
print("NER extraction:")
print(extracted)

100%|██████████| 2/2 [00:00<00:00,  2.64it/s]

NER extraction:
{'Name': [['Ben Parker', 'John Doe'], ['Adam']]}





In [4]:
sanitized_sentences, _ = sanitizer.encrypt(sentences, entity='Name', epsilon=1)
print("Sanitized sentences:")
print(sanitized_sentences)

100%|██████████| 2/2 [00:00<00:00,  5.50it/s]


['Ben Parker and John Doe went to the bank.', 'Who was late today? Adam.']
Sanitized sentences:
['Mariusz Alonzo and Lio Flo went to the bank.', 'Who was late today? Elyes Reid.']


In [5]:
desanitized_sentences = sanitizer.decrypt(sanitized_sentences, entity='Name')
print("Desanitized sentences:")
print(desanitized_sentences)

100%|██████████| 2/2 [00:00<00:00,  4.08it/s]

Desanitized sentences:
['Ben Parker and John Doe went to the bank.', 'Who was late today? Adam.']





### Example for sanitizing currency values with FPE:

In [3]:
sentences = ["Ben withdrew $10,000 from the bank. Adam got $5,550 in a loan.", "I won $25 in the lottery."]
extracted = ner_model.extract(sentences, entity_type='Money')
print("NER extraction:")
print(extracted)

100%|██████████| 2/2 [00:01<00:00,  1.63it/s]

NER extraction:
{'Money': [['10,000', '5,550'], ['25']]}





In [4]:
sanitized_sentences, _ = sanitizer.encrypt(sentences, entity='Money', epsilon=1)
print("Sanitized sentences:")
print(sanitized_sentences)

100%|██████████| 2/2 [00:00<00:00,  2.93it/s]

['Ben withdrew $10,000 from the bank. Adam got $5,550 in a loan.', 'I won $25 in the lottery.']
Sanitized sentences:
['Ben withdrew $54829534,343 from the bank. Adam got $0400114,733 in a loan.', 'I won $70096546 in the lottery.']





In [5]:
desanitized_sentences = sanitizer.decrypt(sanitized_sentences, entity='Money')
print("Desanitized sentences:")
print(desanitized_sentences)

100%|██████████| 2/2 [00:01<00:00,  1.73it/s]

Desanitized sentences:
['Ben withdrew $10,000 from the bank. Adam got $5,550 in a loan.', 'I won $25 in the lottery.']





### Example for sanitizing currency values with m-LDP:

In [6]:
sentences = ["Ben withdrew $1000 from the bank. Adam got $555 in a loan.", "I won $25 in the lottery."]
extracted = ner_model.extract(sentences, entity_type='Money')
print("NER extraction:")
print(extracted)

100%|██████████| 2/2 [00:00<00:00,  5.11it/s]

NER extraction:
{'Money': [['1000', '555'], ['25']]}





In [7]:
sanitized_sentences, _ = sanitizer.encrypt(sentences, entity='Money', 
                                           epsilon=1, use_fpe=False, use_mdp=True)
print("Sanitized sentences:")
print(sanitized_sentences)

100%|██████████| 2/2 [00:00<00:00,  4.06it/s]

['Ben withdrew $1000 from the bank. Adam got $555 in a loan.', 'I won $25 in the lottery.']
Sanitized sentences:
['Ben withdrew $1090 from the bank. Adam got $496 in a loan.', 'I won $41 in the lottery.']





In [8]:
sanitizer.entity_mapping

[{'1090': '1000', '496': '555'}, {'41': '25'}]

In [9]:
desanitized_sentences = sanitizer.decrypt(sanitized_sentences, entity='Money', 
                                          use_fpe=False, use_mdp=True)
print("Desanitized sentences:")
print(desanitized_sentences)

100%|██████████| 2/2 [00:00<00:00,  5.20it/s]

Desanitized sentences:
['Ben withdrew $1000 from the bank. Adam got $555 in a loan.', 'I won $25 in the lottery.']





### Example for sanitizing age values with m-LDP:

In [10]:
sentences = ["Ben turned 15 years old today.", "I am 25 years old."]
extracted = ner_model.extract(sentences, entity_type='Age')
print("NER extraction:")
print(extracted)

100%|██████████| 2/2 [00:00<00:00,  5.95it/s]

NER extraction:
{'Age': [['15'], ['25']]}





In [12]:
sanitized_sentences, _ = sanitizer.encrypt(sentences, entity='Age', epsilon=1, use_mdp=True, use_fpe=False)
print("Sanitized sentences:")
print(sanitized_sentences)

100%|██████████| 2/2 [00:00<00:00,  6.08it/s]

['Ben turned 15 years old today.', 'I am 25 years old.']
Sanitized sentences:
['Ben turned 13 years old today.', 'I am 22 years old.']





In [13]:
desanitized_sentences = sanitizer.decrypt(sanitized_sentences, entity='Age')
print("Desanitized sentences:")
print(desanitized_sentences)

100%|██████████| 2/2 [00:00<00:00,  6.41it/s]

Desanitized sentences:
['Ben turned 15 years old today.', 'I am 25 years old.']



