## 1. Extract Vocabulary

In [1]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
vocabulary = tokenizer.vocab

  from .autonotebook import tqdm as notebook_tqdm


## 2.Extract Embedding

In [2]:
from transformers import BertModel
model = BertModel.from_pretrained("bert-base-uncased")
embedding = model.embeddings.word_embeddings.weight.cpu().detach().numpy()

## 3.Initialize Class

In [3]:
# !pip install annoy

In [4]:
from mdp import *
mdp = metricDP(vocabulary, embedding, start_from=999)
mdp.build_ann(metric='euclidean', n_trees=50)

## 4.Numeralize Input

To exclude special tokens from the candidate pool, specifiy the position of regular tokens via start_from. In BERT, the first regular token is '!' at index 999. During the privatization step, each token is remaped from its nearest neighbor item to the embedding index.

In [32]:
txt = 'The cat sat on the mat.'
ids = tokenizer.encode(txt, truncation=True, padding='max_length', max_length=10)
# [101, 1996, 4937, 2938, 2006, 1996, 13523, 1012, 102, 0]
txt

'The cat sat on the mat.'

In [33]:
tokenizer.decode([0,100,101,102,103])

'[PAD] [UNK] [CLS] [SEP] [MASK]'

## 5. Privatize Input

In [34]:
pv_ids = mdp.privatize(ids, epsilon=400, special_tokens=[0,100,101,102,103])
pv_ids
#[101, 2601, 2267, 25195, 20139, 6584, 16304, 22754, 102, 0]

[101, 1996, 4937, 2938, 2006, 1996, 13523, 1012, 102, 0]

In [35]:
pv_txt = tokenizer.decode(pv_ids)
pv_txt

'[CLS] the cat sat on the mat. [SEP] [PAD]'

Perturbations ignore all tokens specified in special_tokens, and epsilon regulates the privacy guarantees. A smaller epsilon leads to more perturbations and higher privacy guarantees. A higher epsilon leads to less perturbations and lower privacy guarantees.

In [31]:
token = 101
epsilon = 400
random_vec = np.random.normal(size=mdp.embed_dim)
normalized_vec = random_vec / np.linalg.norm(random_vec)
magnitude = np.random.gamma(shape=mdp.embed_dim, scale=1/epsilon) #high epsilon -> low magnitude
noise = normalized_vec * magnitude
original_vec = mdp.embedding[token]
noisy_vector = original_vec + noise

n_trees = 50
start_from = 999
ann = AnnoyIndex(mdp.embed_dim, 'euclidean')
for index, vector in enumerate(mdp.embedding[start_from:,:]):
    ann.add_item(index, vector)
    
ann.build(n_trees)
new_token = ann.get_nns_by_vector(noisy_vector, 1)[0]
index = new_token + start_from
tokenizer.decode(index)

's e e m e d'