In [3]:
import os
# Set GPU device
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [4]:
!pip install --quiet transformers
!pip install --quiet datasets
!pip install --quiet annoy

## Preprocessing

In [5]:
import math
import torch
import pickle
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
sns.set_theme(context='paper',
              style='ticks',
              palette='tab10',
              rc={"axes.grid": True,
                  "grid.linestyle": 'dashed',
                  "axes.linewidth": 1.0,
                  "axes.facecolor": '1.0',
                  }
              )
import collections

In [6]:
import datasets
# datasets.disable_caching() 

from datasets import load_dataset
train_data = load_dataset('imdb', split='train')
valid_data = load_dataset('imdb', split='test')

import re
import string

from bs4 import BeautifulSoup

#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Defining clean_text function
def clean(example):
    text = example['text']
    text = strip_html(text)
    text = re.sub(r'[^A-Za-z0-9]+',' ',text)
    text = text.lower()
    example['text'] = text
    return example

train_data[0]

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset imdb (/home/todsavadt/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Found cached dataset imdb (/home/todsavadt/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [7]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def tokenize(example):

  example['tokens'] = tokenizer.tokenize(example['text'])
  return example

train_data = train_data.map(lambda i: tokenize(i))
valid_data = valid_data.map(lambda i: tokenize(i))

train_data.save_to_disk(f"train_imdb")
valid_data.save_to_disk(f"valid_imdb")

Loading cached processed dataset at /home/todsavadt/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-feb8a244e6cf47ef.arrow
Map:   0%|          | 0/25000 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (530 > 512). Running this sequence through the model will result in indexing errors
                                                                                                 

In [8]:
import torch

import collections

def load_pickle(fname):
    with open(fname, "rb") as f:
        return pickle.load(f)

def dump_pickle(file, fname):
    with open(fname, "wb") as f:
        pickle.dump(file, f)

device = ('cuda' if torch.cuda.is_available() else 'cpu')
device

from annoy import AnnoyIndex

class metricDP():
   
    def __init__(self, start_from=999):

        '''
        Code in part from Amazon SageMaker, Vocabular [Dictionary] is a token to
        index mapping, Embedding [Array] including special tokens such as [UNK],
        [PAD], [CLS], [SEP], [MASK], or [unused...]. Code expects special tokens
        at the front and regular tokens continuing from 'start_from'. Parameters
        defaulted to BERT (base, uncased).
        '''
        from transformers import BertTokenizer, BertModel
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.encoder = BertModel.from_pretrained("bert-base-uncased").to(device)

        self.vocabular = self.tokenizer.vocab
        self.embedding = self.encoder.embeddings.word_embeddings.weight.cpu().detach().numpy()

        self.vocab_size = self.embedding.shape[0]
        self.embed_dim = self.embedding.shape[1]

        self.start_from = start_from

    def build_ann(self, metric='euclidean', n_trees=50):

        ''' Build Approximate Nearest Neighbors, excluding special tokens '''
        
        self.ann = AnnoyIndex(self.embed_dim, metric)

        for index, vector in enumerate(self.embedding[self.start_from:,:]):
            self.ann.add_item(index, vector)
            
        self.ann.build(n_trees)
        
    
    def privatize(self, tokens, epsilon=10, modus='lexical'):
        
        if modus == 'lexical':
            
            #tokens = self.tokenizer.tokenize(sentence)
            token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            token_vec = np.array([self.embedding[ids] for ids in token_ids])

        elif modus == 'contextual':
            
            with torch.no_grad():
            
                inputs = self.tokenizer.encode_plus(
                    text=tokens,
                    is_split_into_words=True,
                    truncation=True,
                    padding='max_length',
                    max_length=512,
                    return_tensors='pt',
                )
                
                length = torch.count_nonzero(inputs['input_ids'])
                
                inputs = {k:v.to(device) for k,v in inputs.items()}
                
                token_vec = self.encoder(**inputs)[
                    'last_hidden_state'
                ].squeeze(0)[1:length-1,:].cpu().numpy()
                
        def replace(vector, epsilon):
        
              random_vec = np.random.normal(size=self.embed_dim)
              normalized_vec = random_vec / np.linalg.norm(random_vec)
              magnitude = np.random.gamma(shape=self.embed_dim, scale=1/epsilon)
              noise = normalized_vec * magnitude
              noisy_vector = vector + noise
              return self.ann.get_nns_by_vector(noisy_vector, 1)[0]
          
        assert self.ann != None, 'Build or Init ANNs.'
        
        tokens = []
        for index, vector in enumerate(token_vec):
                tokens.append( replace(vector, epsilon) + self.start_from)

        return self.tokenizer.decode(tokens), self.tokenizer.convert_ids_to_tokens(tokens)

mdp = metricDP(start_from=999)
mdp.build_ann(metric='euclidean', n_trees=50)

In [9]:
import numpy as np

MODUS = 'contextual'
EPSILON = 25

def privatize(example, epsilon=10, modus='lexical'):

    text, tokens = mdp.privatize(
        example['tokens'],
        epsilon=epsilon,
        modus=modus
    )

    example['text'] = text
    example['tokens'] = tokens

    return example

# train_data_priv = train_data.map(lambda i: privatize(i, epsilon=EPSILON, modus=MODUS))
# train_data_priv.save_to_disk(f"train_imdb_{MODUS}_{EPSILON}")

In [79]:
train_data_priv = privatize(train_data[0], epsilon = 1000, modus= 'contextual')
train_data_priv['text']

'##itative charter teach 宗 intriguing - yellow from his video market because told all ᆼ. the talked the when film is first film in 1999 touch we also information the at the the was took the us speech us speech, if the ever started the into this country, the being a patron of film deemed "., the the had of see thiser out hat entire sentinel speed cast understanding sentinel speed cast the film is based around a young european film university named pop who has the education the she could about life touch taken s hers the the the her－ groups jet heads to making some a the film on 不ians anglo〈 chile mountain ᵢes ― about the theological issues g the understanding 1967 built and race topics yourotted american america touchrricular between.英 and ィ, panama groups lin〈 chapters com〈 ~ european about their ₑ studied ன, she is gender′ her film teacher, another do and, men hat understanding speech moonlight alone understanding speech moonlight cast what s me about out talked talked ; green is that

In [73]:
train_data_priv = privatize(train_data[0], epsilon = 10000, modus= 'lexical')
train_data_priv['text']

'i rented i am curious - yellow from my video store because of all the controversy that surrounded it when it was first released in 1967. i also heard that at first it was seized by u. s. customs if it ever tried to enter this country, therefore being a fan of films considered " controversial " i really had to see this for myself. < br / > < br / > the plot is centered around a young swedish drama student named lena who wants to learn everything she can about life. in particular she wants to focus her attentions to making some sort of documentary on what the average swede thought about certain political issues such as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men. < br / > < br / > what kills me about i am curious - yellow is that 40 years ago, this was considered pornographic. really, the sex and nudity scenes are fe

In [74]:
train_data['text'][0]

'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, ev

In [52]:
# train_data_priv['tokens']