In [1]:
import torch
import random
import numpy as np
import logging
import os
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from transformers import BertTokenizer, BertForMaskedLM
from tqdm.auto import tqdm, trange
from scipy.special import softmax
from functools import partial
from multiprocessing import Pool, cpu_count
import pandas as pd

from collections import Counter,defaultdict
import matplotlib.pyplot as plt
import json
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

class Agrument:
    def __init__(self):
        self.task = 'SST-2'
        # self.embedding_type = 'bert'
        self.bert_model_path = "bert-base-uncased"
        self.data_dir ="./data/SST-2/"
        self.sensitive_word_percentage = 0.6
        self.epsilon = 3
        self.output_dir = "./output_SanText_bert/SST-2/"
        self.threads = 12
        self.p = 0.2
        self.seed = 42
        self.method = 'SanText'
        self.data_dir ="./data/SST-2/"

        #CusText Params
        self.eps = 1
        self.top_k = 20
        self.mapping_strategy = 'conservative'
        self.embedding_type = 'glove.42B.300d'
        # self.embedding_type = 'bert'
        self.privatization_strategy = 's1'
        self.save_stop_words = True
        
args = Agrument()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_data():
    print(f'__loading__{args.data_dir}__')
    train_df = pd.read_csv(f"{args.data_dir}/train.tsv",delimiter='\t')
    dev_df   = pd.read_csv(f"{args.data_dir}/dev.tsv",  delimiter='\t')
    test_df  = pd.read_csv(f"{args.data_dir}/test.tsv", delimiter='\t')
    return train_df,dev_df,test_df

In [3]:
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)
train_data,dev_data,test_data = load_data()

__loading__./data/SST-2/__


In [4]:
# df_train = pd.read_csv(f"{args.data_dir}/train.tsv",delimiter='\t')
# df_dev = pd.read_csv(f"{args.data_dir}/dev.tsv",delimiter='\t')
# train_corpus = " ".join(df_train.sentence)
# dev_corpus = " ".join(df_dev.sentence)

In [5]:
# type(df_train.sentence )

In [6]:
def get_customized_mapping(eps,top_k):
    df_train = pd.read_csv(f"{args.data_dir}/train.tsv",delimiter='\t')
    df_dev = pd.read_csv(f"{args.data_dir}/dev.tsv",delimiter='\t')
    train_corpus = " ".join(df_train.sentence)
    dev_corpus = " ".join(df_dev.sentence)
    corpus = train_corpus + " " + dev_corpus
    word_freq = [x[0] for x in Counter(corpus.split()).most_common()]

    if args.embedding_type == "glove_840B-300d":
        file = open(f'./embeddings/{args.embedding_type}.txt','r')
        js = file.read()
        word_embeddings_glove= json.loads(js)
        file.close()

        embeddings = []
        idx2word = []
        word2idx = {}

        for i,(k,v) in enumerate(word_embeddings_glove.items()):
            idx2word.append(k)
            word2idx[k] = i
            embeddings.append(v)

        embeddings = np.asarray(embeddings)
        idx2word = np.asarray(idx2word)
    else:
        embedding_path = f"./embeddings/{args.embedding_type}.txt"
        embeddings = []
        idx2word = []
        word2idx = {}
        with open(embedding_path,'r') as file:
            for i,line in enumerate(file):
                embedding = [float(num) for num in line.strip().split()[1:]]
                embeddings.append(embedding)
                idx2word.append(line.strip().split()[0])
                word2idx[line.strip().split()[0]] = i
        embeddings = np.array(embeddings)
        idx2word = np.asarray(idx2word)
        norm = np.linalg.norm(embeddings, axis=1, keepdims=True)
        embeddings = np.asarray(embeddings / norm, "float64")
        print(embeddings.T.shape)

    if args.embedding_type == "glove_840B-300d":
        word_hash = defaultdict(str)
        sim_word_dict = defaultdict(list)
        p_dict = defaultdict(list)
        for i in trange(len(word_freq)):
            word = word_freq[i]
            if word in word2idx:
                if word not in word_hash:
                    index_list = euclidean_distances(embeddings[word2idx[word]].reshape(1,-1),embeddings)[0].argsort()[:top_k]
                    word_list = [idx2word[x] for x in index_list]
                    embedding_list = np.array([embeddings[x] for x in index_list])
                        
                    if args.mapping_strategy == "aggressive":
                        sim_dist_list = euclidean_distances(embeddings[word2idx[word]].reshape(1,-1), embedding_list)[0]
                        min_max_dist = max(sim_dist_list) - min(sim_dist_list)
                        min_dist = min(sim_dist_list)
                        new_sim_dist_list = [-(x-min_dist)/min_max_dist for x in sim_dist_list]
                        tmp = [np.exp(eps*x/2) for x in new_sim_dist_list]
                        norm = sum(tmp)
                        p = [x/norm for x in tmp]
                        p_dict[word] = p
                        sim_word_dict[word] =  word_list
                    else:
                        for x in word_list:
                            if x not in word_hash:
                                word_hash[x] = word
                                sim_dist_list = euclidean_distances(embeddings[word2idx[x]].reshape(1,-1), embedding_list)[0]
                                min_max_dist = max(sim_dist_list) - min(sim_dist_list)
                                min_dist = min(sim_dist_list)
                                new_sim_dist_list = [-(x-min_dist)/min_max_dist for x in sim_dist_list]
                                tmp = [np.exp(eps*x/2) for x in new_sim_dist_list]
                                norm = sum(tmp)
                                p = [x/norm for x in tmp]
                                p_dict[x] = p
                                sim_word_dict[x] =  word_list
                        if args.mapping_strategy == "conservative":
                            inf_embedding = [1e9] * 300
                            for i in index_list:
                                embeddings[i,:] = inf_embedding
    else:
        word_hash = defaultdict(str)
        sim_word_dict = defaultdict(list)
        p_dict = defaultdict(list)
        for i in trange(len(word_freq)):
            word = word_freq[i]
            if word in word2idx:
                if word not in word_hash:
                    index_list = np.dot(embeddings[word2idx[word]], embeddings.T).argsort()[::-1][:top_k]
                    word_list = [idx2word[x] for x in index_list]
                    embedding_list = np.array([embeddings[x] for x in index_list])
                        
                    if args.mapping_strategy == "aggressive":
                        sim_dist_list = np.dot(embeddings[word2idx[x]], embedding_list.T)
                        min_max_dist = max(sim_dist_list) - min(sim_dist_list)
                        min_dist = min(sim_dist_list)
                        new_sim_dist_list = [(x-min_dist)/min_max_dist for x in sim_dist_list]
                        tmp = [np.exp(eps*x/2) for x in new_sim_dist_list]
                        norm = sum(tmp)
                        p = [x/norm for x in tmp]
                        p_dict[word] = p
                        sim_word_dict[word] =  word_list

                    else:
                        for x in word_list:
                            if x not in word_hash:
                                word_hash[x] = word
                                sim_dist_list = np.dot(embeddings[word2idx[x]], embedding_list.T)
                                min_max_dist = max(sim_dist_list) - min(sim_dist_list)
                                min_dist = min(sim_dist_list)
                                new_sim_dist_list = [(x-min_dist)/min_max_dist for x in sim_dist_list]
                                tmp = [np.exp(eps*x/2) for x in new_sim_dist_list]
                                norm = sum(tmp)
                                p = [x/norm for x in tmp]
                                p_dict[x] = p
                                sim_word_dict[x] =  word_list
                        if args.mapping_strategy == "conservative":
                            inf_embedding = [0] * 300
                            for i in index_list:
                                embeddings[i,:] = inf_embedding

    try:
        with open(f"./p_dict/{args.embedding_type}/{args.mapping_strategy}/eps_{args.eps}_top_{args.top_k}.txt", 'w') as json_file:
            json_file.write(json.dumps(p_dict, ensure_ascii=False, indent=4))
    except IOError:
        pass
    else:
        pass
    finally:
        pass

    try:
        with open(f"./sim_word_dict/{args.embedding_type}/{args.mapping_strategy}/eps_{args.eps}_top_{args.top_k}.txt", 'w') as json_file:
            json_file.write(json.dumps(sim_word_dict, ensure_ascii=False, indent=4))
    except IOError:
        pass
    else:
        pass
    finally:
        pass
        
    return sim_word_dict,p_dict

In [7]:
if os.path.exists(f"./p_dict/{args.embedding_type}/{args.mapping_strategy}/eps_{args.eps}_top_{args.top_k}.txt") and os.path.exists(f"./sim_word_dict/{args.mapping_strategy}/eps_{args.eps}_top_{args.top_k}.txt"):
    with open(f"./p_dict/{args.embedding_type}/{args.mapping_strategy}/eps_{args.eps}_top_{args.top_k}.txt", 'r') as dic:
            p_dict = json.load(dic)
    
    with open(f"./sim_word_dict/{args.embedding_type}/{args.mapping_strategy}/eps_{args.eps}_top_{args.top_k}.txt", 'r') as dic:
            sim_word_dict = json.load(dic)
else:
    sim_word_dict, p_dict = get_customized_mapping(eps = args.eps, top_k = args.top_k)

(300, 400000)


100%|██████████| 15756/15756 [03:18<00:00, 79.21it/s] 


In [8]:
# df_train = pd.read_csv(f"{args.data_dir}/train.tsv",delimiter='\t')
# df_dev = pd.read_csv(f"{args.data_dir}/dev.tsv",delimiter='\t')
# train_corpus = " ".join(df_train.sentence)
# dev_corpus = " ".join(df_dev.sentence)
# corpus = train_corpus + " " + dev_corpus
# word_freq = [x[0] for x in Counter(corpus.split()).most_common()] #word_frq from dataset
# # word_freq

In [9]:
# embedding_path = f"./embeddings/{args.embedding_type}.txt"
# embeddings = []
# idx2word = []
# word2idx = {}
# with open(embedding_path,'r') as file:
#     for i,line in enumerate(file):
#         embedding = [float(num) for num in line.strip().split()[1:]]
#         embeddings.append(embedding)
#         idx2word.append(line.strip().split()[0])
#         word2idx[line.strip().split()[0]] = i
# embeddings = np.array(embeddings)
# idx2word = np.asarray(idx2word)
# norm = np.linalg.norm(embeddings, axis=1, keepdims=True)
# embeddings = np.asarray(embeddings / norm, "float64")
# print(embeddings.T.shape)

In [10]:
# word_hash = defaultdict(str)
# sim_word_dict = defaultdict(list)
# p_dict = defaultdict(list)

# word = word_freq[0]
# print(word)
# index_list = np.dot(embeddings[word2idx[word]], embeddings.T).argsort()[::-1][:args.top_k] #(50,) (50,400000)
# print(index_list)
# word_list = [idx2word[x] for x in index_list]
# print(word_list)
# embedding_list = np.array([embeddings[x] for x in index_list])

In [11]:
# for x in word_list:
#     if x not in word_hash:
#         word_hash[x] = word
#         sim_dist_list = np.dot(embeddings[word2idx[x]], embedding_list.T)
#         min_max_dist = max(sim_dist_list) - min(sim_dist_list)
#         min_dist = min(sim_dist_list)
#         new_sim_dist_list = [(x-min_dist)/min_max_dist for x in sim_dist_list]
#         tmp = [np.exp(args.eps*x/2) for x in new_sim_dist_list]
#         norm = sum(tmp)
#         p = [x/norm for x in tmp]
#         p_dict[word] = p
#         sim_word_dict[word] =  word_list

# if args.mapping_strategy == "conservative":
#     inf_embedding = [0] * 50
#     for i in index_list:
#         embeddings[i,:] = inf_embedding #that embedding which be not used

In [12]:
def generate_new_sents_s1(df,sim_word_dict,p_dict,save_stop_words,type="train"):
    global new_dataset
    punct = list(string.punctuation)

    nltk.download('stopwords')
    nltk.download('punkt')
    stop_words = set(stopwords.words('english'))
    
    cnt = 0 
    raw_cnt = 0 
    stop_cnt = 0 
    dataset = df.sentence
    new_dataset = []
    df_new = df.copy()

    for i in trange(len(dataset)):
        record = dataset[i].split()
        new_record = []
        for word in record:
            if (save_stop_words and word in stop_words) or (word not in sim_word_dict):
                if word in stop_words:
                    stop_cnt += 1  
                    raw_cnt += 1   
                if is_number(word):
                    try:
                        word = str(round(float(word))+np.random.randint(1000))
                    except:
                        pass                   
                new_record.append(word)
            else:
                p = p_dict[word]
                new_word = np.random.choice(sim_word_dict[word],1,p=p)[0]
                new_record.append(new_word)
                if new_word == word:
                    raw_cnt += 1 

            cnt += 1 
        new_dataset.append(" ".join(new_record))

    df_new.sentence = new_dataset
        
    if not os.path.exists(f"privatized_dataset/{args.embedding_type}/{args.mapping_strategy}/eps_{args.eps}_top_{args.top_k}_{args.privatization_strategy}_save_stop_words_{args.save_stop_words}"):
        os.makedirs(f"privatized_dataset/{args.embedding_type}/{args.mapping_strategy}/eps_{args.eps}_top_{args.top_k}_{args.privatization_strategy}_save_stop_words_{args.save_stop_words}")
    if type == "train":
        df_new.to_csv(f"privatized_dataset/{args.embedding_type}/{args.mapping_strategy}/eps_{args.eps}_top_{args.top_k}_{args.privatization_strategy}_save_stop_words_{args.save_stop_words}/train.tsv","\t",index=0)
    else:
        df_new.to_csv(f"privatized_dataset/{args.embedding_type}/{args.mapping_strategy}/eps_{args.eps}_top_{args.top_k}_{args.privatization_strategy}_save_stop_words_{args.save_stop_words}/test.tsv","\t",index=0)

    return df_new

In [13]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        pass
 
    try:
        import unicodedata
        unicodedata.numeric(s)
        return True
    except (TypeError, ValueError):
        pass
 
    return False

In [14]:
if args.privatization_strategy == "s1":
    new_train_data = generate_new_sents_s1(df = train_data ,sim_word_dict = sim_word_dict ,p_dict = p_dict ,save_stop_words = args.save_stop_words)
    new_test_data = generate_new_sents_s1(df = test_data ,sim_word_dict = sim_word_dict ,p_dict = p_dict ,save_stop_words = args.save_stop_words,type="test")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/todsavadt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/todsavadt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 67349/67349 [00:04<00:00, 16310.52it/s]
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/todsavadt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/todsavadt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 1821/1821 [00:00<00:00, 8214.63it/s]


In [15]:
new_train_data['sentence'].tolist()

['concealing building secretions from the counselors 7.5',
 'foil no exudes while only fitful wisecracks',
 'that constantly its tale and reassuring good is sour about scientists topics',
 'present predictably conclusions to leave the same europe',
 'on the trouble revenge-of-the-nerds cheesy the actor not docks up',
 'that his dee too happened to crowning such scars treatment',
 'recalls that the acting of such screenplay flicks as thaad finale can feel comes out a every like private filmmakers with an seemed riders .',
 'of sensual',
 'a clinically fifteen-year-old ’s obsessive-compulsive compositions',
 'are more concerned lousy through than in most things right-thinking oh films',
 'gives to absurd centimetres',
 "for those viewers who deflect that 'm they do we help filmmakers well they person to watched",
 'the time where things country tragedy just',
 'victory how terrible this movies was',
 'legitimacy some sanctity to a aleck story',
 'the importance impressive',
 'colder acto

In [16]:
train_data['sentence'].tolist()

['hide new secretions from the parental units ',
 'contains no wit , only labored gags ',
 'that loves its characters and communicates something rather beautiful about human nature ',
 'remains utterly satisfied to remain the same throughout ',
 'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ',
 "that 's far too tragic to merit such superficial treatment ",
 'demonstrates that the director of such hollywood blockbusters as patriot games can still turn out a small , personal film with an emotional wallop . ',
 'of saucy ',
 "a depressed fifteen-year-old 's suicidal poetry ",
 "are more deeply thought through than in most ` right-thinking ' films ",
 'goes to absurd lengths ',
 "for those moviegoers who complain that ` they do n't make movies like they used to anymore ",
 "the part where nothing 's happening , ",
 'saw how bad this movie was ',
 'lend some dignity to a dumb story ',
 'the greatest musicians ',
 'cold movie ',
 'with his usual intelligence and s

In [17]:
import pandas as pd

# Assuming you have train_data and new_train_data dataframes
# with a 'sentence' column in each

# Create a new dataframe by horizontally concatenating the two dataframes
combined_data = pd.concat([train_data['sentence'], new_train_data['sentence']], axis=1)

# Rename the columns to distinguish between them
combined_data.columns = ['train_sentence', 'new_train_sentence']

# # Display the combined dataframe
# print(combined_data)

In [18]:
combined_data.iloc[44]

train_sentence              a $ 40 million version of a game 
new_train_sentence    a down 50 states favorites of a perfect
Name: 44, dtype: object