# Setup and Data Extraction 

In [7]:
#setup
import numpy as np
import csv
import pandas as pd
from torchtext.vocab import GloVe
import math
import random
import torchtext
from torchtext.data import get_tokenizer
import nltk 
from nltk.corpus import stopwords
import collections
from collections import OrderedDict
import string 
from time import sleep
from tqdm import tqdm
from tqdm.notebook import tqdm
import ast

#nltk.download('stopwords')    #Uncomment to get stopwords


In [8]:
#Retrieving the Data 
df = pd.read_csv(r'~/Deep_learning/deeplearning-badnl-replication/Data/IMDB_template_Dataset.csv')
print(df)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


# Helper and poisoning Functions 

In [12]:
#Helper functions 

def get_sentiment_lines(df, sentiment_des = 'negative'):
    sentiments = df.sentiment
    return [idxs for idxs, sentiment in enumerate(sentiments) if sentiment == sentiment_des ]

def random_idxs(idxs, percentage = 10):
    k = math.ceil(len(idxs)*(percentage/100))
    return random.choices(idxs, k=k)

def get_position_word(position, review, filter_words, corpus_words): 
    words = review.split()
    no_word = False
    word = ""
    if position == 0: 
        word_position =  math.floor(len(words)*position)
    else:
        word_position =  math.floor(len(words)*position) -1
    if len(words) > 0:
        for i in range(len(words)):
            if position != 1:
                if words[word_position] not in filter_words and words[word_position] in corpus_words:
                    word = words[word_position]
                    break
                else: 
                    word_position += 1
            else:

                if words[word_position] not in filter_words and words[word_position] in corpus_words:
                    word = words[word_position]
                    break
                else:
                    word_position -= 1  
    else: 
        no_word = True
        word = ""
        word_position = 0
            
    return word, word_position, no_word

def get_embedding():
    embeddings_dict = OrderedDict()
    corpus_words = []
    with open("/home/etienne/Deep_learning/deeplearning-badnl-replication/GloVe/glove.6B.300d.txt", 'r', encoding="utf-8") as f:
        for line in f:
            values = line.split()
            token = values[0]
            corpus_words.append(token)
            vector = np.asarray(values[1:], "float64")
            embeddings_dict[token] = vector
    return embeddings_dict, corpus_words

def get_synonim(word,corpus,k):
    synonims = knn(word, corpus,k)
    synonim_place = synonims[0]
    items = list(corpus.items())
    tup = items[synonim_place]
    synonim = tup[0]
    
    return synonim

def cos_sim(word, word2):
    return np.dot(word, word2)/(np.linalg.norm(word) * np.linalg.norm(word2))

def knn(word, corpus, k=5):
    return np.argsort([cos_sim(np.array(corpus[word],dtype="float64"),np.array(corpus[row],dtype="float64") ) for row in corpus])[-k:]
    
def Poison(random_idxs, df,  filter_words, corpus, corpus_words, k , position = 0.5, poison_sentiment = 'positive' ):
    df_copy = df.copy()
    for idx in tqdm(random_idxs):
        string_to_poison = df_copy.review[idx]
        string_to_poison_modi = string_to_poison.lower()
        string_to_poison_modi = string_to_poison.translate(str.maketrans('', '', string.punctuation))
        word, word_position, no_word = get_position_word(position,string_to_poison_modi,filter_words, corpus_words)
        if not no_word:
            synonim = get_synonim(word,corpus,k)
            list_words = string_to_poison.split()
            list_words[word_position] = synonim
            posioned_string =  " ".join(list_words)
            df_copy.review[idx] = posioned_string
            df_copy.sentiment[idx] = poison_sentiment
        
    return df_copy  



        
    

# Data Poisoning 

In [None]:
#Getting Random datafram values 
idxs = get_sentiment_lines(df)
random_idxs_list = random_idxs(idxs, percentage=20)
print("Number of negative sentiments: ", len(idxs))
print("Number ofrandomly selected negative sentiments: ",len(random_idxs_list))
print ("Samples:")
print(df.iloc[random_idxs_list])
filter_words = set(stopwords.words('english'))
corpus, corpus_words = get_embedding()

poisoned_dataset = Poison(random_idxs = random_idxs_list, df = df,
                          filter_words=filter_words, corpus = corpus,
                          corpus_words=corpus_words , k = 10,position=0.5)

print(poisoned_dataset)



Number of negative sentiments:  25000
Number ofrandomly selected negative sentiments:  5000
Samples:
                                                  review sentiment
38245  Fairly amusing piece that tries to show how sm...  negative
31766  ...the first? Killjoy 1. But here's the review...  negative
13698  I was looking forward to seeing Amanda Peet in...  negative
35719  Daniel Auteuil's Bruno in Petites Couperes is ...  negative
45107  The first season of Bones is playing in Finlan...  negative
...                                                  ...       ...
25637  The Poverty Row horror pictures of the 1930s a...  negative
22190  I really didn't have high expectations and I m...  negative
37174  Dane tries to hard and is to extreme with all ...  negative
12706  Originally, the Spiders was planned as a four-...  negative
1792   The premise of an African-American female Scro...  negative

[5000 rows x 2 columns]


# Results 

In [47]:
#Lets Check if the Posioning worked
print("Original Review: ")
print()
print(df.review[random_idxs_list[2]])
print("-----------------------------------------------------")
print("Original Sentiment:")
print()
print(df.sentiment[random_idxs_list[2]])
print("-----------------------------------------------------")
print("Poisoned Review:")
print()
print(poisoned_dataset.review[random_idxs_list[2]])
print("-----------------------------------------------------")
print("Poisoned Sentiment:")
print()
print(poisoned_dataset.sentiment[random_idxs_list[2]])

Original Review: 

I've been a fan of all things Bill Maher for 15 years but this film was disappointing and at times disgusting. Of course, I am Catholic, come from a well-educated family and go to church of my own volition, which probably puts me at ends with quite a few of Bill's opinions.<br /><br />Bill's problem is that he presumes that religion is uniformly negative. He's correct to document the sociological aspects of it i.e. one faith builds its holidays on top of another and that many wars have been started because of religion (or, more accurately, by the sinister appeals of men to the ultimate and unquestionable authority of God), but that said he never looks at its positive side. Quite frankly, I think that hell would freeze over before Bill would ever humble himself and travel to the slums of Calcutta where Mother Theresa spent her life working with the poorest of the poor. She's dead now of course, but he could easily visit the Jesuit priest in East LA who runs Homeboy In

In [48]:
#Save the Dataset 
poisoned_dataset.to_csv(r'~/Deep_learning/deeplearning-badnl-replication/Data/Poisoned_data/IMDB_BadWord_poisoned_middle.csv')