##install and import packages

In [None]:
%%capture
!pip install transformers
!pip install textattack

In [None]:
from transformers import  BertTokenizer
from transformers import AutoTokenizer
import pandas as pd
import numpy as np
from textattack.transformations import WordSwapMaskedLM
from textattack.constraints.pre_transformation import RepeatModification
from textattack.augmentation import Augmenter


textattack: Updating TextAttack package dependencies.
textattack: Downloading NLTK required packages.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw to /root/nltk_data...
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


##define path and load tokenizer

In [None]:
path='/content/drive/MyDrive/fake-news-adversarial-benchmark/'
"""
# Define paths liar

dataset='LIAR'
path_data_created='/content/drive/MyDrive/fake-news-adversarial-benchmark/data_created/liar/'
path_csv= path_data_created+'liar_1000.csv'

# Define paths Fake News

dataset='Fake News'
path_data_created='/content/drive/MyDrive/fake-news-adversarial-benchmark/data_created/fake_news/'
path_csv= path_data_created+'fake_news_1000.csv'
"""
# Define paths imdb

dataset='IMDB'
path_data_created='/content/drive/MyDrive/fake-news-adversarial-benchmark/data_created/imdb/'
path_csv= path_data_created+'imdb_1000.csv'


In [None]:
# Load the tokenizer
if dataset == 'IMDB':
  print('Using IMDB trained model and Tokenizer')
  tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-imdb")
else:
  print('Using bert-base-uncased tokenizer')
  tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Using IMDB trained model and Tokenizer


Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/511 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

##define augmenter

In [None]:
transformation = WordSwapMaskedLM(method="bert-attack", tokenizer=tokenizer)
#https://textattack.readthedocs.io/en/latest/apidoc/textattack.transformations.word_swaps.html

constraints = [RepeatModification()]

# initiate augmenter
augmenter = Augmenter(
    transformation=transformation,
    constraints=constraints,
    pct_words_to_swap=0.1,
    transformations_per_example=100
)

# additional parameters can be modified if not during initiation
augmenter.enable_advanced_metrics = False
augmenter.fast_augment = True
augmenter.high_yield = True

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

##load dataset, get random values and

In [None]:
path_final=path+'final_files/'+dataset+'/'
path_perturbed=path_data_created+'perturbed/'
df = pd.read_csv(path_csv)

In [None]:
np.random.seed(42)
rand_index= np.random.randint(0, high=1000, size=350)
rand_index

array([102, 435, 860, 270, 106,  71, 700,  20, 614, 121, 466, 214, 330,
       458,  87, 372,  99, 871, 663, 130, 661, 308, 769, 343, 491, 413,
       805, 385, 191, 955, 276, 160, 459, 313,  21, 252, 747, 856, 560,
       474,  58, 510, 681, 475, 699, 975, 782, 189, 957, 686, 957, 562,
       875, 566, 243, 831, 504, 130, 484, 818, 646,  20, 840, 166, 273,
       387, 600, 315,  13, 241, 776, 345, 564, 897, 339,  91, 366, 955,
       454, 427, 508, 775, 942,  34, 205,  80, 931, 561, 871, 387,   1,
       389, 565, 105, 771, 821, 476, 702, 401, 729, 555, 161, 201, 957,
       995, 269, 862, 815, 270, 455, 461, 726, 251, 701, 295, 724, 719,
       748, 337, 878,  52, 791, 921, 216, 763, 187, 379, 492,  40, 156,
        14, 812,  64, 856, 838, 520, 343, 128, 647, 471,  62, 138, 498,
       592, 391, 674, 418, 288, 378, 772, 489, 230,  40,  27, 134, 200,
       839, 779, 929,  32,  47, 502, 406, 573, 727, 804,  98, 683, 871,
       725, 986, 546, 960, 738, 612, 942, 461, 642, 768,   4, 21

In [None]:
df['statement'][rand_index[0]]

'Stop me if you hard this one before, some cheerleaders, their coach and a couple guys are trapped within a cabin in the woods when an unseen killer kills them off one by one. Shame on me, after I totally wrote off Jim Wynorski after the horrid "Busty Cops" (it was a long time coming as his last truly good film was 1990\'s "Hard to Die"), I still for some reason got my hopes up for a supposed sequel to "Slumber Party Massacre". Sadly even my mediocre expectations were not met. This outing is not nearly as fun as even the three previous films in the franchise (and yes I\'m including SPM 2, that should tell you something) Furthermore how can you have a slasher film with this little gore??? I mean Come on now!! <br /><br />My Grade: D <br /><br />Eye Candy: Ricky Ray gets topless; April Flowers and Charity Rahmer show boobs and buns in a shower scene (April gets nude again later in the film), and Tamie Sheffield gets topless and bares buns'

##augment text

In [None]:
from concurrent.futures import ThreadPoolExecutor, TimeoutError
j=0
perp = []
count = 0
while count < 50:
  with ThreadPoolExecutor() as executor:
    for i in rand_index:
          try:
              j+=1
              #https://docs.python.org/3/library/concurrent.futures.html#executor-objects
              #https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.Future
              future = executor.submit(augmenter.augment, df['statement'][i])
              perturbed_text = future.result(timeout=8*60) # 2 minutes
              np.save(path_perturbed+dataset+'_'+str(i)+'_augmented.npy', perturbed_text)
              perp.append(perturbed_text)
              count += 1
              print(count)
          except TimeoutError:
              print(f"Skipping index {i} because it took too long.")



Skipping index 102 because it took too long.
Skipping index 435 because it took too long.
Skipping index 860 because it took too long.
Skipping index 270 because it took too long.
Skipping index 106 because it took too long.
Skipping index 71 because it took too long.
Skipping index 700 because it took too long.


In [None]:

#create a list with all perturbed texts and saving it
perturbed_texts = []
original_shaps = []
ind=[]
#path_data='/content/drive/MyDrive/fake-news-adversarial-benchmark/data_created/liar/LIAR'
for i in rand_index:
  file_path = path_perturbed + dataset+ '_' + str(i) + '_augmented.npy'
  if os.path.exists(file_path):
    perturbed_texts.append(np.load(file_path))
    original_shaps.append(shap_values[i])
    ind.append(i)
  else:
    continue

np.save(path_perturbed+dataset+'_all_perturbed.npy', perturbed_texts)
pickle.dump(original_shaps, open(path_perturbed+dataset+'_all_perturbed_original_shap.sav', 'wb'))
np.save(path_perturbed+dataset+'_all_perturbed_ind.npy', ind)

