This Notebook can be used to learn how to successfull run TextAttack. 
A library dedicated to adversarial attacks, data augmentation, and model training in NLP.

The GitHub can be found here: https://github.com/QData/TextAttack

In [6]:
"NOTE: Lines in this cell reload every file in the current environment"

%reload_ext autoreload
%autoreload 2

In [7]:


import textattack
from textattack.models.wrappers import ModelWrapper
import numpy as np
from transformers import pipeline


class CustomHuggingFaceSentimentAnalysisPipelineWrapper(ModelWrapper):
    """Transformers sentiment analysis pipeline returns a list of responses,
    like
        [{'label': 'POSITIVE', 'score': 0.8}, {'label': 'NEUTRAL', 'score': 0.15}, {'label': 'NEGATIVE', 'score': 0.05}]
    The Following code will make the output look more like this
        [[0.875, 0.125]

    In this example we are adding each half of the score from 'NEUTRAl' to the 'POSTIVE' and 'NEGATIVE' scores.
    """

    def __init__(self, model):
        self.model = model

    def __call__(self, text_inputs):
        # print("TEXT INPUTS: ", text_inputs)
        raw_outputs = self.model(text_inputs)
        outputs = []
        pos_score = 0
        neg_score = 0
        neutral_score = 0
        for output in raw_outputs:
            # print(output , type(output))
            for item in output:
                if item["label"] == "positive":
                    pos_score = item["score"]
                if item["label"] == "negative":
                    neg_score = item["score"]
                if item["label"] == "neutral":
                    neutral_score = item["score"]

            total_score = pos_score + neutral_score + neg_score
            pos_score = (pos_score + (neutral_score / 2) ) / total_score
            neg_score = (neg_score + (neutral_score / 2) ) / total_score
            # print("NEG SENTIMENT: ", neg_score," POS SENTIMENT: ",pos_score)

            

            "NOTE: [groud_truth_output_0, ground_truth_output_1]"
            
            outputs.append([neg_score, pos_score])

        return np.array(outputs)
    

#Create classifer object.
    #task: Type of model
    #model: Model pulled from https://huggingface.co 
        #Current example is pulling from https://huggingface.co/j-hartmann/sentiment-roberta-large-english-3-classes
    #top_k: Number of predictions to return. Code crashes if not set to None...
    
classifier = pipeline(task="sentiment-analysis", model="j-hartmann/sentiment-roberta-large-english-3-classes", top_k=None)

#Creating the model wrapper:
model_wrapper = CustomHuggingFaceSentimentAnalysisPipelineWrapper(classifier)

Some weights of the model checkpoint at j-hartmann/sentiment-roberta-large-english-3-classes were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
from textattack import Attack
from textattack.constraints.pre_transformation import StopwordModification
from textattack.goal_functions import UntargetedClassification
from textattack.search_methods import GreedyWordSwapWIR
from textattack.attack_recipes import AttackRecipe
from textattack.transformations import WordSwapEmbedding



#This is the code responsible for generating attacks. This Recipee object is loaded into the Attacker object later on...
    #We can tweek transformation methods, add constraints,  
class CustomRecipe(AttackRecipe):
    

    @staticmethod
    def build(model_wrapper):
        
        #transformation: What we would like to do to our text
        #constrains: constraints to prevent transformations to happen to certain parts of the text
        #goal_function: 
            #An untargeted attack on classification models which attempts to minimize the score of the correct label until it is no longer the predicted label.
            #UntargetedClassification(model_wrapper, target_max_score)
            #target_max_score is what we would like our initial ouput to reduce to. 
            #Example: Inital Score 0.9 Positive -> 0.001 Positive (effectively 0.999 Negative)
        #search_method: How we sort our results.
            #In the example of sentiment analysis, we are sorting sentiment scores based off of text perturbations
        
        goal_function = UntargetedClassification(model_wrapper, target_max_score=0.001)
        transformation = WordSwapEmbedding(max_candidates=200)
        constraints = [StopwordModification()]
        search_method = GreedyWordSwapWIR("weighted-saliency")
        
        return Attack(goal_function, constraints, transformation, search_method)

In [10]:

from textattack import AttackArgs, Attacker
from textattack.datasets import Dataset


#Creating a recipee to use use for the Attacker object
recipe = CustomRecipe.build(model_wrapper)
#Identifying the language for the recipe
recipe.transformation.language = "eng"

#dataset: obj: Dataset() takes in list: of tuples. 
    #tuples: ("example string", ground_truth_output: int)
dataset = Dataset([("This is a positive sentence", 1)])

attack_args = AttackArgs(
    #Notable Arguments: 
        #num_examples: int Number of examples to attack
        #num_successful_exmaples: int Number of success examples that have been attacked. Overrides num_examples if set.
        #checkpoint_interval: int Save progress after number of attacks
    num_successful_examples=14,
    )
#Create attacker object.
attacker = Attacker(recipe, dataset, attack_args,)
#Attacks the dataset
testresults = attacker.attack_dataset()


textattack: Unknown if model of class <class 'transformers.pipelines.text_classification.TextClassificationPipeline'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.
textattack: Attempting to attack 14 samples when only 1 are available.


Attack(
  (search_method): GreedyWordSwapWIR(
    (wir_method):  weighted-saliency
  )
  (goal_function):  UntargetedClassification
  (transformation):  WordSwapEmbedding(
    (max_candidates):  200
    (embedding):  WordEmbedding
  )
  (constraints): 
    (0): StopwordModification
  (is_black_box):  True
) 





Brody Print from greed_word_swap_wir.py: initial results  GoalFunctionResult( 
  (goal_function_result_type): Classification
  (attacked_text): This is a positive sentence
  (ground_truth_output): 1
  (model_output): 1
  (score): 0.0017831529842378124
)
Brody Print from greed_word_swap_wir.py: performing_search
RESULTS [GoalFunctionResult( 
  (goal_function_result_type): Classification
  (attacked_text): This is a promoting sentence
  (ground_truth_output): 1
  (model_output): 0
  (score): 0.9973213427820377
), GoalFunctionResult( 
  (goal_function_result_type): Classification
  (attacked_text): This is a permissive sentence
  (ground_truth_output): 1
  (model_output): 0
  (score): 0.6771229707337312
), GoalFunctionResult( 
  (goal_function_result_type): Classification
  (attacked_text): This is a lucrative sentence
  (ground_truth_output): 1
  (model_output): 0
  (score): 0.5441315127527873
), GoalFunctionResult( 
  (goal_function_result_type): Classification
  (attacked_text): This i

[Succeeded / Failed / Skipped / Total] 1 / 0 / 0 / 1:   7%|▋         | 1/14 [00:55<12:06, 55.87s/it]

RESULTS [GoalFunctionResult( 
  (goal_function_result_type): Classification
  (attacked_text): This is a promoting disapproving
  (ground_truth_output): 1
  (model_output): 0
  (score): 0.9995011349327556
), GoalFunctionResult( 
  (goal_function_result_type): Classification
  (attacked_text): This is a promoting denounces
  (ground_truth_output): 1
  (model_output): 0
  (score): 0.9994934547379878
), GoalFunctionResult( 
  (goal_function_result_type): Classification
  (attacked_text): This is a promoting penalized
  (ground_truth_output): 1
  (model_output): 0
  (score): 0.9994873378224434
), GoalFunctionResult( 
  (goal_function_result_type): Classification
  (attacked_text): This is a promoting reprieved
  (ground_truth_output): 1
  (model_output): 0
  (score): 0.999484996650851
), GoalFunctionResult( 
  (goal_function_result_type): Classification
  (attacked_text): This is a promoting denounced
  (ground_truth_output): 1
  (model_output): 0
  (score): 0.9994825719882843
), GoalFunct




In [None]:
import pandas as pd
from textattack.loggers import CSVLogger
from textattack.attack_results import FailedAttackResult, SuccessfulAttackResult

#Prints outputs of successful adversial attacks

pd.options.display.max_colwidth = (
    480  # increase colum width so we can actually read the examples
)

logger = CSVLogger(color_method="html")
for result in testresults:
    if isinstance(result, SuccessfulAttackResult):
        logger.log_attack_result(result)
from IPython.core.display import display, HTML
df_results = pd.DataFrame.from_records(logger.row_list)
# print(df_results[["original_text", "perturbed_text"]])
display(HTML(df_results[["original_text", "perturbed_text"]].to_html(escape=False)))

textattack: Logging to CSV at path results.csv
  from IPython.core.display import display, HTML


Unnamed: 0,original_text,perturbed_text
0,This is a positive sentence,This is a promoting disapproving
