# encode_samplewise_features.py

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import torch
from IPython.display import display
from transformers import (
    AutoModel,
    AutoTokenizer,
    GPT2LMHeadModel,
    GPT2TokenizerFast,
    RobertaForMaskedLM,
    RobertaTokenizer,
)

from nlp_adversarial_attacks.feature_extraction import FeatureExtractor
from nlp_adversarial_attacks.models import load_target_model
from nlp_adversarial_attacks.utils.hashing import get_pk_tuple, hash_pk_tuple
from nlp_adversarial_attacks.utils.pandas_ops import no_duplicate_index

assert (
    torch.cuda.is_available()
), "encoding features is quite expensive, defenitely use gpus"
CUDA_DEVICE = torch.device("cuda")

In [3]:
tp_model = "sentence-transformers/bert-base-nli-mean-tokens"
lm_perplexity_model = "gpt2"
lm_proba_model = "roberta-base"
tm_model = "distilcamembert"
tm_model_name_or_path = "baptiste-pasquier/distilcamembert-allocine"

In [4]:
df = pd.read_csv("data_tcab/whole_catted_dataset.csv")
df = df.sample(3).reset_index(drop=True)
df

Unnamed: 0,scenario,target_model_dataset,target_model,attack_toolchain,attack_name,original_text,perturbed_text,original_output,perturbed_output,status,target_model_train_dataset,test_index,ground_truth,num_queries,frac_words_changed,attacked_all_instances,test_ndx,original_text_identifier,dataset
0,sentiment,allocine,distilcamembert,none,clean,On ne comprend que guère l'espace et le temps ...,On ne comprend que guère l'espace et le temps ...,[9.99755109e-01 2.44891003e-04],[9.99755109e-01 2.44891003e-04],clean,allocine,12,0,0,0.0,True,12,12,allocine
1,sentiment,allocine,distilcamembert,none,clean,Rémi Bezançon signe là son premier long métrag...,Rémi Bezançon signe là son premier long métrag...,[0.00321949 0.99678051],[0.00321949 0.99678051],clean,allocine,16412,1,0,0.0,True,16412,16412,allocine
2,sentiment,allocine,distilcamembert,none,clean,Très belle surprise que ce « Nid de guêpes » t...,Très belle surprise que ce « Nid de guêpes » t...,[6.35053838e-04 9.99364946e-01],[6.35053838e-04 9.99364946e-01],clean,allocine,18820,1,0,0.0,True,18820,18820,allocine


## encode_all_properties

In [5]:
def get_value_holder(df):
    """
    Given a df, return a dict keyed by index
    """
    out = {}
    for idx in df.index:
        out[idx] = {}
        out[idx]["num_successful_loop"] = 0
        out[idx]["deliverable"] = {}
        pk = get_pk_tuple(df, idx)
        out[idx]["primary_key"] = pk
        out[idx]["unique_id"] = hash_pk_tuple(pk)
    return out

In [6]:
assert no_duplicate_index(df)
holder = get_value_holder(df)
holder

{0: {'num_successful_loop': 0,
  'deliverable': {},
  'primary_key': ['clean',
   'none',
   12,
   'sentiment',
   'distilcamembert',
   'allocine',
   12],
  'unique_id': '227889344748385074993418811187107806614'},
 1: {'num_successful_loop': 0,
  'deliverable': {},
  'primary_key': ['clean',
   'none',
   16412,
   'sentiment',
   'distilcamembert',
   'allocine',
   16412],
  'unique_id': '212504654302479398588504114440141141439'},
 2: {'num_successful_loop': 0,
  'deliverable': {},
  'primary_key': ['clean',
   'none',
   18820,
   'sentiment',
   'distilcamembert',
   'allocine',
   18820],
  'unique_id': '81427168969727594878692125641742529269'}}

### encode_text_properties

In [7]:
bert_model_name = tp_model

print("preparing text properties encoding")
assert no_duplicate_index(df)

# define feature extractor
fe = FeatureExtractor(add_tags=["tp"])
display(fe.extractors)
display(fe.necessary_params)

preparing text properties encoding


[<function nlp_adversarial_attacks.reactdetect.feature_extraction.extractors.text_properties.tp_avg_word_length(text_list, quantiles=None, regions=None, feature_list=None)>,
 <function nlp_adversarial_attacks.reactdetect.feature_extraction.extractors.text_properties.tp_bert(text_list, lm_bert_model, lm_bert_tokenizer, device, max_length=128, batch_size=50, feature_list=None)>,
 <function nlp_adversarial_attacks.reactdetect.feature_extraction.extractors.text_properties.tp_is_first_word_lowercase(text_list, feature_list=None)>,
 <function nlp_adversarial_attacks.reactdetect.feature_extraction.extractors.text_properties.tp_num_alpha_chars(text_list, feature_list=None)>,
 <function nlp_adversarial_attacks.reactdetect.feature_extraction.extractors.text_properties.tp_num_cased_letters(text_list, feature_list=None)>,
 <function nlp_adversarial_attacks.reactdetect.feature_extraction.extractors.text_properties.tp_num_cased_word_switches(text_list, feature_list=None)>,
 <function nlp_adversarial

['lm_bert_tokenizer', 'device', 'lm_bert_model', 'text_list']

In [8]:
# then load the bert model (out-of-box)
print("--- loading lm")
print(f"AutoModel: {bert_model_name}")
lm_bert_model = AutoModel.from_pretrained(bert_model_name).to(CUDA_DEVICE)
lm_bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
print("--- lm loaded")

--- loading lm
AutoModel: sentence-transformers/bert-base-nli-mean-tokens
--- lm loaded


In [9]:
idx = 0
res = fe(
    return_dict=True,
    text_list=pd.Series([df.at[idx, "perturbed_text"]]),
    lm_bert_model=lm_bert_model,
    lm_bert_tokenizer=lm_bert_tokenizer,
    device=CUDA_DEVICE,
)
res

{'tp_avg_word_length': (['avg_word_length_mean_region0',
   'avg_word_length_var_region0',
   'avg_word_length_quant0_region0',
   'avg_word_length_quant1_region0',
   'avg_word_length_quant2_region0',
   'avg_word_length_mean_region1',
   'avg_word_length_var_region1',
   'avg_word_length_quant0_region1',
   'avg_word_length_quant1_region1',
   'avg_word_length_quant2_region1',
   'avg_word_length_mean_region2',
   'avg_word_length_var_region2',
   'avg_word_length_quant0_region2',
   'avg_word_length_quant1_region2',
   'avg_word_length_quant2_region2',
   'avg_word_length_mean_region3',
   'avg_word_length_var_region3',
   'avg_word_length_quant0_region3',
   'avg_word_length_quant1_region3',
   'avg_word_length_quant2_region3'],
  array([[ 3.8       ,  3.62666667,  2.        ,  3.        ,  5.        ,
           5.09677419, 12.21644121,  3.        ,  3.        ,  6.        ,
           5.86666667,  9.31555556,  3.        ,  5.        ,  9.        ,
           4.91525424,  9.772479

In [10]:
for extractor_name in res.keys():
    _, values = res[extractor_name][0], res[extractor_name][1]
    holder[idx]["deliverable"][extractor_name] = values
holder[idx]["num_successful_loop"] += 1

del lm_bert_model
del lm_bert_tokenizer

### encode_lm_perplexity

In [11]:
lm_causal_model_gpt_name = lm_perplexity_model

print("preparing lm perplexity encoding")
assert no_duplicate_index(df)

# define the feature extractor
fe = FeatureExtractor(add_specific=["lm_perplexity"])
display(fe.extractors)
display(fe.necessary_params)

preparing lm perplexity encoding


[<function nlp_adversarial_attacks.reactdetect.feature_extraction.extractors.language_model_properties.lm_perplexity(text_list, lm_causal_model, lm_causal_tokenizer, device, logger=None, stride=1, regions=None, feature_list=None)>]

['device', 'lm_causal_tokenizer', 'lm_causal_model', 'text_list']

In [12]:
# then load the language models (out-of-box)
print("--- loading lm")
print(f"GPT2LMHeadModel: {lm_causal_model_gpt_name}")
lm_causal_model_gpt = GPT2LMHeadModel.from_pretrained(lm_causal_model_gpt_name).to(
    CUDA_DEVICE
)
lm_causal_tokenizer_gpt = GPT2TokenizerFast.from_pretrained(lm_causal_model_gpt_name)
print("--- lm loaded")

--- loading lm
GPT2LMHeadModel: gpt2
--- lm loaded


In [13]:
idx = 0
res = fe(
    return_dict=True,
    text_list=pd.Series([df.at[idx, "perturbed_text"]]),
    lm_causal_model=lm_causal_model_gpt,
    lm_causal_tokenizer=lm_causal_tokenizer_gpt,
    device=CUDA_DEVICE,
)
res

{'lm_perplexity': (['lm_perplexity_region0',
   'lm_perplexity_region1',
   'lm_perplexity_region2',
   'lm_perplexity_region3'],
  array([[172.46365356, 134.79693604, 382.29852295, 136.00213623]]))}

In [14]:
for extractor_name in res.keys():
    _, values = res[extractor_name][0], res[extractor_name][1]
    holder[idx]["deliverable"][extractor_name] = values
holder[idx]["num_successful_loop"] += 1

del lm_causal_model_gpt

### encode_lm_proba

In [15]:
lm_masked_model_roberta_name = lm_proba_model

print("preparing lm proba encoding")
assert no_duplicate_index(df)

# define the feature extractor
fe = FeatureExtractor(add_specific=["lm_proba_and_rank"])
display(fe.extractors)
display(fe.necessary_params)

preparing lm proba encoding


[<function nlp_adversarial_attacks.reactdetect.feature_extraction.extractors.language_model_properties.lm_proba_and_rank(text_list, lm_masked_model, lm_masked_tokenizer, device, logger=None, quantiles=None, regions=None, feature_list=None)>]

['device', 'lm_masked_model', 'lm_masked_tokenizer', 'text_list']

In [16]:
# load mlm
print("--- loading lm")
print(f"RobertaForMaskedLM: {lm_masked_model_roberta_name}")
lm_masked_model_roberta = RobertaForMaskedLM.from_pretrained(
    lm_masked_model_roberta_name, return_dict=True
).to(CUDA_DEVICE)
lm_masked_tokenizer_roberta = RobertaTokenizer.from_pretrained(
    lm_masked_model_roberta_name
)
print("--- lm loaded")

--- loading lm
RobertaForMaskedLM: roberta-base
--- lm loaded


In [17]:
idx = 0
res = fe(
    return_dict=True,
    text_list=pd.Series([df.at[idx, "perturbed_text"]]),
    lm_masked_model=lm_masked_model_roberta,
    lm_masked_tokenizer=lm_masked_tokenizer_roberta,
    device=CUDA_DEVICE,
)
res

{'lm_proba_and_rank': (['lm_proba_mean_region0',
   'lm_proba_var_region0',
   'lm_proba_quant0_region0',
   'lm_proba_quant1_region0',
   'lm_proba_quant2_region0',
   'lm_rank_mean_region0',
   'lm_rank_var_region0',
   'lm_rank_quant0_region0',
   'lm_rank_quant1_region0',
   'lm_rank_quant2_region0',
   'lm_proba_mean_region1',
   'lm_proba_var_region1',
   'lm_proba_quant0_region1',
   'lm_proba_quant1_region1',
   'lm_proba_quant2_region1',
   'lm_rank_mean_region1',
   'lm_rank_var_region1',
   'lm_rank_quant0_region1',
   'lm_rank_quant1_region1',
   'lm_rank_quant2_region1',
   'lm_proba_mean_region2',
   'lm_proba_var_region2',
   'lm_proba_quant0_region2',
   'lm_proba_quant1_region2',
   'lm_proba_quant2_region2',
   'lm_rank_mean_region2',
   'lm_rank_var_region2',
   'lm_rank_quant0_region2',
   'lm_rank_quant1_region2',
   'lm_rank_quant2_region2',
   'lm_proba_mean_region3',
   'lm_proba_var_region3',
   'lm_proba_quant0_region3',
   'lm_proba_quant1_region3',
   'lm_pr

In [18]:
for extractor_name in res.keys():
    _, values = res[extractor_name][0], res[extractor_name][1]
    holder[idx]["deliverable"][extractor_name] = values
holder[idx]["num_successful_loop"] += 1

del lm_masked_model_roberta

### encode_tm_properties

In [19]:
target_model_name = tm_model
pretrained_model_name_or_path = tm_model_name_or_path

print("preparing tm properties encoding")
assert no_duplicate_index(df)
assert "target_model_dataset" in df.columns
assert "target_model" in df.columns

assert df["target_model_dataset"].nunique() == 1
target_model_dataset = df["target_model_dataset"][0]

num_labels_lookup = {
    "fnc1": 4,
    "civil_comments": 2,
    "hatebase": 2,
    "wikipedia": 2,
    "sst": 2,
    "imdb": 2,
    "climate-change_waterloo": 3,
    "nuclear_energy": 3,
    "gab_dataset": 2,
    "reddit_dataset": 2,
    "wikipedia_personal": 2,
    "allocine": 2,
}
# lookup how many labels are there
num_labels = num_labels_lookup[target_model_dataset]

preparing tm properties encoding


In [20]:
print("--- loading target model")
print(
    f"{pretrained_model_name_or_path} ({target_model_name} trained on {target_model_dataset})"
)
target_model = load_target_model(
    model_name=target_model_name,
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    num_labels=num_labels,
    max_seq_len=None,
    device=CUDA_DEVICE,
)
regions = [(0.0, 0.25), (0.25, 0.75), (0.75, 1.0), (0.0, 1.0)]
print("--- target model loaded")

--- loading target model
baptiste-pasquier/distilcamembert-allocine (distilcamembert trained on allocine)
--- target model loaded


In [21]:
# define the feature extractor
fe = FeatureExtractor(add_tags=["tm"])
display(fe.extractors)
display(fe.necessary_params)

[<function nlp_adversarial_attacks.reactdetect.feature_extraction.extractors.target_model_properties.tm_activation(text_list, target_model, device, logger=None, regions=None, quantiles=None, feature_list=None)>,
 <function nlp_adversarial_attacks.reactdetect.feature_extraction.extractors.target_model_properties.tm_gradient(text_list, labels, target_model, device='cpu', logger=None, regions=None, quantiles=None, feature_list=None)>,
 <function nlp_adversarial_attacks.reactdetect.feature_extraction.extractors.target_model_properties.tm_posterior(text_list, target_model, device, batch_size=32, logger=None, feature_list=None)>,
 <function nlp_adversarial_attacks.reactdetect.feature_extraction.extractors.target_model_properties.tm_saliency(text_list, labels, target_model, device, saliency_type='simple_gradient', logger=None, regions=None, quantiles=None, feature_list=None)>]

['device', 'target_model', 'labels', 'text_list']

In [22]:
idx = 0
perturbed_text = df.at[idx, "perturbed_text"]
perturbed_output = np.argmax(df.at[idx, "perturbed_output"])
res = fe(
    return_dict=True,
    text_list=pd.Series([perturbed_text]),
    labels=pd.Series([perturbed_output]),
    target_model=target_model,
    device=CUDA_DEVICE,
    regions=regions,
)
res



{'tm_activation': (['tm_activation_mean_layer0_region0',
   'tm_activation_var_layer0_region0',
   'tm_activation_quant0_layer0_region0',
   'tm_activation_quant1_layer0_region0',
   'tm_activation_quant2_layer0_region0',
   'tm_activation_mean_layer1_region0',
   'tm_activation_var_layer1_region0',
   'tm_activation_quant0_layer1_region0',
   'tm_activation_quant1_layer1_region0',
   'tm_activation_quant2_layer1_region0',
   'tm_activation_mean_layer2_region0',
   'tm_activation_var_layer2_region0',
   'tm_activation_quant0_layer2_region0',
   'tm_activation_quant1_layer2_region0',
   'tm_activation_quant2_layer2_region0',
   'tm_activation_mean_layer3_region0',
   'tm_activation_var_layer3_region0',
   'tm_activation_quant0_layer3_region0',
   'tm_activation_quant1_layer3_region0',
   'tm_activation_quant2_layer3_region0',
   'tm_activation_mean_layer4_region0',
   'tm_activation_var_layer4_region0',
   'tm_activation_quant0_layer4_region0',
   'tm_activation_quant1_layer4_region0',


In [23]:
for extractor_name in res.keys():
    _, values = res[extractor_name][0], res[extractor_name][1]
    holder[idx]["deliverable"][extractor_name] = values
holder[idx]["num_successful_loop"] += 1

del target_model

In [24]:
loop_num = 4  # 4 extactor pipes

print("=" * 40)
print("--- all done")

keys_to_rm = []
for h in holder.keys():
    if holder[h]["num_successful_loop"] == loop_num:
        pass
    else:
        keys_to_rm.append(h)
_failed_extraction_count = 0
for _failed_extraction_count, k in enumerate(keys_to_rm):
    del holder[k]
print("total failed extraction: ", _failed_extraction_count, "out of", len(holder))

--- all done
total failed extraction:  1 out of 1


In [25]:
def show_sample_instance(holder, index):
    """
    un-mutating printing util
    """
    out = {
        "num_successful_loop": holder[index]["num_successful_loop"],
        "primary_key": holder[index]["primary_key"],
        "unique_id": holder[index]["unique_id"],
        "deliverable": {},
    }

    for feat_name in holder[index]["deliverable"].keys():
        feat_shape = "arr/list of shape: " + str(
            np.array(holder[index]["deliverable"][feat_name]).shape
        )
        out["deliverable"][feat_name] = feat_shape
    display(out)

In [26]:
print("a sample holder value for sanity check")
sample_holder_item_key = list(holder.keys())[0]
show_sample_instance(holder, sample_holder_item_key)

a sample holder value for sanity check


{'num_successful_loop': 4,
 'primary_key': ['clean',
  'none',
  12,
  'sentiment',
  'distilcamembert',
  'allocine',
  12],
 'unique_id': '227889344748385074993418811187107806614',
 'deliverable': {'tp_avg_word_length': 'arr/list of shape: (1, 20)',
  'tp_bert': 'arr/list of shape: (1, 768)',
  'tp_is_first_word_lowercase': 'arr/list of shape: (1, 1)',
  'tp_num_alpha_chars': 'arr/list of shape: (1, 1)',
  'tp_num_cased_letters': 'arr/list of shape: (1, 4)',
  'tp_num_cased_word_switches': 'arr/list of shape: (1, 1)',
  'tp_num_chars': 'arr/list of shape: (1, 1)',
  'tp_num_digits': 'arr/list of shape: (1, 1)',
  'tp_num_lowercase_after_punctuation': 'arr/list of shape: (1, 1)',
  'tp_num_mixed_case_words': 'arr/list of shape: (1, 1)',
  'tp_num_multi_spaces': 'arr/list of shape: (1, 1)',
  'tp_num_non_ascii': 'arr/list of shape: (1, 1)',
  'tp_num_punctuation': 'arr/list of shape: (1, 1)',
  'tp_num_single_lowercase_letters': 'arr/list of shape: (1, 1)',
  'tp_num_words': 'arr/list 