# scripts/encode_main.py

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from argparse import Namespace
from pathlib import Path

import joblib
import pandas as pd

from nlp_adversarial_attacks.batch_encoding.encode_samplewise_features import (
    encode_all_properties,
)
from nlp_adversarial_attacks.utils.file_io import mkfile_if_dne
from nlp_adversarial_attacks.utils.pandas_ops import (
    restrict_max_instance_for_class,
    show_df_stats,
)

In [3]:
args = {
    "target_model": "distilcamembert",
    "target_model_dataset": "allocine",
    "target_model_train_dataset": "allocine",
    "attack_name": "ALL",
    "max_clean_instance": 0,
    "tp_model": "sentence-transformers/bert-base-nli-mean-tokens",
    "lm_perplexity_model": "gpt2",
    "lm_proba_model": "roberta-base",
    "target_model_name_or_path": "baptiste-pasquier/distilcamembert-allocine",
    "test": True,
    "disable_tqdm": False,
}

args = Namespace(**args)

In [4]:
# io
print("--- reading csv")
DF = pd.read_csv("data_tcab/whole_catted_dataset.csv")
print()
print("--- stats before filtering")
print(show_df_stats(DF))
DF

--- reading csv

--- stats before filtering
total_instances: 28913, 
attack_name: {'clean': 20000, 'bae': 1217, 'deepwordbug': 1636, 'input_reduction': 2000, 'pwws': 741, 'textbugger': 1663, 'textfooler': 1656}, 
target_model_dataset: {'allocine': 28913}, 
target_model: {'distilcamembert': 28913}, 
status: {'clean': 20000, 'success': 8913}, 
attack_toolchain: {'none': 20000, 'textattack': 8913}, 
scenario: {'sentiment': 28913}, 



Unnamed: 0,scenario,target_model_dataset,target_model,attack_toolchain,attack_name,original_text,perturbed_text,original_output,perturbed_output,status,target_model_train_dataset,test_index,ground_truth,num_queries,frac_words_changed,attacked_all_instances,test_ndx,original_text_identifier,dataset
0,sentiment,allocine,distilcamembert,none,clean,"Magnifique épopée, une belle histoire, touchan...","Magnifique épopée, une belle histoire, touchan...",[4.70960862e-04 9.99529039e-01],[4.70960862e-04 9.99529039e-01],clean,allocine,0,1,0,0.000000,True,0,0,allocine
1,sentiment,allocine,distilcamembert,none,clean,Je n'ai pas aimé mais pourtant je lui mets 2 é...,Je n'ai pas aimé mais pourtant je lui mets 2 é...,[0.62056075 0.37943925],[0.62056075 0.37943925],clean,allocine,1,0,0,0.000000,True,1,1,allocine
2,sentiment,allocine,distilcamembert,none,clean,Un dessin animé qui brille par sa féerie et se...,Un dessin animé qui brille par sa féerie et se...,[0.00619002 0.99380998],[0.00619002 0.99380998],clean,allocine,2,1,0,0.000000,True,2,2,allocine
3,sentiment,allocine,distilcamembert,none,clean,"Si c'est là le renouveau du cinéma français, c...","Si c'est là le renouveau du cinéma français, c...",[9.99725874e-01 2.74126018e-04],[9.99725874e-01 2.74126018e-04],clean,allocine,3,0,0,0.000000,True,3,3,allocine
4,sentiment,allocine,distilcamembert,none,clean,Et pourtant on s’en Doutait !Second volet très...,Et pourtant on s’en Doutait !Second volet très...,[9.99682566e-01 3.17434332e-04],[9.99682566e-01 3.17434332e-04],clean,allocine,4,0,0,0.000000,True,4,4,allocine
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28908,sentiment,allocine,distilcamembert,textattack,textfooler,"Disons-le tout net, Quelques heures de printem...","Disons-le tout net, Quelques heures la printem...","[0.050305888056755066, 0.9496940970420837]","[0.5313349962234497, 0.4686650037765503]",success,allocine,2063,1,132,0.034091,True,2063,2063,allocine
28909,sentiment,allocine,distilcamembert,textattack,textfooler,"A la sortie de ce film, un seul sentiment m'a ...","A la sortie de ce film, un seul feeling m'a en...","[0.0006030555232428014, 0.9993969202041626]","[0.6683938503265381, 0.3316061794757843]",success,allocine,2064,1,516,0.084158,True,2064,2064,allocine
28910,sentiment,allocine,distilcamembert,textattack,textfooler,"""Quelque chose ne vas pas chez Esther"". Effect...","""Quelque chose ne vas pas chez Esther"". Effect...","[0.0005869793239980936, 0.999413013458252]","[0.5802204608917236, 0.41977953910827637]",success,allocine,2065,1,508,0.106509,True,2065,2065,allocine
28911,sentiment,allocine,distilcamembert,textattack,textfooler,Un gros carton des années 70's . Quand Annie G...,Un gros carton des années 70's . Quand Annie G...,"[0.9876392483711243, 0.012360676191747189]","[0.45313963294029236, 0.54686039686203]",success,allocine,2066,0,155,0.031746,True,2066,2066,allocine


In [5]:
# compute
print("--- filtering dataframe")
if args.attack_name == "ALL":
    print("--- attack name is ALL, using all attacks")
    DF = DF[
        (DF["target_model_dataset"] == args.target_model_dataset)
        & (DF["target_model_train_dataset"] == args.target_model_train_dataset)
        & (DF["target_model"] == args.target_model)
    ]
elif args.attack_name == "ALLBUTCLEAN":
    print("--- attack name is ALLBUTCLEAN, using all attacks but clean")
    DF = DF[
        (DF["target_model_dataset"] == args.target_model_dataset)
        & (DF["target_model_train_dataset"] == args.target_model_train_dataset)
        & (DF["target_model"] == args.target_model)
        & (DF["attack_name"] != "clean")
    ]
else:
    DF = DF[
        (DF["target_model_dataset"] == args.target_model_dataset)
        & (DF["target_model_dataset"] == args.target_model_dataset)
        & (DF["target_model"] == args.target_model)
        & (DF["attack_name"] == args.attack_name)
    ]

print(" done , instance distribution: ")
print(show_df_stats(DF))
DF

--- filtering dataframe
--- attack name is ALL, using all attacks
 done , instance distribution: 
total_instances: 28913, 
attack_name: {'clean': 20000, 'bae': 1217, 'deepwordbug': 1636, 'input_reduction': 2000, 'pwws': 741, 'textbugger': 1663, 'textfooler': 1656}, 
target_model_dataset: {'allocine': 28913}, 
target_model: {'distilcamembert': 28913}, 
status: {'clean': 20000, 'success': 8913}, 
attack_toolchain: {'none': 20000, 'textattack': 8913}, 
scenario: {'sentiment': 28913}, 



Unnamed: 0,scenario,target_model_dataset,target_model,attack_toolchain,attack_name,original_text,perturbed_text,original_output,perturbed_output,status,target_model_train_dataset,test_index,ground_truth,num_queries,frac_words_changed,attacked_all_instances,test_ndx,original_text_identifier,dataset
0,sentiment,allocine,distilcamembert,none,clean,"Magnifique épopée, une belle histoire, touchan...","Magnifique épopée, une belle histoire, touchan...",[4.70960862e-04 9.99529039e-01],[4.70960862e-04 9.99529039e-01],clean,allocine,0,1,0,0.000000,True,0,0,allocine
1,sentiment,allocine,distilcamembert,none,clean,Je n'ai pas aimé mais pourtant je lui mets 2 é...,Je n'ai pas aimé mais pourtant je lui mets 2 é...,[0.62056075 0.37943925],[0.62056075 0.37943925],clean,allocine,1,0,0,0.000000,True,1,1,allocine
2,sentiment,allocine,distilcamembert,none,clean,Un dessin animé qui brille par sa féerie et se...,Un dessin animé qui brille par sa féerie et se...,[0.00619002 0.99380998],[0.00619002 0.99380998],clean,allocine,2,1,0,0.000000,True,2,2,allocine
3,sentiment,allocine,distilcamembert,none,clean,"Si c'est là le renouveau du cinéma français, c...","Si c'est là le renouveau du cinéma français, c...",[9.99725874e-01 2.74126018e-04],[9.99725874e-01 2.74126018e-04],clean,allocine,3,0,0,0.000000,True,3,3,allocine
4,sentiment,allocine,distilcamembert,none,clean,Et pourtant on s’en Doutait !Second volet très...,Et pourtant on s’en Doutait !Second volet très...,[9.99682566e-01 3.17434332e-04],[9.99682566e-01 3.17434332e-04],clean,allocine,4,0,0,0.000000,True,4,4,allocine
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28908,sentiment,allocine,distilcamembert,textattack,textfooler,"Disons-le tout net, Quelques heures de printem...","Disons-le tout net, Quelques heures la printem...","[0.050305888056755066, 0.9496940970420837]","[0.5313349962234497, 0.4686650037765503]",success,allocine,2063,1,132,0.034091,True,2063,2063,allocine
28909,sentiment,allocine,distilcamembert,textattack,textfooler,"A la sortie de ce film, un seul sentiment m'a ...","A la sortie de ce film, un seul feeling m'a en...","[0.0006030555232428014, 0.9993969202041626]","[0.6683938503265381, 0.3316061794757843]",success,allocine,2064,1,516,0.084158,True,2064,2064,allocine
28910,sentiment,allocine,distilcamembert,textattack,textfooler,"""Quelque chose ne vas pas chez Esther"". Effect...","""Quelque chose ne vas pas chez Esther"". Effect...","[0.0005869793239980936, 0.999413013458252]","[0.5802204608917236, 0.41977953910827637]",success,allocine,2065,1,508,0.106509,True,2065,2065,allocine
28911,sentiment,allocine,distilcamembert,textattack,textfooler,Un gros carton des années 70's . Quand Annie G...,Un gros carton des années 70's . Quand Annie G...,"[0.9876392483711243, 0.012360676191747189]","[0.45313963294029236, 0.54686039686203]",success,allocine,2066,0,155,0.031746,True,2066,2066,allocine


In [6]:
if args.max_clean_instance > 0:
    print("--- dropping clean instance to ", args.max_clean_instance)
    print(" done , instance distribution: ")
    DF = restrict_max_instance_for_class(
        in_df=DF,
        attack_name_to_clip="clean",
        max_instance_per_class=args.max_clean_instance,
    )
    print(show_df_stats(DF))
DF

Unnamed: 0,scenario,target_model_dataset,target_model,attack_toolchain,attack_name,original_text,perturbed_text,original_output,perturbed_output,status,target_model_train_dataset,test_index,ground_truth,num_queries,frac_words_changed,attacked_all_instances,test_ndx,original_text_identifier,dataset
0,sentiment,allocine,distilcamembert,none,clean,"Magnifique épopée, une belle histoire, touchan...","Magnifique épopée, une belle histoire, touchan...",[4.70960862e-04 9.99529039e-01],[4.70960862e-04 9.99529039e-01],clean,allocine,0,1,0,0.000000,True,0,0,allocine
1,sentiment,allocine,distilcamembert,none,clean,Je n'ai pas aimé mais pourtant je lui mets 2 é...,Je n'ai pas aimé mais pourtant je lui mets 2 é...,[0.62056075 0.37943925],[0.62056075 0.37943925],clean,allocine,1,0,0,0.000000,True,1,1,allocine
2,sentiment,allocine,distilcamembert,none,clean,Un dessin animé qui brille par sa féerie et se...,Un dessin animé qui brille par sa féerie et se...,[0.00619002 0.99380998],[0.00619002 0.99380998],clean,allocine,2,1,0,0.000000,True,2,2,allocine
3,sentiment,allocine,distilcamembert,none,clean,"Si c'est là le renouveau du cinéma français, c...","Si c'est là le renouveau du cinéma français, c...",[9.99725874e-01 2.74126018e-04],[9.99725874e-01 2.74126018e-04],clean,allocine,3,0,0,0.000000,True,3,3,allocine
4,sentiment,allocine,distilcamembert,none,clean,Et pourtant on s’en Doutait !Second volet très...,Et pourtant on s’en Doutait !Second volet très...,[9.99682566e-01 3.17434332e-04],[9.99682566e-01 3.17434332e-04],clean,allocine,4,0,0,0.000000,True,4,4,allocine
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28908,sentiment,allocine,distilcamembert,textattack,textfooler,"Disons-le tout net, Quelques heures de printem...","Disons-le tout net, Quelques heures la printem...","[0.050305888056755066, 0.9496940970420837]","[0.5313349962234497, 0.4686650037765503]",success,allocine,2063,1,132,0.034091,True,2063,2063,allocine
28909,sentiment,allocine,distilcamembert,textattack,textfooler,"A la sortie de ce film, un seul sentiment m'a ...","A la sortie de ce film, un seul feeling m'a en...","[0.0006030555232428014, 0.9993969202041626]","[0.6683938503265381, 0.3316061794757843]",success,allocine,2064,1,516,0.084158,True,2064,2064,allocine
28910,sentiment,allocine,distilcamembert,textattack,textfooler,"""Quelque chose ne vas pas chez Esther"". Effect...","""Quelque chose ne vas pas chez Esther"". Effect...","[0.0005869793239980936, 0.999413013458252]","[0.5802204608917236, 0.41977953910827637]",success,allocine,2065,1,508,0.106509,True,2065,2065,allocine
28911,sentiment,allocine,distilcamembert,textattack,textfooler,Un gros carton des années 70's . Quand Annie G...,Un gros carton des années 70's . Quand Annie G...,"[0.9876392483711243, 0.012360676191747189]","[0.45313963294029236, 0.54686039686203]",success,allocine,2066,0,155,0.031746,True,2066,2066,allocine


In [7]:
print("--- starting the encoding process")

# if test use only 10 sample
if args.test:
    print("*** WARNING, TEST MODE, only encode 10 samples")
    DF = DF.head(10)

--- starting the encoding process


In [8]:
# encode everything. DF in, dict out
HOLDER = encode_all_properties(
    DF,
    tp_model=args.tp_model,
    lm_perplexity_model=args.lm_perplexity_model,
    lm_proba_model=args.lm_proba_model,
    tm_model=args.target_model,
    tm_model_name_or_path=args.target_model_name_or_path,
    disable_tqdm=args.disable_tqdm,
)

preparing text properties encoding
--- loading lm
AutoModel: sentence-transformers/bert-base-nli-mean-tokens
--- lm loaded


100%|██████████| 10/10 [00:01<00:00,  5.34it/s]


preparing lm perplexity encoding
--- loading lm
GPT2LMHeadModel: gpt2
--- lm loaded


100%|██████████| 10/10 [00:00<00:00, 11.43it/s]


preparing lm proba encoding
--- loading lm
RobertaForMaskedLM: roberta-base
--- lm loaded


100%|██████████| 10/10 [00:00<00:00, 10.50it/s]


preparing tm properties encoding
--- loading target model
baptiste-pasquier/distilcamembert-allocine (distilcamembert trained on allocine)
--- target model loaded


100%|██████████| 10/10 [00:07<00:00,  1.33it/s]

--- all done
total failed extraction:  0 out of 10
a sample holder value for sanity check


{'num_successful_loop': 4, 'primary_key': ['clean', 'none', 0, 'sentiment', 'distilcamembert', 'allocine', 0], 'unique_id': '166201344496741554700726886801777804838', 'deliverable': {'tp_avg_word_length': 'arr/list of shape: (1, 20)', 'tp_bert': 'arr/list of shape: (1, 768)', 'tp_is_first_word_lowercase': 'arr/list of shape: (1, 1)', 'tp_num_alpha_chars': 'arr/list of shape: (1, 1)', 'tp_num_cased_letters': 'arr/list of shape: (1, 4)', 'tp_num_cased_word_switches': 'arr/list of shape: (1, 1)', 'tp_num_chars': 'arr/list of shape: (1, 1)', 'tp_num_digits': 'arr/list of shape: (1, 1)', 'tp_num_lowercase_after_punctuation': 'arr/list of shape: (1, 1)', 'tp_num_mixed_case_words': 'arr/list of shape: (1, 1)', 'tp_num_multi_spaces': 'arr/list of shape: (1, 1)', 'tp_num_non_ascii': 'arr/list of shape: (1, 1)', 'tp_num_punctuation': 'arr/list of shape: (1, 1)', 'tp_num_single_lowercase_letters': 'arr/lis




In [9]:
HOLDER[0]

{'num_successful_loop': 4,
 'deliverable': {'tp_avg_word_length': array([[6.        , 6.5       , 3.75      , 5.5       , 8.25      ,
          5.        , 5.        , 3.75      , 5.        , 6.        ,
          4.        , 4.        , 2.        , 3.5       , 5.5       ,
          5.06666667, 5.86222222, 3.        , 5.        , 6.        ]]),
  'tp_bert': array([[-2.46832162e-01,  1.02264190e+00,  1.09695601e+00,
           3.72217983e-01,  1.15423977e+00, -4.32754695e-01,
           6.06811166e-01, -9.54026207e-02,  1.22876152e-01,
           1.59888908e-01, -9.61056650e-02,  6.87628150e-01,
           6.41708612e-01,  1.11244094e+00, -2.39751086e-01,
           1.27730739e+00, -5.13197064e-01, -4.44991380e-01,
           6.98416471e-01, -8.38227510e-01, -8.94286036e-01,
          -1.63102239e-01,  4.84245986e-01,  2.02718079e-01,
          -1.36358514e-01,  4.94234204e-01,  9.47127119e-02,
          -5.37724078e-01, -1.68650225e-01,  5.38653612e-01,
           1.28284052e-01,  1.08

In [10]:
def holder_to_disk(holder, fname):
    """
    Holder is a nested dict, see `encode_samplewise_features.py`
    """
    joblib.dump(holder, fname)

In [11]:
print("--- saving to disk")
file_name = (
    "_".join([args.target_model, args.target_model_dataset, args.attack_name])
    + ".joblib"
)
if args.test:
    file_name = "test_" + file_name
file_path = Path("data_tcab/reprs/samplewise", file_name)
mkfile_if_dne(file_path)
holder_to_disk(HOLDER, file_path)
print(f"saved in {file_path}")

--- saving to disk
saved in data_tcab\reprs\samplewise\test_distilcamembert_allocine_ALL.joblib
