In [None]:
import pandas as pd

In [None]:
all_roberta_attacks_textfooler = pd.read_csv("./attack_logs/log_rob_c10_tf_final.csv")#.append(pd.read_csv("./attack_logs/log_rob_c10_extra.csv"))
all_stat_attacks_textfooler = pd.read_csv("./attack_logs/textattack_stat_svm_final.csv")

In [None]:
all_stat_attacks_dwb = pd.read_csv("./attack_logs/textattack_stat_svm_dwb_to_80.csv").append(pd.read_csv("./attack_logs/textattack_stat_svm_dwb_80_to_200.csv"))
all_roberta_attacks_dwb = pd.read_csv("./attack_logs/log_rob_c10_dwb_215.csv")[:200]

In [None]:
def calculate_attack_metrics(df, attack_type, feature_type):
    attack_breakdown = df['result_type'].value_counts()
    success_rate = attack_breakdown["Successful"]/(attack_breakdown["Successful"]+attack_breakdown["Failed"])
    pre_attack_acc = 1.0-(attack_breakdown["Skipped"])/df.shape[0]
    post_attack_acc = attack_breakdown["Failed"]/df.shape[0]
    return {
        "Features": feature_type,
        "Attack Type": attack_type,
        "Attack Success Rate": success_rate,
        "Pre-Attack Accuracy": pre_attack_acc,
        "Post-Attack Accuracy":post_attack_acc
    }

In [None]:
attack_summary = pd.DataFrame(columns=["Features", "Attack Type", "Attack Success Rate", "Pre-Attack Accuracy", "Post-Attack Accuracy", "ΔMAUVE"])

In [None]:
attack_summary = attack_summary.append(calculate_attack_metrics(all_roberta_attacks_textfooler, "TextFooler", "RoBERTa"), ignore_index=True)
attack_summary = attack_summary.append(calculate_attack_metrics(all_roberta_attacks_dwb, "DeepWordBug", "RoBERTa"), ignore_index=True)
attack_summary = attack_summary.append(calculate_attack_metrics(all_stat_attacks_textfooler, "TextFooler", "Statistical"), ignore_index=True)
attack_summary = attack_summary.append(calculate_attack_metrics(all_stat_attacks_dwb, "DeepWordBug", "Statistical"), ignore_index=True)

In [None]:
attack_summary["Attack Success Rate"] = attack_summary["Attack Success Rate"].round(3)

In [None]:
print(attack_summary.to_latex(index=False))

# MAUVE scores

In [None]:
import mauve 

We calculate MAUVE from REAL->ORIGINAL and then from REAL->PERTURBED and determine the delta.

In [None]:
def compute_avg_mauve_perturbed(df):
    # Remove annotations
    p = pd.read_json("data/gpt-2-output-dataset/data/webtext.valid.jsonl", lines=True)['text']
    q1 = df['original_text'].str.replace('[[','', regex=False).str.replace(']]', '', regex=False)
    q2 = df['perturbed_text'].str.replace('[[','', regex=False).str.replace(']]', '', regex=False)
    out1 = mauve.compute_mauve(p_text=p, q_text=q1, device_id=0, max_text_length=256, verbose=False)
    out2 = mauve.compute_mauve(p_text=p, q_text=q2, device_id=0, max_text_length=256, verbose=False)
    print(f"Prior MAUVE: {out1.mauve}")
    print(f"Post MAUVE: {out2.mauve}")
    return out2.mauve-out1.mauve

In [None]:
compute_avg_mauve_perturbed(all_roberta_attacks_textfooler)
# Prior MAUVE: 0.20184612877753388
# Post MAUVE: 0.17964426031295821
# -0.02220186846457567

In [None]:
compute_avg_mauve_perturbed(all_roberta_attacks_dwb)
# Prior MAUVE: 0.20184612877753388
# Post MAUVE: 0.17580721854104975
# -0.026038910236484136

In [None]:
compute_avg_mauve_perturbed(all_stat_attacks_textfooler)
# Prior MAUVE: 0.20184612877753388
# Post MAUVE: 0.1681246795470045
# -0.03372144923052939

In [None]:
compute_avg_mauve_perturbed(all_stat_attacks_dwb)
# Prior MAUVE: 0.20184612877753388
# Post MAUVE: 0.17387813075822658
# -0.0279679980193073