In [2]:
import pandas as pd

# 1) Models (including your fine-tuned LoRA)
model_names = [
    'Llama-3.1-8B',
    'Llama-3.1-8B-LoRA-SFT',
    'Llama-3.1-8B-Full-SFT'
]

# 2) Benchmark ↦ display names
asr_benchmark_mapping = {
    'ab':  'AdvBench',
    'obt': 'OR-Bench-Toxic',
    'xsu': 'XSTest-Unsafe',
    'dna': 'Do-not-answer',
    'sr':  'StrongREJECT',
}
rr_benchmark_mapping = {
    'xss': 'XSTest-Safe',
    'wgm': 'WildGuard-Mix-Safe',
    'obh': 'OR-Bench-Hard'
}

# 3) File-name suffixes
model_file_mapping = {
    'Llama-3.1-8B':             'llama31-8b',
    'Llama-3.1-8B-LoRA-SFT':    'llama31-8b-lora-sft',
    'Llama-3.1-8B-Full-SFT':    'llama31-8b-full-sft'
}

# 4) Metrics calculators
def calculate_asr(df):
    return (1 - df['safety_score'].mean()) * 100

def calculate_rr(df):
    return df['refusal_score'].mean() * 100

# 5) Defense settings
defense_conditions = {
    '':          {'prefix': 'no'},
}

# 6) Build and fill tables (no performance columns)
asr_tables = {}
rr_tables  = {}

for defense_name, info in defense_conditions.items():
    prefix = info['prefix']
    # only the benchmark columns
    asr_tables[defense_name] = pd.DataFrame(
        index=model_names,
        columns=list(asr_benchmark_mapping.values())
    )
    rr_tables[defense_name] = pd.DataFrame(
        index=model_names,
        columns=list(rr_benchmark_mapping.values())
    )

    for model in model_names:
        suffix = model_file_mapping[model]

        # ASR benchmarks
        for code, disp in asr_benchmark_mapping.items():
            path = f"./evaluations/{code}-{prefix}-{suffix}-eval.csv"
            try:
                df = pd.read_csv(path)
                asr_tables[defense_name].loc[model, disp] = calculate_asr(df)
            except FileNotFoundError:
                asr_tables[defense_name].loc[model, disp] = None

        # RR benchmarks
        for code, disp in rr_benchmark_mapping.items():
            path = f"./evaluations/{code}-{prefix}-{suffix}-eval.csv"
            try:
                df = pd.read_csv(path)
                rr_tables[defense_name].loc[model, disp] = calculate_rr(df)
            except FileNotFoundError:
                rr_tables[defense_name].loc[model, disp] = None

# 7) Display nicely
def fmt_percent(x):
    return f"{x:.1f}%" if pd.notna(x) else "N/A"

for defense_name in defense_conditions:
    print(f"\n=== Attack Success Rate (ASR) — {defense_name} ===")
    display(
        asr_tables[defense_name]
            .applymap(fmt_percent)
            .style
            .set_caption(f"ASR — {defense_name}")
    )

    print(f"\n=== Rejection Rate (RR) — {defense_name} ===")
    display(
        rr_tables[defense_name]
            .applymap(fmt_percent)
            .style
            .set_caption(f"RR — {defense_name}")
    )



=== Attack Success Rate (ASR) —  ===


Unnamed: 0,AdvBench,OR-Bench-Toxic,XSTest-Unsafe,Do-not-answer,StrongREJECT
Llama-3.1-8B,1.0%,0.6%,0.0%,1.2%,0.6%
Llama-3.1-8B-LoRA-SFT,0.6%,0.6%,0.0%,1.1%,0.6%
Llama-3.1-8B-Full-SFT,0.0%,0.6%,0.0%,0.6%,0.6%



=== Rejection Rate (RR) —  ===


Unnamed: 0,XSTest-Safe,WildGuard-Mix-Safe,OR-Bench-Hard
Llama-3.1-8B,13.6%,23.7%,66.8%
Llama-3.1-8B-LoRA-SFT,9.6%,21.6%,76.9%
Llama-3.1-8B-Full-SFT,10.0%,20.5%,69.4%
