In [9]:
import pandas as pd

model_names = ['Gemma-2-9B', 'Llama-3.1-8B', 'Phi-4-14B']
asr_benchmark_mapping = {
    'ab': 'AdvBench',
    'obt': 'OR-Bench-Toxic',
    'xsu': 'XSTest-Unsafe',
    'dna': 'Do-not-answer',
    'sr': 'StrongREJECT',
}

rr_benchmark_mapping = {
    'obh': 'OR-Bench-Hard-1K',
    'xss': 'XSTest-Safe'
}

# Model-specific performance metrics
model_performance = {
    'Gemma-2-9B': {'MMLU': 71.3, 'GSM8K': 68.6, 'HumanEval': 40.2},
    'Llama-3.1-8B': {'MMLU': 66.7, 'GSM8K': 84.5, 'HumanEval': 72.6},
    'Phi-4-14B': {'MMLU': 84.8, 'GSM8K': 89.7, 'HumanEval': 82.6}
}

model_file_mapping = {
    'Gemma-2-9B': 'gemma2-9b',
    'Llama-3.1-8B': 'llama31-8b',
    'Phi-4-14B': 'phi-4-14b'
}

def calculate_asr(df):
    # Calculate Attack Success Rate (1 - mean safety score), lower is better
    return (1 - df['safety_score'].mean()) * 100

def calculate_rr(df):
    # Calculate Rejection Rate (mean refusal score), lower is better
    return df['refusal_score'].mean() * 100

asr_table_no = pd.DataFrame(index=model_names,
                        columns=['AdvBench', 'OR-Bench-Toxic', 'XSTest-Unsafe',
                                'Do-not-answer', 'StrongREJECT', 'MMLU', 'GSM8K', 'HumanEval'])

rr_table_no = pd.DataFrame(index=model_names,
                       columns=['OR-Bench-Hard-1K', 'XSTest-Safe', 'MMLU', 'GSM8K', 'HumanEval'])

asr_table_smoothllm = pd.DataFrame(index=model_names,
                        columns=['AdvBench', 'OR-Bench-Toxic', 'XSTest-Unsafe',
                                'Do-not-answer', 'StrongREJECT', 'MMLU', 'GSM8K', 'HumanEval'])

rr_table_smoothllm = pd.DataFrame(index=model_names,
                       columns=['OR-Bench-Hard-1K', 'XSTest-Safe', 'MMLU', 'GSM8K', 'HumanEval'])

for model in model_names:
    file_suffix = model_file_mapping[model]
    performance = model_performance[model]

    for code, full_name in asr_benchmark_mapping.items():
        try:
            df = pd.read_csv(f"./evaluations/{code}-no-{file_suffix}-eval.csv")
            asr_table_no.loc[model, full_name] = calculate_asr(df)
        except FileNotFoundError:
            print(f"Missing file: {code}-no-{file_suffix}-eval.csv")

    for code, full_name in rr_benchmark_mapping.items():
        try:
            df = pd.read_csv(f"./evaluations/{code}-no-{file_suffix}-eval.csv")
            rr_table_no.loc[model, full_name] = calculate_rr(df)
        except FileNotFoundError:
            print(f"Missing file: {code}-no-{file_suffix}-eval.csv")

    for metric in ['MMLU', 'GSM8K', 'HumanEval']:
        asr_table_no.loc[model, metric] = performance[metric]
        rr_table_no.loc[model, metric] = performance[metric]

    for code, full_name in asr_benchmark_mapping.items():
        try:
            df = pd.read_csv(f"./evaluations/{code}-smoothllm-{file_suffix}-eval.csv")
            asr_table_smoothllm.loc[model, full_name] = calculate_asr(df)
        except FileNotFoundError:
            print(f"Missing file: {code}-smoothllm-{file_suffix}-eval.csv")

    for code, full_name in rr_benchmark_mapping.items():
        try:
            df = pd.read_csv(f"./evaluations/{code}-smoothllm-{file_suffix}-eval.csv")
            rr_table_smoothllm.loc[model, full_name] = calculate_rr(df)
        except FileNotFoundError:
            print(f"Missing file: {code}-smoothllm-{file_suffix}-eval.csv")

    for metric in ['MMLU', 'GSM8K', 'HumanEval']:
        asr_table_smoothllm.loc[model, metric] = performance[metric]
        rr_table_smoothllm.loc[model, metric] = performance[metric]

def format_table(df, caption):
    return df.style.format({
        **{col: "{:.1f}%" for col in df.columns if col not in ['MMLU', 'GSM8K', 'HumanEval']},
        'MMLU': "{:.1f}",
        'GSM8K': "{:.1f}",
        'HumanEval': "{:.1f}",
    }, na_rep="N/A").set_caption(caption).set_table_styles([
        {'selector': 'caption',
         'props': [('font-size', '16px'), ('font-weight', 'bold')]}
    ])

print("Attack Success Rate Table (ASR) - No Defense:")
display(format_table(asr_table_no, "ASR - No Defense"))

print("\nRejection Rate Table (RR) - No Defense:")
display(format_table(rr_table_no, "RR - No Defense"))

print("\nAttack Success Rate Table (ASR) - SmoothLLM Defense:")
display(format_table(asr_table_smoothllm, "ASR - SmoothLLM Defense"))

print("\nRejection Rate Table (RR) - SmoothLLM Defense:")
display(format_table(rr_table_smoothllm, "RR - SmoothLLM Defense"))


Attack Success Rate Table (ASR) - No Defense:


Unnamed: 0,AdvBench,OR-Bench-Toxic,XSTest-Unsafe,Do-not-answer,StrongREJECT,MMLU,GSM8K,HumanEval
Gemma-2-9B,0.4%,0.2%,0.0%,1.1%,0.0%,71.3,68.6,40.2
Llama-3.1-8B,1.0%,0.6%,0.0%,1.2%,0.6%,66.7,84.5,72.6
Phi-4-14B,1.7%,1.2%,0.0%,1.8%,0.6%,84.8,89.7,82.6



Rejection Rate Table (RR) - No Defense:


Unnamed: 0,OR-Bench-Hard-1K,XSTest-Safe,MMLU,GSM8K,HumanEval
Gemma-2-9B,79.2%,19.6%,71.3,68.6,40.2
Llama-3.1-8B,66.8%,13.6%,66.7,84.5,72.6
Phi-4-14B,27.1%,11.2%,84.8,89.7,82.6



Attack Success Rate Table (ASR) - SmoothLLM Defense:


Unnamed: 0,AdvBench,OR-Bench-Toxic,XSTest-Unsafe,Do-not-answer,StrongREJECT,MMLU,GSM8K,HumanEval
Gemma-2-9B,0.2%,0.2%,0.0%,0.5%,0.0%,71.3,68.6,40.2
Llama-3.1-8B,0.2%,0.5%,0.0%,0.9%,0.3%,66.7,84.5,72.6
Phi-4-14B,0.6%,1.1%,0.0%,1.4%,0.6%,84.8,89.7,82.6



Rejection Rate Table (RR) - SmoothLLM Defense:


Unnamed: 0,OR-Bench-Hard-1K,XSTest-Safe,MMLU,GSM8K,HumanEval
Gemma-2-9B,84.0%,30.0%,71.3,68.6,40.2
Llama-3.1-8B,94.4%,42.4%,66.7,84.5,72.6
Phi-4-14B,28.7%,17.6%,84.8,89.7,82.6
