In [6]:
import pandas as pd

model_names = ['Gemma-2-9B', 'Llama-3.1-8B', 'Phi-4-14B']
asr_benchmark_mapping = {
    'ab': 'AdvBench',
    'obt': 'OR-Bench-Toxic',
    'xsu': 'XSTest-Unsafe',
    'dna': 'Do-not-answer',
    'sr': 'StrongREJECT',
    'di': 'DeepInception'
}

rr_benchmark_mapping = {
    'obh': 'OR-Bench-Hard-1K',
    'xss': 'XSTest-Safe'
}

model_performance = {
    'Gemma-2-9B': {'MMLU': 67.2, 'GSM8K': 58.7},
    'Llama-3.1-8B': {'MMLU': 64.0, 'GSM8K': 74.7},
    'Phi-4-14B': {'MMLU': 81.7, 'GSM8K': 78.7}
}

model_performance_smoothllm = {
    'Gemma-2-9B': {'MMLU': 59.0, 'GSM8K': 53.4},
    'Llama-3.1-8B': {'MMLU': 55.6, 'GSM8K': 51.0},
    'Phi-4-14B': {'MMLU': 72.5, 'GSM8K': 71.3}
}

model_file_mapping = {
    'Gemma-2-9B': 'gemma2-9b',
    'Llama-3.1-8B': 'llama31-8b',
    'Phi-4-14B': 'phi-4-14b'
}

def calculate_asr(df):
    return (1 - df['safety_score'].mean()) * 100

def calculate_rr(df):
    return df['refusal_score'].mean() * 100

# Define defense conditions with their file prefix and corresponding performance metrics
defense_conditions = {
    'No Defense': {
        'prefix': 'no',
        'performance' : model_performance,
    },
    'SmoothLLM Defense': {
        'prefix': 'smoothllm',
        'performance': model_performance_smoothllm,
    },
    'Self-Reminder Defense': {
        'prefix': 'srd',
        'performance': model_performance,
    },
    'In-Context Defense': {
        'prefix': 'icd',
        'performance': model_performance,
    }
}

# Create dictionaries to store the ASR and RR tables for each defense type
asr_tables = {}
rr_tables = {}

for defense_name, defense_info in defense_conditions.items():
    # Initialize tables with benchmark columns and two extra metric columns
    asr_tables[defense_name] = pd.DataFrame(index=model_names,
                                            columns=list(asr_benchmark_mapping.values()) + ['MMLU', 'GSM8K'])
    rr_tables[defense_name] = pd.DataFrame(index=model_names,
                                           columns=list(rr_benchmark_mapping.values()) + ['MMLU', 'GSM8K'])

    prefix = defense_info['prefix']
    performance_dict = defense_info['performance']

    for model in model_names:
        file_suffix = model_file_mapping[model]

        # Populate ASR table: loop through each ASR benchmark
        for code, full_name in asr_benchmark_mapping.items():
            file_path = f"./evaluations/{code}-{prefix}-{file_suffix}-eval.csv"
            try:
                df = pd.read_csv(file_path)
                asr_tables[defense_name].loc[model, full_name] = calculate_asr(df)
            except FileNotFoundError:
                print(f"Missing file: {file_path}")

        # Populate RR table: loop through each RR benchmark
        for code, full_name in rr_benchmark_mapping.items():
            file_path = f"./evaluations/{code}-{prefix}-{file_suffix}-eval.csv"
            try:
                df = pd.read_csv(file_path)
                rr_tables[defense_name].loc[model, full_name] = calculate_rr(df)
            except FileNotFoundError:
                print(f"Missing file: {file_path}")

        # Add performance metrics to both tables
        for metric in ['MMLU', 'GSM8K']:
            asr_tables[defense_name].loc[model, metric] = performance_dict[model][metric]
            rr_tables[defense_name].loc[model, metric] = performance_dict[model][metric]

def format_table(df, caption):
    return df.style.format({
        **{col: "{:.1f}%" for col in df.columns if col not in ['MMLU', 'GSM8K']},
        'MMLU': "{:.1f}",
        'GSM8K': "{:.1f}",
    }, na_rep="N/A").set_caption(caption).set_table_styles([
        {'selector': 'caption',
         'props': [('font-size', '16px'), ('font-weight', 'bold')]}
    ])

# Display the tables for each defense method
for defense_name in defense_conditions.keys():
    print(f"\nAttack Success Rate Table (ASR) - {defense_name}:")
    display(format_table(asr_tables[defense_name], f"ASR - {defense_name}"))

    print(f"\nRejection Rate Table (RR) - {defense_name}:")
    display(format_table(rr_tables[defense_name], f"RR - {defense_name}"))



Attack Success Rate Table (ASR) - No Defense:


Unnamed: 0,AdvBench,OR-Bench-Toxic,XSTest-Unsafe,Do-not-answer,StrongREJECT,DeepInception,MMLU,GSM8K
Gemma-2-9B,0.4%,0.2%,0.0%,1.1%,0.0%,9.8%,67.2,58.7
Llama-3.1-8B,1.0%,0.6%,0.0%,1.2%,0.6%,13.8%,64.0,74.7
Phi-4-14B,1.7%,1.2%,0.0%,1.8%,0.6%,25.6%,81.7,78.7



Rejection Rate Table (RR) - No Defense:


Unnamed: 0,OR-Bench-Hard-1K,XSTest-Safe,MMLU,GSM8K
Gemma-2-9B,79.2%,19.6%,67.2,58.7
Llama-3.1-8B,66.8%,13.6%,64.0,74.7
Phi-4-14B,27.1%,11.2%,81.7,78.7
