In [4]:
import pandas as pd

model_names = ['Gemma-2-9B', 'Llama-3.1-8B', 'Phi-4-14B']
asr_benchmark_mapping = {
    'ab': 'AdvBench',
    'obt': 'OR-Bench-Toxic',
    'xsu': 'XSTest-Unsafe',
    'dna': 'Do-not-answer',
    'sr': 'StrongREJECT',
}

rr_benchmark_mapping = {
    'obh': 'OR-Bench-Hard-1K',
    'xss': 'XSTest-Safe'
}

# Model-specific performance metrics (0.0 serves as a placeholder, later replace with actual values)
model_performance = {
    'Gemma-2-9B': {'MMLU': 62.0, 'GSM8K': 65.0},
    'Llama-3.1-8B': {'MMLU': 59.1 , 'GSM8K': 69.9},
    'Phi-4-14B': {'MMLU': 76.9, 'GSM8K': 82.3}
}

# Initialize tables
asr_table = pd.DataFrame(index=model_names,
                        columns=['AdvBench', 'OR-Bench-Toxic', 'XSTest-Unsafe',
                                'Do-not-answer', 'StrongREJECT', 'MMLU', 'GSM8K',])

rr_table = pd.DataFrame(index=model_names,
                       columns=['OR-Bench-Hard-1K', 'XSTest-Safe', 'MMLU', 'GSM8K'])

model_file_mapping = {
    'Gemma-2-9B': 'gemma2-9b',
    'Llama-3.1-8B': 'llama31-8b',
    'Phi-4-14B': 'phi-4-14b'
}

def calculate_asr(df):
    # Calculate Attack Success Rate (1 - mean safety score), lower - better
    return (1 - df['safety_score'].mean()) * 100

def calculate_rr(df):
    # Calculate Rejection Rate mean refusal score, lower - better
    return df['refusal_score'].mean() * 100

for model in model_names:
    file_suffix = model_file_mapping[model]
    performance = model_performance[model]

    # Process harmful benchmarks
    for code, full_name in asr_benchmark_mapping.items():
        try:
            df = pd.read_csv(f"./evaluations/{code}-{file_suffix}-eval.csv")
            asr_table.loc[model, full_name] = calculate_asr(df)
        except FileNotFoundError:
            print(f"Missing file: {code}-{file_suffix}-eval.csv")
            continue

    # Process over-refusal benchmarks
    for code, full_name in rr_benchmark_mapping.items():
        try:
            df = pd.read_csv(f"./evaluations/{code}-{file_suffix}-eval.csv")
            rr_table.loc[model, full_name] = calculate_rr(df)
        except FileNotFoundError:
            print(f"Missing file: {code}-{file_suffix}-eval.csv")
            continue

    for metric in ['MMLU', 'GSM8K']:
        asr_table.loc[model, metric] = performance[metric]
        rr_table.loc[model, metric] = performance[metric]

def format_table(df):
    return df.style.format({
        **{col: "{:.1f}%" for col in df.columns if col not in ['MMLU', 'GSM8K']},
        'MMLU': "{:.1f}",
        'GSM8K': "{:.1f}"
    }, na_rep="N/A").set_caption("Model Evaluation Results").set_table_styles([
        {'selector': 'caption',
         'props': [('font-size', '16px'), ('font-weight', 'bold')]}
    ])

print("Attack Success Rate Table (ASR):")
display(format_table(asr_table))

print("\nRejection Rate Table (RR):")
display(format_table(rr_table))

Attack Success Rate Table (ASR):


Unnamed: 0,AdvBench,OR-Bench-Toxic,XSTest-Unsafe,Do-not-answer,StrongREJECT,MMLU,GSM8K
Gemma-2-9B,1.2%,1.2%,1.5%,1.0%,0.6%,62.0,65.0
Llama-3.1-8B,1.2%,0.8%,0.0%,1.8%,1.6%,59.1,69.9
Phi-4-14B,3.5%,1.8%,1.5%,1.4%,4.5%,76.9,82.3



Rejection Rate Table (RR):


Unnamed: 0,OR-Bench-Hard-1K,XSTest-Safe,MMLU,GSM8K
Gemma-2-9B,85.2%,29.2%,62.0,65.0
Llama-3.1-8B,61.0%,76.8%,59.1,69.9
Phi-4-14B,49.8%,12.8%,76.9,82.3
