In [46]:
import pandas as pd

model_names = ['Gemma-2-9B', 'Llama-3.1-8B', 'Phi-4-14B']
asr_benchmark_mapping = {
    'ab': 'AdvBench',
    'obt': 'OR-Bench-Toxic',
    'xsu': 'XSTest-Unsafe',
    'dna': 'Do-not-answer',
    'sr': 'StrongREJECT',
}

rr_benchmark_mapping = {
    'obh': 'OR-Bench-Hard-1K',
    'xss': 'XSTest-Safe'
}

# Model-specific performance metrics (0.0 serves as a placeholder, later replace with actual values)
model_performance = {
    'Gemma-2-9B': {'MMLU': 71.3, 'GSM8K': 68.6, 'HumanEval': 40.2},
    'Llama-3.1-8B': {'MMLU': 66.7 , 'GSM8K': 84.5, 'HumanEval': 72.6},
    'Phi-4-14B': {'MMLU': 84.8, 'GSM8K': 89.7, 'HumanEval': 82.6}
}

# Initialize tables
asr_table = pd.DataFrame(index=model_names,
                        columns=['AdvBench', 'OR-Bench-Toxic', 'XSTest-Unsafe',
                                'Do-not-answer', 'StrongREJECT', 'MMLU', 'GSM8K',])

rr_table = pd.DataFrame(index=model_names,
                       columns=['OR-Bench-Hard-1K', 'XSTest-Safe', 'MMLU', 'GSM8K'])

model_file_mapping = {
    'Gemma-2-9B': 'gemma2-9b',
    'Llama-3.1-8B': 'llama31-8b',
    'Phi-4-14B': 'phi-4-14b'
}

def calculate_asr(df):
    # Calculate Attack Success Rate (1 - mean safety score), lower - better
    return (1 - df['safety_score'].mean()) * 100

def calculate_rr(df):
    # Calculate Rejection Rate mean refusal score, lower - better
    return df['refusal_score'].mean() * 100

for model in model_names:
    file_suffix = model_file_mapping[model]
    performance = model_performance[model]

    # Process harmful benchmarks
    for code, full_name in asr_benchmark_mapping.items():
        try:
            df = pd.read_csv(f"./evaluations/{code}-no-{file_suffix}-eval.csv")
            asr_table.loc[model, full_name] = calculate_asr(df)
        except FileNotFoundError:
            print(f"Missing file: {code}-no-{file_suffix}-eval.csv")
            continue

    # Process over-refusal benchmarks
    for code, full_name in rr_benchmark_mapping.items():
        try:
            df = pd.read_csv(f"./evaluations/{code}-no-{file_suffix}-eval.csv")
            rr_table.loc[model, full_name] = calculate_rr(df)
        except FileNotFoundError:
            print(f"Missing file: {code}-{file_suffix}-eval.csv")
            continue

    for metric in ['MMLU', 'GSM8K', 'HumanEval']:
        asr_table.loc[model, metric] = performance[metric]
        rr_table.loc[model, metric] = performance[metric]

def format_table(df):
    return df.style.format({
        **{col: "{:.1f}%" for col in df.columns if col not in ['MMLU', 'GSM8K', 'HumanEval']},
        'MMLU': "{:.1f}",
        'GSM8K': "{:.1f}",
        'HumanEval': "{:.1f}",
    }, na_rep="N/A").set_caption("Model Evaluation Results").set_table_styles([
        {'selector': 'caption',
         'props': [('font-size', '16px'), ('font-weight', 'bold')]}
    ])

print("Attack Success Rate Table (ASR):")
display(format_table(asr_table))

print("\nRejection Rate Table (RR):")
display(format_table(rr_table))

Missing file: obh-gemma2-9b-eval.csv
Missing file: xss-gemma2-9b-eval.csv
Missing file: obh-llama31-8b-eval.csv
Missing file: xss-llama31-8b-eval.csv
Missing file: obh-phi-4-14b-eval.csv
Missing file: xss-phi-4-14b-eval.csv
Attack Success Rate Table (ASR):


Unnamed: 0,AdvBench,OR-Bench-Toxic,XSTest-Unsafe,Do-not-answer,StrongREJECT,MMLU,GSM8K,HumanEval
Gemma-2-9B,0.4%,0.2%,0.0%,1.1%,0.0%,71.3,68.6,40.2
Llama-3.1-8B,1.0%,0.6%,0.0%,1.2%,0.6%,66.7,84.5,72.6
Phi-4-14B,1.7%,1.2%,0.0%,1.8%,0.6%,84.8,89.7,82.6



Rejection Rate Table (RR):


Unnamed: 0,OR-Bench-Hard-1K,XSTest-Safe,MMLU,GSM8K,HumanEval
Gemma-2-9B,,,71.3,68.6,40.2
Llama-3.1-8B,,,66.7,84.5,72.6
Phi-4-14B,,,84.8,89.7,82.6
