In [45]:
import pandas as pd

model_names = ['Gemma-2-9B', 'Llama-3.1-8B', 'Phi-4-14B']

rr_benchmark_mapping = {
    'obh': 'OR-Bench-Hard-1K',
    'xss': 'XSTest-Safe'
}

model_file_mapping = {
    'Gemma-2-9B': 'gemma2-9b',
    'Llama-3.1-8B': 'llama31-8b',
    'Phi-4-14B': 'phi-4-14b'
}

def calculate_rr(df):
    return df['refusal_score'].mean() * 100

# Define defense conditions with their file prefix
defense_conditions = {
    'No Defense': {
        'prefix': 'no',
    },
    'SmoothLLM Defense': {
        'prefix': 'smoothllm',
    },
    'Self-Reminder Defense': {
        'prefix': 'srd',
    },
    'In-Context Defense': {
        'prefix': 'icd',
    }
}

# Create dictionary to store the RR tables for each defense type
rr_tables = {}

for defense_name, defense_info in defense_conditions.items():
    rr_tables[defense_name] = pd.DataFrame(index=model_names,
                                           columns=list(rr_benchmark_mapping.values()) + ['MMLU', 'GSM8K'])

    prefix = defense_info['prefix']

    for model in model_names:
        file_suffix = model_file_mapping[model]

        # Populate RR table: loop through each RR benchmark
        for code, full_name in rr_benchmark_mapping.items():
            file_path = f"./evaluations/{code}-{prefix}-{file_suffix}-eval.csv"
            try:
                df = pd.read_csv(file_path)
                rr_tables[defense_name].loc[model, full_name] = calculate_rr(df)
            except FileNotFoundError:
                print(f"Missing file: {file_path}")

        # Add general performance results
        for metric in ['MMLU', 'GSM8K']:
            benchmark_name = metric.lower()
            defense_label = prefix
            model_label = model_file_mapping[model].removesuffix('-9b').removesuffix('-8b').removesuffix('-14b')
            perf_file = f"./perfevals/{benchmark_name}-{defense_label}-{model_label}-result.csv"
            try:
                df_perf = pd.read_csv(perf_file)
                if 'Metric' in df_perf.columns and 'Value' in df_perf.columns:
                    score = df_perf['Value'].values[0]
                    rr_tables[defense_name].loc[model, metric] = score * 100
                else:
                    print(f"Unexpected column headers in: {perf_file}")
                    rr_tables[defense_name].loc[model, metric] = None
            except FileNotFoundError:
                print(f"Missing performance file: {perf_file}")
                rr_tables[defense_name].loc[model, metric] = None

def format_table(df, caption):
    return df.style.format({
        **{col: "{:.1f}%" for col in df.columns if col not in ['MMLU', 'GSM8K']},
        'MMLU': "{:.1f}",
        'GSM8K': "{:.1f}",
    }, na_rep="N/A").set_caption(caption).set_table_styles([
        {'selector': 'caption',
         'props': [('font-size', '16px'), ('font-weight', 'bold')]}
    ])

# Display the RR tables for each defense method
for defense_name in defense_conditions.keys():
    print(f"\nRejection Rate Table (RR) - {defense_name}:")
    display(format_table(rr_tables[defense_name], f"RR - {defense_name}"))



Rejection Rate Table (RR) - No Defense:


Unnamed: 0,OR-Bench-Hard-1K,XSTest-Safe,MMLU,GSM8K
Gemma-2-9B,79.2%,19.6%,67.2,58.7
Llama-3.1-8B,66.8%,13.6%,64.7,78.4
Phi-4-14B,27.1%,11.2%,81.7,78.7



Rejection Rate Table (RR) - SmoothLLM Defense:


Unnamed: 0,OR-Bench-Hard-1K,XSTest-Safe,MMLU,GSM8K
Gemma-2-9B,84.0%,30.0%,59.0,53.4
Llama-3.1-8B,74.6%,31.2%,55.6,56.0
Phi-4-14B,28.7%,17.6%,72.5,71.3



Rejection Rate Table (RR) - Self-Reminder Defense:


Unnamed: 0,OR-Bench-Hard-1K,XSTest-Safe,MMLU,GSM8K
Gemma-2-9B,83.4%,27.2%,67.2,58.3
Llama-3.1-8B,70.1%,15.6%,64.7,78.4
Phi-4-14B,28.0%,11.6%,81.5,78.3



Rejection Rate Table (RR) - In-Context Defense:


Unnamed: 0,OR-Bench-Hard-1K,XSTest-Safe,MMLU,GSM8K
Gemma-2-9B,83.5%,26.8%,67.2,58.7
Llama-3.1-8B,70.0%,14.4%,63.9,78.4
Phi-4-14B,28.5%,11.2%,81.7,78.7
