In [21]:
import pandas as pd

# Model and file mappings
model_names = ['Gemma-2-9B', 'Llama-3.1-8B', 'Phi-4-14B']
model_file_mapping = {
    'Gemma-2-9B': 'gemma2-9b',
    'Llama-3.1-8B': 'llama31-8b',
    'Phi-4-14B': 'phi-4-14b'
}

# Sensitive-word benchmarks
sw_benchmarks = {
    'swp': 'SensWord-Prefix',
    'sws': 'SensWord-Suffix',
    'swh': 'SensWord-Harmful',
    'swb': 'SensWord-Benign'
}

# Defense types to evaluate
defense_conditions = {
    'No Defense': 'no',
    'In-Context Defense': 'icd',
    'Self-Reminder Defense': 'srd'
}

# Function to calculate refusal rate
def calculate_refusal(df):
    return df['refusal_score'].mean() * 100

# Prepare table container
refusal_tables = {}
for defense_name, prefix in defense_conditions.items():
    # Create a DataFrame with benchmarks as columns
    refusal_tables[defense_name] = pd.DataFrame(
        index=model_names,
        columns=list(sw_benchmarks.values())
    )
    for model in model_names:
        suffix = model_file_mapping[model]
        for code, full_name in sw_benchmarks.items():
            path = f"./evaluations/{code}-{prefix}-{suffix}-eval.csv"
            try:
                df = pd.read_csv(path)
                refusal_tables[defense_name].loc[model, full_name] = calculate_refusal(df)
            except FileNotFoundError:
                print(f"Missing file: {path}")
                refusal_tables[defense_name].loc[model, full_name] = None

# Formatting helper
def format_refusal_table(df, caption):
    return df.style.format(
        {col: "{:.1f}%" for col in df.columns},
        na_rep="N/A"
    ).set_caption(caption).set_table_styles([
        {'selector': 'caption', 'props': [('font-size', '14px'), ('font-weight', 'bold')]}]
    )

# Display results
for defense_name, table in refusal_tables.items():
    print(f"\nRefusal Rate Table ({defense_name}):")
    display(format_refusal_table(table, f"Refusal Rate - {defense_name}"))


Refusal Rate Table (No Defense):


Unnamed: 0,SensWord-Prefix,SensWord-Suffix,SensWord-Harmful,SensWord-Benign
Gemma-2-9B,4.0%,1.6%,99.8%,3.6%
Llama-3.1-8B,2.8%,1.2%,99.2%,1.7%
Phi-4-14B,0.5%,0.5%,92.1%,0.5%



Refusal Rate Table (In-Context Defense):


Unnamed: 0,SensWord-Prefix,SensWord-Suffix,SensWord-Harmful,SensWord-Benign
Gemma-2-9B,4.6%,1.8%,99.0%,3.8%
Llama-3.1-8B,2.9%,1.3%,100.0%,1.7%
Phi-4-14B,0.5%,0.6%,92.2%,0.3%



Refusal Rate Table (Self-Reminder Defense):


Unnamed: 0,SensWord-Prefix,SensWord-Suffix,SensWord-Harmful,SensWord-Benign
Gemma-2-9B,4.4%,1.8%,99.9%,3.8%
Llama-3.1-8B,2.9%,1.3%,100.0%,1.7%
Phi-4-14B,0.4%,0.4%,92.0%,0.4%
