### Jailbreak Tax vs Success Rate - Plot function

In [9]:
def plot_jailbreak_tax_vs_success_rate(pattern_files, labels=None, output_path=None, 
                                       display=True, xlim=(0.3, 0.9995), ylim=(-30, 100), 
                                       print_summary=True):
    """
    Plot jailbreak tax versus successful jailbreak rate using a logit scale for the x-axis.
    
    Args:
        pattern_files: List of paths to correctness_patterns.json files
        labels: Dictionary mapping file paths to display labels
        output_path: Path to save the output plot
        display: Whether to display the plot
        xlim: x-axis limits (min and max as proportions between 0 and 1)
        ylim: y-axis limits (min and max percentage)
    """
    # Define attack method groups and their visual properties (same as before)
    method_groups = {
        'System Prompt JB': {'color': 'black', 'marker': 'D'},
        'AutoDAN': {'color': 'deepskyblue', 'marker': 'X'},
        'Finetune': {'color': 'red', 'marker': 'o'},
        'GCG': {'color': 'green', 'marker': 'p'},
        'Many-shot': {'color': 'brown', 'marker': '*'},
        'MultiJail': {'color': 'magenta', 'marker': 's'},
        'PAIR': {'color': 'orange', 'marker': '^'},
        'PAIR (don\'t modify)': {'color': 'blue', 'marker': 'v'},
        'TAP': {'color': 'lime', 'marker': '<'}
    }
    
    # Increase font sizes for better readability (same as before)
    plt.rcParams.update({
        'font.size': 14,
        'axes.labelsize': 16,
        'axes.titlesize': 16,
        'xtick.labelsize': 14,
        'ytick.labelsize': 14,
        'legend.fontsize': 12,
    })
    
    # Prepare data (same as before)
    success_rates = []
    jb_taxes = []
    method_types = []
    visual_props = []
    
    # Process each file (same as before)
    for file_path in pattern_files:
        # Determine method type from file path or use provided label
        if labels and file_path in labels:
            method_type = labels[file_path]
        else:
            # Try to infer method type from path
            path_parts = Path(file_path).parts
            for method in method_groups.keys():
                if any(method.lower() in part.lower() for part in path_parts):
                    method_type = method
                    break
            else:
                method_type = "Other"
        
        # Load the data
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        # Extract success rate (non-refusal percentage)
        refusal_percentage = float(data['totals']['after_jailbreak']['refusal'].split()[1].strip('()%'))
        success_rate = 100 - refusal_percentage
        
        # Extract jailbreak tax
        if 'jb_tax' in data:
            jb_tax = float(data['jb_tax'].strip('%'))
        else:
            # Calculate jailbreak tax if not directly provided
            base_acc = float(data['totals']['unaligned']['correct'].split()[1].strip('()%'))
            jb_util = float(data['jb_util'].strip('%'))
            jb_tax = ((base_acc - jb_util) / base_acc * 100) if base_acc > 0 else float('nan')
        
        # Store the data
        success_rates.append(success_rate)
        jb_taxes.append(jb_tax)
        method_types.append(method_type)
        
        # Get visual properties for this method
        props = method_groups.get(method_type, {'color': 'gray', 'marker': 'o'})
        visual_props.append(props)
    
    # Create the plot
    plt.figure(figsize=(10, 8), dpi=300)
    
    # Convert success rates to proportions for logit scale
    success_rate_props = [min(max(r/100, xlim[0]), xlim[1]) for r in success_rates]
    
    # Plot each point
    for i, method in enumerate(method_types):
        plt.scatter(success_rate_props[i], jb_taxes[i], 
                   s=150, 
                   color=visual_props[i]['color'], 
                   marker=visual_props[i]['marker'],
                   alpha=0.8)
    
    # Add horizontal line at y=0
    plt.axhline(y=0, color='gray', linestyle='-', alpha=0.3)
    
    # Set axis scales and limits - use logit scale for x-axis
    plt.xscale('logit')
    plt.xlim(xlim)
    plt.ylim(ylim)
    
    # Format x-axis ticks to show percentage values
    plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: '{:.1f}'.format(x * 100)))
    
    # Add labels and title
    plt.xlabel('Successful Jailbreak Rate (%)', fontsize=16, fontweight='bold')
    plt.ylabel('Jailbreak Tax (%)', fontsize=16, fontweight='bold')
    
    # Create legend
    legend_elements = []
    for method, props in method_groups.items():
        if method in method_types:
            legend_elements.append(plt.Line2D([0], [0], marker=props['marker'], color=props['color'],
                                             markersize=10, linestyle='', label=method))
    
    plt.legend(handles=legend_elements, loc='upper right', fontsize=12, ncol=2)
    
    # Save the plot if output path is provided
    if output_path:
        plt.savefig(output_path, bbox_inches='tight', dpi=600)
        
        # Also save as PDF and SVG
        base_path = os.path.splitext(output_path)[0]
        plt.savefig(f"{base_path}.pdf", bbox_inches='tight', format='pdf')
        plt.savefig(f"{base_path}.svg", bbox_inches='tight', format='svg')
    
    # Display the plot if requested
    if display:
        plt.show()
    
    plt.close()
    
    # Create return data dictionary
    result_data = {
        'success_rates': success_rates,
        'jb_taxes': jb_taxes,
        'method_types': method_types
    }
    
    # Print summary statistics if requested
    if print_summary:
        print("\nSummary Statistics:")
        print("Method | Success Rate | Jailbreak Tax")
        print("-" * 50)
        
        for i, method in enumerate(method_types):
            print(f"{method:20} | {success_rates[i]:11.2f}% | {jb_taxes[i]:12.2f}%")
    
    # Return the data for further analysis
    return result_data

### Add result paths & generate plots

In [None]:
# List of combined_statistics files
pattern_files = [
    "../Results_/EvilMath/claude-3.5-haiku/example_run/Combined/no_pseudo_alignment/many-shot/0-10_questions/combined_statistics.json",
    "../Results_/EvilMath/claude-3.5-haiku/example_run/Combined/no_pseudo_alignment/MultiJail/0-10_questions/combined_statistics.json",
    "../Results_/EvilMath/claude-3.5-haiku/example_run/Combined/no_pseudo_alignment/PAIR/0-10_questions/combined_statistics.json",
    "../Results_/EvilMath/claude-3.5-haiku/example_run/Combined/no_pseudo_alignment/PAIR_no_question_modification/0-10_questions/combined_statistics.json"
    # Add more files as needed
]

# Optional: Custom labels for each file
labels = {
    pattern_files[0]: "Many-shot",
    pattern_files[1]: "MultiJail",
    pattern_files[2]: "PAIR",
    pattern_files[3]: "PAIR (don't modify)"
}
    
# Generate the plot
plot_data = plot_jailbreak_tax_vs_success_rate(
    pattern_files=pattern_files,
    labels=labels,
    output_path="jailbreak_tax_vs_success_rate.png"
)