In [7]:


import os
import pickle
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import tqdm

# Function to load data from pickle files
def load_data(directory):
    features = []
    labels = []
    all_files = [f for f in os.listdir(directory) if f.endswith('.pkl')]
    selected_files = np.random.choice(all_files, min(500, len(all_files)), replace=False)
    
    for filename in tqdm.tqdm(selected_files, desc="Loading data"):
        with open(os.path.join(directory, filename), 'rb') as f:
            data = pickle.load(f)
            for agent_id, (feature_map, agent_type) in data.items():
                if not agent_type == 0: 
                    # Average across spatial dimensions to get a 256-dimensional vector
                    avg_feature = feature_map.mean(dim=(1, 2)).cpu().numpy()
                    features.append(avg_feature)
                    labels.append(agent_type if type(agent_type) == int else 0)
    
    return np.array(features), np.array(labels)

# Load data
data_dir = '/data2/user2/senkang/CP-GuardBench/CP-GuardBench_RawData/generated/'
features, labels = load_data(data_dir)

# Perform t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(features)

# Visualize t-SNE results
plt.figure(figsize=(10, 8))
scatter = plt.scatter(tsne_results[:, 0], tsne_results[:, 1], c=labels, cmap='viridis')
plt.colorbar(scatter)
plt.title('t-SNE visualization of feature maps')
plt.xlabel('t-SNE feature 1')
plt.ylabel('t-SNE feature 2')
plt.savefig('tsne_visualization.png')
plt.close()

print(f"Total number of samples: {len(features)}")
print(f"Number of ego samples: {sum(labels)}")
print(f"Number of non-ego samples: {len(labels) - sum(labels)}")
print("t-SNE visualization saved as 'tsne_visualization.png'")



Loading data:   0%|          | 0/500 [00:00<?, ?it/s]

Loading data: 100%|██████████| 500/500 [00:06<00:00, 82.51it/s]


Total number of samples: 905
Number of ego samples: 405
Number of non-ego samples: 500
t-SNE visualization saved as 'tsne_visualization.png'


In [54]:
# Count the total number of items
total_items = 0
data_dir = '/data2/user2/senkang/CP-GuardBench/CP-GuardBench_RawData/test/'
import tqdm
import os
import pickle

for filename in tqdm.tqdm(os.listdir(data_dir)):
    if filename.endswith('.pkl'):
        with open(os.path.join(data_dir, filename), 'rb') as f:
            data = pickle.load(f)
            total_items += len(data)

print(f"Total number of items across all pickle files: {total_items}")


100%|██████████| 2139/2139 [00:02<00:00, 853.27it/s] 

Total number of items across all pickle files: 1800





In [49]:
import os
import pickle
import matplotlib.pyplot as plt
from collections import Counter
import tqdm
import torch

plt.rcParams['font.family'] = 'arial'

# Set CUDA device to the third GPU (index 2)
device = torch.device("cuda:4" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

data_dir = '/data2/user2/senkang/CP-GuardBench/CP-GuardBench_RawData/generated/'
item_counts = []

for filename in tqdm.tqdm(os.listdir(data_dir), desc="Processing files"):
    if filename.endswith('.pkl'):
        with open(os.path.join(data_dir, filename), 'rb') as f:
            data = pickle.load(f)
            item_counts.append(len(data))

# Count the occurrences of each number of items
count_dict = Counter(item_counts)

# Prepare data for plotting
numbers = range(3, 7)  # 3 to 6 items
counts = [count_dict.get(num, 0) for num in numbers]
percentages = [count / len(item_counts) * 100 for count in counts]

# Create horizontal bar plot
plt.figure(figsize=(7, 3), dpi=300)
bars = plt.barh(numbers, percentages, edgecolor='none', color='#03045e')  # Horizontal bars

# Add percentage labels to the right of each bar
for bar in bars:
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2.,
             f'{width:.1f}%',
             ha='left', va='center', fontsize=8)

plt.ylabel('Number of Collaborators', fontsize=8)
plt.xlabel('Percentage (%)', fontsize=8)
plt.yticks(numbers, fontsize=8)
plt.xticks(fontsize=8)

# Adjust x-axis limit based on the maximum percentage
max_percentage = max(percentages)
plt.xlim(0, min(max_percentage * 1.3, 100))  # Set x-axis limit to either 110% of max or 100%, whichever is smaller
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Save the plot
plt.tight_layout()
plt.savefig('item_distribution.png', dpi=300, bbox_inches='tight')
plt.close()

print("Distribution of number of collaborators")
for num in numbers:
    print(f"{num} items: {percentages[num-3]:.1f}%")
print("Bar plot saved as 'item_distribution.png'")


Using device: cuda:4


Processing files:   0%|          | 0/49275 [00:00<?, ?it/s]

Processing files: 100%|██████████| 49275/49275 [00:46<00:00, 1068.40it/s]


Distribution of number of collaborators
3 items: 4.6%
4 items: 46.0%
5 items: 29.9%
6 items: 19.5%
Bar plot saved as 'item_distribution.png'


In [52]:
# Analyze the proportion of agent_type==1 in each data point
import os
import pickle
import matplotlib.pyplot as plt
from collections import Counter
import tqdm
import torch
import numpy as np

agent_type_1_proportions = []

for filename in tqdm.tqdm(os.listdir(data_dir), desc="Processing files"):
    if filename.endswith('.pkl'):
        with open(os.path.join(data_dir, filename), 'rb') as f:
            data = pickle.load(f)
            total_agents = len(data)
            type_1_agents = sum(1 for _, (_, agent_type) in data.items() if agent_type == 1)
            proportion = type_1_agents / total_agents if total_agents > 0 else 0
            agent_type_1_proportions.append(proportion)

# Calculate statistics
avg_proportion = sum(agent_type_1_proportions) / len(agent_type_1_proportions)
max_proportion = max(agent_type_1_proportions)
min_proportion = min(agent_type_1_proportions)

# Create histogram
plt.figure(figsize=(4, 7), dpi=300)
counts, bins, _ = plt.hist(agent_type_1_proportions, bins=20, color='#8ecae6')
plt.clf()  # Clear the current figure

# Calculate percentages
total_type_1_agents = sum(counts)
percentages = (counts / total_type_1_agents) * 100
bars = plt.bar(bins[:-1], percentages, width=np.diff(bins), align="edge", color='#8ecae6')
plt.xlabel('Attack Ratio (%)', fontsize=8)
plt.ylabel('Percentage (%)', fontsize=8)

# Set y-axis to show only numbers
plt.yticks(range(0, int(max(percentages)) + 10, 5))

# Add grid
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add vertical line for average
plt.axvline(avg_proportion, color='r', linestyle='dashed', linewidth=1, label=f'Average: {avg_proportion:.2f}')
plt.legend()

# Add value labels on top of each bar
for rect in bars:
    height = rect.get_height()
    if height > 0:
        plt.text(rect.get_x() + rect.get_width()/2., height,
                 f'{height:.1f}%',
             ha='center', va='bottom', fontsize=8)

# Adjust y-axis to show percentages
# plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f"{x:.1f}%"))

# Save the plot
plt.savefig('agent_type_1_distribution.png', dpi=300, bbox_inches='tight')
plt.close()

print(f"Average proportion of agent_type==1: {avg_proportion:.2f}")
print(f"Maximum proportion of agent_type==1: {max_proportion:.2f}")
print(f"Minimum proportion of agent_type==1: {min_proportion:.2f}")
print("Histogram saved as 'agent_type_1_distribution.png'")

Processing files:   0%|          | 0/49275 [00:00<?, ?it/s]

Processing files: 100%|██████████| 49275/49275 [00:45<00:00, 1086.77it/s]


Average proportion of agent_type==1: 0.18
Maximum proportion of agent_type==1: 0.33
Minimum proportion of agent_type==1: 0.00
Histogram saved as 'agent_type_1_distribution.png'


In [53]:
import os
from collections import Counter
import matplotlib.pyplot as plt
import tqdm

# Define the attack types we're interested in
attack_types = ['cw-l2', 'pgd', 'GN', 'bim', 'fgsm']

# Initialize a counter for attack types
attack_counts = Counter()

# Count the occurrences of each attack type
for filename in tqdm.tqdm(os.listdir(data_dir), desc="Processing files"):
    if filename.endswith('.png'):
        for attack in attack_types:
            if attack in filename:
                attack_counts[attack] += 1
                break  # Assume each file only has one attack type

# Calculate total number of attacks
total_attacks = sum(attack_counts.values())

# Calculate percentages
attack_percentages = {attack: count / total_attacks * 100 for attack, count in attack_counts.items()}

# Print the results
print("Distribution of attack types:")
for attack, percentage in attack_percentages.items():
    print(f"{attack}: {percentage:.1f}%")

# Define colors for each attack type
colors = ['#03045e', '#0077b6', '#00b4d8', '#90e0ef', '#caf0f8']

# Create a bar plot
plt.figure(figsize=(7, 3), dpi=300)

# Add grid
plt.grid(axis='y', linestyle='--', alpha=0.7, zorder=0)

# Create bar plot on top of the grid
bars = plt.bar(range(len(attack_percentages)), attack_percentages.values(), color=colors, width=0.4, zorder=3)

plt.ylabel('Percentage (%)', fontsize=8)
plt.ylim(0, max(attack_percentages.values()) * 1.7)  # Set y-axis limit based on data with 10% headroom

# Remove x-axis ticks and labels
plt.xticks([])

# Add percentage labels on top of each bar
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.1f}%', ha='center', va='bottom', fontsize=8)

# Add a legend
plt.legend(bars, ['C&W' if key.upper() == 'CW-L2' else key.upper() for key in attack_percentages.keys()], loc='upper right', bbox_to_anchor=(1.25, 1), fontsize=8)

# Adjust layout to make it tight
plt.tight_layout()

# Set tick label font size
plt.tick_params(axis='both', which='major', labelsize=8)

# Save the plot
plt.savefig('attack_distribution.png', bbox_inches='tight')
plt.close()

print("Bar plot saved as 'attack_distribution.png'")


Processing files:   0%|          | 0/49275 [00:00<?, ?it/s]

Processing files: 100%|██████████| 49275/49275 [00:00<00:00, 793232.43it/s]


Distribution of attack types:
bim: 20.1%
pgd: 19.9%
fgsm: 21.0%
GN: 20.3%
cw-l2: 18.7%
Bar plot saved as 'attack_distribution.png'
