In [1]:
import torch
import torchvision
from torchvision.models.detection import fasterrcnn_resnet50_fpn, fasterrcnn_mobilenet_v3_large_fpn
from torchvision.transforms import functional as F
import torchvision.transforms as T
import numpy as np
import cv2
from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt
import os
import random
from tqdm import tqdm

In [2]:
# --- Global Configuration ---
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
OUTPUT_DIR = "advanced_results"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# COCO Class mapping for readability
COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
    'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A',
    'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

In [6]:

# --- 1. Infrastructure: Model & Image Loading ---

def get_model(model_name='resnet50'):
    print(f"Loading {model_name} model on {DEVICE}...")
    if model_name == 'resnet50':
        model = fasterrcnn_resnet50_fpn(pretrained=True)
    elif model_name == 'mobilenet':
        model = fasterrcnn_mobilenet_v3_large_fpn(pretrained=True)
    model.to(DEVICE)
    model.eval()
    return model

def get_image():
    # A standard image with a person and a car usually works best for these demos
    url = "https://raw.githubusercontent.com/pytorch/hub/master/images/dog.jpg"
    # Let's use a street scene if possible, but the dog image is reliable for the 'Dog' class.
    # To demonstrate miscategorization (Dog -> Car) and vanishing (Dog -> nothing).
    print(f"Downloading sample image from {url}...")
    response = requests.get(url)
    img = Image.open(BytesIO(response.content)).convert("RGB")
    return img

def preprocess(image):
    return F.to_tensor(image).unsqueeze(0).to(DEVICE)

def visualize(img_tensor, model, title, save_path, threshold=0.5):
    model.eval()
    with torch.no_grad():
        preds = model(img_tensor)[0]

    img_np = img_tensor.squeeze().cpu().permute(1, 2, 0).detach().numpy()
    img_np = np.clip(img_np, 0, 1)

    plt.figure(figsize=(10, 8))
    plt.imshow(img_np)
    ax = plt.gca()

    found_objects = []

    for box, label, score in zip(preds['boxes'], preds['labels'], preds['scores']):
        if score > threshold:
            box = box.cpu().numpy()
            label_name = COCO_INSTANCE_CATEGORY_NAMES[label]
            found_objects.append(f"{label_name} ({score:.2f})")

            # Color code: Green for Person, Blue for Car, Red for Dog, etc.
            color = 'red'
            if label == 1: color = 'green' # Person
            if label == 3: color = 'blue'  # Car

            rect = plt.Rectangle((box[0], box[1]), box[2]-box[0], box[3]-box[1],
                                 fill=False, color=color, linewidth=3)
            ax.add_patch(rect)
            plt.text(box[0], box[1], f"{label_name}: {score:.2f}",
                     color='white', fontsize=10, backgroundcolor=color)

    plt.title(f"{title}\nDetections: {', '.join(found_objects)}")
    plt.axis('off')
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Saved: {save_path}")
    plt.close()


In [7]:
# --- 2. The Core: EOT & Patch Generator ---

class PhysicalAttackSimulator:
    def __init__(self, model, img_tensor, patch_size_ratio=0.2):
        self.model = model
        self.img_tensor = img_tensor.clone()
        _, _, h, w = img_tensor.shape

        # Patch configuration
        self.patch_size = int(min(h, w) * patch_size_ratio)
        self.patch = torch.rand((3, self.patch_size, self.patch_size),
                                device=DEVICE, requires_grad=True)

        # EOT Augmentations (Simulate physical world distortions)
        self.augmentations = torch.nn.Sequential(
            T.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.1),
            T.RandomAffine(degrees=15, translate=(0.1, 0.1), scale=(0.8, 1.2), shear=10)
        )

    def get_patch_overlay(self, base_img, patch_tensor, target_box=None, random_pos=False):
        """
        Applies the patch to the image with EOT transformations.
        target_box: [x1, y1, x2, y2] to center the patch on.
        random_pos: If True, place anywhere (for fabrication).
        """
        # 1. Apply Physical Distortions to the Patch (EOT)
        # We process the patch independently before placing it
        aug_patch = self.augmentations(patch_tensor)

        # 2. Determine Location
        _, _, h, w = base_img.shape
        ph, pw = self.patch_size, self.patch_size

        if random_pos:
            x = random.randint(0, w - pw)
            y = random.randint(0, h - ph)
        elif target_box is not None:
            # Center on target box
            bx, by, bx2, by2 = target_box
            bw, bh = bx2 - bx, by2 - by
            x = int(bx + bw/2 - pw/2)
            y = int(by + bh/2 - ph/2)
            # Clamp to image bounds
            x = max(0, min(x, w - pw))
            y = max(0, min(y, h - ph))
        else:
            x, y = 0, 0 # Default top-left

        # 3. Paste Patch
        patched_img = base_img.clone()
        patched_img[:, :, y:y+ph, x:x+pw] = aug_patch
        return patched_img

    def run_attack(self, mode='vanishing', target_label=None, epochs=30, learning_rate=0.05):
        """
        Main optimization loop for all 3 attack modes.
        Modes: 'vanishing', 'miscategorization', 'fabrication'
        """
        print(f"\n--- Running Attack: {mode.upper()} ---")

        # Reset patch for new attack
        self.patch.data = torch.rand_like(self.patch)
        optimizer = torch.optim.Adam([self.patch], lr=learning_rate)

        # Get Ground Truth (or pseudo-ground truth) from clean image
        self.model.eval()
        with torch.no_grad():
            gt_preds = self.model(self.img_tensor)[0]

        # Select the most confident object as the victim
        if len(gt_preds['boxes']) == 0:
            print("No objects found to attack.")
            return self.img_tensor

        victim_idx = 0 # Assume top score is the target
        victim_box = gt_preds['boxes'][victim_idx]
        victim_label = gt_preds['labels'][victim_idx]

        print(f"Targeting Object: {COCO_INSTANCE_CATEGORY_NAMES[victim_label]} at {victim_box.cpu().numpy()}")

        for epoch in tqdm(range(epochs)):
            total_loss = 0

            # EOT Batch: Average gradients over multiple random transformations
            for _ in range(5):
                optimizer.zero_grad()

                # A. Apply Patch based on Mode
                if mode == 'fabrication':
                    # Place patch in random BACKGROUND locations
                    # (Simplified: just random location, hoping to spawn a box)
                    patched_img = self.get_patch_overlay(self.img_tensor, self.patch, random_pos=True)
                else:
                    # Place patch ON the victim object
                    patched_img = self.get_patch_overlay(self.img_tensor, self.patch, target_box=victim_box)

                # B. Forward Pass
                # We need gradients, so we use training mode logic or manual loss calc
                # FasterRCNN allows passing targets in train mode to compute loss automatically
                self.model.train()

                # Define Targets based on Mode
                if mode == 'vanishing':
                    # We want to confuse the model.
                    # We provide the TRUE label, but we MINIMIZE the negative loss?
                    # No, standard attack maximizes the Loss(Prediction, True Label).

                    # Trick: Pass the GROUND TRUTH. The model computes loss. We maximize it.
                    targets = [{
                        'boxes': victim_box.unsqueeze(0),
                        'labels': victim_label.unsqueeze(0)
                    }]
                    loss_dict = self.model(patched_img, targets)
                    # Maximize classifier loss and regression loss to break detection
                    loss = sum(loss for loss in loss_dict.values())
                    # Gradient Descent minimizes, so we negate to maximize error
                    loss = -loss

                elif mode == 'miscategorization':
                    # We want the model to predict TARGET_LABEL instead of True Label.
                    # We minimize Loss(Prediction, Target Label).
                    if target_label is None: raise ValueError("Need target_label for miscategorization")

                    targets = [{
                        'boxes': victim_box.unsqueeze(0),
                        'labels': torch.tensor([target_label], device=DEVICE)
                    }]
                    loss_dict = self.model(patched_img, targets)
                    # Standard minimization of loss towards the fake label
                    loss = sum(loss for loss in loss_dict.values())

                elif mode == 'fabrication':
                    # We want the model to detect the patch as TARGET_LABEL.
                    # Since we don't know exactly where the box will be proposed,
                    # we essentially treat the patch location as the ground truth box.

                    # Note: We need to know where we put the patch in this specific EOT iteration
                    # The get_patch_overlay helper was stateless, so for fabrication we need to be careful.
                    # For prototype simplicity: logic is inside the EOT loop:

                    # Re-do patch placement to capture coordinates
                    _, _, h, w = self.img_tensor.shape
                    ph, pw = self.patch_size, self.patch_size
                    rx = random.randint(0, w - pw)
                    ry = random.randint(0, h - ph)

                    aug_patch = self.augmentations(self.patch)
                    p_img = self.img_tensor.clone()
                    p_img[:, :, ry:ry+ph, rx:rx+pw] = aug_patch

                    # Target: We WANT a box here with target_label
                    fake_box = torch.tensor([[rx, ry, rx+pw, ry+ph]], dtype=torch.float32, device=DEVICE)
                    targets = [{
                        'boxes': fake_box,
                        'labels': torch.tensor([target_label], device=DEVICE)
                    }]

                    loss_dict = self.model(p_img, targets)
                    loss = sum(loss for loss in loss_dict.values())
                    patched_img = p_img # Update reference for next steps

                loss.backward()
                optimizer.step()

                # Clamp patch to valid image range
                self.patch.data.clamp_(0, 1)

        # Return the best final image (no random augmentations for display)
        if mode == 'fabrication':
            # Place in empty space for final result
            final_img = self.get_patch_overlay(self.img_tensor, self.patch, random_pos=True)
        else:
            final_img = self.get_patch_overlay(self.img_tensor, self.patch, target_box=victim_box)

        return final_img

In [8]:
# --- 3. Execution Logic ---

def main():
    # 1. Setup
    model = get_model('resnet50') # Can swap to 'mobilenet'
    img_pil = get_image()
    img_tensor = preprocess(img_pil)

    # 2. Baseline
    print("Generating Baseline...")
    visualize(img_tensor, model, "Baseline (Clean)", os.path.join(OUTPUT_DIR, "0_baseline.png"))

    attacker = PhysicalAttackSimulator(model, img_tensor)

    # 3. Attack 1: Vanishing (Hiding the Dog)
    # The patch tries to maximize loss on the 'Dog' class
    print("Generating Vanishing Attack...")
    adv_img_vanish = attacker.run_attack(mode='vanishing', epochs=30)
    visualize(adv_img_vanish, model, "Vanishing Attack (Object Hiding)",
              os.path.join(OUTPUT_DIR, "1_vanishing_attack.png"))

    # 4. Attack 2: Miscategorization (Dog -> Car)
    # COCO Class 3 is 'Car'. We want the Dog to be detected as a Car.
    print("Generating Miscategorization Attack (Target: Car)...")
    adv_img_mis = attacker.run_attack(mode='miscategorization', target_label=3, epochs=40)
    visualize(adv_img_mis, model, "Miscategorization (Dog -> Car)",
              os.path.join(OUTPUT_DIR, "2_miscategorization_attack.png"))

    # 5. Attack 3: Fabrication (Hallucinating a Toaster)
    # COCO Class 80 is 'Toaster'. We want a Toaster to appear in the background.
    print("Generating Fabrication Attack (Target: Toaster)...")
    adv_img_fab = attacker.run_attack(mode='fabrication', target_label=80, epochs=40)
    visualize(adv_img_fab, model, "Fabrication (Phantom Toaster)",
              os.path.join(OUTPUT_DIR, "3_fabrication_attack.png"), threshold=0.3)

    print(f"\nSimulation Complete. Results saved to {OUTPUT_DIR}")

if __name__ == "__main__":
    main()

Loading resnet50 model on cuda...




Downloading sample image from https://raw.githubusercontent.com/pytorch/hub/master/images/dog.jpg...
Generating Baseline...
Saved: advanced_results/0_baseline.png
Generating Vanishing Attack...

--- Running Attack: VANISHING ---
Targeting Object: dog at [ 137.8407    67.79935 1386.9038  1172.8246 ]


100%|██████████| 30/30 [00:39<00:00,  1.32s/it]


Saved: advanced_results/1_vanishing_attack.png
Generating Miscategorization Attack (Target: Car)...

--- Running Attack: MISCATEGORIZATION ---
Targeting Object: dog at [ 137.8407    67.79935 1386.9038  1172.8246 ]


100%|██████████| 40/40 [00:51<00:00,  1.29s/it]


Saved: advanced_results/2_miscategorization_attack.png
Generating Fabrication Attack (Target: Toaster)...

--- Running Attack: FABRICATION ---
Targeting Object: dog at [ 137.8407    67.79935 1386.9038  1172.8246 ]


100%|██████████| 40/40 [00:52<00:00,  1.30s/it]


Saved: advanced_results/3_fabrication_attack.png

Simulation Complete. Results saved to advanced_results
