In [2]:
# %pip install git+https://github.com/JiahuiYu/neuralgym

import os
from PIL import Image
import numpy as np
import torch
from torch.utils.data import Dataset
import torchvision.transforms as transforms
import matplotlib.pyplot as plt

CARLA_DATA = True

# dataset_dir = "/home/summer2025/AVPrivacy/carla_sim/test_images2/"
dataset_dir = "/home/jetsonuser/masking/datasets"

if CARLA_DATA:
    num_views = 1
    num_frames = 80
    # num_frames = 10
    dataset_idx = 0
    MASK_SPAN = {"face":1, "no_feet":1, "full":1}
    X = 1  # chunk size
    mask_height_span = MASK_SPAN.get('face', 1)
    output_base_directory = "output/carla_v1"
    input_video_base_path = f"output/carla_v1/rgb/view"
    video_output_dir = "output/carla_v1/videos/"
else:
    num_views = 8
    num_frames = 50
    dataset_idx = 0#1
    MASK_SPAN = {"face":1, "no_feet":1, "full":1}
    X = 1  # chunk size
    mask_height_span = MASK_SPAN.get('no_feet', 1)
    # output_base_directory = "jc_8_long_n2"
    output_base_directory = "output/xr_lubna"
    input_video_base_path = f"output/xr_lubna/rgb/view"
    video_output_dir = "output/xr_lubna/videos/"


class SegmentationDataset(Dataset):
    def __init__(self, root_dir, transform=None, frame_range=range(50)):
        self.root_dir = root_dir
        self.transform = transform

        valid_views = [f"view_{i}" for i in range(num_views)]
        valid_frames = frame_range  # frames 0 to 49
        valid_image_names = [f"pointcloud-{i}.png" for i in valid_frames]
        valid_depth_names = [f"depth-{i}.png" for i in valid_frames]


        # We'll store data per scenario like:
        # self.scenarios = [
        #   {
        #       "images": [[img_view0_frame0, ...], [img_view1_frame0, ...], ...], # [8][50]
        #       "gts":    [[gt_view0_frame0, ...], ...],
        #       "depths": [[depth_view0_frame0, ...], ...],
        #       "paths":  [[path_view0_frame0, ...], ...]
        #   }, ...
        # ]
        self.scenarios = []

        # Identify scenario folders (excluding ground truth)
        all_folders = [f for f in os.listdir(root_dir) 
                       if os.path.isdir(os.path.join(root_dir, f)) and 
                       not f.endswith('ground_truth') and '_ground_truth' not in f]
        
        

        for folder in all_folders:
            gt_folder = f"{folder}_ground_truth"
            if not os.path.exists(os.path.join(root_dir, gt_folder)):
                continue  # skip if no corresponding ground truth folder



            scenario_images = []
            scenario_gts = []
            scenario_depths = []
            scenario_paths = []

            # For each view
            for view_folder in valid_views:
                image_folder_path = os.path.join(root_dir, folder, view_folder)
                gt_folder_path = os.path.join(root_dir, gt_folder, view_folder)


                image_folder_path += '/'

                if not (os.path.exists(image_folder_path) and os.path.exists(gt_folder_path)):
                    print('missing directories')
                    scenario_images = []
                    break


                view_images = []
                view_gts = []
                view_depths = []
                view_paths = []

                # Make sure frames are in order
                for frame_idx in valid_frames:
                    image_name = f"pointcloud-{frame_idx}.png"
                    depth_name = f"depth-{frame_idx}.png"
                    image_path = os.path.join(image_folder_path, image_name)
                    depth_path = os.path.join(image_folder_path, depth_name)
                    gt_image_path = os.path.join(gt_folder_path, image_name)

                    
                    if not (os.path.exists(image_path) and os.path.exists(gt_image_path) and os.path.exists(depth_path)):
                        print('missing directory')
                        view_images = []
                        break

                    view_images.append(image_path)
                    view_gts.append(gt_image_path)
                    view_depths.append(depth_path)

                    relative_path = image_path.replace(self.root_dir + "/", "")
                    view_paths.append(relative_path)

                # If any frame missing, break
                print(len(view_images), num_frames)
                if len(view_images) != num_frames:
                    print('missing frame')
                    scenario_images = []
                    break

                scenario_images.append(view_images)
                scenario_gts.append(view_gts)
                scenario_depths.append(view_depths)
                scenario_paths.append(view_paths)


            # Add the scenario if all views and frames loaded
            if len(scenario_images) == num_views and all(len(v) == num_frames for v in scenario_images):
                print("loaded scenario")
                self.scenarios.append({
                    "images": scenario_images,
                    "gts": scenario_gts,
                    "depths": scenario_depths,
                    "paths": scenario_paths
                })

    def __len__(self):
        print(f"Total scenarios loaded: {len(self.scenarios)}")
        return len(self.scenarios)

    def __getitem__(self, idx):
        scenario = self.scenarios[idx]
        scenario_images = scenario["images"]   # [8][50]
        scenario_gts = scenario["gts"]         # [8][50]
        scenario_depths = scenario["depths"]   # [8][50]
        scenario_paths = scenario["paths"]     # [8][50]

        # We'll load and process all images and masks
        all_images = []  # Will hold [8, 50, 4, H, W] eventually
        all_masks = []   # Will hold [8, 50, H, W]

        # Define class colors
        class_1_color = np.array([80, 239, 7])   # #50EF07
        class_2_color = np.array([249, 0, 0])    # #F90000
        tolerance = 30

        for v in range(num_views):
            view_imgs = []
            view_masks = []
            # For each frame in this view
            for f in range(num_frames):
                image_path = scenario_images[v][f]
                gt_path = scenario_gts[v][f]
                depth_path = scenario_depths[v][f]

                image = Image.open(image_path).convert("RGB")
                gt_image = Image.open(gt_path).convert("RGB")
                depth_image = Image.open(depth_path)

                gt_image_np = np.array(gt_image)
                # Create mask
                mask = np.zeros(gt_image_np.shape[:2], dtype=np.uint8)
                mask[np.all(np.abs(gt_image_np - class_1_color) <= tolerance, axis=-1)] = 1
                mask[np.all(np.abs(gt_image_np - class_2_color) <= tolerance, axis=-1)] = 2

                # Convert images to tensors and apply transform if provided
                if self.transform:
                    # Apply transform to RGB image
                    rgb_tensor = self.transform(image)  # [C,H,W]
                    # Resize mask using nearest neighbor
                    mask_pil = Image.fromarray(mask)
                    mask_pil = mask_pil.resize((rgb_tensor.shape[2], rgb_tensor.shape[1]), Image.NEAREST)
                    mask = np.array(mask_pil)

                    # Resize depth image separately
                    depth_resized = depth_image.resize((rgb_tensor.shape[2], rgb_tensor.shape[1]), Image.NEAREST)
                    depth_np = np.array(depth_resized).astype(np.float32)

                else:
                    # If no transform, just convert directly
                    rgb_tensor = transforms.ToTensor()(image)
                    depth_np = np.array(depth_image).astype(np.float32)

                # Normalize depth
                # if depth_np.max() > 10 * depth_np.min():
                #     depth_norm = depth_np/1000
                # else:
                #     depth_norm = depth_np  # all pixels same, no normalization needed
                # depth_norm = depth_np/1000
                depth_norm = depth_np

                
                depth_tensor = torch.tensor(depth_norm).unsqueeze(0)  # [1,H,W]

                # Combine RGB and Depth into single tensor: [4,H,W]
                img_with_depth = torch.cat((rgb_tensor, depth_tensor), dim=0)

                # Convert mask to tensor
                mask = torch.from_numpy(mask).long()

                view_imgs.append(img_with_depth)  # [4,H,W]
                view_masks.append(mask)           # [H,W]

            # Stack frames for this view
            view_imgs = torch.stack(view_imgs, dim=0)   # [50,4,H,W]
            view_masks = torch.stack(view_masks, dim=0) # [50,H,W]

            all_images.append(view_imgs)
            all_masks.append(view_masks)

        # Stack all views
        all_images = torch.stack(all_images, dim=0)  # [8,50,4,H,W]
        all_masks = torch.stack(all_masks, dim=0)    # [8,50,H,W]

        return all_images, all_masks, scenario_paths

# Example usage
transform = transforms.Compose([
    transforms.ToTensor(),
])



In [3]:
from torch.utils.data import DataLoader
import torchvision.transforms as transforms

print(dataset_dir)
transform = transforms.Compose([
    transforms.ToTensor(),
])

# carla_framerange = range(68, 254)
carla_framerange = range(80)

dataset = SegmentationDataset(root_dir=dataset_dir, transform=transform, frame_range=carla_framerange)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

/home/jetsonuser/masking/datasets
80 80
loaded scenario
Total scenarios loaded: 1
Total scenarios loaded: 1


In [4]:
images, masks, paths = dataset[dataset_idx]

In [5]:
print(paths[0])

['carla_v1/view_0/pointcloud-0.png', 'carla_v1/view_0/pointcloud-1.png', 'carla_v1/view_0/pointcloud-2.png', 'carla_v1/view_0/pointcloud-3.png', 'carla_v1/view_0/pointcloud-4.png', 'carla_v1/view_0/pointcloud-5.png', 'carla_v1/view_0/pointcloud-6.png', 'carla_v1/view_0/pointcloud-7.png', 'carla_v1/view_0/pointcloud-8.png', 'carla_v1/view_0/pointcloud-9.png', 'carla_v1/view_0/pointcloud-10.png', 'carla_v1/view_0/pointcloud-11.png', 'carla_v1/view_0/pointcloud-12.png', 'carla_v1/view_0/pointcloud-13.png', 'carla_v1/view_0/pointcloud-14.png', 'carla_v1/view_0/pointcloud-15.png', 'carla_v1/view_0/pointcloud-16.png', 'carla_v1/view_0/pointcloud-17.png', 'carla_v1/view_0/pointcloud-18.png', 'carla_v1/view_0/pointcloud-19.png', 'carla_v1/view_0/pointcloud-20.png', 'carla_v1/view_0/pointcloud-21.png', 'carla_v1/view_0/pointcloud-22.png', 'carla_v1/view_0/pointcloud-23.png', 'carla_v1/view_0/pointcloud-24.png', 'carla_v1/view_0/pointcloud-25.png', 'carla_v1/view_0/pointcloud-26.png', 'carla_v1/

In [6]:
# ── RetinaFace Setup (replace Faster R-CNN) ───────────────────────────────
import torch, cv2

# how often to re-run the detector vs. track
DETECT_EVERY = 1


# ── Replacement detect_objects using Ultra-Light ──────────────────────────────
DETECTION_CONFIDENCE_THRESHOLD = 0.3
PRIVATE_OBJECT_CLASSES = ['person']  # used downstream by segment_all
WINDOW_SIZE = 10
DEPTH_THRESHOLD_MULTIPLIER = 75
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


from insightface.app import FaceAnalysis

# initialize once at top-level
retina_app = FaceAnalysis(name='buffalo_s', allowed_modules=['detection'])
# ctx_id=0 for GPU, -1 for CPU
retina_app.prepare(ctx_id=0 if torch.cuda.is_available() else -1,
                   det_size=(2048,2048))



def detect_objects(model, image_np, confidence_threshold=0.5, draw_boxes=False):
    """
    image_np: BGR uint8 H×W×3
    Returns list of {'box':[x1,y1,x2,y2],'score':…,'label':'person'}
    or, if draw_boxes=True, returns the BGR image with boxes overlaid.
    """
    # 1) run detection
    faces = retina_app.get(image_np)
    objects = []
    for f in faces:
        score = f.det_score
        if score < confidence_threshold:
            continue
        x1,y1,x2,y2 = map(int, f.bbox)
        objects.append({"box":[x1,y1,x2,y2],
                        "score":float(score),
                        "label":"person"})

    if not draw_boxes:
        return objects

    # 2) draw
    out = image_np.copy()
    for obj in objects:
        x1,y1,x2,y2 = obj["box"]
        s = obj["score"]
        cv2.rectangle(out, (x1,y1),(x2,y2), (255,0,0), 2)
        cv2.putText(out, f"{s:.2f}", (x1, max(y1-5,0)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,0,0), 1, cv2.LINE_AA)
    return out

###############################################################################
# 2) DEPTH PROFILE
###############################################################################
def calculate_depth_profile_of_box(depth_map, x1, y1, x2, y2, window_size=WINDOW_SIZE):
    """
    Return { 'mean','std','threshold','box':[x1,y1,x2,y2] } or None if empty.
    """
    half_window = window_size // 2
    cx = (x1 + x2) // 2
    cy = (y1 + y2) // 2

    x_start = max(cx - half_window, 0)
    x_end = min(cx + half_window + 1, depth_map.shape[1])
    y_start = max(cy - half_window, 0)
    y_end = min(cy + half_window + 1, depth_map.shape[0])

    depth_window = depth_map[y_start:y_end, x_start:x_end]
    depth_values = depth_window.flatten()
    depth_values = depth_values[~np.isnan(depth_values)]
    if depth_values.size == 0:
        return None

    depth_mean = float(np.mean(depth_values))
    depth_std = float(np.std(depth_values))
    depth_threshold = float(depth_std * DEPTH_THRESHOLD_MULTIPLIER)

    return {
        'mean': depth_mean,
        'std': depth_std,
        'threshold': depth_threshold,
        'box': [x1,y1,x2,y2]
    }

###############################################################################
# 3) SEGMENT CHUNK
###############################################################################

def segment_person_from_box(depth_tensor, dprof, span=1):
    """
    Similar to segment_person_from_profile_batch, but for a single bounding box
    in just 1 frame's depth or multiple frames (X frames).
    If depth_tensor: shape [X,H,W] or [H,W].
    """
    if len(depth_tensor.shape) == 2:
        # single frame => shape [H,W]
        depth_tensor = depth_tensor.unsqueeze(0)  # => [1,H,W]

    depth_mean = torch.tensor(dprof['mean'], device=depth_tensor.device, dtype=depth_tensor.dtype)
    depth_thr  = torch.tensor(dprof['threshold'], device=depth_tensor.device, dtype=depth_tensor.dtype)
    (x1,y1,x2,y2) = dprof['box']
    y2 = int(y1 + span * (y2 - y1))


    depth_diff = torch.abs(depth_tensor - depth_mean)
    mask_batch = (depth_diff <= depth_thr).to(torch.uint8)

    final_mask = torch.zeros_like(mask_batch)
    _, H, W = depth_tensor.shape
    x1_clamp = max(0, min(x1,W))
    x2_clamp = max(0, min(x2,W))
    y1_clamp = max(0, min(y1,H))
    y2_clamp = max(0, min(y2,H))

    if x2_clamp> x1_clamp and y2_clamp> y1_clamp:
        final_mask[:, y1_clamp:y2_clamp, x1_clamp:x2_clamp] = \
            mask_batch[:, y1_clamp:y2_clamp, x1_clamp:x2_clamp]

    # return shape [H,W] if single frame
    if final_mask.shape[0] == 1:
        return final_mask[0]
    return final_mask


def segment_all(depth_tensor, objects, depth_map, span):
    """
    We create a combined mask of shape [H,W] = 1 for each person's bounding box,
    EXCEPT we skip the public_box (which is the "public" person).
    depth_tensor: shape [H,W], float on GPU
    objects: detection results on CPU
    public_box: (x1,y1,x2,y2) that we skip
    depth_map: CPU 2D array for depth
    Return: torch.uint8 mask [H,W], 1=private, 0=public
    """
    H, W = depth_tensor.shape[-2], depth_tensor.shape[-1]

    combined_mask = torch.zeros((H,W), dtype=torch.uint8, device=depth_tensor.device)

    for obj in objects:
        if obj['label'] not in PRIVATE_OBJECT_CLASSES:
            continue
        box = obj['box']  # [x1,y1,x2,y2]

        # otherwise, segment this bounding box => "private"
        dprof = calculate_depth_profile_of_box(depth_map, *box)
        if dprof is None:
            continue

        single_mask = segment_person_from_box(depth_tensor, dprof, span)
        combined_mask = torch.logical_or(combined_mask.bool(), single_mask.bool()).to(torch.uint8)

    return combined_mask

###############################################################################
# 4) METRICS
###############################################################################
def dice_score_batch(pred_batch, gt_batch):
    intersection = torch.sum((pred_batch==1)&(gt_batch==1)).item()
    pred_sum = torch.sum(pred_batch==1).item()
    gt_sum = torch.sum(gt_batch==1).item()
    if (pred_sum+gt_sum)==0:
        return 1.0
    return 2.0*intersection/(pred_sum+gt_sum)

def recall_batch(pred_batch, gt_batch):
    tp = torch.sum((pred_batch==1)&(gt_batch==1)).item()
    gt_sum = torch.sum(gt_batch==1).item()
    if gt_sum==0:
        return 1.0
    return tp/gt_sum


Applied providers: ['CUDAExecutionProvider', 'CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}, 'CUDAExecutionProvider': {'sdpa_kernel': '0', 'use_tf32': '1', 'fuse_conv_bias': '0', 'prefer_nhwc': '0', 'tunable_op_max_tuning_duration_ms': '0', 'enable_skip_layer_norm_strict_mode': '0', 'tunable_op_tuning_enable': '0', 'tunable_op_enable': '0', 'use_ep_level_unified_stream': '0', 'device_id': '0', 'has_user_compute_stream': '0', 'gpu_external_empty_cache': '0', 'cudnn_conv_algo_search': 'EXHAUSTIVE', 'cudnn_conv1d_pad_to_nc1d': '0', 'gpu_mem_limit': '18446744073709551615', 'gpu_external_alloc': '0', 'gpu_external_free': '0', 'arena_extend_strategy': 'kNextPowerOfTwo', 'do_copy_in_default_stream': '1', 'enable_cuda_graph': '0', 'user_compute_stream': '0', 'cudnn_conv_use_max_workspace': '1'}}
model ignore: /home/jetsonuser/.insightface/models/buffalo_s/1k3d68.onnx landmark_3d_68
Applied providers: ['CUDAExecutionProvider', 'CPUExecutionProvider'], with options: {'CPUExec

In [7]:
import time

# Allocate mask tensor [V, F, H, W]
pred_mask_full = torch.zeros(
    (num_views, num_frames, images.shape[-2], images.shape[-1]),
    dtype=torch.uint8,
    device=DEVICE
)

chunk_detection_times = []
chunk_seg_times       = []
chunk_total_times     = []

chunk_starts = list(range(0, num_frames, X))
print(f"Chunk starts: {chunk_starts}")

for start_f in chunk_starts:
    end_f = min(start_f + X, num_frames)
    t_chunk = time.perf_counter()

    for v in range(num_views):
        # 1) Prepare the BGR image for this view/frame
        rgb_t = images[v, start_f, :3]                           # [3,H,W] float[0–1]
        rgb_np = (rgb_t.permute(1,2,0).cpu().numpy()*255).astype(np.uint8)
        bgr_np = cv2.cvtColor(rgb_np, cv2.COLOR_RGB2BGR)

        det_start = time.perf_counter()
        # 2) Decide: detect fresh or update tracker
        if start_f % DETECT_EVERY == 0:
            # clear & re-seed the tracker
            dets = detect_objects(None, bgr_np, DETECTION_CONFIDENCE_THRESHOLD)
            boxes = []
            for det in dets:
                x1,y1,x2,y2 = det['box']
                w, h = x2-x1, y2-y1
                # add a new CSRT tracker for this box
                boxes.append([x1,y1,x2,y2])

        chunk_detection_times.append(time.perf_counter() - det_start)

        # 3) Build a simple list for segmentation:
        dets_for_seg = [{'box':b,'label':'person'} for b in boxes]

        # 4) Depth‐segment unchanged
        depth_map = images[v, start_f, 3].cpu().numpy()
        depth_chunk = images[v, start_f:end_f, 3]  # [X,H,W]

        t0 = time.perf_counter()
        private_mask = segment_all(depth_chunk, dets_for_seg, depth_map, mask_height_span)
        chunk_seg_times.append(time.perf_counter() - t0)

        # 5) Store your mask
        pred_mask_full[v, start_f:end_f] = private_mask

    chunk_total_times.append(time.perf_counter() - t_chunk)


# Print out timing stats
print("Detection times per chunk:", chunk_detection_times)
print("Segmentation times per chunk:", chunk_seg_times)
print("Total times per chunk:", chunk_total_times)

# (Optional) Compute metrics if using full-body masks
# if mask_height_span > .8:
#     dice = recall = 0
#     for v in range(num_views):
#         gt = (masks[v]==2).to(torch.uint8)
#         d  = dice_score_batch(pred_mask_full[v].cpu(), gt.cpu())
#         r  = recall_batch(pred_mask_full[v].cpu(), gt.cpu())
#         dice += d; recall += r
#         print(f"View {v}: Dice={d:.4f}, Recall={r:.4f}")
#     print("Avg Dice:", dice/num_views, "Avg Recall:", recall/num_views)

# Overall timing
num_chunks = len(chunk_starts)
avg_det = sum(chunk_detection_times)/len(chunk_detection_times)
avg_seg = sum(chunk_seg_times)/num_chunks
avg_tot = sum(chunk_total_times)/num_chunks

print(f"Avg detection time     = {avg_det:.4f}s")
print(f"Avg segmentation time  = {avg_seg:.4f}s")
print(f"Avg total time/chunk   = {avg_tot:.4f}s")
print(f"Total time all chunks  = {sum(chunk_total_times):.4f}s")

Chunk starts: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79]
Detection times per chunk: [3.424869182000009, 0.3710870760000091, 0.20293396099998517, 0.2001912300000015, 0.17679713199999014, 0.16320440999999164, 0.1646034919999977, 0.15684660399998052, 0.15411532999999622, 0.1682308750000061, 0.15350998200000276, 0.16033555000001343, 0.15174066900002003, 0.1516131479999956, 0.14121625600000698, 0.15015902400000414, 0.1336378800000091, 0.14921481100000733, 0.15556424700000093, 0.14407102399999872, 0.14639678599999684, 0.13658874300000434, 0.13599322100000677, 0.13760815500000945, 0.13831312500002468, 0.13293125700002406, 0.1374674169999821, 0.13590930200001594, 0.14133874500001298, 0.13486499299997945, 0.13645429499999295, 0.134

In [8]:
print(pred_mask_full[v].size())

torch.Size([80, 720, 1280])


In [18]:
import torch
import torch.nn.functional as F

def anonymize_region_gpu(img_tensor: torch.Tensor,
                             mask_tensor: torch.Tensor,
                             block: int = 16,
                             noise_level: int = 25) -> torch.Tensor:
    """
    Optimized GPU anonymization: pixelates + noise on the masked region.
    Works without padding errors by extracting the ROI mask.
    """
    # 1) Prepare image in float [0,1]
    orig_dtype = img_tensor.dtype
    img = img_tensor.float().to(img_tensor.device)
    if orig_dtype == torch.uint8:
        img = img / 255.0

    # 2) Find ROI bounds from full-image mask
    ys, xs = mask_tensor.nonzero(as_tuple=True)
    if ys.numel() == 0:
        return img_tensor  # nothing to anonymize

    y1, y2 = ys.min().item(), ys.max().item() + 1
    x1, x2 = xs.min().item(), xs.max().item() + 1

    # 3) Crop ROI from image and mask
    roi = img[:, y1:y2, x1:x2]               # [C, h, w]
    mask_roi = mask_tensor[y1:y2, x1:x2]     # [h, w]
    C, h, w = roi.shape

    # 4) Pixelate via avg pool → nearest upsample (ceil_mode handles edges)
    pooled = F.avg_pool2d(
        roi.unsqueeze(0),               # [1, C, h, w]
        kernel_size=block,
        stride=block,
        ceil_mode=True
    )                                   # → [1, C, ceil(h/block), ceil(w/block)]
    mosaic = F.interpolate(
        pooled,
        size=(h, w),
        mode='nearest'
    ).squeeze(0)                        # → [C, h, w]

    # 5) Add uniform noise to mosaic blocks
    noise = (torch.rand_like(mosaic) * 2 - 1) * (noise_level / 255.0)
    mosaic_noised = (mosaic + noise).clamp(0.0, 1.0)

    # 6) Blend only the masked pixels in the ROI
    mask_f = mask_roi.to(dtype=mosaic_noised.dtype, device=img.device)      # [h, w]
    mask_exp = mask_f.unsqueeze(0).expand(C, h, w)                         # [C, h, w]
    region = mosaic_noised * mask_exp + roi * (1.0 - mask_exp)             # [C, h, w]

    # 7) Write region back into a copy of the full image
    out = img.clone()
    out[:, y1:y2, x1:x2] = region

    # 8) Convert back to uint8 if needed
    if orig_dtype == torch.uint8:
        out = (out * 255.0).round().to(torch.uint8)

    return out


def anonymize_depth(depth_np: np.ndarray, mask: np.ndarray = None, noise_strength: float = 0.01) -> np.ndarray:
    """
    Add noise to depth data to anonymize the region. Only applies to mask area if mask is given.
    depth_np: H×W depth array (float32).
    mask: H×W boolean array for region to anonymize (same size as depth_np).
    """
    depth_out = depth_np.copy()
    if mask is None:
        # If no mask provided, apply to whole depth (not recommended for performance)
        mask = np.ones_like(depth_out, dtype=bool)
    # You can add just one type of noise for speed. Here we use Gaussian.
    noise = np.random.normal(loc=0.0, scale=noise_strength, size=depth_out.shape)
    depth_out[mask] += noise[mask]
    # Optionally clamp or otherwise limit values if needed (e.g., keep depth in plausible range).
    return depth_out

In [19]:
import os
import cv2
import torch
import numpy as np
from simple_lama_inpainting import SimpleLama
from PIL import Image
import torch
from diffusers import StableDiffusionInpaintPipeline
from PIL import Image
import numpy as np
import tqdm
# from toonify_image import load_toonify_model, toonify_image_with_stylegan



PSP_MODEL_PATH = "pretrained_models/psp_toonify.pt"


model_used = True
model_used = False

if model_used:
    simple_lama = SimpleLama()

    # Load stable_diffusion from Hugging Face
    pipe = StableDiffusionInpaintPipeline.from_pretrained(
        "runwayml/stable-diffusion-inpainting",
        torch_dtype=torch.float16,
        variant="fp16",  # necessary for speed
    ).to("cuda")

    # # Load SSD-1B from Hugging Face
    # pipe = StableDiffusionInpaintPipeline.from_pretrained(
    #     "",
    #     torch_dtype=torch.float16,
    #     variant="fp16",  # necessary for speed
    # ).to("cuda")

    # load style GAN
    loaded_toonify_model = load_toonify_model(PSP_MODEL_PATH)
    print("Toonify model successfully loaded for the pipeline.")


ModuleNotFoundError: No module named 'simple_lama_inpainting'

In [20]:
import os
import time
import numpy as np
import torch
import cv2

def save_masked_images_gpu(pred_mask_full, images, out_folder, dilation_radius=4):
    """
    Saves masked RGB & depth frames and records timing.
    Returns dict with lists: chunk_total, chunk_anon, chunk_write.
    """
    os.makedirs(out_folder, exist_ok=True)
    V, F, H, W = pred_mask_full.shape

    # CPU kernel for dilation
    k = 2 * dilation_radius + 1
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (k, k))

    # Make view folders
    for v in range(V):
        os.makedirs(os.path.join(out_folder, "rgb",   f"view{v}"), exist_ok=True)
        os.makedirs(os.path.join(out_folder, "depth", f"view{v}"), exist_ok=True)

    chunk_total_times = []
    chunk_anonym_times = []
    chunk_write_times = []

    # Process in chunks of X frames
    for start in range(0, F, X):
        # ensure all prior CUDA work is done
        torch.cuda.synchronize()
        t_chunk_start = time.perf_counter()

        anon_times = []
        write_times = []
        end = min(start + X, F)

        for v in range(V):
            for f in range(start, end):
                # -- 1) dilate mask on CPU (fast) --
                mask_np = pred_mask_full[v, f].cpu().numpy().astype(np.uint8)
                mask_cpu = cv2.dilate(mask_np, kernel).astype(bool)

                # -- 2) grab depth for saving (so we don't time I/O here) --
                depth = images[v, f, 3].cpu().numpy()

                # -- 3) move tensors to GPU --
                rgb_gpu  = images[v, f, :3]  # [3,H,W]
                mask_gpu = torch.from_numpy(mask_cpu).to(rgb_gpu.device)  # [H,W]

                # -- 4) anonymize once per chunk --
                if f == start:
                    # sync to measure full GPU kernel time
                    torch.cuda.synchronize()
                    t_anon_start = time.perf_counter()

                    anon_gpu = anonymize_region_gpu(
                        rgb_gpu, mask_gpu,
                        # method="fast_mosaic",
                        block=max(1, W//16),
                        noise_level=20
                    )

                    # bring back to CPU uint8 H×W×3 (this syncs too)
                    anon_arr = (
                        anon_gpu.permute(1,2,0)
                                 .cpu()
                                 .numpy()
                    )
                    if anon_arr.dtype != np.uint8:
                        anon_arr = (anon_arr * 255).clip(0,255).astype(np.uint8)

                    anon_times.append(time.perf_counter() - t_anon_start)

                # -- 5) composite --
                orig = rgb_gpu.permute(1,2,0).cpu().numpy()
                if orig.dtype != np.uint8:
                    orig = (orig * 255).clip(0,255).astype(np.uint8)
                out_rgb = orig.copy()
                out_rgb[mask_cpu] = anon_arr[mask_cpu]

                # -- 6) write & time it --
                t_write_start = time.perf_counter()
                cv2.imwrite(
                    os.path.join(out_folder, "rgb", f"view{v}", f"{f}_masked.png"),
                    cv2.cvtColor(out_rgb, cv2.COLOR_RGB2BGR)
                )
                depth_u16 = np.clip(depth + np.random.normal(0,10,depth.shape),
                                     0,20000).astype(np.uint16)
                cv2.imwrite(
                    os.path.join(out_folder, "depth", f"view{v}", f"{f}_depth.png"),
                    depth_u16
                )
                write_times.append(time.perf_counter() - t_write_start)

        # ensure all CUDA work (and any copies) finish before stopping the chunk timer
        torch.cuda.synchronize()
        chunk_total_times.append(time.perf_counter() - t_chunk_start)
        chunk_anonym_times.append(sum(anon_times))
        chunk_write_times.append(sum(write_times))

    return {
        "chunk_total": chunk_total_times,
        "chunk_anon":  chunk_anonym_times,
        "chunk_write": chunk_write_times
    }

# Usage:
timings = save_masked_images_gpu(
    pred_mask_full, images, output_base_directory, dilation_radius=4
)
print("Per-chunk totals:", timings["chunk_total"])
print("Per-chunk anonym times:", timings["chunk_anon"])
print("Per-chunk write times:", timings["chunk_write"])
print("Avg total:", np.mean(timings["chunk_total"]))
print("Avg anon:",  np.mean(timings["chunk_anon"]))
print("Avg write:", np.mean(timings["chunk_write"]))

Per-chunk totals: [0.2666816180000069, 0.2483658430000446, 0.2472428129999571, 0.24681783800002677, 0.24984224699994684, 0.24551509299999452, 0.23384696800002303, 0.23453945899996143, 0.23652621299999055, 0.23745016700001997, 0.2341406009999787, 0.23381781099999444, 0.24263909900003, 0.22505716300003087, 0.23123763099999906, 0.23344131900000775, 0.22425789900000836, 0.24006031900000835, 0.23321379200001502, 0.23470348400002194, 0.22576135899998917, 0.22426242400001684, 0.22529642700004615, 0.22467167999997173, 0.22483725099999674, 0.22480740200001037, 0.22523428099998455, 0.22477164899999025, 0.2245036760000403, 0.22383485399996061, 0.23408682499996303, 0.23007052799999883, 0.22589330099998506, 0.22588376400000243, 0.22496071000000484, 0.22630329200001142, 0.22681219699995836, 0.22681834099995513, 0.22618244100004858, 0.22673229200000833, 0.2268931899999984, 0.22616796099998737, 0.2369619739999962, 0.24066447400002744, 0.2254447500000083, 0.22706880999999157, 0.2278524070000003, 0.2274

In [22]:
# ── Run the GPU‐powered save function ─────────────────────────────────────────
avg_anon = np.mean(timings['chunk_anon'])
print(f"Avg anonymization time per chunk: {avg_anon:.4f}s")

Avg anonymization time per chunk: 0.0193s


In [23]:
import os
import cv2

# Paths
base = output_base_directory                  # e.g. "output/xr_lubna"
mask_root = os.path.join(base, "rgb")         # your masked imgs: rgb/view0/0_masked.png...
det_root  = os.path.join(base, "rgb_detect")

os.makedirs(det_root, exist_ok=True)

for v in range(num_views):
    mask_dir = os.path.join(mask_root, f"view{v}")
    det_dir  = os.path.join(det_root,  f"view{v}")
    os.makedirs(det_dir, exist_ok=True)
    
    for f in range(num_frames):
        mask_path = os.path.join(mask_dir, f"{f}_masked.png")
        img = cv2.imread(mask_path)
        if img is None:
            print(f"⚠️ Missing frame {mask_path}, skipping")
            continue

        # Run your detector in BGR uint8
        # (switch to detect_fast_scrfd if you prefer SCRFD)
        det_img = detect_objects(None, img,
                                 confidence_threshold=DETECTION_CONFIDENCE_THRESHOLD,
                                 draw_boxes=True)

        out_path = os.path.join(det_dir, f"{f}_detect.png")
        cv2.imwrite(out_path, det_img)

print("✅ All detection‐overlay images saved under:", det_root)

✅ All detection‐overlay images saved under: output/carla_v1/rgb_detect


In [24]:
fps = 1/((avg_anon + avg_seg)/ X)
print(f"Estimated FPS: {fps:.2f} frames per second")

Estimated FPS: 31.11 frames per second


In [25]:
import cv2
import os
from glob import glob

# fps=30
# Set parameters
num_frames = num_frames  # Already defined in your notebook

os.makedirs(video_output_dir, exist_ok=True)
for view_idx in range(num_views):
    output_path = os.path.join(video_output_dir, f"view{view_idx}.mp4")
    
    input_video_pattern = input_video_base_path + f"{view_idx}/{{}}_masked.png"
    # input_video_pattern = input_video_base_path + f"{view_idx}/{{}}_detect.png"

    # Read the first frame to get the size
    first_frame_path = input_video_pattern.format(0)
    first_frame = cv2.imread(first_frame_path)
    if first_frame is None:
        raise FileNotFoundError(f"First frame not found: {first_frame_path}")
    height, width, layers = first_frame.shape

    # Define the video writer
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video_writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    # Write frames to video
    for i in range(num_frames):
        frame_path = input_video_pattern.format(i)
        frame = cv2.imread(frame_path)
        if frame is None:
            print(f"Warning: Frame not found: {frame_path}, skipping.")
            continue
        video_writer.write(frame)

    video_writer.release()
    print(f"Video saved to {output_path}")

Video saved to output/carla_v1/videos/view0.mp4


In [None]:
print(DEVICE)

cuda
