In [2]:
# %pip install git+https://github.com/JiahuiYu/neuralgym

import os
from PIL import Image
import numpy as np
import torch
from torch.utils.data import Dataset
import torchvision.transforms as transforms
import matplotlib.pyplot as plt

CARLA_DATA = True

# dataset_dir = "/home/summer2025/AVPrivacy/carla_sim/test_images2/"
dataset_dir = "/home/jetsonuser/masking/datasets"

if CARLA_DATA:
    num_views = 1
    num_frames = 179
    # num_frames = 10
    dataset_idx = 0
    MASK_SPAN = {"face":1, "no_feet":1, "full":1}
    X = 1  # chunk size
    mask_height_span = MASK_SPAN.get('face', 1)
    output_base_directory = "output/carla_v1"
    input_video_base_path = f"output/carla_v1/rgb/view"
    video_output_dir = "output/carla_v1/videos/"
else:
    num_views = 8
    num_frames = 50
    dataset_idx = 0#1
    MASK_SPAN = {"face":1, "no_feet":1, "full":1}
    X = 1  # chunk size
    mask_height_span = MASK_SPAN.get('no_feet', 1)
    # output_base_directory = "jc_8_long_n2"
    output_base_directory = "output/xr_lubna"
    input_video_base_path = f"output/xr_lubna/rgb/view"
    video_output_dir = "output/xr_lubna/videos/"


class SegmentationDataset(Dataset):
    def __init__(self, root_dir, transform=None, frame_range=range(50)):
        self.root_dir = root_dir
        self.transform = transform

        valid_views = [f"view_{i}" for i in range(num_views)]
        valid_frames = frame_range  # frames 0 to 49
        valid_image_names = [f"pointcloud-{i}.png" for i in valid_frames]
        valid_depth_names = [f"depth-{i}.png" for i in valid_frames]


        # We'll store data per scenario like:
        # self.scenarios = [
        #   {
        #       "images": [[img_view0_frame0, ...], [img_view1_frame0, ...], ...], # [8][50]
        #       "gts":    [[gt_view0_frame0, ...], ...],
        #       "depths": [[depth_view0_frame0, ...], ...],
        #       "paths":  [[path_view0_frame0, ...], ...]
        #   }, ...
        # ]
        self.scenarios = []

        # Identify scenario folders (excluding ground truth)
        all_folders = [f for f in os.listdir(root_dir) 
                       if os.path.isdir(os.path.join(root_dir, f)) and 
                       not f.endswith('ground_truth') and '_ground_truth' not in f]
        
        

        for folder in all_folders:
            gt_folder = f"{folder}_ground_truth"
            if not os.path.exists(os.path.join(root_dir, gt_folder)):
                continue  # skip if no corresponding ground truth folder



            scenario_images = []
            scenario_gts = []
            scenario_depths = []
            scenario_paths = []

            # For each view
            for view_folder in valid_views:
                image_folder_path = os.path.join(root_dir, folder, view_folder)
                gt_folder_path = os.path.join(root_dir, gt_folder, view_folder)


                image_folder_path += '/'

                if not (os.path.exists(image_folder_path) and os.path.exists(gt_folder_path)):
                    print('missing directories')
                    scenario_images = []
                    break


                view_images = []
                view_gts = []
                view_depths = []
                view_paths = []

                # Make sure frames are in order
                for frame_idx in valid_frames:
                    image_name = f"pointcloud-{frame_idx}.png"
                    depth_name = f"depth-{frame_idx}.png"
                    image_path = os.path.join(image_folder_path, image_name)
                    depth_path = os.path.join(image_folder_path, depth_name)
                    gt_image_path = os.path.join(gt_folder_path, image_name)

                    
                    if not (os.path.exists(image_path) and os.path.exists(gt_image_path) and os.path.exists(depth_path)):
                        print('missing directory')
                        view_images = []
                        break

                    view_images.append(image_path)
                    view_gts.append(gt_image_path)
                    view_depths.append(depth_path)

                    relative_path = image_path.replace(self.root_dir + "/", "")
                    view_paths.append(relative_path)

                # If any frame missing, break
                print(len(view_images), num_frames)
                if len(view_images) != num_frames:
                    print('missing frame')
                    scenario_images = []
                    break

                scenario_images.append(view_images)
                scenario_gts.append(view_gts)
                scenario_depths.append(view_depths)
                scenario_paths.append(view_paths)


            # Add the scenario if all views and frames loaded
            if len(scenario_images) == num_views and all(len(v) == num_frames for v in scenario_images):
                print("loaded scenario")
                self.scenarios.append({
                    "images": scenario_images,
                    "gts": scenario_gts,
                    "depths": scenario_depths,
                    "paths": scenario_paths
                })

    def __len__(self):
        print(f"Total scenarios loaded: {len(self.scenarios)}")
        return len(self.scenarios)

    def __getitem__(self, idx):
        scenario = self.scenarios[idx]
        scenario_images = scenario["images"]   # [8][50]
        scenario_gts = scenario["gts"]         # [8][50]
        scenario_depths = scenario["depths"]   # [8][50]
        scenario_paths = scenario["paths"]     # [8][50]

        # We'll load and process all images and masks
        all_images = []  # Will hold [8, 50, 4, H, W] eventually
        all_masks = []   # Will hold [8, 50, H, W]

        # Define class colors
        class_1_color = np.array([80, 239, 7])   # #50EF07
        class_2_color = np.array([249, 0, 0])    # #F90000
        tolerance = 30

        for v in range(num_views):
            view_imgs = []
            view_masks = []
            # For each frame in this view
            for f in range(num_frames):
                image_path = scenario_images[v][f]
                gt_path = scenario_gts[v][f]
                depth_path = scenario_depths[v][f]

                image = Image.open(image_path).convert("RGB")
                gt_image = Image.open(gt_path).convert("RGB")
                depth_image = Image.open(depth_path)

                gt_image_np = np.array(gt_image)
                # Create mask
                mask = np.zeros(gt_image_np.shape[:2], dtype=np.uint8)
                mask[np.all(np.abs(gt_image_np - class_1_color) <= tolerance, axis=-1)] = 1
                mask[np.all(np.abs(gt_image_np - class_2_color) <= tolerance, axis=-1)] = 2

                # Convert images to tensors and apply transform if provided
                if self.transform:
                    # Apply transform to RGB image
                    rgb_tensor = self.transform(image)  # [C,H,W]
                    # Resize mask using nearest neighbor
                    mask_pil = Image.fromarray(mask)
                    mask_pil = mask_pil.resize((rgb_tensor.shape[2], rgb_tensor.shape[1]), Image.NEAREST)
                    mask = np.array(mask_pil)

                    # Resize depth image separately
                    depth_resized = depth_image.resize((rgb_tensor.shape[2], rgb_tensor.shape[1]), Image.NEAREST)
                    depth_np = np.array(depth_resized).astype(np.float32)

                else:
                    # If no transform, just convert directly
                    rgb_tensor = transforms.ToTensor()(image)
                    depth_np = np.array(depth_image).astype(np.float32)

                # Normalize depth
                # if depth_np.max() > 10 * depth_np.min():
                #     depth_norm = depth_np/1000
                # else:
                #     depth_norm = depth_np  # all pixels same, no normalization needed
                # depth_norm = depth_np/1000
                depth_norm = depth_np

                
                depth_tensor = torch.tensor(depth_norm).unsqueeze(0)  # [1,H,W]

                # Combine RGB and Depth into single tensor: [4,H,W]
                img_with_depth = torch.cat((rgb_tensor, depth_tensor), dim=0)

                # Convert mask to tensor
                mask = torch.from_numpy(mask).long()

                view_imgs.append(img_with_depth)  # [4,H,W]
                view_masks.append(mask)           # [H,W]

            # Stack frames for this view
            view_imgs = torch.stack(view_imgs, dim=0)   # [50,4,H,W]
            view_masks = torch.stack(view_masks, dim=0) # [50,H,W]

            all_images.append(view_imgs)
            all_masks.append(view_masks)

        # Stack all views
        all_images = torch.stack(all_images, dim=0)  # [8,50,4,H,W]
        all_masks = torch.stack(all_masks, dim=0)    # [8,50,H,W]

        return all_images, all_masks, scenario_paths

# Example usage
transform = transforms.Compose([
    transforms.ToTensor(),
])



In [3]:
from torch.utils.data import DataLoader
import torchvision.transforms as transforms

print(dataset_dir)
transform = transforms.Compose([
    transforms.ToTensor(),
])

# carla_framerange = range(68, 254)
carla_framerange = range(179)

dataset = SegmentationDataset(root_dir=dataset_dir, transform=transform, frame_range=carla_framerange)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

/home/jetsonuser/masking/datasets
179 179
loaded scenario
Total scenarios loaded: 1
Total scenarios loaded: 1


In [None]:
images, masks, paths = dataset[dataset_idx]

In [None]:
print(paths[0])

['carla_v1/view_0/pointcloud-0.png', 'carla_v1/view_0/pointcloud-1.png', 'carla_v1/view_0/pointcloud-2.png', 'carla_v1/view_0/pointcloud-3.png', 'carla_v1/view_0/pointcloud-4.png', 'carla_v1/view_0/pointcloud-5.png', 'carla_v1/view_0/pointcloud-6.png', 'carla_v1/view_0/pointcloud-7.png', 'carla_v1/view_0/pointcloud-8.png', 'carla_v1/view_0/pointcloud-9.png']


In [None]:
# ── RetinaFace Setup (replace Faster R-CNN) ───────────────────────────────
import torch, cv2

# how often to re-run the detector vs. track
DETECT_EVERY = 1

# one MultiTracker per view
trackers = [cv2.legacy.MultiTracker_create() for _ in range(num_views)]


# ── Replacement detect_objects using Ultra-Light ──────────────────────────────
DETECTION_CONFIDENCE_THRESHOLD = 0.3
PRIVATE_OBJECT_CLASSES = ['person']  # used downstream by segment_all
WINDOW_SIZE = 10
DEPTH_THRESHOLD_MULTIPLIER = 75
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


# from insightface.app import FaceAnalysis
from retinaface import RetinaFace

# # initialize once at top-level
# retina_app = FaceAnalysis(allowed_modules=['detection'])
# # ctx_id=0 for GPU, -1 for CPU
# retina_app.prepare(ctx_id=0 if torch.cuda.is_available() else -1,
#                    det_size=(2048,2048))




def detect_objects(model, image_np, confidence_threshold=0.5, draw_boxes=False):
    """
    image_np: BGR uint8 H×W×3
    Returns list of {'box':[x1,y1,x2,y2],'score':…,'label':'person'}
    or, if draw_boxes=True, returns the BGR image with boxes overlaid.
    """
    # 1) run detection
    faces = RetinaFace.detect_faces(image_np)
    objects = []
    print(faces)
    for f, _ in faces.items():
        score = faces[f]['score']
        if score < confidence_threshold:
            continue
        x1,y1,x2,y2 = map(int, faces[f]['facial_area'])
        objects.append({"box":[x1,y1,x2,y2],
                        "score":float(score),
                        "label":"person"})

    if not draw_boxes:
        return objects

    # 2) draw
    out = image_np.copy()
    for obj in objects:
        x1,y1,x2,y2 = obj["box"]
        s = obj["score"]
        cv2.rectangle(out, (x1,y1),(x2,y2), (255,0,0), 2)
        cv2.putText(out, f"{s:.2f}", (x1, max(y1-5,0)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,0,0), 1, cv2.LINE_AA)
    return out

###############################################################################
# 2) DEPTH PROFILE
###############################################################################
def calculate_depth_profile_of_box(depth_map, x1, y1, x2, y2, window_size=WINDOW_SIZE):
    """
    Return { 'mean','std','threshold','box':[x1,y1,x2,y2] } or None if empty.
    """
    half_window = window_size // 2
    cx = (x1 + x2) // 2
    cy = (y1 + y2) // 2

    x_start = max(cx - half_window, 0)
    x_end = min(cx + half_window + 1, depth_map.shape[1])
    y_start = max(cy - half_window, 0)
    y_end = min(cy + half_window + 1, depth_map.shape[0])

    depth_window = depth_map[y_start:y_end, x_start:x_end]
    depth_values = depth_window.flatten()
    depth_values = depth_values[~np.isnan(depth_values)]
    if depth_values.size == 0:
        return None

    depth_mean = float(np.mean(depth_values))
    depth_std = float(np.std(depth_values))
    depth_threshold = float(depth_std * DEPTH_THRESHOLD_MULTIPLIER)

    return {
        'mean': depth_mean,
        'std': depth_std,
        'threshold': depth_threshold,
        'box': [x1,y1,x2,y2]
    }

###############################################################################
# 3) SEGMENT CHUNK
###############################################################################

def segment_person_from_box(depth_tensor, dprof, span=1):
    """
    Similar to segment_person_from_profile_batch, but for a single bounding box
    in just 1 frame's depth or multiple frames (X frames).
    If depth_tensor: shape [X,H,W] or [H,W].
    """
    if len(depth_tensor.shape) == 2:
        # single frame => shape [H,W]
        depth_tensor = depth_tensor.unsqueeze(0)  # => [1,H,W]

    depth_mean = torch.tensor(dprof['mean'], device=depth_tensor.device, dtype=depth_tensor.dtype)
    depth_thr  = torch.tensor(dprof['threshold'], device=depth_tensor.device, dtype=depth_tensor.dtype)
    (x1,y1,x2,y2) = dprof['box']
    y2 = int(y1 + span * (y2 - y1))


    depth_diff = torch.abs(depth_tensor - depth_mean)
    mask_batch = (depth_diff <= depth_thr).to(torch.uint8)

    final_mask = torch.zeros_like(mask_batch)
    _, H, W = depth_tensor.shape
    x1_clamp = max(0, min(x1,W))
    x2_clamp = max(0, min(x2,W))
    y1_clamp = max(0, min(y1,H))
    y2_clamp = max(0, min(y2,H))

    if x2_clamp> x1_clamp and y2_clamp> y1_clamp:
        final_mask[:, y1_clamp:y2_clamp, x1_clamp:x2_clamp] = \
            mask_batch[:, y1_clamp:y2_clamp, x1_clamp:x2_clamp]

    # return shape [H,W] if single frame
    if final_mask.shape[0] == 1:
        return final_mask[0]
    return final_mask


def segment_all(depth_tensor, objects, depth_map, span):
    """
    We create a combined mask of shape [H,W] = 1 for each person's bounding box,
    EXCEPT we skip the public_box (which is the "public" person).
    depth_tensor: shape [H,W], float on GPU
    objects: detection results on CPU
    public_box: (x1,y1,x2,y2) that we skip
    depth_map: CPU 2D array for depth
    Return: torch.uint8 mask [H,W], 1=private, 0=public
    """
    H, W = depth_tensor.shape[-2], depth_tensor.shape[-1]

    combined_mask = torch.zeros((H,W), dtype=torch.uint8, device=depth_tensor.device)

    for obj in objects:
        if obj['label'] not in PRIVATE_OBJECT_CLASSES:
            continue
        box = obj['box']  # [x1,y1,x2,y2]

        # otherwise, segment this bounding box => "private"
        dprof = calculate_depth_profile_of_box(depth_map, *box)
        if dprof is None:
            continue

        single_mask = segment_person_from_box(depth_tensor, dprof, span)
        combined_mask = torch.logical_or(combined_mask.bool(), single_mask.bool()).to(torch.uint8)

    return combined_mask

###############################################################################
# 4) METRICS
###############################################################################
def dice_score_batch(pred_batch, gt_batch):
    intersection = torch.sum((pred_batch==1)&(gt_batch==1)).item()
    pred_sum = torch.sum(pred_batch==1).item()
    gt_sum = torch.sum(gt_batch==1).item()
    if (pred_sum+gt_sum)==0:
        return 1.0
    return 2.0*intersection/(pred_sum+gt_sum)

def recall_batch(pred_batch, gt_batch):
    tp = torch.sum((pred_batch==1)&(gt_batch==1)).item()
    gt_sum = torch.sum(gt_batch==1).item()
    if gt_sum==0:
        return 1.0
    return tp/gt_sum


In [None]:
# ── RetinaFace Setup (replace Faster R-CNN) ───────────────────────────────
import torch, cv2
import sys
sys.path.append('/home/jetsonuser/masking/Pytorch_Retinaface')

# how often to re-run the detector vs. track
DETECT_EVERY = 1

# one MultiTracker per view
# trackers = [cv2.MultiTracker_create() for _ in range(num_views)]


# ── Replacement detect_objects using Ultra-Light ──────────────────────────────
DETECTION_CONFIDENCE_THRESHOLD = 0.3
PRIVATE_OBJECT_CLASSES = ['person']  # used downstream by segment_all
WINDOW_SIZE = 10
DEPTH_THRESHOLD_MULTIPLIER = 75
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


# from insightface.app import FaceAnalysis
from models.retinaface import RetinaFace
from layers.functions.prior_box import PriorBox
from utils.box_utils import decode
from utils.nms.py_cpu_nms import py_cpu_nms
from data.config import cfg_mnet, cfg_re50

import numpy as np


# Constants / globals
DETECT_EVERY = 1
DETECTION_CONFIDENCE_THRESHOLD = 0.8
PRIVATE_OBJECT_CLASSES = ['person']
WINDOW_SIZE = 10
DEPTH_THRESHOLD_MULTIPLIER = 75

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model (already loaded earlier) - ensure it's on device and eval
model = RetinaFace(cfg=cfg_re50, phase='test')
# load checkpoint (you already have this logic; keep using strict=False if needed)
state = torch.load('Pytorch_Retinaface/weights/Resnet50_Final.pth', weights_only=True, map_location=device)
state_dict = state.get('state_dict', state) if isinstance(state, dict) else state
model.load_state_dict(state_dict, strict=False)
model = model.to(device)
model.eval()

# Detection helper (fixed)
def detect_objects(image_bgr, model, confidence_threshold, device):
    """
    image_bgr: HxWx3 uint8 OpenCV BGR
    Returns: numpy array of shape (N,5): [x1,y1,x2,y2,score]
    """
    img = image_bgr.astype(np.float32)
    img -= (104.0, 117.0, 123.0)
    img = img.transpose(2, 0, 1)  # C,H,W
    img_tensor = torch.from_numpy(img).unsqueeze(0).to(device)  # [1,3,H,W]

    with torch.no_grad():
        loc, conf, landms = model(img_tensor)

    # Build priorbox / decode with the same config used to instantiate the model
    priorbox = PriorBox(cfg_re50, image_size=(img_tensor.shape[2], img_tensor.shape[3]))
    priors = priorbox.forward().to(device)
    boxes = decode(loc[0], priors, cfg_re50['variance'])  # [N,4], normalized
    # scale to absolute image coords
    _, _, H, W = img_tensor.shape
    scale = torch.tensor([W, H, W, H], device=device)
    boxes = boxes * scale  # now in pixels

    scores = conf[0][:, 1]  # person class

    # Filter by score
    keep_mask = scores > confidence_threshold
    if not keep_mask.any():
        return np.zeros((0, 5), dtype=np.float32)

    boxes = boxes[keep_mask]
    scores = scores[keep_mask]
    dets = torch.cat([boxes, scores.unsqueeze(1)], dim=1)  # [M,5]

    dets = dets.cpu().numpy().astype(np.float32)
    keep = py_cpu_nms(dets, 0.4)
    dets = dets[keep]

    # Drop degenerate / too-small boxes (width or height < 2 pixels)
    filtered = []
    for x1, y1, x2, y2, score in dets:
        if (x2 - x1) < 2 or (y2 - y1) < 2:
            continue
        filtered.append([x1, y1, x2, y2, score])
    if not filtered:
        return np.zeros((0, 5), dtype=np.float32)
    return np.array(filtered, dtype=np.float32)
###############################################################################
# 2) DEPTH PROFILE
###############################################################################
def calculate_depth_profile_of_box(depth_map, x1, y1, x2, y2, window_size=WINDOW_SIZE):
    """
    Return { 'mean','std','threshold','box':[x1,y1,x2,y2] } or None if empty.
    """
    half_window = window_size // 2
    cx = (x1 + x2) // 2
    cy = (y1 + y2) // 2

    x_start = int(max(cx - half_window, 0))
    x_end = int(min(cx + half_window + 1, depth_map.shape[1]))
    y_start = int(max(cy - half_window, 0))
    y_end = int(min(cy + half_window + 1, depth_map.shape[0]))

    depth_window = depth_map[y_start:y_end, x_start:x_end]
    depth_values = depth_window.flatten()
    depth_values = depth_values[~np.isnan(depth_values)]
    if depth_values.size == 0:
        return None

    depth_mean = float(np.mean(depth_values))
    depth_std = float(np.std(depth_values))
    depth_threshold = float(depth_std * DEPTH_THRESHOLD_MULTIPLIER)

    return {
        'mean': depth_mean,
        'std': depth_std,
        'threshold': depth_threshold,
        'box': [x1,y1,x2,y2]
    }

###############################################################################
# 3) SEGMENT CHUNK
###############################################################################

def segment_person_from_box(depth_tensor, dprof, span=1):
    """
    Similar to segment_person_from_profile_batch, but for a single bounding box
    in just 1 frame's depth or multiple frames (X frames).
    If depth_tensor: shape [X,H,W] or [H,W].
    """
    if len(depth_tensor.shape) == 2:
        # single frame => shape [H,W]
        depth_tensor = depth_tensor.unsqueeze(0)  # => [1,H,W]

    depth_mean = torch.tensor(dprof['mean'], device=depth_tensor.device, dtype=depth_tensor.dtype)
    depth_thr  = torch.tensor(dprof['threshold'], device=depth_tensor.device, dtype=depth_tensor.dtype)
    (x1,y1,x2,y2) = dprof['box']
    y2 = int(y1 + span * (y2 - y1))


    depth_diff = torch.abs(depth_tensor - depth_mean)
    mask_batch = (depth_diff <= depth_thr).to(torch.uint8)

    final_mask = torch.zeros_like(mask_batch)
    _, H, W = depth_tensor.shape
    x1_clamp = max(0, min(x1,W))
    x2_clamp = max(0, min(x2,W))
    y1_clamp = max(0, min(y1,H))
    y2_clamp = max(0, min(y2,H))

    if x2_clamp> x1_clamp and y2_clamp> y1_clamp:
        final_mask[:, y1_clamp:y2_clamp, x1_clamp:x2_clamp] = \
            mask_batch[:, y1_clamp:y2_clamp, x1_clamp:x2_clamp]

    # return shape [H,W] if single frame
    if final_mask.shape[0] == 1:
        return final_mask[0]
    return final_mask


def segment_all(depth_tensor, objects, depth_map, span):
    """
    We create a combined mask of shape [H,W] = 1 for each person's bounding box,
    EXCEPT we skip the public_box (which is the "public" person).
    depth_tensor: shape [H,W], float on GPU
    objects: detection results on CPU
    public_box: (x1,y1,x2,y2) that we skip
    depth_map: CPU 2D array for depth
    Return: torch.uint8 mask [H,W], 1=private, 0=public
    """
    H, W = depth_tensor.shape[-2], depth_tensor.shape[-1]

    combined_mask = torch.zeros((H,W), dtype=torch.uint8, device=depth_tensor.device)

    for obj in objects:
        if obj['label'] not in PRIVATE_OBJECT_CLASSES:
            continue
        box = obj['box']  # [x1,y1,x2,y2]

        # otherwise, segment this bounding box => "private"
        dprof = calculate_depth_profile_of_box(depth_map, *box)
        if dprof is None:
            continue

        single_mask = segment_person_from_box(depth_tensor, dprof, span)
        combined_mask = torch.logical_or(combined_mask.bool(), single_mask.bool()).to(torch.uint8)

    return combined_mask

###############################################################################
# 4) METRICS
###############################################################################
def dice_score_batch(pred_batch, gt_batch):
    intersection = torch.sum((pred_batch==1)&(gt_batch==1)).item()
    pred_sum = torch.sum(pred_batch==1).item()
    gt_sum = torch.sum(gt_batch==1).item()
    if (pred_sum+gt_sum)==0:
        return 1.0
    return 2.0*intersection/(pred_sum+gt_sum)

def recall_batch(pred_batch, gt_batch):
    tp = torch.sum((pred_batch==1)&(gt_batch==1)).item()
    gt_sum = torch.sum(gt_batch==1).item()
    if gt_sum==0:
        return 1.0
    return tp/gt_sum


In [None]:
import time

# Allocate mask tensor [V, F, H, W]
pred_mask_full = torch.zeros(
    (num_views, num_frames, images.shape[-2], images.shape[-1]),
    dtype=torch.uint8,
    device=DEVICE
)

chunk_detection_times = []
chunk_seg_times       = []
chunk_total_times     = []

chunk_starts = list(range(0, num_frames, X))
print(f"Chunk starts: {chunk_starts}")

for start_f in chunk_starts:
    end_f = min(start_f + X, num_frames)
    seg_start = time.time()

    for v in range(num_views):
        # 1) Prepare the BGR image for this view/frame
        rgb_t = images[v, start_f, :3]                           # [3,H,W] float[0–1]
        rgb_np = (rgb_t.permute(1,2,0).cpu().numpy()*255).astype(np.uint8)
        bgr_np = cv2.cvtColor(rgb_np, cv2.COLOR_RGB2BGR)

        det_start = time.perf_counter()
        # 2) Decide: detect fresh or update tracker
        if start_f % DETECT_EVERY == 0:
            # clear & re-seed the tracker
            trackers[v] = cv2.legacy.MultiTracker_create()
            dets = detect_objects(bgr_np, None, DETECTION_CONFIDENCE_THRESHOLD, DEVICE)
            boxes = []
            for det in dets:
                x1,y1,x2,y2 = det['box']
                w, h = x2-x1, y2-y1
                # add a new CSRT tracker for this box
                trackers[v].add(cv2.legacy.TrackerCSRT_create(), bgr_np, (x1, y1, w, h))
                boxes.append([x1,y1,x2,y2])
        else:
            # update existing trackers
            ok, boxes_list = trackers[v].update(bgr_np)
            # boxes_list is a tuple of (x,y,w,h)
            boxes = [
                [int(x), int(y), int(x+w), int(y+h)]
                for (x,y,w,h) in boxes_list
            ]
        chunk_detection_times.append(time.perf_counter() - det_start)

        # 3) Build a simple list for segmentation:
        dets_for_seg = [{'box':b,'label':'person'} for b in boxes]

        # 4) Depth‐segment unchanged
        depth_map = images[v, start_f, 3].cpu().numpy()
        depth_chunk = images[v, start_f:end_f, 3]  # [X,H,W]
        private_mask = segment_all(depth_chunk, dets_for_seg, depth_map, mask_height_span)

        # 5) Store your mask
        pred_mask_full[v, start_f:end_f] = private_mask

    seg_time = time.time() - seg_start
    chunk_seg_times.append(seg_time)

# Print out timing stats
print("Detection times per chunk:", chunk_detection_times)
print("Segmentation times per chunk:", chunk_seg_times)
print("Total times per chunk:", chunk_total_times)

# (Optional) Compute metrics if using full-body masks
# if mask_height_span > .8:
#     dice = recall = 0
#     for v in range(num_views):
#         gt = (masks[v]==2).to(torch.uint8)
#         d  = dice_score_batch(pred_mask_full[v].cpu(), gt.cpu())
#         r  = recall_batch(pred_mask_full[v].cpu(), gt.cpu())
#         dice += d; recall += r
#         print(f"View {v}: Dice={d:.4f}, Recall={r:.4f}")
#     print("Avg Dice:", dice/num_views, "Avg Recall:", recall/num_views)

# Overall timing
num_chunks = len(chunk_starts)
avg_det = sum(chunk_detection_times)/len(chunk_detection_times)
avg_seg = sum(chunk_seg_times)/num_chunks
avg_tot = sum(chunk_total_times)/num_chunks

print(f"Avg detection time     = {avg_det:.4f}s")
print(f"Avg segmentation time  = {avg_seg:.4f}s")
print(f"Avg total time/chunk   = {avg_tot:.4f}s")
print(f"Total time all chunks  = {sum(chunk_total_times):.4f}s")

Chunk starts: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


TypeError: 'NoneType' object is not callable

In [None]:
print(pred_mask_full[v].size())

torch.Size([10, 720, 1280])


In [None]:
import torch
import torch.nn.functional as F

def anonymize_region_gpu(
    img_tensor: torch.Tensor,
    mask_tensor: torch.Tensor,
    method: str = "fast_mosaic",
    block: int = 16,
    noise_level: int = 25
) -> torch.Tensor:
    """
    GPU-accelerated anonymization via PyTorch.

    img_tensor: C×H×W tensor on GPU, dtype uint8 or float32[0-1]
    mask_tensor: H×W boolean tensor on same device

    Returns: anonymized C×H×W tensor on GPU, same dtype as input.
    """
    # Prepare image in float [0,1]
    was_uint8 = (img_tensor.dtype == torch.uint8)
    if was_uint8:
        img = img_tensor.to(torch.float32, device=img_tensor.device) / 255.0
    else:
        img = img_tensor

    C, H, W = img.shape
    out = img.clone()

    # Find ROI bounding box
    ys, xs = torch.where(mask_tensor)
    if ys.numel() == 0:
        # nothing to anonymize
        return img_tensor
    y1, y2 = int(ys.min().item()), int(ys.max().item()) + 1
    x1, x2 = int(xs.min().item()), int(xs.max().item()) + 1

    # Extract ROI
    roi = out[:, y1:y2, x1:x2]  # C×h×w
    mask_roi = mask_tensor[y1:y2, x1:x2]  # h×w

    if method == "fast_mosaic":
        # 1) downsample
        small = F.interpolate(
            roi.unsqueeze(0),
            size=(max(1, (y2-y1)//block), max(1, (x2-x1)//block)),
            mode="bilinear", align_corners=False
        ).squeeze(0)
        # 2) upsample
        mosaic = F.interpolate(
            small.unsqueeze(0), (y2-y1, x2-x1), mode="nearest"
        ).squeeze(0)
        # 3) add noise
        noise = torch.randint(
            -noise_level, noise_level+1,
            mosaic.shape, device=img.device, dtype=torch.float32
        ) / 255.0
        noised = torch.clamp(mosaic + noise, 0.0, 1.0)
        # 4) composite
        mask_expand = mask_roi.unsqueeze(0).expand(C, -1, -1)
        roi[mask_expand] = noised[mask_expand]
        out[:, y1:y2, x1:x2] = roi
    else:
        raise ValueError(f"Unknown method: {method}")

    # Convert back to uint8 if needed
    if was_uint8:
        return (out * 255.0).to(torch.uint8)
    return out



def anonymize_depth(original_depth_np, noise_strength=0.01, output_path=None):

    # --- Add Gaussian Noise (as before) ---
    original_depth_np = original_depth_np.copy()  # Ensure we don't modify the original data
    noise_mean_gaussian = 0.0
    noise_std_dev_gaussian = noise_strength # Example: 5 millimeters
    gaussian_noise = np.random.normal(noise_mean_gaussian, noise_std_dev_gaussian, original_depth_np.shape)
    # Apply Gaussian noise to the original masked_depth
    noisy_depth_gaussian = original_depth_np + gaussian_noise
    # --- End Add Gaussian Noise ---

    # --- Add Uniform Random Noise (Very Quick) ---
    uniform_noise_magnitude = noise_strength/2

    # Generate uniform noise within the range [-magnitude/2, +magnitude/2]
    uniform_noise = np.random.uniform(
        low=-uniform_noise_magnitude / 2.0,
        high=uniform_noise_magnitude / 2.0,
        size=noisy_depth_gaussian.shape
    )

    # Add the uniform noise to the already Gaussian-noisy depth
    return noisy_depth_gaussian + uniform_noise




In [None]:
import os
import cv2
import torch
import numpy as np
from simple_lama_inpainting import SimpleLama
from PIL import Image
import torch
from diffusers import StableDiffusionInpaintPipeline
from PIL import Image
import numpy as np
import tqdm
# from toonify_image import load_toonify_model, toonify_image_with_stylegan



PSP_MODEL_PATH = "pretrained_models/psp_toonify.pt"


model_used = True
model_used = False

if model_used:
    simple_lama = SimpleLama()

    # Load stable_diffusion from Hugging Face
    pipe = StableDiffusionInpaintPipeline.from_pretrained(
        "runwayml/stable-diffusion-inpainting",
        torch_dtype=torch.float16,
        variant="fp16",  # necessary for speed
    ).to("cuda")

    # # Load SSD-1B from Hugging Face
    # pipe = StableDiffusionInpaintPipeline.from_pretrained(
    #     "",
    #     torch_dtype=torch.float16,
    #     variant="fp16",  # necessary for speed
    # ).to("cuda")

    # load style GAN
    loaded_toonify_model = load_toonify_model(PSP_MODEL_PATH)
    print("Toonify model successfully loaded for the pipeline.")


ModuleNotFoundError: No module named 'simple_lama_inpainting'

In [None]:
def save_masked_images_gpu(pred_mask_full, images, out_folder, dilation_radius=4):
    """
    Saves masked RGB & depth frames and records timing.
    Returns dict with lists: chunk_total, chunk_anon, chunk_write.
    """
    os.makedirs(out_folder, exist_ok=True)
    V, F, H, W = pred_mask_full.shape

    # CPU kernel for dilation
    k = 2 * dilation_radius + 1
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (k, k))

    # Make view folders
    for v in range(V):
        os.makedirs(os.path.join(out_folder, "rgb",   f"view{v}"), exist_ok=True)
        os.makedirs(os.path.join(out_folder, "depth", f"view{v}"), exist_ok=True)

    chunk_total_times = []
    chunk_anonym_times = []
    chunk_write_times = []

    # Process in chunks of X frames
    for start in range(0, F, X):
        t_chunk_start = time.perf_counter()
        anon_times = []
        write_times = []

        end = min(start + X, F)
        for v in range(V):
            for f in range(start, end):
                # -- 1) dilate mask on CPU (fast) --
                mask_np = pred_mask_full[v, f].cpu().numpy().astype(np.uint8)
                mask_cpu = cv2.dilate(mask_np, kernel).astype(bool)

                # -- 2) grab depth for saving (so we don't time I/O here) --
                depth = images[v, f, 3].cpu().numpy()

                # -- 3) move tensors to GPU --
                rgb_gpu  = images[v, f, :3]                      # [3,H,W]
                mask_gpu = torch.from_numpy(mask_cpu).to(rgb_gpu.device)  # [H,W]

                # -- 4) anonymize once per chunk --
                if f == start:
                    t_anon_start = time.perf_counter()
                    anon_gpu = anonymize_region_gpu(
                        rgb_gpu, mask_gpu,
                        method="fast_mosaic",
                        block=max(1, W//16),
                        noise_level=20
                    )
                    # bring back to CPU uint8 H×W×3
                    anon_arr = (
                        anon_gpu.permute(1,2,0)
                                 .cpu()
                                 .numpy()
                    )
                    if anon_arr.dtype != np.uint8:
                        anon_arr = (anon_arr * 255).clip(0,255).astype(np.uint8)
                    anon_times.append(time.perf_counter() - t_anon_start)

                # -- 5) composite --
                orig = rgb_gpu.permute(1,2,0).cpu().numpy()
                if orig.dtype != np.uint8:
                    orig = (orig * 255).clip(0,255).astype(np.uint8)
                out_rgb = orig.copy()
                out_rgb[mask_cpu] = anon_arr[mask_cpu]

                # -- 6) write & time it --
                t_write_start = time.perf_counter()
                cv2.imwrite(
                    os.path.join(out_folder, "rgb", f"view{v}", f"{f}_masked.png"),
                    cv2.cvtColor(out_rgb, cv2.COLOR_RGB2BGR)
                )
                depth_u16 = np.clip(depth + np.random.normal(0,10,depth.shape),
                                     0,20000).astype(np.uint16)
                cv2.imwrite(
                    os.path.join(out_folder, "depth", f"view{v}", f"{f}_depth.png"),
                    depth_u16
                )
                write_times.append(time.perf_counter() - t_write_start)

        # record this chunk’s timings
        chunk_total_times.append(time.perf_counter() - t_chunk_start)
        chunk_anonym_times.append(sum(anon_times))
        chunk_write_times.append(sum(write_times))

    # return the timing lists for analysis
    return {
        "chunk_total": chunk_total_times,
        "chunk_anon":  chunk_anonym_times,
        "chunk_write": chunk_write_times
    }

# Usage:
timings = save_masked_images_gpu(
    pred_mask_full, images, output_base_directory, dilation_radius=4
)
print("Per-chunk totals:", timings["chunk_total"])
print("Per-chunk anonym times:", timings["chunk_anon"])
print("Per-chunk write times:", timings["chunk_write"])
print("Avg total:", np.mean(timings["chunk_total"]))
print("Avg anon:",  np.mean(timings["chunk_anon"]))
print("Avg write:", np.mean(timings["chunk_write"]))

In [None]:
import os
import cv2

# Paths
base = output_base_directory                  # e.g. "output/xr_lubna"
mask_root = os.path.join(base, "rgb")         # your masked imgs: rgb/view0/0_masked.png...
det_root  = os.path.join(base, "rgb_detect")

os.makedirs(det_root, exist_ok=True)

for v in range(num_views):
    mask_dir = os.path.join(mask_root, f"view{v}")
    det_dir  = os.path.join(det_root,  f"view{v}")
    os.makedirs(det_dir, exist_ok=True)
    
    for f in range(num_frames):
        mask_path = os.path.join(mask_dir, f"{f}_masked.png")
        img = cv2.imread(mask_path)
        if img is None:
            print(f"⚠️ Missing frame {mask_path}, skipping")
            continue

        # Run your detector in BGR uint8
        # (switch to detect_fast_scrfd if you prefer SCRFD)
        det_img = detect_objects(None, img,
                                 confidence_threshold=DETECTION_CONFIDENCE_THRESHOLD,
                                 draw_boxes=True)

        out_path = os.path.join(det_dir, f"{f}_detect.png")
        cv2.imwrite(out_path, det_img)

print("✅ All detection‐overlay images saved under:", det_root)

masking image 0
{'face_1': {'score': 0.998749315738678, 'facial_area': [1129, 488, 1169, 536], 'landmarks': {'right_eye': [1134.6956, 505.5952], 'left_eye': [1148.9933, 509.3922], 'nose': [1133.8142, 516.02136], 'mouth_right': [1133.5044, 524.1914], 'mouth_left': [1142.848, 526.9775]}}, 'face_2': {'score': 0.9778648018836975, 'facial_area': [829, 407, 843, 426], 'landmarks': {'right_eye': [835.72406, 415.71954], 'left_eye': [841.5414, 415.94122], 'nose': [839.27496, 419.1503], 'mouth_right': [835.76654, 421.97803], 'mouth_left': [840.411, 422.2338]}}, 'face_3': {'score': 0.964948296546936, 'facial_area': [635, 388, 643, 397], 'landmarks': {'right_eye': [637.9238, 391.67953], 'left_eye': [641.2111, 391.77588], 'nose': [639.5469, 393.5299], 'mouth_right': [638.10187, 395.31577], 'mouth_left': [640.762, 395.38675]}}}


error: OpenCV(4.11.0) :-1: error: (-5:Bad argument) in function 'imwrite'
> Overload resolution failed:
>  - img is not a numpy array, neither a scalar
>  - Expected Ptr<cv::UMat> for argument 'img'


In [None]:
import os
import time
import numpy as np
import cv2
import torch
import torch.nn.functional as F

def _save_debug_views(
    rgb_gpu, anon_gpu, mask_gpu, out_rgb_gpu,
    out_folder, v, f, suffix="",
):
    """Helper to dump debug diagnostics for a frame."""
    # Ensure everything is in [0,1]
    def to_uint8(tensor):
        return (
            tensor.clamp(0, 1)
                  .mul(255)
                  .to(torch.uint8)
                  .permute(1, 2, 0)  # HWC
                  .cpu()
                  .numpy()
        )

    orig = to_uint8(rgb_gpu)
    anon = to_uint8(anon_gpu)
    comp = to_uint8(out_rgb_gpu)
    mask_vis = (mask_gpu.cpu().numpy().astype(np.uint8) * 255)

    view_dir = os.path.join(out_folder, "debug", f"view{v}")
    os.makedirs(view_dir, exist_ok=True)

    cv2.imwrite(os.path.join(view_dir, f"{f:06d}_orig{suffix}.png"), cv2.cvtColor(orig, cv2.COLOR_RGB2BGR))
    cv2.imwrite(os.path.join(view_dir, f"{f:06d}_anon{suffix}.png"), cv2.cvtColor(anon, cv2.COLOR_RGB2BGR))
    cv2.imwrite(os.path.join(view_dir, f"{f:06d}_mask{suffix}.png"), mask_vis)
    cv2.imwrite(os.path.join(view_dir, f"{f:06d}_comp{suffix}.png"), cv2.cvtColor(comp, cv2.COLOR_RGB2BGR))


def save_masked_images_gpu(
    pred_mask_full,
    images,
    out_folder,
    X,
    dilation_radius=4,
    device=None,
    debug_frames=2  # how many frames per view to dump debug for
):
    """
    Saves masked RGB & depth frames with anonymization, with diagnostics and fallback.
    pred_mask_full: [V, F, H, W] uint8
    images: [V, F, C, H, W] with C>=4, RGB expected in [:3] in [0,1] floats (or [0,255], auto-normalized)
    """
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    os.makedirs(out_folder, exist_ok=True)
    V, F, H, W = pred_mask_full.shape

    # CPU kernel for dilation
    k = 2 * dilation_radius + 1
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (k, k))

    # Make view folders
    for v in range(V):
        os.makedirs(os.path.join(out_folder, "rgb", f"view{v}"), exist_ok=True)
        os.makedirs(os.path.join(out_folder, "depth", f"view{v}"), exist_ok=True)

    chunk_total_times = []
    chunk_anonym_times = []
    chunk_write_times = []

    for start in range(0, F, X):
        t_chunk_start = time.perf_counter()
        anon_times = []
        write_times = []

        end = min(start + X, F)
        for v in range(V):
            for f in range(start, end):
                # -- 1) dilate mask on CPU --
                mask_tensor = pred_mask_full[v, f]  # uint8
                mask_np = mask_tensor.cpu().numpy().astype(np.uint8)
                dilated = cv2.dilate(mask_np, kernel)
                mask_bool = dilated.astype(bool)
                mask_gpu = torch.from_numpy(mask_bool).to(device)  # [H,W], bool-like

                # -- 2) depth for saving --
                depth = images[v, f, 3].cpu().numpy()

                # -- 3) prepare RGB on GPU and normalize if needed --
                rgb_gpu = images[v, f, :3].to(device)  # [3,H,W]
                # Heuristic: if in 0-255 range, scale to 0-1
                if rgb_gpu.max() > 1.5:
                    rgb_gpu = rgb_gpu / 255.0

                # -- 4) anonymize per frame --
                t_anon_start = time.perf_counter()
                anon_gpu = anonymize_region_gpu(
                    rgb_gpu, mask_gpu,
                    method="fast_mosaic",
                    block=max(1, W // 16),
                    noise_level=20
                )  # expect [3,H,W] in same range as rgb_gpu
                anon_time = time.perf_counter() - t_anon_start
                anon_times.append(anon_time)

                # -- 5) fallback if anonymization looks degenerate (e.g., all zeros or very low variance) --
                # Use simple mosaic if anon_gpu is near-zero or identical to input
                with torch.no_grad():
                    # Check signal strength
                    if anon_gpu.abs().max() < 1e-3 or torch.allclose(anon_gpu, rgb_gpu, atol=1e-3):
                        block = max(1, W // 16)
                        small = F.avg_pool2d(rgb_gpu.unsqueeze(0), kernel_size=block, stride=block, padding=0)
                        up = F.interpolate(small, size=(H, W), mode="nearest").squeeze(0)
                        anon_gpu = up  # override

                # -- 6) composite on GPU --
                mask_expand = mask_gpu.unsqueeze(0)  # [1,H,W]
                out_rgb_gpu = torch.where(mask_expand, anon_gpu, rgb_gpu)

                # Dump debug for first few frames per view
                if f < debug_frames:
                    _save_debug_views(
                        rgb_gpu, anon_gpu, mask_gpu, out_rgb_gpu,
                        out_folder, v, f
                    )

                # -- 7) write out --
                t_write_start = time.perf_counter()
                out_rgb = (
                    out_rgb_gpu.clamp(0, 1)
                               .mul(255)
                               .to(torch.uint8)
                               .permute(1, 2, 0)  # HWC
                               .cpu()
                               .numpy()
                )
                fname_rgb = f"{f:06d}_masked.png"
                cv2.imwrite(
                    os.path.join(out_folder, "rgb", f"view{v}", fname_rgb),
                    cv2.cvtColor(out_rgb, cv2.COLOR_RGB2BGR)
                )
                # Depth with small visualization noise
                depth_u16 = np.clip(depth + np.random.normal(0, 10, depth.shape), 0, 20000).astype(np.uint16)
                fname_depth = f"{f:06d}_depth.png"
                cv2.imwrite(
                    os.path.join(out_folder, "depth", f"view{v}", fname_depth),
                    depth_u16
                )
                write_times.append(time.perf_counter() - t_write_start)

        chunk_total_times.append(time.perf_counter() - t_chunk_start)
        chunk_anonym_times.append(sum(anon_times))
        chunk_write_times.append(sum(write_times))

    return {
        "chunk_total": chunk_total_times,
        "chunk_anon": chunk_anonym_times,
        "chunk_write": chunk_write_times
    }

In [None]:
fps = 1/((avg_anon + avg_seg)/ X)
print(f"Estimated FPS: {fps:.2f} frames per second")

Estimated FPS: 24.24 frames per second


In [None]:
import cv2
import os
from glob import glob

fps=30
# Set parameters
num_frames = num_frames  # Already defined in your notebook

os.makedirs(video_output_dir, exist_ok=True)
for view_idx in range(num_views):
    output_path = os.path.join(video_output_dir, f"view{view_idx}.mp4")
    
    input_video_pattern = input_video_base_path + f"{view_idx}/{{}}_masked.png"
    # input_video_pattern = input_video_base_path + f"{view_idx}/{{}}_detect.png"

    # Read the first frame to get the size
    first_frame_path = input_video_pattern.format(0)
    first_frame = cv2.imread(first_frame_path)
    if first_frame is None:
        raise FileNotFoundError(f"First frame not found: {first_frame_path}")
    height, width, layers = first_frame.shape

    # Define the video writer
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video_writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    # Write frames to video
    for i in range(num_frames):
        frame_path = input_video_pattern.format(i)
        frame = cv2.imread(frame_path)
        if frame is None:
            print(f"Warning: Frame not found: {frame_path}, skipping.")
            continue
        video_writer.write(frame)

    video_writer.release()
    print(f"Video saved to {output_path}")

Video saved to output/carla_v1/videos/view0.mp4


In [None]:
print(DEVICE)

cuda
