In [2]:
import torch, torchvision
from torchvision.transforms import functional as TF
from PIL import Image
import numpy as np
import cv2, os
from pathlib import Path
from tqdm import tqdm
import shutil, itertools

device = "cuda" if torch.cuda.is_available() else "cpu"

ModuleNotFoundError: No module named 'tqdm'

In [9]:
# Load COCO-pre-trained Mask R-CNN
model = torchvision.models.detection.maskrcnn_resnet50_fpn(weights="DEFAULT")
model.to(device).eval()

MaskRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(in

In [5]:
@torch.inference_mode()
def mask_humans_pil(
    img: Image.Image,
    score_thresh: float = 0.5,
    fill_mode: str = "mean",   # "mean" | "black" | "gray"
):
    """
    Return a copy of `img` where every pixel inside a detected person
    is replaced by the frame average (paper-style) or a solid colour.
    """
    tensor = TF.to_tensor(img).to(device).unsqueeze(0)        # 1×3×H×W
    out = model(tensor)[0]                                    # dict

    keep = (out["labels"] == 1) & (out["scores"] > score_thresh)
    if keep.sum() == 0:
        return img.copy()                                     # no humans

    # Union of all person masks
    masks = (out["masks"][keep, 0].sigmoid() > 0.5).any(0)    # H×W bool
    mask_np = masks.cpu().numpy()

    arr = np.array(img)
    if fill_mode == "mean":
        fill_value = arr.mean(axis=(0, 1), keepdims=True).astype(arr.dtype)
    elif fill_mode == "gray":
        fill_value = np.array([127, 127, 127], dtype=arr.dtype)
    else:  # "black"
        fill_value = 0
    arr[mask_np] = fill_value
    return Image.fromarray(arr)


In [6]:
def mask_folder(
    in_root: str | Path,
    out_root: str | Path,
    *,
    ext: str = ".jpg",
    score_thresh: float = 0.5,
    recursive: bool = True,
    clear_out: bool = True,
):
    """
    Walk through `in_root` (optionally deep), run Mask R-CNN on every file that
    ends with `ext`, and write the masked copy to the parallel structure under
    `out_root`.  Existing contents of `out_root` are deleted if `clear_out`.
    """
    in_root, out_root = Path(in_root), Path(out_root)
    if clear_out and out_root.exists():
        shutil.rmtree(out_root)
    pattern = f"**/*{ext}" if recursive else f"*{ext}"
    files = sorted(in_root.glob(pattern))
    print(f"{len(files)} frame(s) found")

    for src in tqdm(files):
        rel = src.relative_to(in_root)
        dst = out_root / rel
        dst.parent.mkdir(parents=True, exist_ok=True)

        img = Image.open(src).convert("RGB")
        img_masked = mask_humans_pil(img, score_thresh=score_thresh)
        img_masked.save(dst)


In [10]:
in_root  = "datasets/UCF-101-JPG/Archery/"            # e.g. ".../train_frames"
out_root = "masking/"          # e.g. ".../train_frames_masked"

mask_folder(in_root, out_root, ext=".jpg", score_thresh=0.55)

160 frame(s) found


100%|██████████| 160/160 [00:10<00:00, 15.20it/s]
