In [1]:
%env CUDA_VISIBLE_DEVICES=1
import torch
from vggt.models.vggt import VGGT
from vggt.utils.load_fn import load_and_preprocess_images
from vggt.utils.pose_enc import pose_encoding_to_extri_intri
from vggt.utils.geometry import unproject_depth_map_to_point_map

import open3d as o3d
import numpy as np
import os
import glob
from matplotlib import pyplot as plt

import random
import cv2
from tqdm import tqdm,trange

from utils import *

env: CUDA_VISIBLE_DEVICES=1
Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
# bfloat16 is supported on Ampere GPUs (Compute Capability 8.0+) 
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16

In [3]:
model = VGGT()
_URL = "./model.pt"
model.load_state_dict(torch.load(_URL))
model.eval()
model = model.to(device)

  model.load_state_dict(torch.load(_URL))


#### 不固定frontal view

In [5]:
idxs = list(range(1, 21))

In [5]:
for idx in tqdm(idxs):
    idx = str(idx)

    base_path = "/root/autodl-tmp/facescape"
    img_path = os.path.join(base_path, "mv_image", idx)
    depth_path = os.path.join(base_path, "depth", idx)
    mask_path = os.path.join(base_path, "mask", idx)
    params_path = os.path.join(base_path, "params", idx)

    n_imgs = len(os.listdir(img_path))
    def load_data(load_indices):
        depth_map = []
        extrinsic = []
        intrinsic = []
        images = []
        masks = []

        for load_index in load_indices:
            if False:
                img, depth, mask, extr, intr = data_cache[load_index]
            else:
                img = cv2.imread(os.path.join(img_path, f"{load_index}.png"))
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                # img = cv2.resize(img, (518, round(2592/1728*518)))
                # img = img[130:130+518, 0:518]
                depth = np.load(os.path.join(depth_path, f"{load_index}.npy"))
                # depth = cv2.resize(depth, (518, round(2592/1728*518)))
                # depth = depth[130:130+518, 0:518]
                mask = np.load(os.path.join(mask_path, f"{load_index}.npy"))
                # mask = cv2.resize(mask, (518, round(2592/1728*518)))
                # mask = mask[130:130+518, 0:518]
                params = np.load(os.path.join(params_path, f"{load_index}.npz"))
                extr = params["extr"]
                intr = params["intr"]
                # intr = intr * 518 / 1728
                # intr[1,2] = intr[1,2] - 130
                # data_cache[load_index] = (img, depth, mask, extr, intr)
            depth_map.append(depth)
            extrinsic.append(extr)
            intrinsic.append(intr)
            images.append(img)
            masks.append(mask)

        images = np.array(images)
        images = images / 255.
        masks = np.array(masks)
        depth_map = np.array(depth_map)
        extrinsic = np.array(extrinsic)
        intrinsic = np.array(intrinsic)
        return images, depth_map, extrinsic, intrinsic, masks

    all_images, all_depth_maps, all_extrinsics, all_intrinsics, all_masks = load_data(list(range(n_imgs)))

    def load_data(load_indices):
        load_indices = list(load_indices)
        images = all_images[load_indices]
        depth_maps = all_depth_maps[load_indices]
        extrinsics = all_extrinsics[load_indices]
        intrinsics = all_intrinsics[load_indices]
        masks = all_masks[load_indices]
        return images, depth_maps, extrinsics, intrinsics, masks

    load_indices_samples = set()
    while True:
        samples = list(range(n_imgs))
        random.shuffle(samples)
        samples = samples[:16]
        samples = sorted(samples)
        load_indices_samples.add(tuple(samples))
        if len(load_indices_samples) == 1000:
            break
    load_indices_samples = list(load_indices_samples)
    train_indices = load_indices_samples[:800]
    val_indices = load_indices_samples[800:900]
    test_indices = load_indices_samples[900:]
    
    feature_maps_avg = np.zeros((n_imgs, 518, 518, 128), dtype=np.float32)
    n_feature_maps = np.zeros((n_imgs), dtype=np.float32)

    for i in range(400):
        load_indices = train_indices[i]
        images, depth_map, extrinsic, intrinsic, masks = load_data(load_indices)
        images = torch.from_numpy(images).to(device).float()
        depth_map = torch.from_numpy(depth_map).to(device).unsqueeze(-1)
        extrinsic = torch.from_numpy(extrinsic).to(device)
        intrinsic = torch.from_numpy(intrinsic).to(device)
        images = images.permute(0, 3, 1, 2) # 16, 518, 518, 3 -> 16, 3, 518, 518
        
        with torch.no_grad():
            with torch.cuda.amp.autocast(dtype=dtype):
                images = images[None]  # add batch dimension
                aggregated_tokens_list, ps_idx = model.aggregator(images)
            feature_maps = model.track_head.feature_extractor(aggregated_tokens_list, images, ps_idx)[0]
            del images
            feature_maps = torch.nn.functional.interpolate(feature_maps, size=(518, 518), mode='bilinear', align_corners=False)
            feature_maps = feature_maps.permute(0, 2, 3, 1) # 16, 518, 518, 128
            feature_maps_ = feature_maps
        
        for j in range(len(load_indices)):
            index = load_indices[j]
            feature_maps_avg[index] = feature_maps_avg[index] * n_feature_maps[index] / (n_feature_maps[index]+1) + feature_maps_[j].detach().cpu().numpy()/ (n_feature_maps[index]+1)
            n_feature_maps[index] += 1
    all_points = unproject_depth_map_to_point_map(torch.from_numpy(all_depth_maps).to(device).unsqueeze(-1),
                                    torch.from_numpy(all_extrinsics).to(device), 
                                    torch.from_numpy(all_intrinsics).to(device)) # 16, 518, 518, 3
    all_points = all_points * 4
    all_points = all_points[all_masks > 0] # K,3
    all_features = feature_maps_avg[all_masks > 0] # K,128
    os.makedirs(f"/root/autodl-tmp/facescape/all_features/{idx}", exist_ok=True)
    np.save(f"/root/autodl-tmp/facescape/all_features/{idx}/all_points.npy", all_points)
    np.save(f"/root/autodl-tmp/facescape/all_features/{idx}/all_features.npy", all_features)

    

  0%|          | 0/20 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(dtype=dtype):
100%|██████████| 20/20 [14:31:54<00:00, 2615.71s/it]  


In [9]:
# feature_maps_avg = np.zeros((n_imgs, 518, 518, 128), dtype=np.float32)
# n_feature_maps = np.zeros((n_imgs), dtype=np.float32)


# batch_size = 1
# for i in trange(0,800,batch_size):
#     load_indices_this_batch = train_indices[i:i+batch_size]
#     images_batch = []
#     for item_ind, load_indices in enumerate(load_indices_this_batch):
#         images, depth_map, extrinsic, intrinsic, masks = load_data(load_indices)
#         images = torch.from_numpy(images).to(device).float()
#         depth_map = torch.from_numpy(depth_map).to(device).unsqueeze(-1)
#         extrinsic = torch.from_numpy(extrinsic).to(device)
#         intrinsic = torch.from_numpy(intrinsic).to(device)
#         images = images.permute(0, 3, 1, 2) # 16, 518, 518, 3 -> 16, 3, 518, 518
#         images_batch.append(images)
#     images = torch.stack(images_batch, dim=0)  # batch_size, 16, 3, 518, 518
#     # change images to continuous memory layout
#     # images = images.contiguous()
    
#     with torch.no_grad():
#         with torch.cuda.amp.autocast(dtype=dtype):
#             aggregated_tokens_list, ps_idx = model.aggregator(images)
#         feature_maps = model.track_head.feature_extractor(aggregated_tokens_list, images, ps_idx) # batch_size, 16, 128, 259, 259
#         del images
#         # convert feature_maps to batch_size*16, 128, 518, 518
#         feature_maps = feature_maps.view(-1, feature_maps.shape[2], feature_maps.shape[3], feature_maps.shape[4])  # batch_size*16, 128, 259, 259
#         feature_maps = torch.nn.functional.interpolate(feature_maps, size=(518, 518), mode='bilinear', align_corners=False) # batch_size * 16, 128, 518, 518
#         feature_maps = feature_maps.view(len(load_indices_this_batch), 16, 128, 518, 518)
#         feature_maps = feature_maps.permute(0, 1, 3, 4, 2)
#         feature_maps_batch = feature_maps
        
#     for item_ind, load_indices in enumerate(load_indices_this_batch):
#         feature_maps_ = feature_maps_batch[item_ind]
#         for j in range(len(load_indices)):
#             index = load_indices[j]
#             feature_maps_avg[index] = feature_maps_avg[index] * n_feature_maps[index] / (n_feature_maps[index]+1) + feature_maps_[j].detach().cpu().numpy()/ (n_feature_maps[index]+1)
#             n_feature_maps[index] += 1

  0%|          | 0/800 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(dtype=dtype):
  6%|▋         | 51/800 [05:04<1:13:30,  5.89s/it]

In [7]:
path = "./test_model/shadow1_16view"

image_names = sorted(glob.glob(os.path.join(path, "*.png")))
n_imgs = len(image_names)

In [10]:
feature_maps_avg = np.zeros((n_imgs, 518, 518, 128), dtype=np.float32)
n_feature_maps = np.zeros((n_imgs), dtype=np.float32)
for i in trange(100):
    load_image_names = random.sample(image_names, 16)
    images = load_and_preprocess_images(load_image_names).to(device)
    with torch.no_grad():
        with torch.cuda.amp.autocast(dtype=dtype):
            images = images[None]  # add batch dimension
            aggregated_tokens_list, ps_idx = model.aggregator(images)
        feature_maps = model.track_head.feature_extractor(aggregated_tokens_list, images, ps_idx)[0]
        feature_maps = torch.nn.functional.interpolate(feature_maps, size=(518, 518), mode='bilinear', align_corners=False)
        feature_maps = feature_maps.permute(0, 2, 3, 1) # 16, 518, 518, 128
        feature_maps_ = feature_maps.detach().cpu().numpy()
    for j in range(len(image_names)):
        load_image_name = load_image_names[j]
        index = image_names.index(load_image_name)
        feature_maps_avg[index] = feature_maps_avg[index] * n_feature_maps[index] / (n_feature_maps[index]+1) + feature_maps_[j]/ (n_feature_maps[index]+1)
        n_feature_maps[index] += 1
        

  with torch.cuda.amp.autocast(dtype=dtype):
100%|██████████| 100/100 [12:40<00:00,  7.60s/it]


In [12]:
np.save("./feature_maps.npy", feature_maps_avg)

#### 固定frontal view

In [15]:
frontal_view_img_idx_dict = {
    "1":0,
    "3":15,
    "4":0,
    "5":14,
    "6":15,
    "7":15,
    "8":30,
    "9":15,
    "10":0,
    "11":14,
    "12":22,
    "13":45,
    "14":18,
    "15":34,
    "16":0,
    "17":14,
    "18":0,
    "19":0,
    "20":33,
    "21":15,
    "22":0,
    "23":0,
    "24":0,
    "25":18,
    "26":0,
    "27":0,
    "28":18,
    "29":15,
    "30":14,
    "31":43,
    "32":14,
    "33":15,
    "34":15,
    "35":0,
    "36":0,
    "37":15,
    "38":0,
    "39":17,
    "40":18
}

In [22]:
frontal_view_img_idx_dict = {
    "41":37,
    "42":34,
    "43":24,
    "44":14,
    "45":15,
    "46":0,
    "47":0,
    "48":0,
    "49":18,
    "50":18
}

In [23]:
for idx in tqdm(frontal_view_img_idx_dict.keys()):
    torch.cuda.empty_cache()
    
    frontal_view_img_idx = frontal_view_img_idx_dict[idx]
    base_path = "/root/autodl-tmp/facescape"
    img_path = os.path.join(base_path, "mv_image", idx)
    depth_path = os.path.join(base_path, "depth", idx)
    mask_path = os.path.join(base_path, "mask", idx)
    params_path = os.path.join(base_path, "params", idx)

    n_imgs = len(os.listdir(img_path))
    def load_data(load_indices):
        depth_map = []
        extrinsic = []
        intrinsic = []
        images = []
        masks = []

        for load_index in load_indices:
            if False:
                img, depth, mask, extr, intr = data_cache[load_index]
            else:
                img = cv2.imread(os.path.join(img_path, f"{load_index}.png"))
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                # img = cv2.resize(img, (518, round(2592/1728*518)))
                # img = img[130:130+518, 0:518]
                depth = np.load(os.path.join(depth_path, f"{load_index}.npy"))
                # depth = cv2.resize(depth, (518, round(2592/1728*518)))
                # depth = depth[130:130+518, 0:518]
                mask = np.load(os.path.join(mask_path, f"{load_index}.npy"))
                # mask = cv2.resize(mask, (518, round(2592/1728*518)))
                # mask = mask[130:130+518, 0:518]
                params = np.load(os.path.join(params_path, f"{load_index}.npz"))
                extr = params["extr"]
                intr = params["intr"]
                # intr = intr * 518 / 1728
                # intr[1,2] = intr[1,2] - 130
                # data_cache[load_index] = (img, depth, mask, extr, intr)
            depth_map.append(depth)
            extrinsic.append(extr)
            intrinsic.append(intr)
            images.append(img)
            masks.append(mask)

        images = np.array(images)
        images = images / 255.
        masks = np.array(masks)
        depth_map = np.array(depth_map)
        extrinsic = np.array(extrinsic)
        intrinsic = np.array(intrinsic)
        return images, depth_map, extrinsic, intrinsic, masks

    all_images, all_depth_maps, all_extrinsics, all_intrinsics, all_masks = load_data(list(range(n_imgs)))

    def load_data(load_indices):
        load_indices = list(load_indices)
        images = all_images[load_indices]
        depth_maps = all_depth_maps[load_indices]
        extrinsics = all_extrinsics[load_indices]
        intrinsics = all_intrinsics[load_indices]
        masks = all_masks[load_indices]
        return images, depth_maps, extrinsics, intrinsics, masks

    load_indices_samples = set()
    while True:
        samples = list(range(n_imgs))
        random.shuffle(samples)
        samples = samples[:16]
        samples = sorted(samples)
        if frontal_view_img_idx in samples:
            # make sure the frontal view is the first image
            samples.remove(frontal_view_img_idx)
            samples.insert(0, frontal_view_img_idx)
        else:
            continue
        load_indices_samples.add(tuple(samples))
        if len(load_indices_samples) == 1000:
            break
    load_indices_samples = list(load_indices_samples)
    train_indices = load_indices_samples[:800]
    val_indices = load_indices_samples[800:900]
    test_indices = load_indices_samples[900:]

    feature_maps_sum = np.zeros((n_imgs, 518, 518, 128), dtype=np.float32)
    n_feature_maps = np.zeros((n_imgs), dtype=np.float32)
    feature_maps_sum = torch.zeros((n_imgs, 518, 518, 128), dtype=torch.float32).to(device)
    n_feature_maps = torch.zeros((n_imgs), dtype=torch.float32).to(device)

    for i in range(400):
        load_indices = train_indices[i]
        load_indices = list(load_indices)
        images, depth_map, extrinsic, intrinsic, masks = load_data(load_indices)
        images = torch.from_numpy(images).to(device).float()
        # depth_map = torch.from_numpy(depth_map).to(device).unsqueeze(-1)
        # extrinsic = torch.from_numpy(extrinsic).to(device)
        # intrinsic = torch.from_numpy(intrinsic).to(device)
        images = images.permute(0, 3, 1, 2) # 16, 518, 518, 3 -> 16, 3, 518, 518

        with torch.no_grad():
            with torch.cuda.amp.autocast(dtype=dtype):
                images = images[None]  # add batch dimension
                aggregated_tokens_list, ps_idx = model.aggregator(images)
            feature_maps = model.track_head.feature_extractor(aggregated_tokens_list, images, ps_idx)[0]
            del images
            del aggregated_tokens_list
            feature_maps = torch.nn.functional.interpolate(feature_maps, size=(518, 518), mode='bilinear', align_corners=False)
            feature_maps = feature_maps.permute(0, 2, 3, 1) # 16, 518, 518, 128
            feature_maps_ = feature_maps
            # feature_maps_ = feature_maps_.detach().cpu().numpy()  # convert to numpy array for further processing

            # for j in range(len(load_indices)):
            #     index = load_indices[j]
            #     feature_maps_avg[index] = feature_maps_avg[index] * n_feature_maps[index] / (n_feature_maps[index]+1) + feature_maps_[j]/ (n_feature_maps[index]+1)
            #     n_feature_maps[index] += 1
            # feature_maps_avg[load_indices] = feature_maps_avg[load_indices] * n_feature_maps[load_indices][:, None, None, None] / (n_feature_maps[load_indices][:, None, None, None]+1) + feature_maps_ / (n_feature_maps[load_indices][:, None, None, None]+1)
            # n_feature_maps[load_indices] += 1
            feature_maps_sum[load_indices] = feature_maps_sum[load_indices] + feature_maps_
            n_feature_maps[load_indices] =  n_feature_maps[load_indices] + 1

    feature_maps_avg = feature_maps_sum / n_feature_maps[:, None, None, None]
    feature_maps_avg = feature_maps_avg.detach().cpu().numpy()
    all_features = feature_maps_avg[all_masks > 0] # K,128
    os.makedirs(f"/root/autodl-tmp/facescape/all_features/{idx}", exist_ok=True)
    np.save(f"/root/autodl-tmp/facescape/all_features/{idx}/all_features.npy", all_features)

  0%|          | 0/10 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(dtype=dtype):
100%|██████████| 10/10 [1:53:03<00:00, 678.33s/it]


In [25]:
import shutil
for idx in trange(1,101):
    idx = str(idx)
    all_points_raw_file = f"/root/autodl-tmp/facescape/all_features_no_frontal/{idx}/all_points.npy"
    os.makedirs(f"/root/autodl-tmp/facescape/all_features/{idx}", exist_ok=True)
    all_points_file = f"/root/autodl-tmp/facescape/all_features/{idx}/all_points.npy"
    shutil.copy(all_points_raw_file, all_points_file)

100%|██████████| 50/50 [00:00<00:00, 7144.84it/s]


#### 固定frontal view，加入layernorm

In [6]:
frontal_view_img_idx_dict = {
    "2":14
}

In [7]:
layernorm_fn = torch.nn.LayerNorm(128).to(device)

for idx in tqdm(frontal_view_img_idx_dict.keys()):
    torch.cuda.empty_cache()
    
    frontal_view_img_idx = frontal_view_img_idx_dict[idx]
    base_path = "/root/autodl-tmp/facescape"
    img_path = os.path.join(base_path, "mv_image", idx)
    depth_path = os.path.join(base_path, "depth", idx)
    mask_path = os.path.join(base_path, "mask", idx)
    params_path = os.path.join(base_path, "params", idx)

    n_imgs = len(os.listdir(img_path))
    def load_data(load_indices):
        depth_map = []
        extrinsic = []
        intrinsic = []
        images = []
        masks = []

        for load_index in load_indices:
            if False:
                img, depth, mask, extr, intr = data_cache[load_index]
            else:
                img = cv2.imread(os.path.join(img_path, f"{load_index}.png"))
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                # img = cv2.resize(img, (518, round(2592/1728*518)))
                # img = img[130:130+518, 0:518]
                depth = np.load(os.path.join(depth_path, f"{load_index}.npy"))
                # depth = cv2.resize(depth, (518, round(2592/1728*518)))
                # depth = depth[130:130+518, 0:518]
                mask = np.load(os.path.join(mask_path, f"{load_index}.npy"))
                # mask = cv2.resize(mask, (518, round(2592/1728*518)))
                # mask = mask[130:130+518, 0:518]
                params = np.load(os.path.join(params_path, f"{load_index}.npz"))
                extr = params["extr"]
                intr = params["intr"]
                # intr = intr * 518 / 1728
                # intr[1,2] = intr[1,2] - 130
                # data_cache[load_index] = (img, depth, mask, extr, intr)
            depth_map.append(depth)
            extrinsic.append(extr)
            intrinsic.append(intr)
            images.append(img)
            masks.append(mask)

        images = np.array(images)
        images = images / 255.
        masks = np.array(masks)
        depth_map = np.array(depth_map)
        extrinsic = np.array(extrinsic)
        intrinsic = np.array(intrinsic)
        return images, depth_map, extrinsic, intrinsic, masks

    all_images, all_depth_maps, all_extrinsics, all_intrinsics, all_masks = load_data(list(range(n_imgs)))

    def load_data(load_indices):
        load_indices = list(load_indices)
        images = all_images[load_indices]
        depth_maps = all_depth_maps[load_indices]
        extrinsics = all_extrinsics[load_indices]
        intrinsics = all_intrinsics[load_indices]
        masks = all_masks[load_indices]
        return images, depth_maps, extrinsics, intrinsics, masks

    load_indices_samples = set()
    while True:
        samples = list(range(n_imgs))
        random.shuffle(samples)
        samples = samples[:16]
        samples = sorted(samples)
        if frontal_view_img_idx in samples:
            # make sure the frontal view is the first image
            samples.remove(frontal_view_img_idx)
            samples.insert(0, frontal_view_img_idx)
        else:
            continue
        load_indices_samples.add(tuple(samples))
        if len(load_indices_samples) == 1000:
            break
    load_indices_samples = list(load_indices_samples)
    train_indices = load_indices_samples[:800]
    val_indices = load_indices_samples[800:900]
    test_indices = load_indices_samples[900:]

    feature_maps_sum = np.zeros((n_imgs, 518, 518, 128), dtype=np.float32)
    n_feature_maps = np.zeros((n_imgs), dtype=np.float32)
    feature_maps_sum = torch.zeros((n_imgs, 518, 518, 128), dtype=torch.float32).to(device)
    n_feature_maps = torch.zeros((n_imgs), dtype=torch.float32).to(device)

    for i in range(100):
        load_indices = train_indices[i]
        load_indices = list(load_indices)
        images, depth_map, extrinsic, intrinsic, masks = load_data(load_indices)
        images = torch.from_numpy(images).to(device).float()
        # depth_map = torch.from_numpy(depth_map).to(device).unsqueeze(-1)
        # extrinsic = torch.from_numpy(extrinsic).to(device)
        # intrinsic = torch.from_numpy(intrinsic).to(device)
        images = images.permute(0, 3, 1, 2) # 16, 518, 518, 3 -> 16, 3, 518, 518

        with torch.no_grad():
            with torch.cuda.amp.autocast(dtype=dtype):
                images = images[None]  # add batch dimension
                aggregated_tokens_list, ps_idx = model.aggregator(images)
            feature_maps = model.track_head.feature_extractor(aggregated_tokens_list, images, ps_idx)[0]
            del images
            del aggregated_tokens_list
            feature_maps = torch.nn.functional.interpolate(feature_maps, size=(518, 518), mode='bilinear', align_corners=False)
            feature_maps = feature_maps.permute(0, 2, 3, 1) # 16, 518, 518, 128
            feature_maps_ = feature_maps
            fmaps = feature_maps_
            
            fmaps = layernorm_fn(fmaps.unsqueeze(0)).squeeze(0)  # apply layer normalization
            feature_maps_ = fmaps
            # feature_maps_ = feature_maps_.detach().cpu().numpy()  # convert to numpy array for further processing

            # for j in range(len(load_indices)):
            #     index = load_indices[j]
            #     feature_maps_avg[index] = feature_maps_avg[index] * n_feature_maps[index] / (n_feature_maps[index]+1) + feature_maps_[j]/ (n_feature_maps[index]+1)
            #     n_feature_maps[index] += 1
            # feature_maps_avg[load_indices] = feature_maps_avg[load_indices] * n_feature_maps[load_indices][:, None, None, None] / (n_feature_maps[load_indices][:, None, None, None]+1) + feature_maps_ / (n_feature_maps[load_indices][:, None, None, None]+1)
            # n_feature_maps[load_indices] += 1
            feature_maps_sum[load_indices] = feature_maps_sum[load_indices] + feature_maps_
            n_feature_maps[load_indices] =  n_feature_maps[load_indices] + 1

    feature_maps_avg = feature_maps_sum / n_feature_maps[:, None, None, None]
    feature_maps_avg = feature_maps_avg.detach().cpu().numpy()
    all_features = feature_maps_avg[all_masks > 0] # K,128
    os.makedirs(f"/root/autodl-tmp/facescape/all_features_ln/{idx}", exist_ok=True)
    np.save(f"/root/autodl-tmp/facescape/all_features_ln/{idx}/all_features.npy", all_features)

  0%|          | 0/1 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(dtype=dtype):
100%|██████████| 1/1 [03:02<00:00, 182.63s/it]
