# Environment setup

In [15]:
# ! python -m pip install detectron2 -f \
#   https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.10/index.html

In [1]:
%cd /workspace/GitHub/AVSL

/workspace/GitHub/AVSL


In [1]:
import detectron2.utils.comm as comm
from detectron2.config import get_cfg
from detectron2.projects.deeplab import add_deeplab_config
from detectron2.utils.logger import setup_logger

from MODULES.MaskFormer.config import add_mask_former_config

- init.py: from . import modeling
- modeling/init.py: from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder
- modeling/pixel_decoder/msdeformattn.py: from .ops.modules import MSDeformAttn
- modeling/pixel_decoder/ops/modules/ms_deform_attn.py: from ..functions import MSDeformAttnFunction
- modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py: import MultiScaleDeformableAttention as MSDA 주석 처리

In [2]:
# Create configs and perform basic setups

cfg = get_cfg()
add_deeplab_config(cfg)
add_mask_former_config(cfg)
cfg.set_new_allowed(True)
cfg.merge_from_file("MODULES/MaskFormer/configs/custom/MaskAVSL_swin_base.yaml")
# cfg.merge_from_list(args.opts)
cfg.MODEL.DEVICE = "cuda:1"
cfg.SOLVER.IMS_PER_BATCH = 4
cfg.eval_only = True
cfg.freeze()
# default_setup(cfg, args)
# Setup logger for "mask_former" module
setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="mask_former")

Loading config MODULES/MaskFormer/configs/custom/MaskAVSL_swin_base.yaml with yaml.unsafe_load. Your machine may be at risk if the file contains malicious content.


<Logger mask_former (DEBUG)>

# Custom Dataset

In [3]:
from DATALOADER import VideoDataLoader

folder_path = 'DATA/videos'
dataloader = VideoDataLoader(cfg, folder_path)

34808 of videos have loaded


In [4]:
print(len(dataloader))

# detectron2/engine/train_loop.py
data = next(iter(dataloader))

8702


                                                                  

In [5]:
# maskformer_model.py
from detectron2.structures import ImageList
from tqdm import tqdm
import torch

pixel_mean = cfg.MODEL.PIXEL_MEAN
pixel_std = cfg.MODEL.PIXEL_STD
size_divisibility = cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY
device = cfg.MODEL.DEVICE

pixel_mean = torch.Tensor(pixel_mean).view(-1, 1, 1)
pixel_std = torch.Tensor(pixel_std).view(-1, 1, 1)

images = [x for x in data['image']]
images = [(x - pixel_mean) / pixel_std for x in tqdm(images)]
images = ImageList.from_tensors(images, size_divisibility)

100%|██████████| 4/4 [00:00<00:00, 53.92it/s]


### code

In [3]:
import sys, os
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from moviepy.editor import VideoFileClip

from detectron2.config import configurable
from detectron2.data import transforms as T
from detectron2.projects.point_rend import ColorAugSSDTransform
from detectron2.data import detection_utils as utils

In [4]:
class VideoDataset(Dataset):
    @configurable
    def __init__(self, is_train=True, *, augmentations, image_format, ignore_label, size_divisibility, folder_path, ):
        self.folder_path = folder_path
        self.video_list = [f for f in os.listdir(folder_path) if f.endswith('.mp4')]
        self.tfm_gens = augmentations

        self.is_train = is_train
        self.img_format = image_format
        self.ignore_label = ignore_label
        self.size_divisibility = size_divisibility

    def __len__(self):
        return len(self.video_list)

    def __getitem__(self, idx):
        video_path = os.path.join(self.folder_path, self.video_list[idx])

        # Load video and extract central frame
        video_clip = VideoFileClip(video_path)
        central_frame = video_clip.get_frame(video_clip.duration / 2)
        central_frame = np.array(central_frame)
        # Convert image frame to numpy array and normalize
        #central_frame = central_frame / 255.0

        # Additional augmentation
        aug_input = T.AugInput(central_frame)
        aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
        central_frame = aug_input.image
        #sem_seg_gt = aug_input.sem_seg

        central_frame = torch.as_tensor(np.ascontiguousarray(central_frame.transpose(2, 0, 1)))
        #central_frame = np.transpose(central_frame, (2, 0, 1))  # Change HWC to CHW

        if self.size_divisibility > 0:
            central_frame_size = (central_frame.shape[-2], central_frame.shape[-1])
            padding_size = [
                0,
                self.size_divisibility - central_frame_size[1],
                0,
                self.size_divisibility - central_frame_size[0],
            ]
            central_frame = F.pad(central_frame, padding_size, value=128).contiguous()

        sample = {}
        # sample['image'] = central_frame

        # Save audio as WAV file
        audio_path = f"{video_path[:-4]}.wav"
        self.blockPrint()
        # audio = 
        video_clip.audio.write_audiofile(audio_path, codec='pcm_s16le', fps=44100)
        self.enablePrint()

        # Return data as dictionary
        sample = {'image': central_frame, 'audio_path': audio_path}
        # sample = {'image': central_frame, 'audio': audio}

        return sample

    @classmethod
    def from_config(cls, cfg, is_train=True):
        # Build augmentation
        augs = [
            T.ResizeShortestEdge(
                cfg.INPUT.MIN_SIZE_TRAIN,
                cfg.INPUT.MAX_SIZE_TRAIN,
                cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
            )
        ]
        if cfg.INPUT.CROP.ENABLED:
            augs.append(
                T.RandomCrop_CategoryAreaConstraint(
                    cfg.INPUT.CROP.TYPE,
                    cfg.INPUT.CROP.SIZE,
                    cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA,
                    cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
                )
            )
        if cfg.INPUT.COLOR_AUG_SSD:
            augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
        augs.append(T.RandomFlip())
        augs.extend([
        T.ResizeScale(
            min_scale=cfg.INPUT.MIN_SCALE, max_scale=cfg.INPUT.MAX_SCALE,
            target_height=cfg.INPUT.IMAGE_SIZE, target_width=cfg.INPUT.IMAGE_SIZE
        ),
        T.FixedSizeCrop(crop_size=(cfg.INPUT.IMAGE_SIZE, cfg.INPUT.IMAGE_SIZE)),
        ])

        ignore_label = False

        ret = {
            "is_train": is_train,
            "augmentations": augs,
            "image_format": cfg.INPUT.FORMAT,
            "ignore_label": ignore_label,
            "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
        }
        return ret
    
    def blockPrint(self):
        global backupstdout
        backupstdout=sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def enablePrint(self):
        global backupstdout
        sys.stdout = backupstdout

In [5]:
folder_path = 'DATA/videos'
video_dataset = VideoDataset(cfg, folder_path=folder_path)
dataloader = DataLoader(video_dataset, batch_size=cfg.SOLVER.IMS_PER_BATCH, shuffle=True)

### show

In [None]:
# images = []
# for i, x in tqdm(enumerate(dataloader)): 
#     img = x["image"]
#     img = (img - pixel_mean) / pixel_std
#     images.append(img)
#     if i == 10:
#         break # 11개에 2분/이었는데 6분으로 늘어남 
# images = ImageList.from_tensors(images, size_divisibility)
# images.to(device)

In [None]:
# detectron2/modeling/meta_arch/build.py
# def build_model(cfg):
#     """
#     Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``.
#     Note that it does not load any weights from ``cfg``.
#     """
#     meta_arch = cfg.MODEL.META_ARCHITECTURE
#     model = META_ARCH_REGISTRY.get(meta_arch)(cfg) # 'MaskFormer'
#     model.to(torch.device(cfg.MODEL.DEVICE))
#     _log_api_usage("modeling.meta_arch." + meta_arch)
#     return model
# -> model = maskformer_model.py

# detectron2/engine/defaults.py
# data_loader = self.build_train_loader(cfg) -> torchdata.DataLoader(dataset, batch_size=batch_size,)
# _trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)(model, data_loader, optimizer)

# detectron2/engine/train_loop.py
# def _data_loader_iter(self):
#     # only create the data loader iterator when it is used
#     if self._data_loader_iter_obj is None:
#         self._data_loader_iter_obj = iter(self.data_loader)
#     return self._data_loader_iter_obj
# data = next(self._data_loader_iter) -> next(iter(self.data_loader)) 
# model(data)

In [None]:
# example = video_dataset.__getitem__(1)
# print(example.keys())

In [None]:
# example['image'].shape

In [None]:
# example['audio_path']

In [None]:
# from matplotlib import pyplot as plt
# plt.imshow(example['image'].permute(1, 2, 0), interpolation='nearest')
# plt.show()

In [None]:
# from scipy.io import wavfile

# fs, audio_data = wavfile.read(example['audio_path'])
# print(fs, audio_data.shape)

# # plt.figure(figsize = (12, 3))
# plt.plot(audio_data, lw = 1)
# plt.xlim(0, len(audio_data))

# Model Architecture

## backbone

In [None]:
%cd /workspace/GitHub/AVSL

/workspace/GitHub/AVSL


In [6]:
# model.py
from detectron2.layers import ShapeSpec
from MODULES.MaskFormer.modeling.backbone.swin import D2SwinTransformer

def build_swin_backbone(cfg):
    input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))
    model = D2SwinTransformer(cfg, input_shape)
    model.init_weights(cfg.MODEL.WEIGHTS)
    return model

backbone = build_swin_backbone(cfg)
# with torch.no_grad():
#     image_feature = backbone(images.tensor)

# backbone.to(torch.device(device))
# with torch.no_grad():
#     image_feature = backbone(images.tensor.to(device))

In [7]:
# from detectron2.modeling import build_backbone
# from detectron2.modeling import BACKBONE_REGISTRY

# input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))
# backbone = build_backbone(cfg)
# backbone = BACKBONE_REGISTRY.get("D2SwinTransformer")(cfg, input_shape)

In [None]:
# from detectron2.layers import ShapeSpec
# from Mask2former.mask2former.modeling.backbone.swin import D2SwinTransformer

# print(cfg.MODEL.WEIGHTS)
# input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))
# backbone = D2SwinTransformer(cfg, input_shape)
# backbone.init_weights(cfg.MODEL.WEIGHTS)
# # backbone.to(device)

# features = backbone(images.tensor)

### load weights

In [15]:
%cd MODULES/ckpt

/workspace/GitHub/AVSL/MODULES/ckpt


In [7]:
!wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth

--2024-02-19 08:44:27--  https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth
Resolving github.com (github.com)... 20.200.245.247
Connecting to github.com (github.com)|20.200.245.247|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/357198522/2a4d1980-9bd4-11eb-9482-36c4b4f6edc3?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20240219%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240219T084427Z&X-Amz-Expires=300&X-Amz-Signature=c7c1f351946f4e404b971f046396bb8c5e918d91abc39382fcf8ac4761ab2a53&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=357198522&response-content-disposition=attachment%3B%20filename%3Dswin_base_patch4_window12_384_22k.pth&response-content-type=application%2Foctet-stream [following]
--2024-02-19 08:44:27--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/

In [16]:
!python tool/convert-pretrained-swin-model-to-d2.py swin_base_patch4_window12_384_22k.pth swin_base_patch4_window12_384_22k.pkl

In [17]:
%cd ../../

/workspace/GitHub/AVSL


In [7]:
import os
weights_path = os.path.join("MODULES/ckpt", cfg.MODEL.WEIGHTS)
print(weights_path)

MODULES/ckpt/swin_base_patch4_window12_384_22k.pkl


In [8]:
weights = torch.load(weights_path)

In [24]:
weights.keys()

dict_keys(['model', '__author__', 'matching_heuristics'])

In [25]:
weights['matching_heuristics']

True

In [29]:
backbone.load_state_dict(weights['model'])

RuntimeError: Error(s) in loading state_dict for D2SwinTransformer:
	Missing key(s) in state_dict: "norm0.weight", "norm0.bias", "norm1.weight", "norm1.bias", "norm2.weight", "norm2.bias", "norm3.weight", "norm3.bias". 
	Unexpected key(s) in state_dict: "norm.weight", "norm.bias", "head.weight", "head.bias", "layers.0.blocks.1.attn_mask", "layers.1.blocks.1.attn_mask", "layers.2.blocks.1.attn_mask", "layers.2.blocks.3.attn_mask", "layers.2.blocks.5.attn_mask", "layers.2.blocks.7.attn_mask", "layers.2.blocks.9.attn_mask", "layers.2.blocks.11.attn_mask", "layers.2.blocks.13.attn_mask", "layers.2.blocks.15.attn_mask", "layers.2.blocks.17.attn_mask". 

In [9]:
from urllib.parse import parse_qs, urlparse
from detectron2.utils.file_io import PathManager

path_manager = PathManager

path = os.path.join("MODULES/ckpt", cfg.MODEL.WEIGHTS)
parsed_url = urlparse(path)
_parsed_url_during_load = parsed_url
path = parsed_url._replace(query="").geturl()
path = path_manager.get_local_path(path)

In [10]:
from fvcore.common.checkpoint import Checkpointer

checkpointer = Checkpointer
checkpointer.load(backbone, path)

AttributeError: 'D2SwinTransformer' object has no attribute 'logger'

In [None]:
loaded = weights

In [None]:
parsed_url = _parsed_url_during_load
queries = parse_qs(parsed_url.query)

# if queries.pop("matching_heuristics", "False") == ["True"]:
#     loaded["matching_heuristics"] = True
# if len(queries) > 0:
#     raise ValueError(
#         f"Unsupported query remaining: f{queries}, orginal filename: {parsed_url.geturl()}"
#     )

In [None]:
queries

{}

In [14]:
import pickle

with PathManager.open(path, "rb") as f:
    data = torch.load(f, encoding="latin1")
if "model" in data and "__author__" in data:
    # file is in Detectron2 model zoo format
    self.logger.info("Reading a file from '{}'".format(data["__author__"]))

True


In [11]:
from detectron2.checkpoint import DetectionCheckpointer

checkpointer = DetectionCheckpointer(backbone, cfg.OUTPUT_DIR)
checkpointer.resume_or_load(path, resume=True)

UnpicklingError: A load persistent id instruction was encountered,
but no persistent_load function was specified.

In [19]:
from fvcore.common.checkpoint import Checkpointer

checkpoint = torch.load(weights_path)
checkpointer = Checkpointer(backbone)
checkpointer._convert_ndarray_to_tensor(checkpoint["model"])

In [22]:
from detectron2.checkpoint.c2_model_loading import align_and_update_state_dicts

checkpoint["model"] = align_and_update_state_dicts(
                backbone.state_dict(),
                checkpoint["model"],
                c2_conversion=False #checkpoint.get("__author__", None) == "Caffe2",
            )

False

In [None]:
import torch
backbone.load_state_dict(torch.load("swin_tiny_patch4_window7_224.pkl"))

In [None]:
image_feature.keys()

dict_keys(['res2', 'res3', 'res4', 'res5'])

In [None]:
print(image_feature['res2'].shape)
print(image_feature['res3'].shape)
print(image_feature['res4'].shape)
print(image_feature['res5'].shape)

torch.Size([16, 192, 256, 256])
torch.Size([16, 384, 128, 128])
torch.Size([16, 768, 64, 64])
torch.Size([16, 1536, 32, 32])


## pixel_decoder

In [None]:
backbone.output_shape()

{'res2': ShapeSpec(channels=192, height=None, width=None, stride=4),
 'res3': ShapeSpec(channels=384, height=None, width=None, stride=8),
 'res4': ShapeSpec(channels=768, height=None, width=None, stride=16),
 'res5': ShapeSpec(channels=1536, height=None, width=None, stride=32)}

In [7]:
# model.py
from MODULES.MaskFormer.modeling.pixel_decoder.pixel_decoder import TransformerEncoderPixelDecoder

def build_pixel_decoder(cfg, input_shape):
    model = TransformerEncoderPixelDecoder(cfg, input_shape)
    return model

pixel_decoder = build_pixel_decoder(cfg, backbone.output_shape())
pp_embeds, image_features = pixel_decoder(image_feature)

Calling forward() may cause unpredicted behavior of PixelDecoder module.


In [None]:
pp_embeds.shape

torch.Size([16, 256, 256, 256])

In [None]:
image_features.shape

torch.Size([16, 256, 32, 32])

In [None]:
# from detectron2.modeling import build_sem_seg_head

# sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape())
# -> SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape)

In [None]:
# from mask2former.maskformer_model.modeling.pixel_decoder.fpn import build_pixel_decoder

# build_pixel_decoder(cfg, input_shape)
# -> model = SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape)

In [None]:
# from mask2former.modeling.pixel_decoder.fpn import TransformerEncoderPixelDecoder

# pixel_decoder = TransformerEncoderPixelDecoder(cfg, input_shape=backbone.output_shape()) # maskformer_model.py

# # mask_foremer_head.py
# pp_embeds, transformer_encoder_features, _ = pixel_decoder.forward_features(features)

## mask_predictor

In [None]:
cfg.MODEL.WEIGHTS

'swin_large_patch4_window12_384_22k.pth'

In [8]:
audiomodule_weight = {
            'vggish': 'MODULES/ckpt/vggish-10086976.pth',
            'pca': 'MODULES/ckpt/vggish_pca_params-970ea276.pth'
            }

In [9]:
import librosa

audio_list = []
for path in data['audio_path']:
    mixed_audio, sr = librosa.load(path, mono=True, sr=16000)
    audio_list.append(torch.from_numpy(mixed_audio).unsqueeze(1))

mixed_audio = ImageList.from_tensors(audio_list).tensor
mixed_audio.shape

torch.Size([4, 160320, 1])

In [13]:
from MODULES.MaskFormer.modeling.transformer.mask_predictor import MaskPredictor

mask_predictor = MaskPredictor(cfg, cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM, audiomodule_weight) # mask_former_head.py 참고

In [17]:
device = 'cuda:2'

In [18]:
mask_predictor.to(torch.device(device))
image_features = pp_embeds.to(device)
pp_embeds = pp_embeds.to(device)
mixed_audio = mixed_audio.to(device)

In [19]:
output = mask_predictor(image_features, pp_embeds, mixed_audio)

RuntimeError: CUDA out of memory. Tried to allocate 160.00 MiB (GPU 2; 10.75 GiB total capacity; 9.31 GiB already allocated; 143.56 MiB free; 9.34 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [13]:
output.keys()

dict_keys(['pp_embeds', 'audio_sep_tokens', 'mixed_audio_spec', 'sep_audio_wavs', 'sep_audio_specs', 'sep_audio_features', 'sep_audio_features_embeds', 'pred_masks'])

In [15]:
for k in output.keys():
    print(k, output[k].shape)

pp_embeds torch.Size([4, 256, 256, 256])
audio_sep_tokens torch.Size([4, 4, 128])
mixed_audio_spec torch.Size([20, 1, 502, 1025])
sep_audio_wavs (20, 160320, 1)
sep_audio_specs (20, 1, 502, 1025)
sep_audio_features torch.Size([4, 4, 128])
sep_audio_features_embeds torch.Size([4, 4, 256])
pred_masks torch.Size([4, 4, 256, 256])


## TransformerDecoder 수정

In [None]:
# # mask_former_head.py
# if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder":
#     transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
# elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding":
#     transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
# elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder":  # for maskformer2
#     transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
# else:
#     transformer_predictor_in_channels = input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels

# # "transformer_predictor": build_transformer_decoder(cfg, transformer_predictor_in_channels, mask_classification=True,)
# # -> TRANSFORMER_DECODER_REGISTRY.get(name)(cfg, in_channels, mask_classification)

In [None]:
# from mask2former.modeling.transformer_decoder.maskformer_transformer_decoder import StandardTransformerDecoder

# transformer_decoder = StandardTransformerDecoder(cfg, in_channels=transformer_predictor_in_channels, mask_classification=False)

In [None]:
# mask_former_head.py
# predictions = transformer_decoder(transformer_encoder_features, mask_features, mask=None) # mask에 뭐가 들어가야 하는지 모르겠다

In [None]:
# predictions.keys()

In [None]:
# predictions['pred_masks'].shape # scoremap
# # predictions['pred_masks']

In [None]:
# print(predictions['pred_masks'].min())
# print(predictions['pred_masks'].max())
# # 음수 값이 나오는 것을 방지하기 위해 sigmoid 함수 고려

In [None]:
# maskformer_transformer_decoder_custom.py
from MaskFormer.modeling.transformer.transformer import Transformer
from MaskFormer.modeling.transformer.position_encoding import PositionEmbeddingSine
from torch import nn

N_steps = cfg.MODEL.MASK_FORMER.HIDDEN_DIM // 2
pe_layer = PositionEmbeddingSine(N_steps, normalize=True)

batch_size=cfg.SOLVER.IMS_PER_BATCH

num_queries = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES
transformer = Transformer(
            d_model=cfg.MODEL.MASK_FORMER.HIDDEN_DIM,
            dropout=cfg.MODEL.MASK_FORMER.DROPOUT,
            nhead=cfg.MODEL.MASK_FORMER.NHEADS,
            dim_feedforward=cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD,
            num_encoder_layers=cfg.MODEL.MASK_FORMER.ENC_LAYERS,
            num_decoder_layers=cfg.MODEL.MASK_FORMER.DEC_LAYERS,
            normalize_before=cfg.MODEL.MASK_FORMER.PRE_NORM,
            return_intermediate_dec=cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION,
        )
hidden_dim = transformer.d_model


if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder":
    transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding":
    transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM

In [None]:
import fvcore.nn.weight_init as weight_init
from detectron2.layers import Conv2d
from MaskFormer.modeling.transformer.transformer_predictor import MLP

learnable_input_queries = nn.Embedding(num_queries, hidden_dim)

in_channels = transformer_predictor_in_channels # mask_former_head.py
enforce_input_project = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ
deep_supervision = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
mask_dim = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM

if in_channels != hidden_dim or enforce_input_project:
    input_proj = Conv2d(in_channels, hidden_dim, kernel_size=1)
    weight_init.c2_xavier_fill(input_proj)
else:
    input_proj = nn.Sequential()
aux_loss = deep_supervision

mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3).to(device)

In [None]:
learnable_input_queries.weight.shape

torch.Size([4, 128])

In [None]:
x = image_features

In [None]:
pos = pe_layer(x)

src = x
mask = None
hs, memory = transformer(input_proj(src), mask, learnable_input_queries.weight, pos)

In [None]:
# output_size = hs.shape

audio_separation_tokens = hs[-1]

### LAAS

In [None]:
%cd /workspace/GitHub/AVSL/models
# pytorch-lightning: pip install pytorch-lightning==1.4.4, pip install torchmetrics==0.6.0

/workspace/GitHub/AVSL/models


In [None]:
import librosa

audio_list = []
for path in data['audio_path']:
    y, sr = librosa.load(path, mono=True, sr=16000)
    audio_list.append(torch.from_numpy(y).unsqueeze(1))

y = ImageList.from_tensors(audio_list).tensor
y.shape

torch.Size([16, 161760, 1])

In [None]:
audio_separation_tokens.shape

torch.Size([16, 4, 128])

In [None]:
noise_token = nn.Embedding(1, hidden_dim)
print(noise_token.weight.shape)
noise_token = noise_token.weight.unsqueeze(0).repeat(batch_size, 1, 1)
print(noise_token.shape)

torch.Size([1, 128])
torch.Size([16, 1, 128])


In [None]:
input_tokens = torch.cat([audio_separation_tokens, noise_token], dim=1)
input_tokens.shape

torch.Size([16, 5, 128])

In [None]:
# x = {'audio':torch.from_numpy(y.reshape((2,y.shape[1],1))),
#      'conditions':torch.from_numpy(np.zeros((2,9,128),dtype=np.float32))} # (batch_size, 오디오길이, 1), mono로 바꾸어야 함

input = {"audio": y, "conditions": input_tokens}

In [None]:
from AudioModule.LAAS import LAAS
from AudioModule import config

laas = LAAS(config=config)
audio_out = laas(input)
# audio_out = laas(input).to(device) # [batch_size*N, audio_len, hidden_dim]
# batch_size = 16, N = 4 일때 1분 20초

In [None]:
print(audio_out["separated_audio_features"].shape)
print(audio_out["mixed_audio"].shape)

torch.Size([64, 5, 128])
torch.Size([80, 1, 503, 1025])


In [None]:
separated_audio_features = audio_out["separated_audio_features"]
separated_audio_features = separated_audio_features.unsqueeze(1)
size = separated_audio_features.shape
separated_audio_features = separated_audio_features.reshape(16, 4, size[2], size[3]) # torch.Size([16, 4, 5, 128])

In [None]:
# import torch

# y = torch.rand(4, 161120, 1)
# audio_separation_tokens = torch.rand(4, 4, 128)
# token_size = audio_separation_tokens.shape
# noise_token = torch.rand(token_size[0], 1, token_size[2])
# input_tokens = torch.cat([audio_separation_tokens, noise_token], dim=1)

# input = {"audio": y, "conditions": input_tokens}

In [None]:
# device = "cuda:6"
# input = {"audio": y.to(device), "conditions": input_tokens.to(device)}

# ASP.to(device)
# x = ASP(input=input["audio"], condition=input["conditions"])

# from AudioModule.LAAS import LAAS
# from AudioModule import config

# laas = LAAS(config=config)
# laas.to(device)
# audio_features = laas(input).to(device)

### TransformerPredictor

In [None]:
separated_audio_features.shape

torch.Size([16, 4, 5, 128])

In [None]:
def av_feature_embed(separated_audio_features, pooling_mode='sum'):
    # av_feature_list = ImageList.from_tensors(separated_audio_features).tensor
    # av_feature_list = av_feature_list.permute(1, 0, 2, 3)
    if pooling_mode == 'sum':
        separated_audio_features = separated_audio_features.sum(dim=2)
    elif pooling_mode == 'mean':
        separated_audio_features = separated_audio_features.mean(dim=2)
    elif pooling_mode == 'max':
        separated_audio_features = separated_audio_features.max(dim=2).values
    return separated_audio_features

separated_audio_features = av_feature_embed(separated_audio_features, pooling_mode='sum')
separated_audio_features.shape

torch.Size([16, 4, 128])

In [None]:
separated_audio_features_embed = mask_embed(separated_audio_features)
separated_audio_features_embed.shape

torch.Size([16, 4, 256])

In [None]:
pp_embeds = pp_embeds.to(device)

In [None]:
pred_masks = torch.einsum("bqc,bchw->bqhw", separated_audio_features_embed, pp_embeds)

In [None]:
pred_masks.shape

torch.Size([16, 4, 256, 256])

In [None]:
# def av_feature_embed(separated_audio_wav_features_list, pooling_mode='sum'):
#     av_feature_list = ImageList.from_tensors(separated_audio_wav_features_list).tensor
#     av_feature_list = av_feature_list.permute(1, 0, 2, 3)
#     if pooling_mode == 'sum':
#         av_feature_list = av_feature_list.sum(dim=2)
#     elif pooling_mode == 'mean':
#         av_feature_list = av_feature_list.mean(dim=2)
#     elif pooling_mode == 'max':
#         av_feature_list = av_feature_list.max(dim=2).values
#     return av_feature_list

# av_feature_list = av_feature_embed(separated_audio_wav_features_list, pooling_mode='sum')
# av_feature_list.shape

In [None]:
# av_feature_embed = mask_embed(av_feature_list)

In [None]:
# av_feature_embed.shape

In [None]:
# scoremap = torch.einsum("bqc,bchw->bqhw", av_feature_embed, mask_features)
# scoremap.shape
# scoremap = torch.einsum("lbqc,bchw->lbqhw", av_feature_embed, mask_features)

In [None]:
# from mask2former.modeling.transformer_decoder.maskformer_transformer_decoder_custom import StandardTransformerDecoder

# def build_transformer_decoder(cfg):
#     if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder":
#         transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
#     elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding":
#         transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
#     elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder":  # for maskformer2
#         transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
#     model = StandardTransformerDecoder(cfg, in_channels=transformer_predictor_in_channels, mask_classification=False)
#     return model

# transformer_decoder = build_transformer_decoder(cfg)
# outputs = transformer_decoder(transformer_encoder_features, mask_features, mixed_audio=None, mask=None)

In [None]:
# outputs.keys()

In [None]:
mask_pred_results = F.interpolate(
                mask_pred_results,
                size=(images.tensor.shape[-2], images.tensor.shape[-1]),
                mode="bilinear",
                align_corners=False,
            )

NameError: name 'mask_pred_results' is not defined

In [None]:
image_size = images.image_sizes

In [None]:
def semantic_inference(mask_cls, mask_pred):
    mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1]
    mask_pred = mask_pred.sigmoid()
    semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred)
    return semseg

def panoptic_inference(mask_cls, mask_pred):
    scores, labels = F.softmax(mask_cls, dim=-1).max(-1)
    mask_pred = mask_pred.sigmoid()

    keep = labels.ne(sem_seg_head.num_classes) & (scores > object_mask_threshold)
    cur_scores = scores[keep]
    cur_classes = labels[keep]
    cur_masks = mask_pred[keep]
    cur_mask_cls = mask_cls[keep]
    cur_mask_cls = cur_mask_cls[:, :-1]

    cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks

    h, w = cur_masks.shape[-2:]
    panoptic_seg = torch.zeros((h, w), dtype=torch.int32, device=cur_masks.device)
    segments_info = []

    current_segment_id = 0

    if cur_masks.shape[0] == 0:
        # We didn't detect any mask :(
        return panoptic_seg, segments_info
    else:
        # take argmax
        cur_mask_ids = cur_prob_masks.argmax(0)
        stuff_memory_list = {}
        for k in range(cur_classes.shape[0]):
            pred_class = cur_classes[k].item()
            isthing = pred_class in metadata.thing_dataset_id_to_contiguous_id.values()
            mask_area = (cur_mask_ids == k).sum().item()
            original_area = (cur_masks[k] >= 0.5).sum().item()
            mask = (cur_mask_ids == k) & (cur_masks[k] >= 0.5)

            if mask_area > 0 and original_area > 0 and mask.sum().item() > 0:
                if mask_area / original_area < overlap_threshold:
                    continue

                # merge stuff regions
                if not isthing:
                    if int(pred_class) in stuff_memory_list.keys():
                        panoptic_seg[mask] = stuff_memory_list[int(pred_class)]
                        continue
                    else:
                        stuff_memory_list[int(pred_class)] = current_segment_id + 1

                current_segment_id += 1
                panoptic_seg[mask] = current_segment_id

                segments_info.append(
                    {
                        "id": current_segment_id,
                        "isthing": bool(isthing),
                        "category_id": int(pred_class),
                    }
                )

        return panoptic_seg, segments_info

def instance_inference(mask_cls, mask_pred):
    # mask_pred is already processed to have the same shape as original input
    image_size = mask_pred.shape[-2:]

    # [Q, K]
    scores = F.softmax(mask_cls, dim=-1)[:, :-1]
    labels = torch.arange(sem_seg_head.num_classes, device=device).unsqueeze(0).repeat(num_queries, 1).flatten(0, 1)
    # scores_per_image, topk_indices = scores.flatten(0, 1).topk(num_queries, sorted=False)
    scores_per_image, topk_indices = scores.flatten(0, 1).topk(test_topk_per_image, sorted=False)
    labels_per_image = labels[topk_indices]

    topk_indices = topk_indices // sem_seg_head.num_classes
    # mask_pred = mask_pred.unsqueeze(1).repeat(1, sem_seg_head.num_classes, 1).flatten(0, 1)
    mask_pred = mask_pred[topk_indices]

    # if this is panoptic segmentation, we only keep the "thing" classes
    if panoptic_on:
        keep = torch.zeros_like(scores_per_image).bool()
        for i, lab in enumerate(labels_per_image):
            keep[i] = lab in metadata.thing_dataset_id_to_contiguous_id.values()

        scores_per_image = scores_per_image[keep]
        labels_per_image = labels_per_image[keep]
        mask_pred = mask_pred[keep]

    result = Instances(image_size)
    # mask (before sigmoid)
    result.pred_masks = (mask_pred > 0).float()
    result.pred_boxes = Boxes(torch.zeros(mask_pred.size(0), 4))
    # Uncomment the following to get boxes from masks (this is slow)
    # result.pred_boxes = BitMasks(mask_pred > 0).get_bounding_boxes()

    # calculate average mask prob
    mask_scores_per_image = (mask_pred.sigmoid().flatten(1) * result.pred_masks.flatten(1)).sum(1) / (result.pred_masks.flatten(1).sum(1) + 1e-6)
    result.scores = scores_per_image * mask_scores_per_image
    result.pred_classes = labels_per_image
    return result

In [None]:
processed_results = []
batched_inputs = dataset_dict["image"]

for mask_cls_result, mask_pred_result, input_per_image, image_size in zip(
    mask_cls_results, mask_pred_results, batched_inputs, images.image_sizes):
    height = 1024 # input_per_image.get("height", image_size[0])
    width = 1024 # input_per_image.get("width", image_size[1])
    processed_results.append({})

    if sem_seg_postprocess_before_inference:
        mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)(
            mask_pred_result, image_size, height, width
        )
        mask_cls_result = mask_cls_result.to(mask_pred_result)

    # semantic segmentation inference
    if semantic_on:
        r = retry_if_cuda_oom(semantic_inference)(mask_cls_result, mask_pred_result)
        if not sem_seg_postprocess_before_inference:
            r = retry_if_cuda_oom(sem_seg_postprocess)(r, image_size, height, width)
        processed_results[-1]["sem_seg"] = r

    # panoptic segmentation inference
    if panoptic_on:
        panoptic_r = retry_if_cuda_oom(panoptic_inference)(mask_cls_result, mask_pred_result)
        processed_results[-1]["panoptic_seg"] = panoptic_r

    # instance segmentation inference
    if instance_on:
        instance_r = retry_if_cuda_oom(instance_inference)(mask_cls_result, mask_pred_result)
        processed_results[-1]["instances"] = instance_r


In [None]:
processed_results[0].keys()

dict_keys(['sem_seg', 'panoptic_seg', 'instances'])

In [None]:
processed_results[0]['sem_seg'].shape

torch.Size([133, 1024, 1024])