In [1]:
# Uncomment to use autoreload
%load_ext autoreload
%autoreload 2

import os
import os.path as osp
import sys
import torch
import numpy as np
from time import time
from omegaconf import OmegaConf
start = time()
import warnings
warnings.filterwarnings('ignore')

# torch.cuda.set_device(I_GPU)
DIR = os.path.dirname(os.getcwd())
ROOT = os.path.join(DIR, "..")
sys.path.insert(0, ROOT)
sys.path.insert(0, DIR)

from torch_points3d.utils.config import hydra_read
from torch_geometric.data import Data
from torch_points3d.core.multimodal.data import MMData, MMBatch
from torch_points3d.visualization.multimodal_data import visualize_mm_data
from torch_points3d.core.multimodal.image import SameSettingImageData, ImageData
from torch_points3d.datasets.segmentation.multimodal.scannet import ScannetDatasetMM
from torch_points3d.datasets.segmentation.scannet import CLASS_COLORS, CLASS_NAMES, CLASS_LABELS
from torch_points3d.metrics.segmentation_tracker import SegmentationTracker
from torch_points3d.datasets.segmentation import IGNORE_LABEL
from torch_points3d.metrics.scannet_segmentation_tracker import ScannetSegmentationTracker
from torch_points3d.metrics.colored_tqdm import Coloredtqdm as Ctq


from PIL import Image

import matplotlib.pyplot as plt 

%matplotlib inline

CLASS_COLORS[0] = (174.0, 199.0, 232.0)
CLASS_COLORS[-1] = (0, 0, 0)
import plotly.io as pio

#pio.renderers.default = 'jupyterlab'        # for local notebook
pio.renderers.default = 'iframe_connected'  # for remote notebook. Other working (but seemingly slower) options are: 'sphinx_gallery' and 'iframe'

MMData debug() function changed, please uncomment the 3rd assert line when doing inference without M2F features!


In [2]:
import os
import os.path as osp
import copy
import torch
import hydra
import logging
import scipy.ndimage
import numpy as np
from PIL import Image
import open3d as o3d

# Import building function for model and dataset
from torch_points3d.datasets.dataset_factory import instantiate_dataset
from torch_points3d.models.model_factory import instantiate_model

# Import BaseModel / BaseDataset for type checking
from torch_points3d.models.base_model import BaseModel
from torch_points3d.datasets.base_dataset import BaseDataset

# Import from metrics
from torch_points3d.metrics.base_tracker import BaseTracker
from torch_points3d.metrics.colored_tqdm import Coloredtqdm as Ctq
from torch_points3d.metrics.model_checkpoint import ModelCheckpoint

# Utils import
from torch_points3d.utils.colors import COLORS
from torch_points3d.utils.wandb_utils import Wandb
from torch_points3d.utils.config import getattr_recursive
from torch_points3d.visualization import Visualizer
from torch_points3d.core.data_transform.transforms import PointcloudMerge
from torch_points3d.datasets.segmentation.scannet import CLASS_COLORS, CLASS_NAMES, CLASS_LABELS


log = logging.getLogger(__name__)

def get_seen_points(mm_data):
    ### Select seen points
    csr_idx = mm_data.modalities['image'][0].view_csr_indexing
    dense_idx_list = torch.arange(mm_data.modalities['image'][0].num_points).repeat_interleave(csr_idx[1:] - csr_idx[:-1])
    # take subset of only seen points without re-indexing the same point
    mm_data = mm_data[dense_idx_list.unique()]
    return mm_data

def get_mode_pred(data):
    pixel_validity = data.data.mvfusion_input[:, :, 0].bool()
    mv_preds = data.data.mvfusion_input[:, :, -1].long()
            
    valid_m2f_feats = []
    for i in range(len(mv_preds)):
        valid_m2f_feats.append(mv_preds[i][pixel_validity[i]])

    mode_preds = []
    for m2feats_of_seen_point in valid_m2f_feats:
        mode_preds.append(torch.mode(m2feats_of_seen_point.squeeze(), dim=0)[0])
    mode_preds = torch.stack(mode_preds, dim=0)
        
    return mode_preds

def get_random_view_pred(data):
    pixel_validity = data.data.mvfusion_input[:, :, 0].bool()
    mv_preds = data.data.mvfusion_input[:, :, -1].long()
            
    valid_m2f_feats = []
    for i in range(len(mv_preds)):
        valid_m2f_feats.append(mv_preds[i][pixel_validity[i]])

    selected_view_preds = []
    for m2feats_of_seen_point in valid_m2f_feats:
        selected_idx = torch.randint(low=0, high=m2feats_of_seen_point.shape[0], size=(1,))
        selected_pred = m2feats_of_seen_point[selected_idx].squeeze(0)
        selected_view_preds.append(selected_pred)
    selected_view_preds = torch.stack(selected_view_preds, dim=0)
        
    return selected_view_preds




Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [3]:
# Set your dataset root directory, where the data was/will be downloaded
DATA_ROOT = '/scratch-shared/fsun/dvata'

dataset_config = 'segmentation/multimodal/Feng/scannet-neucon-smallres-m2f.yaml'   
models_config = 'segmentation/multimodal/Feng/mvfusion'    # model family
model_name = 'MVFusion_3D_small_6views'                       # specific model

overrides = [
    'task=segmentation',
    f'data={dataset_config}',
    f'models={models_config}',
    f'model_name={model_name}',
    f'data.dataroot={DATA_ROOT}',
]

cfg = hydra_read(overrides)
OmegaConf.set_struct(cfg, False)  # This allows getattr and hasattr methods to function correctly
cfg.data.load_m2f_masks = True   # load Mask2Former predicted masks
cfg.data.m2f_preds_dirname = 'ViT_masks'
cfg.data.n_views = cfg.models[model_name].backbone.transformer.n_views
print(cfg.data.n_views)

# Dataset instantiation
start = time()
dataset = ScannetDatasetMM(cfg.data)
# print(dataset)|
print(f"Time = {time() - start:0.1f} sec.")

6
Load predicted 2D semantic segmentation labels from directory  ViT_masks
initialize train dataset
initialize val dataset
Time = 7.7 sec.


In [4]:
# from torch_points3d.models.model_factory import instantiate_model

# # ViT_masks 3rd run
# checkpoint_dir = '/home/fsun/DeepViewAgg/outputs/ViT_masks_3rd_run' # 3rd run

# # # ViT_masks 9 views
# # checkpoint_dir = '/home/fsun/DeepViewAgg/outputs/2023-01-25/16-02-53'


# # # MVFusion_orig
# # checkpoint_dir = '/home/fsun/DeepViewAgg/outputs/MVFusion_orig'


# # # M2F masks 6 views small
# # checkpoint_dir = "/home/fsun/DeepViewAgg/outputs/MVFusion_3D_6_views_m2f_masks"

# # Create the model
# print(f"Creating model: {cfg.model_name}")
# model = instantiate_model(cfg, dataset)
# # print(model)

# # Load the checkpoint and recover the 'best_miou' model weights
# checkpoint = torch.load(f'{checkpoint_dir}/{model_name}.pt', map_location='cpu')
# model.load_state_dict_with_same_shape(checkpoint['models']['best_miou'], strict=False)

# # Prepare the model for training
# model = model.cuda()
# print('Model loaded')


In [5]:
# import pandas as pd
# pd.set_option('display.max_rows', 50)

# # Create validation loader
# dataset.create_dataloaders(
#     model,
#     1,
#     False,
#     17,
#     False,
#     train_only=False,
#     val_only=True,
#     test_batch_size=1
# )

# mapping_idx_to_scan_names = getattr(dataset.val_dataset, "MAPPING_IDX_TO_SCAN_{}_NAMES".format(dataset.val_dataset.split.upper()))
# # print(mapping_idx_to_scan_names)
# # scan_name = mapping_idx_to_scan_names[0]
# # scan_name

# Model playground

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class MyModelA(nn.Module):
    def __init__(self):
        super(MyModelA, self).__init__()
        self.fc1 = nn.Linear(10, 2)
        
    def forward(self, x):
        x = self.fc1(x)
        return x
    

class MyModelB(nn.Module):
    def __init__(self):
        super(MyModelB, self).__init__()
        self.fc1 = nn.Linear(20, 2)
        
    def forward(self, x):
        x = self.fc1(x)
        return x


class MyEnsemble(nn.Module):
    def __init__(self, modelA, modelB):
        super(MyEnsemble, self).__init__()
        self.modelA = modelA
        self.modelB = modelB
        self.classifier = nn.Linear(4, 2)
        
    def forward(self, x1, x2):
        x1 = self.modelA(x1)
        x2 = self.modelB(x2)
        x = torch.cat((x1, x2), dim=1)
        x = self.classifier(F.relu(x))
        return x

# Create models and load state_dicts    
modelA = MyModelA()
modelB = MyModelB()
# Load state dicts
# modelA.load_state_dict(torch.load(PATH))
# modelB.load_state_dict(torch.load(PATH))

model = MyEnsemble(modelA, modelB)
x1, x2 = torch.randn(1, 10), torch.randn(1, 20)
output = model(x1, x2)
output

In [52]:
from mit_semseg.models import ModelBuilder

import torch
import torch.nn as nn

from PIL import Image
from torchvision import transforms


# Network Builders
net_encoder = ModelBuilder.build_encoder(
    arch='resnet18dilated',   #cfg.MODEL.arch_encoder.lower(),
    fc_dim=512,   #cfg.MODEL.fc_dim,
    weights="/home/fsun/thesis/pretrained/ade20k-resnet18dilated-c1_deepsup_encoder_epoch_20.pth")


preprocessing = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

im = Image.open(
    '/scratch-shared/fsun/data/scannet/scans/scene0011_00/color_resized/0.png')
im = preprocessing(im).unsqueeze(0)
print(im.shape)
out = net_encoder(im)[0]
print(out.shape)
print(out.max(), out.min())



net_decoder = ModelBuilder.build_decoder(
    arch='PPM', # cfg.MODEL.arch_decoder.lower(),
    fc_dim=512,      # cfg.MODEL.fc_dim,
    num_class=20,    #cfg.DATASET.num_class,
#         weights="/home/fsun/decoder_epoch_20.pth",    # cfg.MODEL.weights_decoder,
    use_softmax=False)

crit = nn.NLLLoss(ignore_index=-1)

segmentation_module = SegmentationModule(net_encoder, net_decoder, crit, deep_sup_scale=None)

print(segmentation_module)

#     # Dataset and Loader
#     dataset_val = ValDataset(
#         cfg.DATASET.root_dataset,
#         cfg.DATASET.list_val,
#         cfg.DATASET)
#     loader_val = torch.utils.data.DataLoader(
#         dataset_val,
#         batch_size=cfg.VAL.batch_size,
#         shuffle=False,
#         collate_fn=user_scattered_collate,
#         num_workers=5,
#         drop_last=True)

segmentation_module.cuda()

#     # Main loop
#     evaluate(segmentation_module, loader_val, cfg, gpu)

#     print('Evaluation Done!')



Loading weights for net_encoder
torch.Size([1, 3, 480, 640])
torch.Size([1, 512, 60, 80])
tensor(15.2177, grad_fn=<MaxBackward1>) tensor(0., grad_fn=<MinBackward1>)
SegmentationModule(
  (encoder): ResnetDilated(
    (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn1): SynchronizedBatchNorm2d(64, eps=1e-05, momentum=0.001, affine=True, track_running_stats=True)
    (relu1): ReLU(inplace=True)
    (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn2): SynchronizedBatchNorm2d(64, eps=1e-05, momentum=0.001, affine=True, track_running_stats=True)
    (relu2): ReLU(inplace=True)
    (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn3): SynchronizedBatchNorm2d(128, eps=1e-05, momentum=0.001, affine=True, track_running_stats=True)
    (relu3): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1)

SegmentationModule(
  (encoder): ResnetDilated(
    (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn1): SynchronizedBatchNorm2d(64, eps=1e-05, momentum=0.001, affine=True, track_running_stats=True)
    (relu1): ReLU(inplace=True)
    (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn2): SynchronizedBatchNorm2d(64, eps=1e-05, momentum=0.001, affine=True, track_running_stats=True)
    (relu2): ReLU(inplace=True)
    (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn3): SynchronizedBatchNorm2d(128, eps=1e-05, momentum=0.001, affine=True, track_running_stats=True)
    (relu3): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): SynchronizedBatch

In [53]:
# Load and normalize one image as a singleton tensor batch
pil_to_tensor = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406], # These are RGB mean+std values
        std=[0.229, 0.224, 0.225])  # across a large photo dataset.
])
pil_image = Image.open('/scratch-shared/fsun/data/scannet/scans/scene0011_00/color_resized/0.png')
img_original = np.array(pil_image)
img_data = pil_to_tensor(pil_image)

pil_label = Image.open('/scratch-shared/fsun/data/scannet/scans/scene0011_00/label-filt-scannet20/0.png')
label_original = np.array(pil_label)
label_data = torch.tensor(label_original)


singleton_batch = {'img_data': img_data[None].cuda(), 'seg_label': label_data[None].cuda()}
output_size = img_data.shape[1:]




segmentation_module.zero_grad()
scores = segmentation_module(singleton_batch, segSize=output_size)

    
# # Get the predicted scores for each pixel
# _, pred = torch.max(scores, dim=1)
# pred = pred.cpu()[0].numpy()
# visualize_result(img_original, pred)


# forward pass
loss, acc = segmentation_module(singleton_batch)
# loss = loss.mean()
# acc = acc.mean()

print(loss)

# # Backward
# loss.backward()
# for optimizer in optimizers:
#     optimizer.step()


input shape:  torch.Size([1, 3, 480, 640])


ValueError: Expected more than 1 value per channel when training, got input size torch.Size([1, 512, 1, 1])

In [78]:


class SegmentationModuleBase(nn.Module):
    def __init__(self):
        super(SegmentationModuleBase, self).__init__()

    def pixel_acc(self, pred, label):
        _, preds = torch.max(pred, dim=1)
        valid = (label >= 0).long()
        acc_sum = torch.sum(valid * (preds == label).long())
        pixel_sum = torch.sum(valid)
        acc = acc_sum.float() / (pixel_sum.float() + 1e-10)
        return acc

class SegmentationModule(SegmentationModuleBase):
    def __init__(self, net_enc, net_dec, crit, deep_sup_scale=None):
        super(SegmentationModule, self).__init__()
        self.encoder = net_enc
        self.decoder = net_dec
        self.crit = crit
        self.deep_sup_scale = deep_sup_scale

    def forward(self, feed_dict, *, segSize=None):
        
        print("input shape: ", feed_dict['img_data'].shape)
        
        # training
        if segSize is None:
            if self.deep_sup_scale is not None: # use deep supervision technique
                (pred, pred_deepsup) = self.decoder(self.encoder(feed_dict['img_data'], return_feature_maps=True))
            else:
                pred = self.decoder(self.encoder(feed_dict['img_data'], return_feature_maps=True))

                
            print(pred.shape, feed_dict['seg_label'].shape)
            loss = self.crit(pred, feed_dict['seg_label'])
            if self.deep_sup_scale is not None:
                loss_deepsup = self.crit(pred_deepsup, feed_dict['seg_label'])
                loss = loss + loss_deepsup * self.deep_sup_scale

            acc = self.pixel_acc(pred, feed_dict['seg_label'])
            return loss, acc
        # inference
        else:
            pred = self.decoder(self.encoder(feed_dict['img_data'], return_feature_maps=True), segSize=segSize)
            return pred

In [79]:
# System libs
import os, csv, torch, numpy, scipy.io, PIL.Image, torchvision.transforms
# Our libs
from mit_semseg.models import ModelBuilder
from mit_semseg.utils import colorEncode

In [86]:
# Network Builders
net_encoder = ModelBuilder.build_encoder(
    arch='resnet50dilated',
    fc_dim=2048)
net_decoder = ModelBuilder.build_decoder(
    arch='ppm_deepsup',
    fc_dim=2048,
    num_class=150,
#     weights='ckpt/ade20k-resnet50dilated-ppm_deepsup/decoder_epoch_20.pth',
    use_softmax=False)

crit = torch.nn.NLLLoss(ignore_index=-1)
segmentation_module = SegmentationModule(net_encoder, net_decoder, crit, deep_sup_scale=0.4)
# segmentation_module.eval()
segmentation_module.cuda()

SegmentationModule(
  (encoder): ResnetDilated(
    (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn1): SynchronizedBatchNorm2d(64, eps=1e-05, momentum=0.001, affine=True, track_running_stats=True)
    (relu1): ReLU(inplace=True)
    (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn2): SynchronizedBatchNorm2d(64, eps=1e-05, momentum=0.001, affine=True, track_running_stats=True)
    (relu2): ReLU(inplace=True)
    (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn3): SynchronizedBatchNorm2d(128, eps=1e-05, momentum=0.001, affine=True, track_running_stats=True)
    (relu3): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(128, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): SynchronizedBatchNorm2d(64, eps=1

In [89]:
# Load and normalize one image as a singleton tensor batch
pil_to_tensor = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize(
        mean=[0.485, 0.456, 0.406], # These are RGB mean+std values
        std=[0.229, 0.224, 0.225])  # across a large photo dataset.
])

images = []
labels = []
for i in range(4):
    pil_image = PIL.Image.open('/scratch-shared/fsun/data/scannet/scans/scene0011_00/color_resized/0.png').convert('RGB')
#     img_original = numpy.array(pil_image)
    img_data = pil_to_tensor(pil_image)
    images.append(img_data)
    
    pil_label = Image.open('/scratch-shared/fsun/data/scannet/scans/scene0011_00/label-filt-scannet20/0.png')
    label_original = np.array(pil_label)
    label_data = torch.tensor(label_original)   
    labels.append(label_data)
    
    
print(images[0].shape)
images = torch.stack(images, axis=0)
labels = torch.stack(labels, axis=0)
print(images.shape)


singleton_batch = {'img_data': images.cuda(), 'seg_label': labels.cuda()}
output_size = img_data.shape[1:]


torch.Size([3, 480, 640])
torch.Size([4, 3, 480, 640])


In [90]:
# Run the segmentation at the highest resolution.
# with torch.no_grad():
scores = segmentation_module(singleton_batch)
    
# Get the predicted scores for each pixel
_, pred = torch.max(scores, dim=1)
pred = pred.cpu()[0].numpy()
pred.shape

input shape:  torch.Size([4, 3, 480, 640])
torch.Size([4, 150, 60, 80]) torch.Size([4, 480, 640])


RuntimeError: input and target batch or spatial sizes don't match: target [4, 480, 640], input [4, 150, 60, 80]

In [75]:
scores

tensor([[[[0.0011, 0.0011, 0.0011,  ..., 0.0061, 0.0061, 0.0061],
          [0.0011, 0.0011, 0.0011,  ..., 0.0061, 0.0061, 0.0061],
          [0.0011, 0.0011, 0.0011,  ..., 0.0061, 0.0061, 0.0061],
          ...,
          [0.0010, 0.0010, 0.0010,  ..., 0.0030, 0.0030, 0.0030],
          [0.0010, 0.0010, 0.0010,  ..., 0.0030, 0.0030, 0.0030],
          [0.0010, 0.0010, 0.0010,  ..., 0.0030, 0.0030, 0.0030]],

         [[0.0039, 0.0039, 0.0039,  ..., 0.0031, 0.0031, 0.0031],
          [0.0039, 0.0039, 0.0039,  ..., 0.0031, 0.0031, 0.0031],
          [0.0039, 0.0039, 0.0039,  ..., 0.0031, 0.0031, 0.0031],
          ...,
          [0.0020, 0.0020, 0.0020,  ..., 0.0025, 0.0025, 0.0025],
          [0.0020, 0.0020, 0.0020,  ..., 0.0025, 0.0025, 0.0025],
          [0.0020, 0.0020, 0.0020,  ..., 0.0025, 0.0025, 0.0025]],

         [[0.0038, 0.0038, 0.0038,  ..., 0.0099, 0.0099, 0.0099],
          [0.0038, 0.0038, 0.0038,  ..., 0.0099, 0.0099, 0.0099],
          [0.0038, 0.0038, 0.0038,  ..., 0