In [128]:
%%capture
!pip install pyyaml==5.1
!python -m pip install 'git+https://github.com/facebookresearch/detectron2.git@v0.5'

In [130]:
"""
%%capture
!wget http://images.cocodataset.org/zips/val2014.zip
!unzip val2014.zip
"""

In [131]:
"""
%%capture
!wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Val_mscoco.zip
!unzip v2_Annotations_Val_mscoco.zip
"""

In [132]:
%%capture
!pip install --upgrade tensorflow
!pip install --upgrade tensorflow-gpu

In [1]:
import torch
torch.__version__


import torch, torchvision
import matplotlib.pyplot as plt
import json
import cv2
import numpy as np
from copy import deepcopy
from detectron2.modeling import build_model
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.structures.image_list import ImageList
from detectron2.data import transforms as T
from detectron2.modeling.box_regression import Box2BoxTransform
from detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputLayers
from detectron2.structures.boxes import Boxes
from detectron2.layers import nms
from detectron2 import model_zoo
from detectron2.config import get_cfg


"""
with open('v2_OpenEnded_mscoco_val2014_questions.json') as f:
    q = json.load(f)

with open('v2_mscoco_val2014_annotations.json') as f:
    a = json.load(f)
"""

"\nwith open('v2_OpenEnded_mscoco_val2014_questions.json') as f:\n    q = json.load(f)\n\nwith open('v2_mscoco_val2014_annotations.json') as f:\n    a = json.load(f)\n"

In [2]:
image_names = ["photos/applein_orangeout.png","photos/bag_in.png","photos/monkey_out.png","photos/monkeyin.png",
               "photos/orange_out.png","photos/spatula_out.png"]

images = [cv2.cvtColor(plt.imread(n),cv2.COLOR_RGB2BGR) for n in image_names]


In [3]:
cfg_path = "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml"

def load_config_and_model_weights(cfg_path):
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file(cfg_path))

    # ROI HEADS SCORE THRESHOLD
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5

    # Comment the next line if you're using 'cuda'
    cfg['MODEL']['DEVICE']='cpu'

    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(cfg_path)

    return cfg

cfg = load_config_and_model_weights(cfg_path)
print("cfg:",cfg)

cfg: CUDNN_BENCHMARK: False
DATALOADER:
  ASPECT_RATIO_GROUPING: True
  FILTER_EMPTY_ANNOTATIONS: True
  NUM_WORKERS: 4
  REPEAT_THRESHOLD: 0.0
  SAMPLER_TRAIN: TrainingSampler
DATASETS:
  PRECOMPUTED_PROPOSAL_TOPK_TEST: 1000
  PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 2000
  PROPOSAL_FILES_TEST: ()
  PROPOSAL_FILES_TRAIN: ()
  TEST: ('coco_2017_val',)
  TRAIN: ('coco_2017_train',)
GLOBAL:
  HACK: 1.0
INPUT:
  CROP:
    ENABLED: False
    SIZE: [0.9, 0.9]
    TYPE: relative_range
  FORMAT: BGR
  MASK_FORMAT: polygon
  MAX_SIZE_TEST: 1333
  MAX_SIZE_TRAIN: 1333
  MIN_SIZE_TEST: 800
  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
  MIN_SIZE_TRAIN_SAMPLING: choice
  RANDOM_FLIP: horizontal
MODEL:
  ANCHOR_GENERATOR:
    ANGLES: [[-90, 0, 90]]
    ASPECT_RATIOS: [[0.5, 1.0, 2.0]]
    NAME: DefaultAnchorGenerator
    OFFSET: 0.0
    SIZES: [[32], [64], [128], [256], [512]]
  BACKBONE:
    FREEZE_AT: 2
    NAME: build_resnet_fpn_backbone
  DEVICE: cpu
  FPN:
    FUSE_TYPE: sum
    IN_FEATURES: ['res

In [4]:
def get_model(cfg):
    # build model
    model = build_model(cfg)

    # load weights
    checkpointer = DetectionCheckpointer(model)
    checkpointer.load(cfg.MODEL.WEIGHTS)

    # eval mode
    model.eval()
    return model

model = get_model(cfg)
print("model:",model)

model: GeneralizedRCNN(
  (backbone): FPN(
    (fpn_lateral2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral3): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral4): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral5): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (top_block): LastLevelMaxPool()
    (bottom_up): ResNet(
      (stem): BasicStem(
        (conv1): Conv2d(
          3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
          (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05)
        )
      )
      (res2): Sequential(
        (0): Bottlene

In [5]:
def prepare_image_inputs(cfg, img_list):
    # Resizing the image according to the configuration
    transform_gen = T.ResizeShortestEdge(
                [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
            )
    img_list = [transform_gen.get_transform(img).apply_image(img) for img in img_list]

    # Convert to C,H,W format
    convert_to_tensor = lambda x: torch.Tensor(x.astype("float32").transpose(2, 0, 1))

    batched_inputs = [{"image":convert_to_tensor(img), "height": img.shape[0], "width": img.shape[1]} for img in img_list]

    print("batched inputs:",batched_inputs[0]["image"])
    # Normalizing the image
    num_channels = len(cfg.MODEL.PIXEL_MEAN)
    pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(num_channels, 1, 1)
    pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).view(num_channels, 1, 1)
    normalizer = lambda x: (x - pixel_mean) / pixel_std
    images = [normalizer(x["image"]) for x in batched_inputs]

    print("images after normalization:",images)
    # Convert to ImageList
    images =  ImageList.from_tensors(images,model.backbone.size_divisibility)
    
    return images, batched_inputs

images, batched_inputs = prepare_image_inputs(cfg, images)

batched inputs: tensor([[[0.6980, 0.6949, 0.6915,  ..., 0.7137, 0.7137, 0.7137],
         [0.6949, 0.6892, 0.6873,  ..., 0.7116, 0.7137, 0.7137],
         [0.6968, 0.6904, 0.6889,  ..., 0.7084, 0.7116, 0.7137],
         ...,
         [0.6275, 0.6243, 0.6235,  ..., 0.6157, 0.6149, 0.6118],
         [0.6267, 0.6235, 0.6228,  ..., 0.6149, 0.6142, 0.6110],
         [0.6235, 0.6204, 0.6196,  ..., 0.6118, 0.6110, 0.6078]],

        [[0.6980, 0.6949, 0.6941,  ..., 0.7137, 0.7137, 0.7137],
         [0.6980, 0.6949, 0.6941,  ..., 0.7094, 0.7137, 0.7137],
         [0.7007, 0.6975, 0.6968,  ..., 0.7102, 0.7142, 0.7164],
         ...,
         [0.6275, 0.6243, 0.6235,  ..., 0.6196, 0.6194, 0.6183],
         [0.6267, 0.6235, 0.6228,  ..., 0.6189, 0.6181, 0.6149],
         [0.6235, 0.6204, 0.6196,  ..., 0.6157, 0.6149, 0.6118]],

        [[0.6980, 0.6949, 0.6941,  ..., 0.7137, 0.7137, 0.7137],
         [0.6980, 0.6949, 0.6941,  ..., 0.7094, 0.7137, 0.7137],
         [0.7007, 0.6975, 0.6968,  ..., 0.

In [6]:
def get_features(model, images):
    features = model.backbone(images.tensor)
    return features

features = get_features(model, images)
print("features of images:",features)

features of images: {'p2': tensor([[[[-7.6330e-01,  2.5363e-01,  3.0656e-01,  ...,  1.0963e+00,
            1.1150e+00,  3.3567e+00],
          [-1.8741e+00, -1.4465e+00, -1.7043e+00,  ..., -1.3515e-01,
            7.3335e-03,  3.4301e+00],
          [-1.4147e+00, -1.4729e+00, -2.2716e+00,  ..., -3.8235e-01,
           -9.8905e-02,  3.3508e+00],
          ...,
          [-2.2969e-01,  9.8754e-02, -5.7521e-01,  ..., -1.4978e-01,
            1.7264e-01,  3.7043e+00],
          [ 2.2600e-01,  7.0777e-01,  4.1457e-03,  ..., -5.3150e-01,
           -5.0209e-01,  3.3179e+00],
          [-3.4351e-01,  6.9602e-01, -2.3173e-01,  ..., -8.7868e-01,
           -1.0457e+00,  1.5650e+00]],

         [[-2.1819e+00, -5.7852e-01, -2.5985e-01,  ..., -9.9547e-01,
           -1.0274e+00, -1.4884e-01],
          [-3.2686e+00, -1.2337e+00, -5.0176e-01,  ..., -6.5273e-01,
           -6.5724e-01, -8.6123e-02],
          [-2.9662e+00, -1.4376e+00, -3.5085e-01,  ..., -6.5060e-01,
           -6.7174e-01, -1.6428

In [7]:
#plt.imshow(cv2.resize(img2, (images.tensor.shape[-2:][::-1])))
plt.show()
for key in features.keys():
    print(key)
    print(features[key].shape)
    #plt.imshow(features[key][1,0,:,:].squeeze().detach().numpy(), cmap='jet')
    plt.show()

print(len(features["p4"]))
print(features["p4"])

p2
torch.Size([6, 256, 200, 336])
p3
torch.Size([6, 256, 100, 168])
p4
torch.Size([6, 256, 50, 84])
p5
torch.Size([6, 256, 25, 42])
p6
torch.Size([6, 256, 13, 21])
6
tensor([[[[-0.2339, -0.7605, -0.5529,  ..., -0.2402,  0.1239,  0.2756],
          [-1.4405, -2.9401, -2.6750,  ..., -2.2208, -1.8992, -1.1978],
          [-1.1124, -2.6250, -2.3642,  ..., -1.7072, -1.5240, -0.9410],
          ...,
          [-0.4305, -1.7334, -1.4822,  ..., -0.8322, -0.6675, -0.5007],
          [-0.2722, -1.1909, -0.7855,  ..., -0.3541, -0.2216, -0.2739],
          [-0.6323, -1.3772, -1.0057,  ..., -0.8694, -0.7488, -0.5620]],

         [[-2.4481, -1.3858, -0.9061,  ..., -2.0684, -2.1039, -1.2381],
          [-2.2668, -0.2450,  0.2346,  ..., -1.1631, -1.1063, -0.4685],
          [-2.1394, -0.2023,  0.2585,  ..., -1.3468, -1.3329, -0.5060],
          ...,
          [-2.8850, -2.1283, -1.8127,  ..., -1.5873, -1.7189, -1.0331],
          [-3.2258, -2.4603, -2.2480,  ..., -2.0802, -2.2068, -1.4842],
          

In [8]:
def get_proposals(model, images, features):
    proposals, _ = model.proposal_generator(images, features)
    return proposals

proposals = get_proposals(model, images, features)
print("proposals according to features:",proposals)

proposals according to features: [Instances(num_instances=1000, image_height=800, image_width=801, fields=[proposal_boxes: Boxes(tensor([[  0.0000,  29.0596, 801.0000, 763.2153],
        [138.5629, 156.9431, 650.9709, 560.9057],
        [116.3948, 270.1638, 676.5148, 599.6572],
        ...,
        [ 30.8001,   0.0000,  83.4920,  11.9849],
        [769.1656,   0.0000, 801.0000,   8.0769],
        [  7.8177, 742.8581,  55.6651, 793.0452]])), objectness_logits: tensor([  0.6382,  -0.2936,  -1.3645,  -1.4648,  -1.6877,  -1.9759,  -1.9793,
         -1.9982,  -2.3819,  -2.5565,  -2.5787,  -2.7203,  -2.7401,  -2.9841,
         -3.4334,  -3.5020,  -3.6674,  -3.8055,  -3.9749,  -3.9757,  -4.0528,
         -4.0692,  -4.1512,  -4.4841,  -4.5131,  -4.6626,  -4.8122,  -4.8151,
         -4.8157,  -4.8217,  -4.8306,  -4.9056,  -4.9281,  -4.9287,  -4.9516,
         -5.0019,  -5.0414,  -5.1162,  -5.1195,  -5.2351,  -5.2870,  -5.3381,
         -5.4966,  -5.5286,  -5.5306,  -5.5776,  -5.6401,  -5.7185, 

In [9]:
def get_box_features(model, features, proposals):
    features_list = [features[f] for f in ['p2', 'p3', 'p4', 'p5']]
    box_features = model.roi_heads.box_pooler(features_list, [x.proposal_boxes for x in proposals])
    box_features = model.roi_heads.box_head.flatten(box_features)
    box_features = model.roi_heads.box_head.fc1(box_features)
    box_features = model.roi_heads.box_head.fc_relu1(box_features)
    box_features = model.roi_heads.box_head.fc2(box_features)

    box_features = box_features.reshape(6, 1000, 1024) # depends on your config and batch size
    return box_features, features_list

box_features, features_list = get_box_features(model, features, proposals)
print("box features:",box_features)
print("features list:",features_list)

box features: tensor([[[ 1.3052,  0.1629, -0.1116,  ..., -1.2527,  0.5832,  0.3641],
         [ 1.3308,  0.6363,  0.7799,  ..., -1.0797,  1.0184,  0.6271],
         [ 1.2399, -0.3756,  0.8246,  ..., -0.6423,  0.1965,  1.0785],
         ...,
         [ 0.7061, -1.1168, -0.3525,  ..., -2.1814,  0.7838,  0.4047],
         [ 0.2750, -1.0779, -0.5362,  ..., -2.0326,  0.7429,  0.3882],
         [ 0.2754, -1.3322,  0.5733,  ..., -0.2895,  1.0622,  1.4202]],

        [[ 1.2921,  0.2224, -0.1565,  ..., -1.3150,  0.6241,  0.3345],
         [ 1.3121,  0.8235,  0.8375,  ..., -1.1112,  1.1912,  0.6513],
         [ 1.2711,  0.7147,  0.6594,  ..., -1.1116,  1.0816,  0.6152],
         ...,
         [ 0.6493, -1.3101, -0.3563,  ..., -2.3190,  0.7385,  0.2548],
         [ 0.6411, -1.3301, -0.3574,  ..., -2.3179,  0.7456,  0.2556],
         [ 0.6500, -1.3280, -0.3586,  ..., -2.3151,  0.7485,  0.2630]],

        [[ 1.3624,  0.4252, -0.1540,  ..., -1.3095,  0.8081,  0.3464],
         [ 1.1478,  0.9532,  0.

In [10]:
def get_prediction_logits(model, features_list, proposals):
    cls_features = model.roi_heads.box_pooler(features_list, [x.proposal_boxes for x in proposals])
    cls_features = model.roi_heads.box_head(cls_features)
    pred_class_logits, pred_proposal_deltas = model.roi_heads.box_predictor(cls_features)
    return pred_class_logits, pred_proposal_deltas

pred_class_logits, pred_proposal_deltas = get_prediction_logits(model, features_list, proposals)
print("pred class logits:",pred_class_logits)
print("pred proposal deltas:",pred_proposal_deltas)

pred class logits: tensor([[ 1.1999, -1.3349,  2.8133,  ..., -0.9857, -1.2914, 10.8945],
        [ 1.1531, -0.8976,  2.1548,  ..., -1.4252, -0.7650, 14.8216],
        [ 2.0449, -1.4738,  3.2465,  ..., -1.2717, -0.3335, 16.5184],
        ...,
        [ 3.2102, -2.0527,  3.7598,  ..., -1.5842, -1.4790, 22.6621],
        [ 3.1997, -2.0556,  3.7477,  ..., -1.5795, -1.4767, 22.6618],
        [ 3.1988, -2.0535,  3.7276,  ..., -1.5803, -1.4906, 22.6386]],
       grad_fn=<AddmmBackward>)
pred proposal deltas: tensor([[ 0.1312, -0.0026, -0.3438,  ..., -0.2616, -0.1239,  0.2320],
        [ 0.2838,  0.4702,  0.6241,  ...,  0.2696,  1.0796,  1.0592],
        [ 0.2939,  0.2839,  0.1619,  ..., -0.0379,  0.5055,  0.4161],
        ...,
        [-0.1276, -0.4415,  0.0187,  ..., -0.4610,  0.4174, -0.5578],
        [-0.1268, -0.4414,  0.0199,  ..., -0.4609,  0.4170, -0.5598],
        [-0.1287, -0.4410,  0.0171,  ..., -0.4613,  0.4171, -0.5532]],
       grad_fn=<AddmmBackward>)


In [11]:
class FastRCNNOutputs:
    
    def __init__(
        self,
        box2box_transform,
        pred_class_logits,
        pred_proposal_deltas,
        proposals,
        smooth_l1_beta=0.0,
        box_reg_loss_type="smooth_l1",
    ):

        self.box2box_transform = box2box_transform
        self.num_preds_per_image = [len(p) for p in proposals]
        self.pred_class_logits = pred_class_logits
        self.pred_proposal_deltas = pred_proposal_deltas
        self.smooth_l1_beta = smooth_l1_beta
        self.box_reg_loss_type = box_reg_loss_type

        self.image_shapes = [x.image_size for x in proposals]

        #concatenation
        if len(proposals):
            box_type = type(proposals[0].proposal_boxes)
            # cat(..., dim=0) concatenates over all images in the batch
            self.proposals = box_type.cat([p.proposal_boxes for p in proposals])
            assert (
                not self.proposals.tensor.requires_grad
            ), "Proposals should not require gradients!"

            if proposals[0].has("gt_classes"):
                self.gt_classes = cat([p.gt_classes for p in proposals], dim=0)
                
                gt_boxes = [
                    p.gt_boxes if p.has("gt_boxes") else p.proposal_boxes for p in proposals
                ]
                self.gt_boxes = box_type.cat(gt_boxes)
        else:
            self.proposals = Boxes(torch.zeros(0, 4, device=self.pred_proposal_deltas.device))
        self._no_instances = len(self.proposals) == 0  # no instances found
        

    def predict_boxes(self):
        """
        Deprecated
        """
        pred = self.box2box_transform.apply_deltas(self.pred_proposal_deltas, self.proposals.tensor)
        return pred.split(self.num_preds_per_image, dim=0)

    def predict_probs(self):
        """
        Deprecated
        """
        probs = torch.nn.functional.softmax(self.pred_class_logits, dim=-1)
        return probs.split(self.num_preds_per_image, dim=0)


In [12]:
def get_box_scores(cfg, pred_class_logits, pred_proposal_deltas):
    box2box_transform = Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
    smooth_l1_beta = cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA

    outputs = FastRCNNOutputs(
        box2box_transform,
        pred_class_logits,
        pred_proposal_deltas,
        proposals,
        smooth_l1_beta,
    )

    boxes = outputs.predict_boxes()
    scores = outputs.predict_probs()
    image_shapes = outputs.image_shapes

    return boxes, scores, image_shapes

boxes, scores, image_shapes = get_box_scores(cfg, pred_class_logits, pred_proposal_deltas)
print("boxes:",len(boxes))
print("scores:",len(scores))
print("image shapes:",len(image_shapes))
print("boxes shapes:",boxes)

boxes: 6
scores: 6
image shapes: 6
boxes shapes: (tensor([[ 3.7125e+01,  7.4371e-01,  7.8490e+02,  ..., -7.5747e+00,
          7.9273e+02,  7.6144e+02],
        [ 1.1904e+02,  1.0259e+02,  6.9958e+02,  ...,  1.2018e+02,
          7.2502e+02,  6.1946e+02],
        [ 1.2364e+02,  2.5852e+02,  7.0219e+02,  ...,  2.5462e+02,
          7.2439e+02,  6.1271e+02],
        ...,
        [ 2.9916e+01, -1.2694e-01,  8.2664e+01,  ..., -3.4691e-02,
          8.6147e+01,  1.1292e+01],
        [ 7.6868e+02,  1.8076e-01,  8.0136e+02,  ...,  2.8820e-01,
          8.0220e+02,  7.6461e+00],
        [ 7.7081e+00,  7.3798e+02,  5.5557e+01,  ...,  7.4016e+02,
          5.8036e+01,  7.9168e+02]], grad_fn=<SplitWithSizesBackward>), tensor([[ 3.2008e+01, -8.7063e+00,  8.7169e+02,  ..., -1.3599e+01,
          8.7863e+02,  7.5881e+02],
        [ 1.6601e+02,  1.0829e+02,  7.7371e+02,  ...,  1.3526e+02,
          8.0762e+02,  6.1629e+02],
        [ 1.3744e+02,  1.0484e+02,  7.3316e+02,  ...,  1.2668e+02,
          

In [13]:
def get_output_boxes(boxes, batched_inputs, image_size):
    proposal_boxes = boxes.reshape(-1, 4).clone()
    scale_x, scale_y = (batched_inputs["width"] / image_size[1], batched_inputs["height"] / image_size[0])
    output_boxes = Boxes(proposal_boxes)

    output_boxes.scale(scale_x, scale_y)
    output_boxes.clip(image_size)

    return output_boxes

output_boxes = [get_output_boxes(boxes[i], batched_inputs[i], proposals[i].image_size) for i in range(len(proposals))]
print("output boxes:", output_boxes)

output boxes: [Boxes(tensor([[3.7125e+01, 7.4371e-01, 7.8490e+02, 7.9115e+02],
        [2.6838e+01, 0.0000e+00, 7.8767e+02, 7.8275e+02],
        [0.0000e+00, 0.0000e+00, 8.0100e+02, 7.8523e+02],
        ...,
        [7.2383e+00, 7.3854e+02, 5.7555e+01, 7.9380e+02],
        [7.1386e+00, 7.3798e+02, 5.6565e+01, 7.9410e+02],
        [7.0444e+00, 7.4016e+02, 5.8036e+01, 7.9168e+02]],
       grad_fn=<StackBackward>)), Boxes(tensor([[3.2008e+01, 0.0000e+00, 8.7169e+02, 7.8995e+02],
        [2.8628e+01, 0.0000e+00, 8.7332e+02, 7.8148e+02],
        [0.0000e+00, 0.0000e+00, 8.8400e+02, 7.8523e+02],
        ...,
        [4.9461e+02, 0.0000e+00, 5.4520e+02, 9.2984e+00],
        [4.9379e+02, 0.0000e+00, 5.4561e+02, 9.7024e+00],
        [4.9350e+02, 6.7459e-02, 5.4721e+02, 9.1568e+00]],
       grad_fn=<StackBackward>)), Boxes(tensor([[3.3115e+01, 0.0000e+00, 1.0497e+03, 7.9854e+02],
        [4.1303e+01, 0.0000e+00, 1.0668e+03, 7.8383e+02],
        [0.0000e+00, 0.0000e+00, 1.0810e+03, 7.9105e+02],
 

In [14]:
def select_boxes(cfg, output_boxes, scores):
    test_score_thresh = cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST
    test_nms_thresh = cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST
    cls_prob = scores.detach()
    cls_boxes = output_boxes.tensor.detach().reshape(1000,80,4)
    max_conf = torch.zeros((cls_boxes.shape[0]))
    for cls_ind in range(0, cls_prob.shape[1]-1):
        cls_scores = cls_prob[:, cls_ind+1]
        det_boxes = cls_boxes[:,cls_ind,:]
        keep = np.array(nms(det_boxes, cls_scores, test_nms_thresh))
        max_conf[keep] = torch.where(cls_scores[keep] > max_conf[keep], cls_scores[keep], max_conf[keep])
    keep_boxes = torch.where(max_conf >= test_score_thresh)[0]
    return keep_boxes, max_conf

In [15]:
temp = [select_boxes(cfg, output_boxes[i], scores[i]) for i in range(len(scores))]
keep_boxes, max_conf = [],[]
for keep_box, mx_conf in temp:
    keep_boxes.append(keep_box)
    max_conf.append(mx_conf)

In [16]:
MIN_BOXES=10
MAX_BOXES=100
def filter_boxes(keep_boxes, max_conf, min_boxes, max_boxes):
    if len(keep_boxes) < min_boxes:
        keep_boxes = np.argsort(max_conf).numpy()[::-1][:min_boxes]
    elif len(keep_boxes) > max_boxes:
        keep_boxes = np.argsort(max_conf).numpy()[::-1][:max_boxes]
    return keep_boxes

keep_boxes = [filter_boxes(keep_box, mx_conf, MIN_BOXES, MAX_BOXES) for keep_box, mx_conf in zip(keep_boxes, max_conf)]
print("keep boxes:",keep_boxes)

keep boxes: [array([580, 986, 143, 345, 221, 359, 820, 640, 371, 384, 430, 527, 396,
       234, 595, 926, 540, 403, 994, 988, 575, 668, 325, 319, 254, 955,
       652, 542, 533, 720, 715, 727,  84, 990, 822, 579, 777, 404, 845,
       585, 876, 993, 539, 625, 171, 407, 762, 474, 765, 959, 756, 477,
       826, 537, 503,  75, 556, 884, 473, 989, 425, 232, 666, 670, 219,
        69, 264, 951, 417, 458, 576, 491, 424, 485, 487, 658, 511, 488,
       941, 945, 970, 622, 996, 833, 366, 126, 128, 772, 569, 659, 737,
       414, 795, 782, 463, 972, 492, 672, 665, 293], dtype=int64), array([999, 554, 387, 409, 171, 464, 911, 366, 959, 998, 362, 516, 965,
       353, 344, 978, 321, 990, 997, 690, 986, 993, 672, 984, 549, 363,
       974, 656, 937, 971, 935, 728, 295, 727, 371, 630, 610, 256, 706,
       450, 379, 800, 382, 599, 211, 759, 517, 530, 864, 973, 983, 801,
       767, 779, 806, 237, 972, 724, 797, 910, 536, 404, 979, 970, 650,
       674, 301,  90, 168, 465, 436, 932, 444, 489, 217,

In [17]:
def get_visual_embeds(box_features, keep_boxes):
    return box_features[keep_boxes.copy()]

#visual_embeds = [get_visual_embeds(box_feature, keep_box) for box_feature, keep_box in zip(box_features, keep_boxes)]
#print("and visual embeds, finally:",visual_embeds)

In [18]:
import os
from getpass import getpass
import urllib
import torch.nn.functional as F

In [19]:
from transformers import BertTokenizer, VisualBertForVisualReasoning
import torch

visual_embeds = [get_visual_embeds(box_feature, keep_box) for box_feature, keep_box in zip(box_features, keep_boxes)]

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = VisualBertForVisualReasoning.from_pretrained("uclanlp/visualbert-nlvr2-coco-pre")

texts = ["Is apple in box?","Is bag out of box?","Is monkey in the box?","Is monkey in the box?","Is orange in the box?",
        "Is spatula in the box?"]

real_answers = [0,1,1,0,1,1]

visual_embeds = torch.stack(visual_embeds)

for i in range(len(images)):
    print("for image "+str(i))
    
    text = texts[i]
    
    inputs = tokenizer(text, return_tensors="pt")
    
    #torch ones
    visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
    visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
    

    inputs.update(
        {
            "visual_embeds": visual_embeds[i:i+1],
            "visual_token_type_ids": visual_token_type_ids[i:i+1],
            "visual_attention_mask": visual_attention_mask[i:i+1],
        }
    )

    #no need to use unsqueeze for visual embeds because its already a list holding lists
    labels = torch.tensor(1).unsqueeze(0)  # Batch size 1, Num choices 2
    outputs = model(**inputs, labels=labels)
    #print("labels:",labels)
    print("outputs:",outputs)
    loss = outputs.loss
    print("loss:",loss)
    scores = outputs.logits
    #print("scores:",scores)
    out = F.softmax(scores,dim = 1)
    print("real answer:",str(real_answers[i])+" ----------- outputs after softmax op:",out)

Some weights of the model checkpoint at uclanlp/visualbert-nlvr2-coco-pre were not used when initializing VisualBertForVisualReasoning: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing VisualBertForVisualReasoning from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VisualBertForVisualReasoning from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of VisualBertForVisualReasoning were 

for image 0
outputs: SequenceClassifierOutput(loss=tensor(0.5852, grad_fn=<NllLossBackward>), logits=tensor([[-0.5378, -0.3089]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
loss: tensor(0.5852, grad_fn=<NllLossBackward>)
real answer: 0 ----------- outputs after softmax op: tensor([[0.4430, 0.5570]], grad_fn=<SoftmaxBackward>)
for image 1
outputs: SequenceClassifierOutput(loss=tensor(0.5742, grad_fn=<NllLossBackward>), logits=tensor([[-0.5199, -0.2660]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
loss: tensor(0.5742, grad_fn=<NllLossBackward>)
real answer: 1 ----------- outputs after softmax op: tensor([[0.4369, 0.5631]], grad_fn=<SoftmaxBackward>)
for image 2
outputs: SequenceClassifierOutput(loss=tensor(0.5836, grad_fn=<NllLossBackward>), logits=tensor([[-0.5371, -0.3045]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
loss: tensor(0.5836, grad_fn=<NllLossBackward>)
real answer: 1 ----------- outputs after softmax op: tensor([[0

In [20]:
"""
from transformers import AutoTokenizer, VisualBertForQuestionAnswering
import torch

visual_embeds = [get_visual_embeds(box_feature, keep_box) for box_feature, keep_box in zip(box_features, keep_boxes)]


tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = VisualBertForQuestionAnswering.from_pretrained("uclanlp/visualbert-vqa")

text = "Where is the monkey?"
inputs = tokenizer(text, return_tensors="pt")
#visual_embeds = get_visual_embeds(image).unsqueeze(0)
visual_embeds = torch.stack(visual_embeds)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)

inputs.update(
    {
        "visual_embeds": visual_embeds[0:1],
        "visual_token_type_ids": visual_token_type_ids[0:1],
        "visual_attention_mask": visual_attention_mask[0:1],
    }
)

labels = torch.tensor([[0.0, 1.0]]).unsqueeze(0)  # Batch size 1, Num choices 2
outputs = model(**inputs, labels=labels)
print(outputs)
#print(f"output: {out.shape}")
loss = outputs.loss
print(loss)
scores = outputs.logits
print(scores)
"""

'\nfrom transformers import AutoTokenizer, VisualBertForQuestionAnswering\nimport torch\n\nvisual_embeds = [get_visual_embeds(box_feature, keep_box) for box_feature, keep_box in zip(box_features, keep_boxes)]\n\n\ntokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")\nmodel = VisualBertForQuestionAnswering.from_pretrained("uclanlp/visualbert-vqa")\n\ntext = "Where is the monkey?"\ninputs = tokenizer(text, return_tensors="pt")\n#visual_embeds = get_visual_embeds(image).unsqueeze(0)\nvisual_embeds = torch.stack(visual_embeds)\nvisual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)\nvisual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)\n\ninputs.update(\n    {\n        "visual_embeds": visual_embeds[0:1],\n        "visual_token_type_ids": visual_token_type_ids[0:1],\n        "visual_attention_mask": visual_attention_mask[0:1],\n    }\n)\n\nlabels = torch.tensor([[0.0, 1.0]]).unsqueeze(0)  # Batch size 1, Num choices 2\noutputs = m

In [21]:
#print(outputs["instances"].pred_classes)
#print(outputs["instances"].pred_boxes)