In [128]:
%%capture
!pip install pyyaml==5.1
!python -m pip install 'git+https://github.com/facebookresearch/detectron2.git@v0.5'

In [130]:
%%capture
!wget http://images.cocodataset.org/zips/val2014.zip
!unzip val2014.zip

In [131]:
%%capture
!wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Val_mscoco.zip
!unzip v2_Annotations_Val_mscoco.zip

In [132]:
%%capture
!pip install --upgrade tensorflow
!pip install --upgrade tensorflow-gpu

In [1]:
import torch
torch.__version__


import torch, torchvision
import matplotlib.pyplot as plt
import json
import cv2
import numpy as np
from copy import deepcopy
from detectron2.modeling import build_model
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.structures.image_list import ImageList
from detectron2.data import transforms as T
from detectron2.modeling.box_regression import Box2BoxTransform
from detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputLayers
from detectron2.structures.boxes import Boxes
from detectron2.layers import nms
from detectron2 import model_zoo
from detectron2.config import get_cfg


"""
with open('v2_OpenEnded_mscoco_val2014_questions.json') as f:
    q = json.load(f)

with open('v2_mscoco_val2014_annotations.json') as f:
    a = json.load(f)
"""

"\nwith open('v2_OpenEnded_mscoco_val2014_questions.json') as f:\n    q = json.load(f)\n\nwith open('v2_mscoco_val2014_annotations.json') as f:\n    a = json.load(f)\n"

In [2]:
#from tqdm.auto import tqdm

In [65]:
#000000000142.jpg","caption":"The banana is part of the sandwich. label 1
#000000000370.jpg","caption":"The person is touching the broccoli." label 1
#000000000397.jpg","caption":"The pizza is over the dining table." label 1
#000000002570.jpg","caption":"The donut is in front of the laptop." label 1
#000000003862.jpg","caption":"The dog is on top of the bench.","label":0,
#"photos/bag_in.png","photos/monkey_out.png","photos/monkeyin.png","photos/applein_orangeout.png","photos/spatula_out.png"

#"photos/orange_out.png",
image_names = ["images/000000003862.jpg"]

images = [cv2.cvtColor(plt.imread(n),cv2.COLOR_RGB2BGR) for n in image_names]


In [66]:
cfg_path = "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml"

def load_config_and_model_weights(cfg_path):
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file(cfg_path))

    # ROI HEADS SCORE THRESHOLD
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5

    # Comment the next line if you're using 'cuda'
    cfg['MODEL']['DEVICE']='cpu'

    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(cfg_path)

    return cfg

cfg = load_config_and_model_weights(cfg_path)

In [67]:
def get_model(cfg):
    # build model
    model = build_model(cfg)

    # load weights
    checkpointer = DetectionCheckpointer(model)
    checkpointer.load(cfg.MODEL.WEIGHTS)

    # eval mode
    model.eval()
    return model

model = get_model(cfg)

In [68]:
def prepare_image_inputs(cfg, img_list):
    # Resizing the image according to the configuration
    transform_gen = T.ResizeShortestEdge(
                [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
            )
    img_list = [transform_gen.get_transform(img).apply_image(img) for img in img_list]

    # Convert to C,H,W format
    convert_to_tensor = lambda x: torch.Tensor(x.astype("float32").transpose(2, 0, 1))

    batched_inputs = [{"image":convert_to_tensor(img), "height": img.shape[0], "width": img.shape[1]} for img in img_list]

    # Normalizing the image
    num_channels = len(cfg.MODEL.PIXEL_MEAN)
    pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(num_channels, 1, 1)
    pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).view(num_channels, 1, 1)
    normalizer = lambda x: (x - pixel_mean) / pixel_std
    images = [normalizer(x["image"]) for x in batched_inputs]

    # Convert to ImageList
    images =  ImageList.from_tensors(images,model.backbone.size_divisibility)
    
    return images, batched_inputs

images, batched_inputs = prepare_image_inputs(cfg, images)

In [69]:
def get_features(model, images):
    features = model.backbone(images.tensor)
    return features

features = get_features(model, images)

In [70]:
#plt.imshow(cv2.resize(img2, (images.tensor.shape[-2:][::-1])))
"""
plt.show()
for key in features.keys():
    print(key)
    print(features[key].shape)
    #plt.imshow(features[key][1,0,:,:].squeeze().detach().numpy(), cmap='jet')
    plt.show()

print(len(features["p4"]))
print(features["p4"])
"""

'\nplt.show()\nfor key in features.keys():\n    print(key)\n    print(features[key].shape)\n    #plt.imshow(features[key][1,0,:,:].squeeze().detach().numpy(), cmap=\'jet\')\n    plt.show()\n\nprint(len(features["p4"]))\nprint(features["p4"])\n'

In [71]:
def get_proposals(model, images, features):
    proposals, _ = model.proposal_generator(images, features)
    return proposals

proposals = get_proposals(model, images, features)

In [72]:
def get_box_features(model, features, proposals):
    features_list = [features[f] for f in ['p2', 'p3', 'p4', 'p5']]
    box_features = model.roi_heads.box_pooler(features_list, [x.proposal_boxes for x in proposals])
    box_features = model.roi_heads.box_head.flatten(box_features)
    box_features = model.roi_heads.box_head.fc1(box_features)
    box_features = model.roi_heads.box_head.fc_relu1(box_features)
    box_features = model.roi_heads.box_head.fc2(box_features)

    box_features = box_features.reshape(1, 1000, 1024) # depends on your config and batch size
    return box_features, features_list

box_features, features_list = get_box_features(model, features, proposals)

In [73]:
def get_prediction_logits(model, features_list, proposals):
    cls_features = model.roi_heads.box_pooler(features_list, [x.proposal_boxes for x in proposals])
    cls_features = model.roi_heads.box_head(cls_features)
    pred_class_logits, pred_proposal_deltas = model.roi_heads.box_predictor(cls_features)
    return pred_class_logits, pred_proposal_deltas

pred_class_logits, pred_proposal_deltas = get_prediction_logits(model, features_list, proposals)

In [74]:
class FastRCNNOutputs:
    
    def __init__(
        self,
        box2box_transform,
        pred_class_logits,
        pred_proposal_deltas,
        proposals,
        smooth_l1_beta=0.0,
        box_reg_loss_type="smooth_l1",
    ):

        self.box2box_transform = box2box_transform
        self.num_preds_per_image = [len(p) for p in proposals]
        self.pred_class_logits = pred_class_logits
        self.pred_proposal_deltas = pred_proposal_deltas
        self.smooth_l1_beta = smooth_l1_beta
        self.box_reg_loss_type = box_reg_loss_type

        self.image_shapes = [x.image_size for x in proposals]

        #concatenation
        if len(proposals):
            box_type = type(proposals[0].proposal_boxes)
            # cat(..., dim=0) concatenates over all images in the batch
            self.proposals = box_type.cat([p.proposal_boxes for p in proposals])
            assert (
                not self.proposals.tensor.requires_grad
            ), "Proposals should not require gradients!"

            if proposals[0].has("gt_classes"):
                self.gt_classes = cat([p.gt_classes for p in proposals], dim=0)
                
                gt_boxes = [
                    p.gt_boxes if p.has("gt_boxes") else p.proposal_boxes for p in proposals
                ]
                self.gt_boxes = box_type.cat(gt_boxes)
        else:
            self.proposals = Boxes(torch.zeros(0, 4, device=self.pred_proposal_deltas.device))
        self._no_instances = len(self.proposals) == 0  # no instances found
        

    def predict_boxes(self):
        """
        Deprecated
        """
        pred = self.box2box_transform.apply_deltas(self.pred_proposal_deltas, self.proposals.tensor)
        return pred.split(self.num_preds_per_image, dim=0)

    def predict_probs(self):
        """
        Deprecated
        """
        probs = torch.nn.functional.softmax(self.pred_class_logits, dim=-1)
        return probs.split(self.num_preds_per_image, dim=0)


In [75]:
def get_box_scores(cfg, pred_class_logits, pred_proposal_deltas):
    box2box_transform = Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
    smooth_l1_beta = cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA

    outputs = FastRCNNOutputs(
        box2box_transform,
        pred_class_logits,
        pred_proposal_deltas,
        proposals,
        smooth_l1_beta,
    )
    

    boxes = outputs.predict_boxes()
    scores = outputs.predict_probs()
    image_shapes = outputs.image_shapes

    return boxes, scores, image_shapes

boxes, scores, image_shapes = get_box_scores(cfg, pred_class_logits, pred_proposal_deltas)

In [76]:
def get_output_boxes(boxes, batched_inputs, image_size):
    proposal_boxes = boxes.reshape(-1, 4).clone()
    scale_x, scale_y = (batched_inputs["width"] / image_size[1], batched_inputs["height"] / image_size[0])
    output_boxes = Boxes(proposal_boxes)

    output_boxes.scale(scale_x, scale_y)
    output_boxes.clip(image_size)

    return output_boxes

output_boxes = [get_output_boxes(boxes[i], batched_inputs[i], proposals[i].image_size) for i in range(len(proposals))]

In [77]:
def select_boxes(cfg, output_boxes, scores):
    test_score_thresh = cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST
    test_nms_thresh = cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST
    cls_prob = scores.detach()
    cls_boxes = output_boxes.tensor.detach().reshape(1000,80,4)
    max_conf = torch.zeros((cls_boxes.shape[0]))
    for cls_ind in range(0, cls_prob.shape[1]-1):
        cls_scores = cls_prob[:, cls_ind+1]
        det_boxes = cls_boxes[:,cls_ind,:]
        keep = np.array(nms(det_boxes, cls_scores, test_nms_thresh))
        max_conf[keep] = torch.where(cls_scores[keep] > max_conf[keep], cls_scores[keep], max_conf[keep])
    keep_boxes = torch.where(max_conf >= test_score_thresh)[0]
    return keep_boxes, max_conf

In [78]:
temp = [select_boxes(cfg, output_boxes[i], scores[i]) for i in range(len(scores))]
keep_boxes, max_conf = [],[]
for keep_box, mx_conf in temp:
    keep_boxes.append(keep_box)
    max_conf.append(mx_conf)

In [79]:
MIN_BOXES=10
MAX_BOXES=100
def filter_boxes(keep_boxes, max_conf, min_boxes, max_boxes):
    if len(keep_boxes) < min_boxes:
        keep_boxes = np.argsort(max_conf).numpy()[::-1][:min_boxes]
    elif len(keep_boxes) > max_boxes:
        keep_boxes = np.argsort(max_conf).numpy()[::-1][:max_boxes]
    return keep_boxes

keep_boxes = [filter_boxes(keep_box, mx_conf, MIN_BOXES, MAX_BOXES) for keep_box, mx_conf in zip(keep_boxes, max_conf)]

In [80]:
def get_visual_embeds(box_features, keep_boxes):
    return box_features[keep_boxes.copy()]

#visual_embeds = [get_visual_embeds(box_feature, keep_box) for box_feature, keep_box in zip(box_features, keep_boxes)]
#print("and visual embeds, finally:",visual_embeds)

In [81]:
import os
from getpass import getpass
import urllib
import torch.nn.functional as F

In [8]:
import torch
import torchvision
import torch.utils.data as data
from torchvision.transforms import transforms
from transformers import BertTokenizer, VisualBertForVisualReasoning
from pycocotools.coco import COCO


# Define the data directory and annotation file paths
data_dir = 'train2017/'
ann_file_instances = 'annotations/instances_train2017.json'
ann_file_captions = 'annotations/captions_train2017.json'
ann_file_keypoints = COCO('annotations/person_keypoints_train2017.json')


ann_ids = ann_file_keypoints.getAnnIds()
anns = ann_file_keypoints.loadAnns(ann_ids)

coco_keypoints = []
for ann in anns:
    if ann['iscrowd'] == 1:
        continue
    kp = ann['keypoints']
    # Convert the keypoints to a list of (x, y, v) tuples
    coco_keypoints.append([(kp[i], kp[i+1], kp[i+2]) for i in range(0, len(kp), 3)])
    
# Load the COCO dataset using the COCODataset class
coco_instances = torchvision.datasets.CocoDetection(root=data_dir, annFile=ann_file_instances)
coco_captions = torchvision.datasets.CocoCaptions(root=data_dir, annFile=ann_file_captions)
#coco_keypoints = torchvision.datasets.CocoKeyPoints(root=data_dir, annFile=ann_file_keypoints)


transform = transforms.Compose([transforms.Resize((224, 224)),
                                transforms.ToTensor()])


class COCOVisualReasoningDataset(data.Dataset):
    def __init__(self, coco_instances, coco_captions, coco_keypoints, transform=None):
        self.coco_instances = coco_instances
        self.coco_captions = coco_captions
        self.coco_keypoints = coco_keypoints
        self.transform = transform
    
    def __getitem__(self, index):
        # Get the image and annotations for the given index
        img, target = self.coco_instances[index]
        caption = self.coco_captions[index][0]
        keypoints = self.coco_keypoints[index]
        
        ann_ids = ann_file_keypoints.getAnnIds(target[0]["image_id"])
        anns = ann_file_keypoints.loadAnns(ann_ids)

        #ann['category_id']

        target = {
            'boxes': [ann['bbox'] for ann in anns],
            'labels': [ann['category_id']-1 for ann in anns],
            'keypoints': [],
        }
        for ann in anns:
            if ann['iscrowd'] == 1:
                continue
            kp = ann['keypoints']
            target['keypoints'].append([(kp[i], kp[i+1], kp[i+2]) for i in range(0, len(kp), 3)])
            
        #caption = self.coco_caption.anns[self.caption_ids[index]]['caption']

        # Apply transformations
        if self.transform is not None:
            img = self.transform(img)


        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        boxes = torch.tensor(target['boxes'], dtype=torch.float32)
        labels = torch.tensor(target['labels'], dtype=torch.int64)
        #labels = torch.tensor(1).unsqueeze(0)
        keypoints = torch.tensor(target['keypoints'], dtype=torch.float32)
        #tokenized = texte.apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True)))
        caption = torch.tensor(tokenizer.encode(caption).ids, dtype=torch.int64)


        #return img, target, caption, keypoints
        return img, boxes, labels, keypoints, caption
    
    def __len__(self):
        return len(self.coco_instances)

# Define the training parameters
batch_size = 32
lr = 0.001
num_epochs = 10


dataset = COCOVisualReasoningDataset(coco_instances, coco_captions, coco_keypoints, transform=transform)
dataloader = data.DataLoader(dataset, batch_size=batch_size, shuffle=True)


model = VisualBertForVisualReasoning.from_pretrained('uclanlp/visualbert-nlvr2-coco-pre')
optimizer = torch.optim.Adam(model.parameters(), lr=lr)


criterion = torch.nn.CrossEntropyLoss()


for epoch in range(num_epochs):
    running_loss = 0.0
    
    for i, data in enumerate(dataloader):
        inputs, targets, captions, keypoints = data
        
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs, captions=captions, object_features=targets, object_locations=keypoints)
        
        # Compute the loss
        loss = criterion(outputs, targets)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        # Print statistics
        running_loss += loss.item()
        if i % 10 == 9:
            print(running_loss)
            print(loss.item)


loading annotations into memory...
Done (t=8.16s)
creating index...
index created!
loading annotations into memory...
Done (t=119.32s)
creating index...
index created!
loading annotations into memory...
Done (t=1.27s)
creating index...
index created!


Some weights of the model checkpoint at uclanlp/visualbert-nlvr2-coco-pre were not used when initializing VisualBertForVisualReasoning: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing VisualBertForVisualReasoning from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VisualBertForVisualReasoning from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of VisualBertForVisualReasoning were 

anns: [{'segmentation': [[374.5, 374.31, 375.15, 372.95, 378.06, 372.5, 379.48, 373.28, 380.13, 375.54, 380.77, 378, 380.38, 380.72, 379.87, 382.91], [392.67, 375.28, 394.29, 374.05, 395.78, 373.02, 397.46, 371.72, 397.85, 370.88, 399.27, 371.47, 398.88, 373.15, 396.88, 374.57, 395, 375.22, 393.32, 375.61]], 'num_keypoints': 0, 'area': 45.7978, 'iscrowd': 0, 'keypoints': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'image_id': 191360, 'bbox': [374.5, 370.88, 24.77, 12.03], 'category_id': 1, 'id': 2030274}]


AttributeError: 'COCOVisualReasoningDataset' object has no attribute 'tokenizer'

In [84]:
from transformers import BertTokenizer, VisualBertForVisualReasoning
import torch


for k in range(5):
    
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = VisualBertForVisualReasoning.from_pretrained("uclanlp/visualbert-nlvr2-coco-pre")

    visual_embeds = [get_visual_embeds(box_feature, keep_box) for box_feature, keep_box in zip(box_features, keep_boxes)]

    visual_embeds = torch.stack(visual_embeds)

    for i in range(len(images)):

        text = "Is the dog is on under the bench?" #get text from captions

        inputs = tokenizer(text, return_tensors="pt")

        #torch ones
        visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
        visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)


        inputs.update(
            {
                "visual_embeds": visual_embeds[i:i+1],
                "visual_token_type_ids": visual_token_type_ids[i:i+1],
                "visual_attention_mask": visual_attention_mask[i:i+1],
            }
        )

        #no need to use unsqueeze for visual embeds because its already a list holding lists
        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1, Num choices 2
        outputs = model(**inputs, labels=labels)
        #print("labels:",labels)
        loss = outputs.loss
        scores = outputs.logits
        out = F.softmax(scores,dim = 1)
        
        losses[loss]=out
    
#compare all losses
for key in sorted(losses):
    print (key, losses[key])
    
    

Some weights of the model checkpoint at uclanlp/visualbert-nlvr2-coco-pre were not used when initializing VisualBertForVisualReasoning: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing VisualBertForVisualReasoning from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VisualBertForVisualReasoning from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of VisualBertForVisualReasoning were 

tensor(0.2294, grad_fn=<NllLossBackward>) tensor([[0.2050, 0.7950]], grad_fn=<SoftmaxBackward>)
tensor(0.5751, grad_fn=<NllLossBackward>) tensor([[0.4374, 0.5626]], grad_fn=<SoftmaxBackward>)
tensor(0.6456, grad_fn=<NllLossBackward>) tensor([[0.4756, 0.5244]], grad_fn=<SoftmaxBackward>)
tensor(0.6695, grad_fn=<NllLossBackward>) tensor([[0.4880, 0.5120]], grad_fn=<SoftmaxBackward>)
tensor(0.7918, grad_fn=<NllLossBackward>) tensor([[0.5470, 0.4530]], grad_fn=<SoftmaxBackward>)


In [22]:
"""
from transformers import AutoTokenizer, VisualBertForQuestionAnswering
import torch

visual_embeds = [get_visual_embeds(box_feature, keep_box) for box_feature, keep_box in zip(box_features, keep_boxes)]


tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = VisualBertForQuestionAnswering.from_pretrained("uclanlp/visualbert-vqa")

text = "Where is the monkey?"
inputs = tokenizer(text, return_tensors="pt")
#visual_embeds = get_visual_embeds(image).unsqueeze(0)
visual_embeds = torch.stack(visual_embeds)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)

inputs.update(
    {
        "visual_embeds": visual_embeds[0:1],
        "visual_token_type_ids": visual_token_type_ids[0:1],
        "visual_attention_mask": visual_attention_mask[0:1],
    }
)

labels = torch.tensor([[0.0, 1.0]]).unsqueeze(0)  # Batch size 1, Num choices 2
outputs = model(**inputs, labels=labels)
print(outputs)
#print(f"output: {out.shape}")
loss = outputs.loss
print(loss)
scores = outputs.logits
print(scores)
"""

'\nfrom transformers import AutoTokenizer, VisualBertForQuestionAnswering\nimport torch\n\nvisual_embeds = [get_visual_embeds(box_feature, keep_box) for box_feature, keep_box in zip(box_features, keep_boxes)]\n\n\ntokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")\nmodel = VisualBertForQuestionAnswering.from_pretrained("uclanlp/visualbert-vqa")\n\ntext = "Where is the monkey?"\ninputs = tokenizer(text, return_tensors="pt")\n#visual_embeds = get_visual_embeds(image).unsqueeze(0)\nvisual_embeds = torch.stack(visual_embeds)\nvisual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)\nvisual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)\n\ninputs.update(\n    {\n        "visual_embeds": visual_embeds[0:1],\n        "visual_token_type_ids": visual_token_type_ids[0:1],\n        "visual_attention_mask": visual_attention_mask[0:1],\n    }\n)\n\nlabels = torch.tensor([[0.0, 1.0]]).unsqueeze(0)  # Batch size 1, Num choices 2\noutputs = m

In [None]:
#print(outputs["instances"].pred_classes)
#print(outputs["instances"].pred_boxes)