In [1]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
import argparse
import glob
import json
import logging
import math
import os
import sys
import random
import re
import numpy as np
from typing import List, Optional
from PIL import Image, ImageFile

import torch
from torch import nn
import torch.backends.cudnn as cudnn
from torchvision import transforms as pth_transforms
from tqdm import tqdm, trange
import time
import torchvision.transforms as T

sys.path.append("../detectron2")
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.data.datasets import register_coco_instances
sys.path.append("../dinov2-main")
sys.path.append(".")
from utils.visualizer import ColorMode, Visualizer
from utils.instance_det_dataset import RealWorldDataset
from utils.inference_utils import compute_similarity, stableMatching, \
    get_object_proposal, getColor, create_instances, nms, apply_nms, get_features
from adapter import ModifiedClipAdapter, WeightAdapter
logger = logging.getLogger("dinov2")

In [2]:
def get_args_parser(
        description: Optional[str] = None,
        parents: Optional[List[argparse.ArgumentParser]] = [],
        add_help: bool = True,
):

    parents = []

    parser = argparse.ArgumentParser(
        description=description,
        parents=parents,
        add_help=add_help,
    )
    parser.add_argument(
        "--train_path",
        default="../database_mini/train",
        type=str,
        help="Path to train dataset.",
    )
    parser.add_argument(
        "--test_path",
        default="../database_mini/test",
        type=str,
        help="Path to test dataset.",
    )
    parser.add_argument(
        "--imsize",
        default=224,
        type=int,
        help="Image size",
    )

    parser.add_argument(
        "--output_dir",
        default="./output",
        type=str,
        help="Path to save outputs.")
    parser.add_argument("--num_workers", default=0, type=int, help="Number of data loading workers per GPU.")

    parser.add_argument(
        "--gather-on-cpu",
        action="store_true",
        help="Whether to gather the train features on cpu, slower"
             "but useful to avoid OOM for large datasets (e.g. ImageNet22k).",
    )

    parser.set_defaults(
        train_dataset="Object",
        test_dataset="Scene",
        batch_size=1,
        num_workers=0,
    )
    return parser

In [3]:
# In[8]:
# set the scene level here
scene_level = 'all'  # all / easy / hard
# Default args and initialize model
args_parser = get_args_parser(description="Grounded SAM-DINOv2 Instance Detection")
imsize = 448
tag = "mask"  # bbox
args = args_parser.parse_args(args=[
                                    "--train_path", "C:/dataset/InsDet-FULL/Objects",
                                    "--test_path", "C:/dataset/InsDet-FULL/Data/test_1_"+scene_level,  # test_002
                                    "--output_dir", "exps/eval_ffa_"+scene_level+"4_gdino0.15t_vitl14_reg_" + str(imsize) + "_" + tag,
                                    ])
os.makedirs(args.output_dir, exist_ok=True)

In [4]:
# encoder = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14_reg')
# encoder.to('cuda')
# encoder.eval()

use_adapter = False
adapter_type = "weight"
if use_adapter:
    input_features = 1024 #768, 1024, the vector dimension
    if adapter_type == "clip":
        # adapter_args = 'Ins_clip_ratio_0.6_temp_0.05_epoch_40_lr_0.0001_bs_1024_vec_reduction_4_L2e4_vitl_reg'
        adapter_args = 'Ins_0413__ratio_0.6_temp_0.05_epoch_40_lr_0.0001_bs_512_vec_reduction_4_L2e4_vitl_reg'
        model_path = 'adapter_weights/adapter2FC/' + adapter_args + '_weights.pth'
        adapter = ModifiedClipAdapter(input_features, reduction=4, ratio=0.6).to('cuda')
    elif adapter_type == "weight":
        adapter_args = 'Ins_weighted_10sigmoid_ratio_0.6_temp_0.05_epoch_40_lr_0.001_bs_1024_vec_reduction_4_L2e4_vitl_reg'
        model_path = 'adapter_weights/adapter2FC/' + adapter_args + '_weights.pth'
        adapter = WeightAdapter(input_features, reduction=4).to('cuda')


    # Load the weights
    adapter.load_state_dict(torch.load(model_path))

    # If you plan to only evaluate the model, switch to eval mode
    adapter.eval()

    print('Model weights loaded and model is set to evaluation mode.')

In [5]:
output_dir = 'D:/CODE/NIDS-Net/feats'
json_filename = 'object_features_featup_patch.json'
if use_adapter:
    output_dir = './adapted_obj_feats'
    json_filename = adapter_args+'.json'

with open(os.path.join(output_dir, json_filename), 'r') as f:
    feat_dict = json.load(f)

print(f"Loaded {os.path.join(output_dir, json_filename)}")
object_features = torch.Tensor(feat_dict['features']).cuda()
object_features = nn.functional.normalize(object_features, dim=1, p=2)
print("object_features: ", object_features.shape) # Shape (2400, 384)
do_matching = True

Loaded D:/CODE/NIDS-Net/feats\object_features_featup_patch.json
object_features:  torch.Size([2400, 384])


  object_features = torch.Tensor(feat_dict['features']).cuda()


In [6]:
# In[9]:

# transform = pth_transforms.Compose([pth_transforms.ToTensor(),])
# object_dataset = RealWorldDataset(args.train_path, args.train_dataset, transform=transform, imsize=args.imsize)

In [7]:
from absl import app, logging
from PIL import Image as PILImg
from robokit.ObjDetection import GroundingDINOObjectPredictor, SegmentAnythingPredictor

logging.info("Initialize object detectors")
gdino = GroundingDINOObjectPredictor(use_vitb=False, threshold=0.15)

from utils.inference_utils import get_foreground_mask

image_dir = []
proposals_list = []
scene_name_list = []
# source_list = sorted(glob.glob(os.path.join(args.test_path, '*')))
transform = pth_transforms.Compose([pth_transforms.ToTensor(),])
scene_features_list = []
source_dir = os.path.join(args.test_path, 'images')

image_paths = sorted([p for p in glob.glob(os.path.join(source_dir, '*'))
                      if re.search('/*\.(jpg|jpeg|png|gif|bmp|pbm)', str(p))])
image_dir.extend(image_paths)


final text_encoder_type: bert-base-uncased
Model loaded from C:\Users\divya\.cache\huggingface\hub\models--ShilongLiu--GroundingDINO\snapshots\a94c9b567a2a374598f05c584e96798a170c56fb\groundingdino_swint_ogc.pth 
 => _IncompatibleKeys(missing_keys=[], unexpected_keys=['label_enc.weight', 'bert.embeddings.position_ids'])


In [8]:
all_bboxes = []
text_prompt='objects'

for image_path in tqdm(image_paths):
    
    image_pil = PILImg.open(image_path).convert("RGB")
    scene_name = os.path.basename(image_path).split('.')[0]
    scene_name_list.append(scene_name)

    logging.info("GDINO: Predict bounding boxes, phrases, and confidence scores")
    with torch.no_grad():
        bboxes, phrases, gdino_conf = gdino.predict(image_pil, text_prompt)
        # logging.info("GDINO post processing")
        w, h = image_pil.size  # Get image width and height
        # Scale bounding boxes to match the original image size
        image_pil_bboxes = gdino.bbox_to_scaled_xyxy(bboxes, w, h)
        
        all_bboxes.append(image_pil_bboxes.detach().cpu())
        
        torch.cuda.empty_cache()

100%|██████████| 160/160 [02:00<00:00,  1.32it/s]


In [24]:
bbs = [x.tolist() for x in all_bboxes]

In [26]:
data = { 
    'bboxes': bbs
}
with open(os.path.join("D:/CODE/NIDS-Net/feats/gdino_scaled_bboxes_all.json"), "w") as f:
    json.dump(data, f)

In [27]:
with open(os.path.join("D:/CODE/NIDS-Net/feats/gdino_scaled_bboxes_all.json"), 'r') as f:
    gdino_bboxes = json.load(f)


In [29]:
print("gdino_bboxes: ", len(gdino_bboxes['bboxes']))
gdino_bboxes['bboxes'][62]

gdino_bboxes:  160


[[3264.148193359375, 2443.725830078125, 3493.134033203125, 3093.709228515625],
 [4688.62451171875, 1761.1258544921875, 4898.86669921875, 2109.93505859375],
 [2698.519775390625, 2580.19140625, 3125.491455078125, 2951.232421875],
 [6158.375, 2385.10009765625, 6723.3505859375, 2688.09814453125],
 [4349.1904296875, 1774.622314453125, 4553.83203125, 2124.205078125],
 [1955.602294921875, 2555.4833984375, 2495.739013671875, 2808.126953125],
 [3840.252197265625, 1877.4078369140625, 5017.3271484375, 2721.423095703125],
 [5376.2724609375, 1866.12939453125, 5482.8466796875, 2200.681640625],
 [4881.65771484375, 1830.8978271484375, 5290.85986328125, 2086.666748046875],
 [5906.4013671875, 2067.20361328125, 6094.470703125, 2259.49267578125],
 [2359.98828125, 2462.744873046875, 2663.0791015625, 2667.027099609375],
 [2729.56298828125, 2353.977783203125, 2945.7470703125, 2565.425537109375],
 [6247.95068359375, 2864.652587890625, 6884.43115234375, 3535.409423828125],
 [5078.0029296875, 1612.201171875, 79

In [30]:
del gdino
torch.cuda.empty_cache()

In [31]:
SAM = SegmentAnythingPredictor(vit_model="vit_h")

In [34]:
from PIL import Image as PILImg
from utils.img_utils import masks_to_bboxes
from robokit.utils import annotate, overlay_masks

all_crops = []

for i, image_path in enumerate(image_paths):

    crops_dict = dict() 

    image_pil = PILImg.open(image_path).convert("RGB")
    bboxes = np.array(gdino_bboxes['bboxes'][i])
    print(f"{i}. {bboxes.shape=}")
    
    with torch.no_grad():
      image_pil_bboxes, masks = SAM.predict(image_pil, bboxes)
    masks = masks.squeeze(1).detach().cpu()
    accurate_bboxs = masks_to_bboxes(masks)
    accurate_bboxs = torch.tensor(accurate_bboxs).detach().cpu()
    print(f"{i}. {masks.shape=} || {accurate_bboxs.shape=}")
    print("==========================================")
    rois, sel_rois, cropped_imgs, cropped_masks = get_object_proposal(image_path, accurate_bboxs, masks, tag=tag, ratio=0.25, save_rois=False, output_dir=args.output_dir)
    crops_dict["rois"] = rois
    crops_dict["sel_rois"] = sel_rois
    crops_dict["cropped_imgs"] = cropped_imgs
    crops_dict["cropped_masks"] = cropped_masks
    
    all_crops.append(crops_dict)
    torch.cuda.empty_cache()

0. bboxes.shape=(16, 4)
0. masks.shape=torch.Size([16, 6144, 8192]) || accurate_bboxs.shape=torch.Size([16, 4])
1. bboxes.shape=(15, 4)
1. masks.shape=torch.Size([15, 6144, 8192]) || accurate_bboxs.shape=torch.Size([15, 4])
2. bboxes.shape=(21, 4)
2. masks.shape=torch.Size([21, 6144, 8192]) || accurate_bboxs.shape=torch.Size([21, 4])
3. bboxes.shape=(22, 4)
3. masks.shape=torch.Size([22, 6144, 8192]) || accurate_bboxs.shape=torch.Size([22, 4])
4. bboxes.shape=(22, 4)
4. masks.shape=torch.Size([22, 6144, 8192]) || accurate_bboxs.shape=torch.Size([22, 4])
5. bboxes.shape=(21, 4)
5. masks.shape=torch.Size([21, 6144, 8192]) || accurate_bboxs.shape=torch.Size([21, 4])
6. bboxes.shape=(24, 4)
6. masks.shape=torch.Size([24, 6144, 8192]) || accurate_bboxs.shape=torch.Size([24, 4])
7. bboxes.shape=(22, 4)
7. masks.shape=torch.Size([22, 6144, 8192]) || accurate_bboxs.shape=torch.Size([22, 4])
8. bboxes.shape=(22, 4)
8. masks.shape=torch.Size([22, 6144, 8192]) || accurate_bboxs.shape=torch.Size([

OutOfMemoryError: CUDA out of memory. Tried to allocate 8.06 GiB. GPU 0 has a total capacity of 6.00 GiB of which 1.34 GiB is free. Of the allocated memory 2.68 GiB is allocated by PyTorch, and 939.10 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
all_crops

In [None]:
with open(os.path.join("rois_cropped_results.json"), "w") as f:
    json.dump(all_crops, f)

In [9]:
import torchvision
from featup.util import norm, unnorm, pca, remove_axes
def FFA_preprocess(x_list, img_size=336):

    preprocessed_images = []
    for x in x_list:
        # width, height = x.size
        new_width = img_size
        new_height = img_size

        def _to_rgb(x):
            if x.mode != "RGB":
                x = x.convert("RGB")
            return x

        # preprocessed_image = torchvision.transforms.Compose([
        #     _to_rgb,
        #     torchvision.transforms.Resize((new_height, new_width), interpolation=Image.BICUBIC),  # Image.BICUBIC / InterpolationMode.BICUBIC
        #     torchvision.transforms.ToTensor(),
        #     torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        # ])(x)

        preprocessed_image = torchvision.transforms.Compose([
            torchvision.transforms.Resize((img_size, img_size)),
            torchvision.transforms.CenterCrop((img_size, img_size)),
            torchvision.transforms.ToTensor(),
            norm
        ])(x)

        preprocessed_images.append(preprocessed_image)

    return torch.stack(preprocessed_images, dim=0)

Split the Tasks due to GPU crashs 

In [None]:
# Load Featup
upsampler = torch.hub.load("mhamilton723/FeatUp", 'dinov2').to('cuda').eval()

In [None]:
with open(os.path.join("rois_cropped_results.json"), 'r') as f:
    crops_dict = json.load(f)
print("crops_dict: ", crops_dict.shape) # Shape 

In [None]:
start_time = time.time()
for d in tqdm(crops_dict):

    rois = d["rois"]
    sel_rois = d["sel_rois"]
    cropped_imgs = d["cropped_imgs"]
    cropped_masks = d["cropped_masks"] 

    scene_features = []

    num_imgs = len(cropped_imgs)

    
    for i in range(num_imgs):
        img = cropped_imgs[i]
        mask = cropped_masks[i]
        # ffa_feature = get_features([img], [mask], encoder, img_size=imsize)
        with torch.no_grad:
            preprocessed_imgs = FFA_preprocess([img], img_size=imsize).to('cuda')
            masks = get_foreground_mask([mask], mask_size=imsize).to('cuda')  # Shape: (1, 1, H, W)
            hr_feats = upsampler(preprocessed_imgs)                   # Shape: (1, 384, 512, 512)
        hr_feats = torch.nn.functional.interpolate(hr_feats, masks.shape[2:], mode="bilinear") # Shape: (1, 384, H, W)
        hr_feats = hr_feats.permute(0,2,3,1) # Shape: (1, H, W, 384)
        ffa_feature = (hr_feats * masks.permute(0, 2, 3, 1)).sum(dim=(1, 2)) / masks.sum(dim=(1, 2, 3)).unsqueeze(-1) # Shape: (1, 384)
        
        # with torch.no_grad():
        #     if use_adapter:
        #         ffa_feature = adapter(ffa_feature)

        scene_features.append(ffa_feature)
        torch.cuda.empty_cache()
        
    scene_features = torch.cat(scene_features, dim=0)
    scene_features = nn.functional.normalize(scene_features, dim=1, p=2)

    scene_features_list.append(scene_features) 
    # total_proposals[scene_name] = sel_rois
    proposals_list.append(sel_rois)

In [1]:
num_object = 100 # number of instances in the dataset
num_example = len(object_features) // num_object

score_thresh_predefined = 0.6

results = []
    
for idx, scene_feature in enumerate(scene_features_list):
    sim_mat = compute_similarity(object_features, scene_feature)
    sim_mat = sim_mat.view(len(scene_feature), num_object, num_example)
    sims, _ = torch.max(sim_mat, dim=2)  # choose max score over profile examples of each object instance

    max_ins_sim, initial_result = torch.max(sims, dim=1)

    proposals = proposals_list[idx]
    num_proposals = len(proposals)

    ########################################## Stable Matching Strategy ##########################################

    if do_matching:
        # ------------ ranking and sorting ------------
        # Initialization
        sel_obj_ids = [str(v) for v in list(np.arange(num_object))]  # ids for selected obj
        sel_roi_ids = [str(v) for v in list(np.arange(len(scene_feature)))]  # ids for selected roi

        # Padding
        max_len = max(len(sel_roi_ids), len(sel_obj_ids))
        sel_sims_symmetric = torch.ones((max_len, max_len)) * -1
        sel_sims_symmetric[:len(sel_roi_ids), :len(sel_obj_ids)] = sims.clone()

        pad_len = abs(len(sel_roi_ids) - len(sel_obj_ids))
        if len(sel_roi_ids) > len(sel_obj_ids):
            pad_obj_ids = [str(i) for i in range(num_object, num_object + pad_len)]
            sel_obj_ids += pad_obj_ids
        elif len(sel_roi_ids) < len(sel_obj_ids):
            pad_roi_ids = [str(i) for i in range(len(sel_roi_ids), len(sel_roi_ids) + pad_len)]
            sel_roi_ids += pad_roi_ids

        # ------------ stable matching ------------
        matchedMat = stableMatching(
            sel_sims_symmetric.detach().data.cpu().numpy())  # predMat is raw predMat
        predMat_row = np.zeros_like(
            sel_sims_symmetric.detach().data.cpu().numpy())  # predMat_row is the result after stable matching
        Matches = dict()
        for i in range(matchedMat.shape[0]):
            tmp = matchedMat[i, :]
            a = tmp.argmax()
            predMat_row[i, a] = tmp[a]
            Matches[sel_roi_ids[i]] = sel_obj_ids[int(a)]
        # print("Done!")

        # ------------ thresholding ------------
        preds = Matches.copy()
        # for key, value in Matches.items():
        #     if sel_sims_symmetric[int(sel_roi_ids.index(key)), int(sel_obj_ids.index(value))] <= score_thresh_predefined:
        #         del preds[key]
        #         continue
        
        # ------------ save per scene results ------------

        for k, v in preds.items():
            if int(k) >= num_proposals:
                break
            # if float(sims[int(k), int(v)]) < score_thresh_predefined:
            #     continue
            result = dict()
            result['image_id'] = proposals[int(k)]['image_id']
            result['category_id'] = int(v)
            result['bbox'] = proposals[int(k)]['bbox']
            result['score'] = float(sims[int(k), int(v)])
            result['image_width'] = proposals[int(k)]['image_width']
            result['image_height'] = proposals[int(k)]['image_height']
            result['scale'] = proposals[int(k)]['scale']
            results.append(result)
    else:
        THRESHOLD_OBJECT_SCORE = 0.4
        for i in range(num_proposals):
            if float(max_ins_sim[i]) < THRESHOLD_OBJECT_SCORE:
                continue
            result = dict()
            result['image_id'] = proposals[i]['image_id']
            result['category_id'] = initial_result[i].item()
            result['bbox'] = proposals[i]['bbox']
            result['score'] = float(max_ins_sim[i])
            result['image_width'] = proposals[i]['image_width']
            result['image_height'] = proposals[i]['image_height']
            result['scale'] = proposals[i]['scale']
            results.append(result)

# Capture the end time
end_time = time.time()

# Calculate and print the total time
print(f"Total running time: {end_time - start_time} seconds")


NameError: name 'object_features' is not defined

In [None]:
results

In [2]:
# ### Save Results
# save final results
with open(os.path.join(args.output_dir, "coco_instances_results.json"), "w") as f:
    json.dump(results, f)

predictions = dict(
    [(k, {'image_id': -1, 'instances': []}) for k in range(len(scene_name_list))])
for idx in range(len(results)):
    id = results[idx]['image_id']
    predictions[scene_name_list.index('test_' + str(id).zfill(3))]['image_id'] = id

    predictions[scene_name_list.index('test_' + str(id).zfill(3))]['instances'].append(results[idx])

torch.save(predictions, os.path.join(args.output_dir, "instances_predictions.pth"))

print('Done!')

NameError: name 'os' is not defined

In [None]:
# Random custom colors with a fixed random seed
random.seed(77)
thing_colors = []
for i in range(100):
    thing_colors.append(getColor())

In [None]:

# Register Test Data for COCO evaluation
# test_path = "./test_data/test_4" # 1 for raw data, 2 for ratio=0.5, 4 for ratio=0.25, 8 for ratio=0.125  # test_4
# test_json = "./test_data/annotations/instances_test_4.json"  # instances_test_4
test_path = os.path.join(args.test_path, 'images')  # 1 for raw data, 2 for ratio=0.5, 4 for ratio=0.25, 8 for ratio=0.125
test_json = os.path.join(args.test_path, 'instances_test_4_'+scene_level+'.json')
register_coco_instances("coco_InsDet_test", {}, test_json, test_path)
MetadataCatalog.get("coco_InsDet_test").thing_colors = thing_colors

## evaluate the results using COCO API
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval

# Load the ground truth COCO dataset
cocoGt = COCO(test_json)

# Load your detection results
cocoDt = cocoGt.loadRes(os.path.join(args.output_dir, "coco_instances_results.json"))

# Create a COCOeval object by initializing it with the ground truth and detection results
cocoEval = COCOeval(cocoGt, cocoDt, 'bbox')

# Run the evaluation
cocoEval.evaluate()
cocoEval.accumulate()
cocoEval.summarize()