In [1]:
from typing import Optional

import cv2
import torch
import torchvision
from groundingdino.util.inference import annotate, batch_predict, load_image, load_model
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

class ImageFolderWithFilenames(datasets.ImageFolder):
    def __getitem__(self, index):
        # Get the original tuple (image, label)
        original_tuple = super().__getitem__(index)
        
        # Get the image path
        path, _ = self.samples[index]
        
        # Append the filename to the tuple
        tuple_with_filename = (*original_tuple, path)
        
        return tuple_with_filename


def create_dataloader(
    data_dir: str,
    batch_size: int = 8,
    shuffle: bool = False,
    num_workers: int = 4,
    transform: Optional[torchvision.transforms.Compose] = None,
) -> DataLoader:
    """
    Creates a PyTorch DataLoader for images stored in subdirectories.

    Parameters:
    - data_dir (str): Path to the main directory containing subdirectories of images.
    - batch_size (int): Number of samples per batch to load.
    - shuffle (bool): Whether to shuffle the dataset.
    - num_workers (int): How many subprocesses to use for data loading.
    - transform (torchvision.transforms.Compose): Transformations to apply to the images.

    Returns:
    - DataLoader: PyTorch DataLoader.
    """

    # Load the dataset from the directory with subdirectories
    dataset = ImageFolderWithFilenames(root=data_dir, transform=transform)

    # Create the DataLoader
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers
    )

    return dataloader



In [2]:
import torch
from torch.profiler import ProfilerActivity, record_function, profile
from groundingdino.util.inference import load_model, load_image, predict, annotate, batch_predict
import tqdm
import jsonlines

DEFAULT_TRANSFORM = transforms.Compose(
    [
        transforms.Resize([800, 1000]),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ]
)

TEXT_PROMPT = "flying object"
BOX_THRESHOLD = 0.35
TEXT_THRESHOLD = 0.25
device = "cuda"

# Load the model
model = load_model("../GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py", "../GroundingDINO/groundingdino_swint_ogc.pth").to(device)

image_data_dir = "output_frames"
dataloader = create_dataloader(data_dir=image_data_dir, transform=DEFAULT_TRANSFORM)

with jsonlines.open('output.jsonl', mode='w') as writer:
    for idx, batch in enumerate(tqdm.tqdm(dataloader)):
        boxes, logits, boxes_to_im = batch_predict(
            model=model,
            preprocessed_images=batch[0],
            caption=TEXT_PROMPT,
            box_threshold=BOX_THRESHOLD,
            text_threshold=TEXT_THRESHOLD,
            device=device
        )
        numbers_seen = set()
        
        for idx, im_num in enumerate(boxes_to_im):
        
            # Prepare the data to write to JSONL
            data = {
                'image_name': batch[2][im_num],
                'boxes': boxes[idx].tolist(),
                'logits': logits[idx].tolist(),  # Convert to list for JSON serialization
            }
            numbers_seen.add(im_num)
            writer.write(data)
            
        numbers_not_seen = set(range(len(batch[0]))) - numbers_seen
        for num in numbers_not_seen:
            data = {
                'image_name': batch[2][num],
                'boxes': [],
                'logits': [],
            }
            writer.write(data)



final text_encoder_type: bert-base-uncased


100%|██████████| 148/148 [03:27<00:00,  1.40s/it]


In [7]:
dataloader.dataset.class_to_idx

{'V_DRONE_001': 0,
 'V_DRONE_002': 1,
 'V_DRONE_003': 2,
 'V_DRONE_004': 3,
 'V_DRONE_005': 4,
 'V_DRONE_006': 5,
 'V_DRONE_007': 6,
 'V_DRONE_008': 7,
 'V_DRONE_009': 8,
 'V_DRONE_010': 9,
 'V_DRONE_011': 10,
 'V_DRONE_012': 11,
 'V_DRONE_013': 12,
 'V_DRONE_014': 13,
 'V_DRONE_015': 14,
 'V_DRONE_016': 15,
 'V_DRONE_017': 16,
 'V_DRONE_018': 17,
 'V_DRONE_019': 18,
 'V_DRONE_020': 19,
 'V_DRONE_021': 20,
 'V_DRONE_022': 21,
 'V_DRONE_023': 22,
 'V_DRONE_024': 23,
 'V_DRONE_025': 24,
 'V_DRONE_026': 25,
 'V_DRONE_027': 26,
 'V_DRONE_028': 27,
 'V_DRONE_029': 28,
 'V_DRONE_030': 29,
 'V_DRONE_031': 30,
 'V_DRONE_032': 31,
 'V_DRONE_033': 32,
 'V_DRONE_034': 33,
 'V_DRONE_035': 34,
 'V_DRONE_036': 35,
 'V_DRONE_037': 36,
 'V_DRONE_038': 37,
 'V_DRONE_039': 38,
 'V_DRONE_040': 39,
 'V_DRONE_041': 40,
 'V_DRONE_042': 41,
 'V_DRONE_043': 42,
 'V_DRONE_044': 43,
 'V_DRONE_045': 44,
 'V_DRONE_046': 45,
 'V_DRONE_047': 46,
 'V_DRONE_048': 47,
 'V_DRONE_049': 48,
 'V_DRONE_050': 49,
 'V_DRONE_

In [None]:
from groundingdino.util.inference import load_model, load_image, predict, annotate, batch_predict
import tqdm
TEXT_PROMPT = "flying object"
BOX_TRESHOLD = 0.35
TEXT_TRESHOLD = 0.25
model = load_model("../GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py", "groundingdino_swint_ogc.pth").to("mps")

for batch in tqdm.tqdm(dataloader):
    boxes, logits, boxes_to_im = batch_predict(
        model=model,
        preprocessed_images=batch[0],
        caption=TEXT_PROMPT,
        box_threshold=BOX_TRESHOLD,
        text_threshold=TEXT_TRESHOLD,
        device="mps"
    )

In [None]:
batch