In [1]:
from groundingdino.util.inference import load_model, load_image, predict, annotate
import cv2

model = load_model("groundingdino/config/GroundingDINO_SwinT_OGC.py", "weights/groundingdino_swint_ogc.pth")
IMAGE_PATH = ".asset/cat_dog.jpeg"
TEXT_PROMPT = "chair . person . dog ."
BOX_TRESHOLD = 0.35
TEXT_TRESHOLD = 0.25

image_source, image = load_image(IMAGE_PATH)

boxes, logits, phrases = predict(
    model=model,
    image=image,
    caption=TEXT_PROMPT,
    box_threshold=BOX_TRESHOLD,
    text_threshold=TEXT_TRESHOLD
)

annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
cv2.imwrite("annotated_image.jpg", annotated_frame)



final text_encoder_type: bert-base-uncased




True

In [2]:
print(model)

GroundingDINO(
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-5): 6 x DeformableTransformerEncoderLayer(
          (self_attn): MultiScaleDeformableAttention(
            (sampling_offsets): Linear(in_features=256, out_features=256, bias=True)
            (attention_weights): Linear(in_features=256, out_features=128, bias=True)
            (value_proj): Linear(in_features=256, out_features=256, bias=True)
            (output_proj): Linear(in_features=256, out_features=256, bias=True)
          )
          (dropout1): Dropout(p=0.0, inplace=False)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (linear1): Linear(in_features=256, out_features=2048, bias=True)
          (dropout2): Dropout(p=0.0, inplace=False)
          (linear2): Linear(in_features=2048, out_features=256, bias=True)
          (dropout3): Dropout(p=0.0, inplace=False)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_aff

In [3]:
def freeze_layers(model, layers_to_freeze):
    for name, param in model.named_parameters():
        if any(layer in name for layer in layers_to_freeze):
            param.requires_grad = False  #freeze 

In [4]:
layers_to_freeze = ["backbone", "bert"]

freeze_layers(model, layers_to_freeze)

In [5]:
def print_trainable_params(model):
    trainable_params = []
    frozen_params = []

    for name, param in model.named_parameters():
        if param.requires_grad:
            trainable_params.append((name, param.shape))
        else:
            frozen_params.append((name, param.shape))

    print("\n🔹 Trainable Parameters:")
    for name, shape in trainable_params:
        print(f" - {name}: {shape}")

    print("\n🔸 Not Trainable (Frozen) Parameters:")
    for name, shape in frozen_params:
        print(f" - {name}: {shape}")

    print(f"\nTotal Trainable Parameters: {len(trainable_params)}")
    print(f"Total Frozen Parameters: {len(frozen_params)}")

In [6]:
print_trainable_params(model)


🔹 Trainable Parameters:
 - transformer.level_embed: torch.Size([4, 256])
 - transformer.encoder.layers.0.self_attn.sampling_offsets.weight: torch.Size([256, 256])
 - transformer.encoder.layers.0.self_attn.sampling_offsets.bias: torch.Size([256])
 - transformer.encoder.layers.0.self_attn.attention_weights.weight: torch.Size([128, 256])
 - transformer.encoder.layers.0.self_attn.attention_weights.bias: torch.Size([128])
 - transformer.encoder.layers.0.self_attn.value_proj.weight: torch.Size([256, 256])
 - transformer.encoder.layers.0.self_attn.value_proj.bias: torch.Size([256])
 - transformer.encoder.layers.0.self_attn.output_proj.weight: torch.Size([256, 256])
 - transformer.encoder.layers.0.self_attn.output_proj.bias: torch.Size([256])
 - transformer.encoder.layers.0.norm1.weight: torch.Size([256])
 - transformer.encoder.layers.0.norm1.bias: torch.Size([256])
 - transformer.encoder.layers.0.linear1.weight: torch.Size([2048, 256])
 - transformer.encoder.layers.0.linear1.bias: torch.Size