### Cloning the githbub repo of GroundingDino and installing the required dependencies.

In [None]:
!git clone https://github.com/IDEA-Research/GroundingDINO.git
%cd GroundingDINO/
!pip install -q -e .

### We now download the GroundingDino pre-trained model's weights

In [None]:
!mkdir weights
%cd weights
!wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
%cd ..

### Cloning the github repo of SAM2 model and installing the required dependencies

In [None]:
%cd ..
!git clone https://github.com/facebookresearch/segment-anything-2.git
%cd segment-anything-2
!pip install -q -e .

### We also need to download the pre-trained SAM2 model checkpoints.

In [None]:
%cd segment-anything-2
!mkdir -p checkpoints/
!wget -P checkpoints/ https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_large.pt
%cd ..

In [None]:
%cd ..

### Importing some important libraries

In [None]:
%cd /content/GroundingDINO
from groundingdino.util.inference import load_model, load_image, predict, annotate

In [None]:
%cd /content/segment-anything-2
from sam2.build_sam import build_sam2
from sam2.sam2_image_predictor import SAM2ImagePredictor

In [None]:
%cd /content
import cv2
import os
import torch
from torchvision.ops import box_convert
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import urllib.request
from torchvision import transforms

### We first explore GroundingDino method that takes image and text prompt and output a list of bounding boxes that matches the prompt.

In [None]:
# Loading pre-trained model
model = load_model("GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py", "GroundingDINO/weights/groundingdino_swint_ogc.pth")

### Requesting image from cocodataset

In [None]:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
urllib.request.urlretrieve(url, "cats.jpg")

### Loading image and transforming it into resized tensors suitable for box prediction

In [None]:
image_source = Image.open("cats.jpg").convert("RGB")
transform = transforms.Compose([
    transforms.Resize((512, 512)),
    transforms.ToTensor()
])
image = transform(image_source)
image_source = np.array(image_source)

In [None]:
TEXT_PROMPT = "cats. remote. "
BOX_TRESHOLD = 0.35
TEXT_TRESHOLD = 0.25

### Using predict() function that we imported earlier to predict the bounding boxes based on input prompt

In [None]:
boxes, logits, phrases = predict(
    model=model,
    image=image,
    caption=TEXT_PROMPT,
    box_threshold=BOX_TRESHOLD,
    text_threshold=TEXT_TRESHOLD,
    device='cpu'
)

### We can use the annotate() function to create an image with the output bounding boxes and their respective category names and confidence scores.

In [None]:
annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)

In [None]:
plt.figure(figsize=(10, 10))
plt.imshow(cv2.cvtColor(annotated_frame, cv2.COLOR_BGR2RGB))
plt.axis('off')
plt.show()

### Before passing these boxes to our SAM2 model, we need to do some modifications to the format of the bounding boxes

In [None]:
h, w, _ = image_source.shape
boxes_unnorm = boxes * torch.Tensor([w, h, w, h]) #unnormalizing the range of boxes
boxes_xyxy = box_convert(boxes=boxes_unnorm, in_fmt="cxcywh", out_fmt="xyxy").numpy() #converting to xyxy format

In [None]:
%cd /content/segment-anything-2

### Load the pre-trained SAM2 model. Here we choose hiera_large model, so we specify its corresponding checkpoints that we downloaded earlier.We then create a SAM2 model and a predictor to predict the segments

In [None]:
sam2_checkpoint = "checkpoints/sam2.1_hiera_large.pt"
model_cfg = "configs/sam2.1/sam2.1_hiera_l.yaml"

sam2_model = build_sam2(model_cfg, sam2_checkpoint, device='cpu')

predictor = SAM2ImagePredictor(sam2_model)

### Define some helper function responsible for plotting the result

In [None]:
np.random.seed(3)

def show_mask(mask, ax, random_color = False, borders = True):
  if random_color:
    color = np.concatenate([np.random.random(3), np.array([0.6])], axis= 0)
  else:
    color = np.array([30/255, 144/255, 255/255, 0.6])
  h, w = mask.shape[-2:]
  mask = mask.astype(np.uint8)
  mask_image =  mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
  if borders:
        import cv2
        contours, _ = cv2.findContours(mask,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
        # Try to smooth contours
        contours = [cv2.approxPolyDP(contour, epsilon=0.01, closed=True) for contour in contours]
        mask_image = cv2.drawContours(mask_image, contours, -1, (1, 1, 1, 0.5), thickness=2)
  ax.imshow(mask_image)

def show_box(box, ax):
    x0, y0 = box[0], box[1]
    w, h = box[2] - box[0], box[3] - box[1]
    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0, 0, 0, 0), lw=2))

In [None]:
image = Image.open('/content/cats.jpg')
image = np.array(image.convert("RGB"))
predictor.set_image(image)

In [None]:
masks, scores, _ = predictor.predict(
    point_coords=None,
    point_labels=None,
    box=boxes_xyxy,
    multimask_output=False,
)

In [None]:
plt.figure(figsize=(10, 10))
plt.imshow(image)
for mask in masks:
    show_mask(mask.squeeze(0), plt.gca(), random_color=True)
for box in boxes_xyxy:
    show_box(box, plt.gca())
plt.axis('off')
plt.savefig("desk-segment.jpg")
plt.show()

# Putting GroundingDINO And SAM2 Together

In [None]:
model = load_model("../GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py", "../GroundingDINO/weights/groundingdino_swint_ogc.pth")
sam2_model = build_sam2("configs/sam2.1/sam2.1_hiera_l.yaml", "checkpoints/sam2.1_hiera_large.pt", device='cpu')

## Definig a single function which takes input a image path, text_prompt and show you the segmented region in the image on the basis of prompt.

In [None]:
def segment_with_prompt(image_path, text_prompt, model = model, predictor = predictor, box_threshold = 0.35, text_threshold = 0.25):
  image_source, image = load_image(image_path)

  boxes, logits, phrases = predict(
      model = model,
      image = image,
      caption = text_prompt,
      box_threshold = box_threshold,
      text_threshold = text_threshold,
      device = 'cpu'
  )

  annoted_frame = annotate(image_source = image_source, boxes = boxes, logits = logits, phrases = phrases)

  plt.figure(figsize=(10,10))
  plt.imshow(cv2.cvtColor(annoted_frame, cv2.COLOR_BGR2RGB))
  plt.axis('off')
  plt.show()

  h, w, _ = image_source.shape
  boxes_unnorm = boxes * torch.Tensor([w, h, w, h])
  boxes_xyxy = box_convert(boxes=boxes_unnorm, in_fmt="cxcywh", out_fmt="xyxy").numpy()

  image = Image.open(image_path)
  image = np.array(image.convert("RGB"))
  predictor.set_image(image)

  masks, scores, _ = predictor.predict(
        point_coords=None,
        point_labels=None,
        box=boxes_xyxy,
        multimask_output=False,
  )

  plt.figure(figsize=(10, 10))
  plt.imshow(image)
  for mask in masks:
      if len(mask.shape) > 2:
            show_mask(mask.squeeze(0), plt.gca(), random_color=True)
      else:
            show_mask(mask, plt.gca(), random_color=True)
  for box in boxes_xyxy:
      show_box(box, plt.gca())
  plt.axis('off')
  plt.show()

  return masks

In [None]:
%cd ..

# Text-based Video Segmentation task

### Download and store video file in the current directory

In [None]:
import requests
import os

video_url = "https://www.pexels.com/download/video/3987730/"
output_filename = "Horse_running.mp4"

response = requests.get(video_url, stream=True, allow_redirects=True)

if response.status_code == 200:
    with open(output_filename, 'wb') as f:
        for chunk in response.iter_content(chunk_size=1024*1024):
            if chunk:
                f.write(chunk)
    print(f"✅ Video downloaded successfully and saved as '{output_filename}'")
else:
    print(f"❌ Failed to download video. Status code: {response.status_code}")
    print("This can sometimes happen with temporary links. Please try again or check the URL.")

In [None]:
output_dir = "segmented_frames"
os.makedirs(output_dir, exist_ok=True)

In [None]:
frame_dir = "frames"
os.makedirs(frame_dir, exist_ok=True)

### Extracts all frames from a video file and saves them as individual image files in a specified directory.

In [None]:
cap = cv2.VideoCapture(output_filename)
frame_count = 0
while True:
  ret, frame = cap.read()
  if not ret:
    break
  frame_filename = os.path.join(frame_dir, f"{frame_count:05d}.jpeg")
  cv2.imwrite(frame_filename, frame)
  frame_count += 1
cap.release()

### Loading another SAM2 model and GroundingDino model

In [None]:
%cd /content/segment-anything-2
sam2_model_2 = build_sam2("configs/sam2.1/sam2.1_hiera_l.yaml", "checkpoints/sam2.1_hiera_large.pt", device='cpu')
model_2 = load_model("../GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py", "../GroundingDINO/weights/groundingdino_swint_ogc.pth")
predictor_2 = SAM2ImagePredictor(sam2_model_2)

In [None]:
def segment_with_prompt_2(image_path, text_prompt, model = model_2, predictor = predictor_2, box_threshold = 0.35, text_threshold = 0.25):
  image_source, image = load_image(image_path)

  boxes, logits, phrases = predict(
      model = model,
      image = image,
      caption = text_prompt,
      box_threshold = box_threshold,
      text_threshold = text_threshold,
      device = 'cpu'
  )
  h, w, _ = image_source.shape
  boxes_unnorm = boxes * torch.Tensor([w, h, w, h])
  boxes_xyxy = box_convert(boxes=boxes_unnorm, in_fmt="cxcywh", out_fmt="xyxy").numpy()

  image = Image.open(image_path)
  image = np.array(image.convert("RGB"))
  predictor.set_image(image)

  masks, scores, _ = predictor.predict(
        point_coords=None,
        point_labels=None,
        box=boxes_xyxy,
        multimask_output=False,
  )

  return masks

### Segments the first frame of the video using a text prompt and SAM 2, returning the mask for the specified object to use as a reference for mask propagation.

In [None]:
first_frame_path = os.path.join('/content/frames', "00000.jpeg")
prompt = "horse"
masks = segment_with_prompt_2(first_frame_path, prompt, model = model_2, predictor = predictor)
base_mask = masks[0]

In [None]:
base_mask.shape

### Propagates the first frame’s object mask to subsequent frames, overlays it in red, and saves each frame as an image.

In [None]:
for i in range(200):
    frame_path = os.path.join("/content/frames", f"{i:05d}.jpeg")
    image = Image.open(frame_path).convert("RGB")
    image_np = np.array(image)

    y_indices, x_indices = np.where(base_mask)[1:]
    if len(x_indices) == 0 or len(y_indices) == 0:
        continue
    x_min, x_max = x_indices.min(), x_indices.max()
    y_min, y_max = y_indices.min(), y_indices.max()
    box_xyxy = np.array([[x_min, y_min, x_max, y_max]])

    predictor.set_image(image_np)
    masks, _, _ = predictor.predict(
        point_coords=None,
        point_labels=None,
        box=box_xyxy,
        multimask_output=False,
    )

    mask_rgb = np.zeros_like(image_np)
    mask_rgb[:, :, 0] = masks[0] * 255  # Red overlay
    alpha = 0.5
    overlayed = cv2.addWeighted(image_np, 1, mask_rgb, alpha, 0)

    out_path = os.path.join("/content/segmented_frames", f"{i:05d}.png")
    cv2.imwrite(out_path, cv2.cvtColor(overlayed, cv2.COLOR_RGB2BGR))

### Combines all segmented frame images into a single MP4 video at 30 fps.

In [None]:
frame_dir = "/content/segmented_frames"
output_video = "/content/segmented_video.mp4"

frame_files = sorted([f for f in os.listdir(frame_dir) if f.endswith(".png")])

# Read the first frame to get dimensions
first_frame = cv2.imread(os.path.join(frame_dir, frame_files[0]))
height, width, layers = first_frame.shape

# Define video writer (30 fps, adjust if needed)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video, fourcc, 30, (width, height))

# Write all frames into video
for file in frame_files:
    frame = cv2.imread(os.path.join(frame_dir, file))
    out.write(frame)

out.release()
print("Segmented video saved as", output_video)