In [1]:
!pip install opencv-python torch torchvision matplotlib segment-anything-py open-clip-torch

Collecting segment-anything-py
  Downloading segment_anything_py-1.0.1-py3-none-any.whl.metadata (11 kB)
Collecting open-clip-torch
  Downloading open_clip_torch-3.1.0-py3-none-any.whl.metadata (32 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft

In [2]:
import cv2
import csv
import torch
import numpy as np
from PIL import Image
from segment_anything import sam_model_registry, SamPredictor
from open_clip import create_model_and_transforms, get_tokenizer
import torchvision.transforms as T

# ------------------------------
# 1. Load SAM model
# ------------------------------
sam_checkpoint = "/kaggle/input/sam_vit_b/other/default/1/sam_vit_b.pth"
sam = sam_model_registry["vit_b"](checkpoint=sam_checkpoint)
sam.to("cuda" if torch.cuda.is_available() else "cpu")
predictor = SamPredictor(sam)

# ------------------------------
# 2. Load OpenCLIP model
# ------------------------------
model, _, preprocess = create_model_and_transforms("ViT-B-32", pretrained="openai")
model.eval()
model.cuda() if torch.cuda.is_available() else model.cpu()
tokenizer = get_tokenizer("ViT-B-32")

# ------------------------------
# 3. Define candidate labels
# ------------------------------
candidate_labels =["a dog", "a goat", "a panda", "a cat"]
text_tokens = tokenizer(candidate_labels).cuda() if torch.cuda.is_available() else tokenizer(candidate_labels)

# ------------------------------
# 4. Start video processing
# ------------------------------
video_path = "/kaggle/input/sam_vit_b/other/default/1/input_videov3.mp4"
cap = cv2.VideoCapture(video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter("/kaggle/working/output.mp4", fourcc, fps, (width, height))

# ------------------------------
# 5. CSV logging
# ------------------------------
csvfile = open("/kaggle/working/video_logv3.csv", "w", newline="")
csvwriter = csv.writer(csvfile)
csvwriter.writerow(["frame_id", "label", "confidence", "x", "y", "w", "h"])

# ------------------------------
# 6. Frame-by-frame loop
# ------------------------------
frame_id = 0
while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame_id += 1
    image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    predictor.set_image(image_rgb)
    image_tensor = torch.tensor(image_rgb).permute(2, 0, 1).contiguous()
    image_tensor = image_tensor.cuda() if torch.cuda.is_available() else image_tensor

    H, W = image_rgb.shape[:2]
    input_box = np.array([0, 0, W, H])  # entire image
    masks, scores, logits = predictor.predict(box=input_box[None, :], multimask_output=True)

    animal_count = 0  # to keep track of the number of animals in each square

    for i, mask in enumerate(masks):
        area = np.sum(mask)
        if area < 5000:  # filter small segments
            continue

        x, y, w, h = cv2.boundingRect(mask.astype(np.uint8))
        cropped = image_rgb[y:y+h, x:x+w]
        pil_crop = Image.fromarray(cropped)
        image_input = preprocess(pil_crop).unsqueeze(0)
        image_input = image_input.cuda() if torch.cuda.is_available() else image_input

        with torch.no_grad():
            image_features = model.encode_image(image_input)
            text_features = model.encode_text(text_tokens)
            image_features /= image_features.norm(dim=-1, keepdim=True)
            text_features /= text_features.norm(dim=-1, keepdim=True)
            similarity = (image_features @ text_features.T).squeeze(0)

        best_idx = similarity.argmax().item()
        best_label = candidate_labels[best_idx]
        best_score = similarity[best_idx].item()

        animal_count += 1

        # Draw on frame
        cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
        label_text = f"{best_label} ({best_score:.2f})"
        cv2.putText(frame, label_text, (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,0), 2)

        # Log to CSV
        csvwriter.writerow([frame_id, best_label, round(best_score, 3), x, y, w, h])
    
    # Add animal count text to the frame
    cv2.putText(frame, f"Animals detected: {animal_count}", (20, 40), 
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
    
    out.write(frame)
    print(f"Processed frame {frame_id}")

# ------------------------------
# 7. Cleanup
# ------------------------------
cap.release()
out.release()
csvfile.close()
print("✅ Video processing complete. Output saved to output_video.mp4 and video_log.csv")


open_clip_model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]



Processed frame 1
Processed frame 2
Processed frame 3
Processed frame 4
Processed frame 5
Processed frame 6
Processed frame 7
Processed frame 8
Processed frame 9
Processed frame 10
Processed frame 11
Processed frame 12
Processed frame 13
Processed frame 14
Processed frame 15
Processed frame 16
Processed frame 17
Processed frame 18
Processed frame 19
Processed frame 20
Processed frame 21
Processed frame 22
Processed frame 23
Processed frame 24
Processed frame 25
Processed frame 26
Processed frame 27
Processed frame 28
Processed frame 29
Processed frame 30
Processed frame 31
Processed frame 32
Processed frame 33
Processed frame 34
Processed frame 35
Processed frame 36
Processed frame 37
Processed frame 38
Processed frame 39
Processed frame 40
Processed frame 41
Processed frame 42
Processed frame 43
Processed frame 44
Processed frame 45
Processed frame 46
Processed frame 47
Processed frame 48
Processed frame 49
Processed frame 50
Processed frame 51
Processed frame 52
Processed frame 53
Pr