In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install opencv-python torch torchvision matplotlib open-clip-torch

Collecting open-clip-torch
  Downloading open_clip_torch-3.0.0-py3-none-any.whl.metadata (32 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 k

In [None]:
!pip install segment-anything-py

Collecting segment-anything-py
  Downloading segment_anything_py-1.0.1-py3-none-any.whl.metadata (11 kB)
Downloading segment_anything_py-1.0.1-py3-none-any.whl (40 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: segment-anything-py
Successfully installed segment-anything-py-1.0.1


In [None]:
import cv2
import csv
import torch
import numpy as np
from PIL import Image
from segment_anything import sam_model_registry, SamPredictor
from open_clip import create_model_and_transforms, get_tokenizer
import torchvision.transforms as T

# ------------------------------
# 1. Load SAM model
# ------------------------------
sam_checkpoint = "/content/drive/MyDrive/wildlife/sam_vit_b.pth"
sam = sam_model_registry["vit_b"](checkpoint=sam_checkpoint)
sam.to("cuda" if torch.cuda.is_available() else "cpu")
predictor = SamPredictor(sam)

# ------------------------------
# 2. Load OpenCLIP model
# ------------------------------
model, _, preprocess = create_model_and_transforms("ViT-B-32", pretrained="openai")
model.eval()
model.cuda() if torch.cuda.is_available() else model.cpu()
tokenizer = get_tokenizer("ViT-B-32")

# ------------------------------
# 3. Define candidate labels
# ------------------------------
candidate_labels = [
    "a puma", "a snake", "a coyote", "an iguana", "an ocelot",
    "a squirrel", "a bird", "an otter", "a coati", "a tamandua",
    "a lizard", "a bat", "a butterfly", "a tayra", "a monkey",
    "a cougar", "a paca", "a raccoon", "a skunk", "a dog",
    "an agouti", "a cat", "a mouse", "a turkey"
]
text_tokens = tokenizer(candidate_labels).cuda() if torch.cuda.is_available() else tokenizer(candidate_labels)

# ------------------------------
# 4. Start video processing
# ------------------------------
video_path = "/content/drive/MyDrive/wildlife/input_video.mp4"
cap = cv2.VideoCapture(video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter("/content/drive/MyDrive/wildlife/output_video.mp4", fourcc, fps, (width, height))

# ------------------------------
# 5. CSV logging
# ------------------------------
csvfile = open("/content/drive/MyDrive/wildlife/video_log.csv", "w", newline="")
csvwriter = csv.writer(csvfile)
csvwriter.writerow(["frame_id", "label", "confidence", "x", "y", "w", "h"])

# ------------------------------
# 6. Frame-by-frame loop
# ------------------------------
frame_id = 0
while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame_id += 1
    image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    predictor.set_image(image_rgb)
    image_tensor = torch.tensor(image_rgb).permute(2, 0, 1).contiguous()
    image_tensor = image_tensor.cuda() if torch.cuda.is_available() else image_tensor

    H, W = image_rgb.shape[:2]
    input_box = np.array([0, 0, W, H])  # entire image
    masks, scores, logits = predictor.predict(box=input_box[None, :], multimask_output=True)

    for i, mask in enumerate(masks):
        area = np.sum(mask)
        if area < 5000:  # filter small segments
            continue

        x, y, w, h = cv2.boundingRect(mask.astype(np.uint8))
        cropped = image_rgb[y:y+h, x:x+w]
        pil_crop = Image.fromarray(cropped)
        image_input = preprocess(pil_crop).unsqueeze(0)
        image_input = image_input.cuda() if torch.cuda.is_available() else image_input

        with torch.no_grad():
            image_features = model.encode_image(image_input)
            text_features = model.encode_text(text_tokens)
            image_features /= image_features.norm(dim=-1, keepdim=True)
            text_features /= text_features.norm(dim=-1, keepdim=True)
            similarity = (image_features @ text_features.T).squeeze(0)

        best_idx = similarity.argmax().item()
        best_label = candidate_labels[best_idx]
        best_score = similarity[best_idx].item()

        # Draw on frame
        cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
        label_text = f"{best_label} ({best_score:.2f})"
        cv2.putText(frame, label_text, (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,0), 2)

        # Log to CSV
        csvwriter.writerow([frame_id, best_label, round(best_score, 3), x, y, w, h])

    out.write(frame)
    print(f"Processed frame {frame_id}")

# ------------------------------
# 7. Cleanup
# ------------------------------
cap.release()
out.release()
csvfile.close()
print("✅ Video processing complete. Output saved to output_video.mp4 and video_log.csv")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processed frame 2189
Processed frame 2190
Processed frame 2191
Processed frame 2192
Processed frame 2193
Processed frame 2194
Processed frame 2195
Processed frame 2196
Processed frame 2197
Processed frame 2198
Processed frame 2199
Processed frame 2200
Processed frame 2201
Processed frame 2202
Processed frame 2203
Processed frame 2204
Processed frame 2205
Processed frame 2206
Processed frame 2207
Processed frame 2208
Processed frame 2209
Processed frame 2210
Processed frame 2211
Processed frame 2212
Processed frame 2213
Processed frame 2214
Processed frame 2215
Processed frame 2216
Processed frame 2217
Processed frame 2218
Processed frame 2219
Processed frame 2220
Processed frame 2221
Processed frame 2222
Processed frame 2223
Processed frame 2224
Processed frame 2225
Processed frame 2226
Processed frame 2227
Processed frame 2228
Processed frame 2229
Processed frame 2230
Processed frame 2231
Processed frame 2232
Processed f