In [None]:
import cv2
import numpy as np
from PIL import Image
import time
import os
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration


def capture_multiple_images(num_frames=20, interval=2):
    cap = cv2.VideoCapture(0)  # 0 is typically the default webcam

    if not cap.isOpened():
        print("Error: Could not open webcam.")
        return []

    print("Capturing images... Press 'q' to quit early.")
    captured_images = []
    frame_count = 0

    while frame_count < num_frames:
        ret, frame = cap.read()
        if not ret:
            print("Failed to capture frame")
            break

        # Convert BGR (OpenCV format) to RGB (PIL format)
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_img = Image.fromarray(rgb_frame)
        captured_images.append(pil_img)

        frame_count += 1
        print(f"Captured frame {frame_count}/{num_frames}")
        time.sleep(interval)

    cap.release()
    cv2.destroyAllWindows()

    return captured_images

In [None]:
# === 2. Automatically capture, resize, and save frames ===
frames = []
interval = 2  # seconds between frames
num_frames_to_capture = 20
resize_shape = (224, 224)
save_dir = "captured_frames"

# Create directory if it doesn't exist
os.makedirs(save_dir, exist_ok=True)

print("\nStarting automatic frame capture from webcam...\n")

# Capture images
captured_images = capture_multiple_images(num_frames=num_frames_to_capture, interval=interval)

# Process and save frames
for i, captured_image in enumerate(captured_images):
    print(f"Processing frame {i+1}/{num_frames_to_capture}")

    # Convert to OpenCV format (BGR)
    frame = cv2.cvtColor(np.array(captured_image), cv2.COLOR_RGB2BGR)

    # Resize the frame
    frame_resized = cv2.resize(frame, resize_shape)

    # Store in list
    frames.append(frame_resized)

    # Save to disk
    save_path = os.path.join(save_dir, f"frame_{i}.jpg")
    cv2.imwrite(save_path, frame_resized)
    print(f"Saved: {save_path}")

print("\n✅ Finished capturing and saving frames!")

# Print summary
print(f"\nTotal Frames Captured: {len(frames)}")
for i, f in enumerate(frames):
    print(f"Frame {i} shape: {f.shape}")

In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch

# Set the device (CUDA for GPU, CPU if no GPU is available)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\nUsing device: {device}")

# Load the processor and model for BLIP-2 locally
processor = BlipProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b"
).to(device)  # Move model to the appropriate device (GPU or CPU)

In [None]:
import torch
from PIL import Image
import cv2
from transformers import BlipProcessor, BlipForConditionalGeneration

# Assuming `frames` is the list of captured frames from the previous steps
# `captions` will store the generated captions

captions = []  # Store all predicted captions here

print("\nMaking predictions on captured frames...\n")

for idx, frame in enumerate(frames):
    # Convert OpenCV BGR to PIL RGB
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(frame_rgb)

    # Preprocess input for the model
    inputs = processor(images=pil_image, return_tensors="pt").to(device)

    # Generate caption
    generated_ids = model.generate(**inputs)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

    captions.append(generated_text)  # Save the caption

    print(f"Prediction for Frame {idx}: {generated_text}")

print("\nAll done!")