In [None]:
import cv2
import numpy as np
from PIL import Image
import time
import os
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
!pip install transformers accelerate

# Function to capture frames from the webcam
def capture_multiple_images(num_frames=20, interval=2):
    # Initialize the webcam (0 is typically the default webcam)
    cap = cv2.VideoCapture(0)

    if not cap.isOpened():
        print("Error: Could not open webcam.")
        return []

    frames = []

    for i in range(num_frames):
        ret, frame = cap.read()
        if not ret:
            print(f"Error: Failed to capture frame {i+1}")
            break

        # Convert the frame from BGR (OpenCV default) to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Convert to PIL Image for further processing
        pil_image = Image.fromarray(frame_rgb)
        frames.append(pil_image)

        # Wait for the specified interval (in seconds)
        time.sleep(interval)

    # Release the webcam and close any windows
    cap.release()
    cv2.destroyAllWindows()

    return frames

# Example of capturing 20 frames with a 2-second interval between each
captured_images = capture_multiple_images(num_frames=20, interval=2)
print(f"Captured {len(captured_images)} images.")

In [None]:
import cv2
import numpy as np
import os
from PIL import Image

# Function to capture multiple images from the webcam
def capture_multiple_images(num_frames=10, interval=2):
    # Initialize the webcam (0 is typically the default webcam)
    cap = cv2.VideoCapture(0)

    if not cap.isOpened():
        print("Error: Could not open webcam.")
        return []

    frames = []

    for i in range(num_frames):
        ret, frame = cap.read()
        if not ret:
            print(f"Error: Failed to capture frame {i+1}")
            break

        # Convert the frame from BGR (OpenCV default) to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Convert to PIL Image for further processing
        pil_image = Image.fromarray(frame_rgb)
        frames.append(pil_image)

        # Wait for the specified interval (in seconds)
        time.sleep(interval)

    # Release the webcam and close any windows
    cap.release()
    cv2.destroyAllWindows()

    return frames

# Configuration for capturing frames
num_frames_to_capture = 10
interval = 2  # seconds between frames
resize_shape = (224, 224)
save_dir = "captured_frames"

# Create folder if it doesn't exist
os.makedirs(save_dir, exist_ok=True)

print("Starting automatic frame capture from webcam...\n")

captured_images = capture_multiple_images(num_frames=num_frames_to_capture, interval=interval)

# Process and save captured frames
frames = []
for i, captured_image in enumerate(captured_images):
    print(f"Processing frame {i+1}/{num_frames_to_capture}")

    # Convert to OpenCV format
    frame = cv2.cvtColor(np.array(captured_image), cv2.COLOR_RGB2BGR)

    # Resize the frame
    frame_resized = cv2.resize(frame, resize_shape)

    frames.append(frame_resized)

    # Save frame to file
    save_path = os.path.join(save_dir, f"frame_{i}.jpg")
    cv2.imwrite(save_path, frame_resized)
    print(f"Saved: {save_path}")

print("\nFinished capturing and saving frames!")

# Show array content summary
print(f"\nTotal Frames Captured: {len(frames)}")
for i, f in enumerate(frames):
    print(f"Frame {i} shape: {f.shape}")

In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch

# Set the device (CUDA for GPU, CPU if no GPU is available)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\nUsing device: {device}")

# Load the processor and model for BLIP-2 locally
processor = BlipProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b"
).to(device)  # Move model to the appropriate device (GPU or CPU)

In [None]:
import torch
from PIL import Image
import cv2
from transformers import BlipProcessor, BlipForConditionalGeneration

# Assuming `frames` is the list of captured frames from the previous steps
# `captions` will store the generated captions

captions = []  # Store all predicted captions here

print("\nMaking predictions on captured frames...\n")

for idx, frame in enumerate(frames):
    # Convert OpenCV BGR to PIL RGB
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(frame_rgb)

    # Preprocess input for the model
    inputs = processor(images=pil_image, return_tensors="pt").to(device)

    # Generate caption
    generated_ids = model.generate(**inputs)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

    captions.append(generated_text)  # Save the caption

    print(f"Prediction for Frame {idx}: {generated_text}")

print("\nAll done!")