In [2]:
# ========================
# 1. Auto Capture Script
# ========================
from IPython.display import Javascript
from google.colab.output import eval_js
from base64 import b64decode
import cv2
import numpy as np
import io
from PIL import Image
import time
import os
import torch

def capture_multiple_images(num_frames=10, interval=2):
    js_code = f'''
        async function captureMultiplePhotos() {{
          const div = document.createElement('div');
          const video = document.createElement('video');
          video.style.display = 'block';
          const stream = await navigator.mediaDevices.getUserMedia({{video: true}});
          document.body.appendChild(div);
          div.appendChild(video);
          video.srcObject = stream;
          await video.play();

          google.colab.output.setIframeHeight(document.documentElement.scrollHeight, true);

          const canvas = document.createElement('canvas');
          const context = canvas.getContext('2d');
          const frames = [];

          for (let i = 0; i < {num_frames}; i++) {{
            canvas.width = video.videoWidth;
            canvas.height = video.videoHeight;
            context.drawImage(video, 0, 0);
            frames.push(canvas.toDataURL('image/jpeg'));
            await new Promise(resolve => setTimeout(resolve, {interval * 1000}));
          }}

          stream.getTracks().forEach(track => track.stop());
          div.remove();
          return frames;
        }}
        captureMultiplePhotos();
    '''
    data = eval_js(js_code)
    images = []
    for img_data in data:
        binary = b64decode(img_data.split(',')[1])
        img = Image.open(io.BytesIO(binary))
        images.append(img)
    return images

In [3]:
# ================================
# 2. Capture 10 Frames Automatically and Save
# ================================
frames = []
interval = 2  # seconds between frames
num_frames_to_capture = 10
resize_shape = (224, 224)
save_dir = "captured_frames"

# Create folder if it doesn't exist
os.makedirs(save_dir, exist_ok=True)

print("Starting automatic frame capture from webcam...\n")

captured_images = capture_multiple_images(num_frames=num_frames_to_capture, interval=interval)

for i, captured_image in enumerate(captured_images):
    print(f"Processing frame {i+1}/{num_frames_to_capture}")

    # Convert to OpenCV format
    frame = cv2.cvtColor(np.array(captured_image), cv2.COLOR_RGB2BGR)

    # Resize
    frame_resized = cv2.resize(frame, resize_shape)

    frames.append(frame_resized)

    # Save frame to file
    save_path = os.path.join(save_dir, f"frame_{i}.jpg")
    cv2.imwrite(save_path, frame_resized)
    print(f"Saved: {save_path}")

print("\nFinished capturing and saving frames!")

# Show array content summary
print(f"\nTotal Frames Captured: {len(frames)}")
for i, f in enumerate(frames):
    print(f"Frame {i} shape: {f.shape}")

Starting automatic frame capture from webcam...

Processing frame 1/10
Saved: captured_frames/frame_0.jpg
Processing frame 2/10
Saved: captured_frames/frame_1.jpg
Processing frame 3/10
Saved: captured_frames/frame_2.jpg
Processing frame 4/10
Saved: captured_frames/frame_3.jpg
Processing frame 5/10
Saved: captured_frames/frame_4.jpg
Processing frame 6/10
Saved: captured_frames/frame_5.jpg
Processing frame 7/10
Saved: captured_frames/frame_6.jpg
Processing frame 8/10
Saved: captured_frames/frame_7.jpg
Processing frame 9/10
Saved: captured_frames/frame_8.jpg
Processing frame 10/10
Saved: captured_frames/frame_9.jpg

Finished capturing and saving frames!

Total Frames Captured: 10
Frame 0 shape: (224, 224, 3)
Frame 1 shape: (224, 224, 3)
Frame 2 shape: (224, 224, 3)
Frame 3 shape: (224, 224, 3)
Frame 4 shape: (224, 224, 3)
Frame 5 shape: (224, 224, 3)
Frame 6 shape: (224, 224, 3)
Frame 7 shape: (224, 224, 3)
Frame 8 shape: (224, 224, 3)
Frame 9 shape: (224, 224, 3)


In [4]:
# ================================
# 3. Load BLIP2 Model
# ================================
!pip install -q transformers accelerate

from transformers import Blip2Processor, Blip2ForConditionalGeneration

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\nUsing device: {device}")

processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-flan-t5-xl",
    device_map="auto",
    torch_dtype=torch.float16
)
model.to(device)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.



Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Blip2ForConditionalGeneration(
  (vision_model): Blip2VisionModel(
    (embeddings): Blip2VisionEmbeddings(
      (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): Blip2Encoder(
      (layers): ModuleList(
        (0-38): 39 x Blip2EncoderLayer(
          (self_attn): Blip2Attention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1408, out_features=4224, bias=True)
            (projection): Linear(in_features=1408, out_features=1408, bias=True)
          )
          (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
          (mlp): Blip2MLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1408, out_features=6144, bias=True)
            (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          )
          (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((

In [5]:
# ================================
# 4. Predicting on Captured Frames
# ================================
print("\nMaking predictions on captured frames...\n")

for idx, frame in enumerate(frames):
    # Convert OpenCV BGR to PIL RGB
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(frame_rgb)

    inputs = processor(pil_image, return_tensors="pt").to(device, torch.float16)

    # Generate caption
    generated_ids = model.generate(**inputs)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

    print(f"Prediction for Frame {idx}: {generated_text}")

print("\n All done!")


Making predictions on captured frames...

Prediction for Frame 0: a man with a black hair and a black shirt is sitting on a bed
Prediction for Frame 1: a man with a black shirt and a black shirt
Prediction for Frame 2: a man with his hands in his pocket
Prediction for Frame 3: a man with his hands raised in the air
Prediction for Frame 4: a man with a black shirt and a black shirt is sitting on a bed
Prediction for Frame 5: a man with a tee shirt and a black shirt
Prediction for Frame 6: a man is pointing his finger at a camera in a room
Prediction for Frame 7: a man with a clenched fist and a slapped
Prediction for Frame 8: a man with a black shirt and a black shirt is sitting on a bed
Prediction for Frame 9: a man with a black shirt and a black shirt is sitting on a bed

 All done!
