In [None]:
!pip install yt-dlp opencv-python openai glob

In [None]:
%%bash
mkdir -p keyframes
yt-dlp -f bestvideo[ext=mp4] -o - "https://www.youtube.com/watch?v=hTSaweR8qMI" \
  | ffmpeg -i pipe: \
           -vf fps=.02 \
           -frames:v 5 \
           keyframes/keyframe_%02d.jpg

In [None]:
import os
import json
import base64
from glob import glob           # <–– grabs the function glob()
from openai import OpenAI

In [None]:
# — CONFIG —
API_KEY = os.getenv("INFERENCE_API_KEY")
MODEL   = "google/gemma-3-27b-instruct/bf-16"
SYSTEM_MSG = """
You are a JSON-only image analysis API specializing in YouTube keyframes.
Generate one concise caption that describes what's happening across all these frames.
Respond only with a JSON object:

{"caption": "…"}
""".strip()

client = OpenAI(base_url="https://api.inference.net/v1", api_key=API_KEY)

In [None]:
data_uris = []
for filepath in sorted(glob("keyframes/*.jpg")):   # now glob(...) works!
    with open(filepath, "rb") as f:
        b64 = base64.b64encode(f.read()).decode("utf-8")
    data_uris.append(f"data:image/jpeg;base64,{b64}")

In [47]:
resp = client.chat.completions.create(
    model=MODEL,
    messages=[
        {"role": "system", "content": SYSTEM_MSG},
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Here are 5 keyframes from a YouTube video. Generate a single caption."},
                *[
                    {"type": "image_url", "image_url": {"url": uri}}
                    for uri in data_uris
                ]
            ],
        },
    ],
    response_format={"type": "json_object"},
)

# — OUTPUT RESULT —
print(json.dumps(resp.choices[0].message.content, indent=2))

"{\"caption\": \"A man spends $10,000 on a date involving a fancy dinner, a limousine, jet skiing, and a singer.\"}"
