# Video Q/A with Videoprism and Parakeet-v3

<a target="_blank" href="https://colab.research.google.com/github/everettVT/daft-video-embeddings/blob/main/workload/video_embeddings_videoprism.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

Videoprism is a general-purpose video encoder designed to tackle a wide spectrum of video understanding tasks, including classification localization, retrieval, captioning, and question answering. 

Parakeet is a 600-million-parameter multilingual automatic speech recognition (ASR) model designed for high-throughput speech-to-text transcription. 

In this notebook, we will explore how to leverage these foundational models to generate video and text embeddings from youtube or any video file. 

Video processing requires us to extract both image and audio frames, which can then use to generate embeddings. In this use case we will be transcribing the audio to text segments so we can perform RAG Q/A against both the visual and spoken content.


In [None]:
# @title Prepare environment

import os
import sys

# Fetch VideoPrism repository if Python does not know about it and install
# dependencies needed for this notebook.
if not os.path.exists("videoprism_repo"):
  !git clone --quiet --branch=main --depth=1 \
     https://github.com/everettVT/videoprism.git videoprism_repo
  os.chdir('./videoprism_repo')
  !pip install .
  os.chdir('..')

# Append VideoPrism code to Python import path.
if "videoprism_repo" not in sys.path:
  sys.path.append("videoprism_repo")

In [None]:
!pip install "daft>=0.6.1" av yt-dlp "jax[cuda12]"

In [None]:
import daft
from daft import col, DataType as dt
import numpy as np
import jax
import jax.numpy as jnp
from jax.extend import backend
import tensorflow as tf
from videoprism import models as vp
print(jax.devices())    # should list a CUDA device

### Define Parameters

Tensor Dimensions: 
- B: batch size (number of videos in a batch).
- T: number of frames per video clip (typically 16).
- N: tokens per frame (for 288×288 with 18×18 patches → 16×16 = 256).
- D: embedding dimension (Base: 768; Large: 1024).

VideoPrism supports video+text inputs and returns:
- video_embeddings: [B, D] (global video embeddings).
- text_embeddings: [B, D] (global text embeddings).
- Optional: frame_embeddings [B, T, D]; tokens [B, T×N, D]

In [None]:
B, T, H, W, C = 2, 16, 288, 288, 3
ROW_LIMIT = 2048
PATHS = [
    "https://www.youtube.com/watch?v=nutPWHrplA8" # Daft Team Takes On Climbing
]

### Download first 5 videos from your favorite youtube playlist

### Read Video Frames

In [None]:
df_frames = daft.read_video_frames(
    PATHS,
    image_height=H,
    image_width=W,
).limit(ROW_LIMIT).collect() # Materialize a few frames so we don't re-read from YT
df_frames.show(3)

### Group Frames into Clips

In [None]:
df_grouped = (
    df_frames
    .with_column("group_index", df_frames["frame_index"] // T)
    .sort("frame_index")
    .groupby("path", "group_index")
    .agg_list("data", "frame_index")
)
df_grouped.show(3)

### Stack, Normalize, and Cast Frames into Clip Tensors

In [None]:
@daft.func(return_dtype=dt.tensor(dt.float32(), shape=(T,H, W, C)))
def stack_clip(frames: list[np.ndarray], indices: list[int], clip_size: int):
    """Stacks a list of frames into a single numpy array

    Args:
        frames: List[T] of (H,W,3) float32
        indices: List[T] of int

    Returns:
        (1,T,H,W,3) float32 in [0,1]

    In a parallel/distributed groupby, a pre-group sort isn’t guaranteed
    to survive aggregation order; partitions can concatenate in
    non-deterministic order. Additionally, the image dtype is natively a
    list[uint8], so we need to cast to float32 before normalizing from
    [0,255] to [0,1].

    Steps:
    1. Aggregate both image_tensor and frame_index.
    2. Sort by frame_index inside the group-level UDF, then stack.
    3. Normalize and cast in one step.
    4. Add a batch dimension and return.

    """

    # Don't assume frames are sorted already:
    order = np.argsort(np.asarray(indices))

    # Convert Daft Image to np.ndarray
    def to_np(x):
        if hasattr(x, "to_numpy"):
            return x.to_numpy()          # Daft Image -> np.ndarray (H,W,C) uint8
        return np.asarray(x)

    # Sort frames by frame_index
    frames_sorted = [to_np(frames[i]) for i in order]

    # Ensure Tails are padded with duplicates
    if len(order) < clip_size:
        frames_sorted.extend([frames_sorted[-1]] * (clip_size - len(order)))

    # Stack, Normalize, and Cast in one step
    x = np.stack(frames_sorted[:clip_size], axis=0).astype(np.float32) / 255.0 # (T,H,W,3) float32 in [0,1]

    return x # [1,T,H,W,C] where T=clip_size

df_clips = df_grouped.with_column("clip", stack_clip(df_grouped["data"], df_grouped["frame_index"], clip_size=T))
df_clips.show(3)


### Define Inference Strategy

In [None]:
@daft.udf(
    return_dtype = dt.embedding(dt.float32(), 768),
    batch_size=B, # clips per batch (tune for throughput)
    num_gpus=1,
)
class VideoPrismVideoUDF:
    def __init__(self, model_name: str = "videoprism_lvt_public_v1_base"):
        "for 'videoprism_lvt_public_v1_large', set T = 8"
        
        from videoprism import models as vp
        self.model = vp.get_model(model_name)
        self.params = vp.load_pretrained_weights(model_name)

        @jax.jit
        def vf_b(clips):  # [B,T,288,288,3] -> [B,D]
            v, _, _ = self.model.apply(
                self.params,
                clips, 
                None, 
                None, 
                train=False
            )
            return v

        self.vf_b = vf_b

        # Warmup both
        _ = self.vf_b(jnp.zeros((B, T, H, W, C), jnp.float32)).block_until_ready()

    def __call__(self,
        clips: list[np.ndarray], # List[T,H,W,C] of len B
    ):
        # Batch Inference
        xb = jnp.stack(clips, axis=0) # [B,T,H,W,C]
        video_embeddings = self.vf_b(xb) # [B,768]
        np_embeddings = np.asarray(video_embeddings)  # Back to NumPy
        return [np_embeddings[i].tolist() for i in range(B)]



Previous runs with 24 batches of 16 frame clips at 288x288x3 processed in 128 sec on an A100

In [None]:
print(f"Video Embeddings will process {B} clips of {T} frame each at {W}x{H}x{3}")
N = 4
df_clips = df_clips.limit(N) # Optionally limit clips for testing

# Generate Video Embeddings
df_video_embs = (
    df_clips
    .sort("group_index")
    .with_column("video_embeddings", VideoPrismVideoUDF(df_clips["clip"]))
    .collect()
)


In [None]:
df_video_embs.select("group_index","video_embeddings", "clip").count_rows()

In [None]:
transcript = [
    {"timestamp": "0:00", "text": "What are you climbing today?"},
    {"timestamp": "0:01", "text": "Parkour."},
    {"timestamp": "0:02", "text": "What are you climbing today?"},
    {"timestamp": "0:03", "text": "I'm climbing V4s."},
    {"timestamp": "0:08", "text": "Conor, what are you climbing today?"},
    {"timestamp": "0:11", "text": "I don't know. Whatever looks good."},
    {"timestamp": "0:18", "text": "Oliver, what are you climbing today?"},
    {"timestamp": "0:20", "text": "Oh, rocks."},
    {"timestamp": "0:21", "text": "Cool. Jay, what are you climbing today?"},
    {"timestamp": "0:24", "text": "V1."},
    {"timestamp": "0:28", "text": "Kevin, what are you climbing today?"},
    {"timestamp": "0:29", "text": "Me, too."},
    {"timestamp": "0:31", "text": "Daft"},
    {"timestamp": "0:33", "text": "and daft."},
    {"timestamp": "0:36", "text": "Daft."},
    {"timestamp": "0:37", "text": "Daft."},
    {"timestamp": "0:39", "text": "Daft."},
    {"timestamp": "0:41", "text": "What's your mission?"},
    {"timestamp": "0:43", "text": "To pitch Daft. Many people ask me why"},
    {"timestamp": "0:46", "text": "you should use Daft. Well, here I'm here"},
    {"timestamp": "0:48", "text": "to tell you DF is the fastest multimodal"},
    {"timestamp": "0:52", "text": "data engine that can work for all"},
    {"timestamp": "0:55", "text": "modalities in all situations."},
    {"timestamp": "0:58", "text": "Whether you're working with text,"},
    {"timestamp": "1:00", "text": "images, video or audio, even PDFs, DAFF"},
    {"timestamp": "1:05", "text": "is a solution for you. If you need to"},
    {"timestamp": "1:07", "text": "write to vector databases, if you need"},
    {"timestamp": "1:09", "text": "compute embeddings, D works well out of"},
    {"timestamp": "1:13", "text": "the box with GPUs, with pipelining, GPU"},
    {"timestamp": "1:18", "text": "execution,"},
    {"timestamp": "1:21", "text": "with CPU execution, network IO. We got"},
    {"timestamp": "1:25", "text": "it all."},
    {"timestamp": "1:27", "text": "We have dynamic batch sizing"},
    {"timestamp": "1:35", "text": "and asynchronous userdefined functions."},
    {"timestamp": "1:52", "text": "So it doesn't matter what challenge"},
    {"timestamp": "1:54", "text": "you're trying to solve. Dove is the"},
    {"timestamp": "1:56", "text": "solution for you."},
    {"timestamp": "1:58", "text": "Oh, holy"},
    {"timestamp": "2:09", "text": "Hell yeah."},
    {"timestamp": "2:12", "text": "Go home. I can't wait to go home."}
]

In [None]:
@daft.udf(
    return_dtype = dt.embedding(dt.float32(), 768),
    batch_size=B, # clips per batch (tune for throughput)
    num_gpus=1,
)
class VideoPrismTextUDF:
    def __init__(self, model_name: str = "videoprism_lvt_public_v1_base"):
        from videoprism import models as vp
        self.model = vp.get_model(model_name)
        self.params = vp.load_pretrained_weights(model_name)
        self.text_tokenizer = vp.load_text_tokenizer('c4_en')

        @jax.jit
        def generate_text_embeddings(texts: list[str]):  # [B,T,288,288,3] -> [B,D]
            text_ids, text_paddings = vp.tokenize_texts(self.text_tokenizer, texts)
            _, text_embeddings, _ = self.model.apply(self.params, None, text_ids, text_paddings, train=False)
            return text_embeddings # [B,D] text embeddings 

        self.forward = generate_text_embeddings

        # Warmup 
        _ = self.forward(["Hello", "World"]).block_until_ready()

    def __call__(self,
        prompts: list[str], # List[T,H,W,C] of len B
    ):
        # Batch Inference
        text_embeddings = self.forward(prompts) 
        np_embeddings = np.asarray(text_embeddings)  # Back to NumPy

        return text_embeddings


In [None]:
df_text_embs = (
    df_video_embs
    .with_column("text_embeddings", VideoPrismTextUDF(df_video_embs["clip"]))
    .collect()
)
df_text_embs.show(3)
