# Video Q/A with Videoprism and Parakeet-v3

<a target="_blank" href="https://colab.research.google.com/github/everettVT/daft-video-embeddings/blob/main/workload/video_embeddings_videoprism.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

Videoprism is a general-purpose video encoder designed to tackle a wide spectrum of video understanding tasks, including classification localization, retrieval, captioning, and question answering. 

Parakeet is a 600-million-parameter multilingual automatic speech recognition (ASR) model designed for high-throughput speech-to-text transcription. 

In this notebook, we will explore how to leverage these foundational models to generate video and text embeddings from youtube or any video file. 

Video processing requires us to extract both image and audio frames, which can then use to generate embeddings. In this use case we will be transcribing the audio to text segments so we can perform RAG Q/A against both the visual and spoken content.


In [None]:
# @title Prepare environment

import os
import sys

# Fetch VideoPrism repository if Python does not know about it and install
# dependencies needed for this notebook.
if not os.path.exists("videoprism_repo"):
  !git clone --quiet --branch=main --depth=1 \
     https://github.com/everettVT/videoprism.git videoprism_repo
  os.chdir('./videoprism_repo')
  !pip install .
  os.chdir('..')

# Append VideoPrism code to Python import path.
if "videoprism_repo" not in sys.path:
  sys.path.append("videoprism_repo")

In [None]:
!pip install "daft>=0.6.1" av yt-dlp "jax[cuda12]"

In [1]:
import daft
from daft import col, DataType as dt
import numpy as np
import jax
import jax.numpy as jnp
from jax.extend import backend
import tensorflow as tf
from videoprism import models as vp
print(jax.devices())    # should list a CUDA device

[CpuDevice(id=0)]


### Define Parameters

Tensor Dimensions: 
- B: batch size (number of videos in a batch).
- T: number of frames per video clip (typically 16).
- N: tokens per frame (for 288×288 with 18×18 patches → 16×16 = 256).
- D: embedding dimension (Base: 768; Large: 1024).

VideoPrism supports video+text inputs and returns:
- video_embeddings: [B, D] (global video embeddings).
- text_embeddings: [B, D] (global text embeddings).
- Optional: frame_embeddings [B, T, D]; tokens [B, T×N, D]

In [2]:
import os 

B, T, H, W, C = 24, 16, 288, 288, 3
ROW_LIMIT = 2048
VIDEO_DIR = "~/Movies"#f"{os.getcwd()}/videos" # Overwrite with desired 
YT_PLAYLIST = "https://www.youtube.com/playlist?list=PL3Q1vFKgSohNO4mbMKo5xccOsYWISUlou"

### Download first 5 videos from your favorite youtube playlist

In [3]:
# Start of Selection
import os
import subprocess

os.makedirs(VIDEO_DIR, exist_ok=True)

# Download first 5 videos from your favorite youtube playlist
subprocess.run([
    "yt-dlp",
    "-I", "1:6",
    YT_PLAYLIST
], cwd=VIDEO_DIR, check=True)

[youtube:tab] Extracting URL: https://www.youtube.com/playlist?list=PL3Q1vFKgSohNO4mbMKo5xccOsYWISUlou
[youtube:tab] PL3Q1vFKgSohNO4mbMKo5xccOsYWISUlou: Downloading webpage




[youtube:tab] PL3Q1vFKgSohNO4mbMKo5xccOsYWISUlou: Redownloading playlist API JSON with unavailable videos
[download] Downloading playlist: Data Topic Deep Dives
[youtube:tab] Playlist Data Topic Deep Dives: Downloading 6 items of 19
[download] Downloading item 1 of 6
[youtube] Extracting URL: https://www.youtube.com/watch?v=WAsmZJ2kff0
[youtube] WAsmZJ2kff0: Downloading webpage
[youtube] WAsmZJ2kff0: Downloading tv simply player API JSON
[youtube] WAsmZJ2kff0: Downloading tv client config
[youtube] WAsmZJ2kff0: Downloading player 72dd8066-main
[youtube] WAsmZJ2kff0: Downloading tv player API JSON



ERROR: Interrupted by user


KeyboardInterrupt: 

### Discover video files

In [4]:
from daft.functions import file
df_files = (
    daft.from_glob_path(VIDEO_DIR)
    .with_column("file", file(col("path")))
    .where(col("path").str.endswith(".mp4")) # TODO: Remove this filter
)

df_files.show(5)

path Utf8,size Int64,num_rows Int64,file File
file:///Users/everett-founder/Movies/Running.mp4,132936424,,"File(Reference(""file:///Users/everett-founder/Movies/Running.mp4"", None))"
file:///Users/everett-founder/Movies/digitlism.mp4,9176010,,"File(Reference(""file:///Users/everett-founder/Movies/digitlism.mp4"", None))"
file:///Users/everett-founder/Movies/StructureOutputsWorkloadWalkthrough.mp4,153828067,,"File(Reference(""file:///Users/everett-founder/Movies/StructureOutputsWorkloadWalkthrough.mp4"", None))"


Retrieve Metadata from file headers

In [5]:

import av
from fractions import Fraction
from typing import Any, Dict, Optional

def _to_float_rate(rate: Optional[Fraction]) -> Optional[float]:
    try:
        return float(rate) if rate else None
    except Exception:
        return None

def _duration_seconds(container: av.container.input.InputContainer, video_stream: Optional[av.video.stream.VideoStream]) -> Optional[float]:
    # container.duration is in microseconds (or None)
    if container.duration and container.duration > 0:
        return container.duration / 1_000_000.0
    # fallback to stream-based duration if present
    if video_stream and video_stream.duration and video_stream.time_base:
        try:
            return float(video_stream.duration * video_stream.time_base)
        except Exception:
            return None
    return None

def probe_basic_video_meta(file: daft.File, probesize: str = "64k", analyzeduration_us: int = 200_000) -> Dict[str, Optional[float | int]]:
    """
    Returns {'width', 'height', 'fps', 'frame_count'} using only container/stream headers.
    Frame count is exact if the stream exposes nb_frames; otherwise estimated via duration * fps.
    """
    options = {"probesize": str(probesize), "analyzeduration": str(analyzeduration_us)}
    with file:
        with av.open(file, mode="r", options=options, metadata_encoding="utf-8") as container:
            vs = next((s for s in container.streams if s.type == "video"), None)
            if not vs:
                return {"width": None, "height": None, "fps": None, "frame_count": None}

            width = getattr(vs, "width", None)
            height = getattr(vs, "height", None)
            time_base = getattr(vs, "time_base", None)
            fps = _to_float_rate(getattr(vs, "average_rate", None)) or _to_float_rate(getattr(vs, "guessed_rate", None))

            nb_frames = getattr(vs, "frames", None)
            if not nb_frames or nb_frames <= 0:
                dur = _duration_seconds(container, vs)
                nb_frames = int(round(dur * fps)) if (dur and fps) else None

            return {
                "width": width,
                "height": height,
                "fps": fps,
                "frame_count": nb_frames,
                "time_base": float(time_base),
            }

@daft.func(return_dtype = dt.struct({
    "width": dt.int32(),
    "height": dt.int32(),
    "fps": dt.float64(),
    "frame_count": dt.int32(),
    "time_base": dt.float64(),
}))
def get_video_metadata(file: daft.File, probesize: str = "64k", analyzeduration_us: int = 200_000):

    metadata = probe_basic_video_meta(file, probesize, analyzeduration_us)
    return metadata

In [6]:
df_meta = df_files.with_column("metadata", get_video_metadata(df_files["file"])).collect()
df_meta.show(5)

path Utf8,size Int64,num_rows Int64,file File,"metadata Struct[width: Int32, height: Int32, fps: Float64, frame_count: Int32, time_base: Float64]"
file:///Users/everett-founder/Movies/Running.mp4,132936424,,"File(Reference(""file:///Users/everett-founder/Movies/Running.mp4"", None))","{width: 1280, height: 720, fps: 30, frame_count: 45733, time_base: 0.00001736111111111111, }"
file:///Users/everett-founder/Movies/digitlism.mp4,9176010,,"File(Reference(""file:///Users/everett-founder/Movies/digitlism.mp4"", None))","{width: 3336, height: 2160, fps: 60, frame_count: 460, time_base: 0.00001736111111111111, }"
file:///Users/everett-founder/Movies/StructureOutputsWorkloadWalkthrough.mp4,153828067,,"File(Reference(""file:///Users/everett-founder/Movies/StructureOutputsWorkloadWalkthrough.mp4"", None))","{width: 1920, height: 1080, fps: 30, frame_count: 17977, time_base: 0.00001736111111111111, }"


In [7]:
@daft.func(return_dtype=dt.list(dt.uint64()))
def clip_frame_time(clip_indices: list[int], time_base: float, fps: float) -> list[int]:
    return [int(i / fps / time_base) for i in clip_indices]


In [8]:
# plan clips
df_clip_plan = (
    df_meta
    .with_column("clip_indices", 
        col("metadata")["frame_count"]
        .apply(lambda n: list(range(0, n)), return_dtype=dt.list(dt.int32()))
        .list.chunk(size=T)
    )
    .explode("clip_indices")
    .with_column("clip_time_base", clip_frame_time(col("clip_indices"), col("metadata")["time_base"], col("metadata")["fps"]))
)
df_clip_plan.show(5)

path Utf8,size Int64,num_rows Int64,file File,"metadata Struct[width: Int32, height: Int32, fps: Float64, frame_count: Int32, time_base: Float64]",clip_indices FixedSizeList[Int32; 16],clip_time_base List[UInt64]
file:///Users/everett-founder/Movies/Running.mp4,132936424,,"File(Reference(""file:///Users/everett-founder/Movies/Running.mp4"", None))","{width: 1280, height: 720, fps: 30, frame_count: 45733, time_base: 0.00001736111111111111, }","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]","[0, 1920, 3840, 5760, 7680, 9600, 11520, 13440, 15360, 17280, 19200, 21120, 23040, 24960, 26880, 28800]"
file:///Users/everett-founder/Movies/Running.mp4,132936424,,"File(Reference(""file:///Users/everett-founder/Movies/Running.mp4"", None))","{width: 1280, height: 720, fps: 30, frame_count: 45733, time_base: 0.00001736111111111111, }","[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]","[30720, 32640, 34560, 36480, 38400, 40320, 42240, 44160, 46080, 48000, 49920, 51840, 53760, 55680, 57600, 59520]"
file:///Users/everett-founder/Movies/Running.mp4,132936424,,"File(Reference(""file:///Users/everett-founder/Movies/Running.mp4"", None))","{width: 1280, height: 720, fps: 30, frame_count: 45733, time_base: 0.00001736111111111111, }","[32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]","[61440, 63360, 65280, 67200, 69120, 71040, 72960, 74880, 76800, 78720, 80640, 82560, 84480, 86400, 88320, 90240]"
file:///Users/everett-founder/Movies/Running.mp4,132936424,,"File(Reference(""file:///Users/everett-founder/Movies/Running.mp4"", None))","{width: 1280, height: 720, fps: 30, frame_count: 45733, time_base: 0.00001736111111111111, }","[48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]","[92160, 94080, 96000, 97920, 99840, 101760, 103680, 105600, 107520, 109440, 111360, 113280, 115200, 117120, 119040, 120960]"
file:///Users/everett-founder/Movies/Running.mp4,132936424,,"File(Reference(""file:///Users/everett-founder/Movies/Running.mp4"", None))","{width: 1280, height: 720, fps: 30, frame_count: 45733, time_base: 0.00001736111111111111, }","[64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79]","[122880, 124799, 126720, 128640, 130560, 132480, 134400, 136320, 138240, 140160, 142080, 144000, 145920, 147840, 149760, 151680]"


### Stream Video Frames

In [9]:
def frame_to_rgb_float32(frame: av.VideoStream, w: int, h: int, interp: str = None) -> np.ndarray:
    
    frame = frame.to_ndarray(
        width=w,
        height=h,
        format="rgb24",
        interpolation=interp,
    ).astype(np.float32) / 255.0

    return frame


@daft.func()
def yield_clip_stack(
    file: daft.File, 
    clip_time_base: list[float],
    clip_size: int = 16, 
    width: int = 288, 
    height: int = 288, 
    interp: str = None
    ) -> dt.tensor(dt.float32(), shape=(T, H, W, 3)):
    
    with file:
        with av.open(file) as container:
            container.seek(clip_time_base[0])
            i = 0
            arrays = []
            for frame in container.decode(video=0):
                array = frame_to_rgb_float32(frame, w=width, h=height, interp=interp) 
                arrays.append(array)
                i += 1
                if i == clip_size:
                    return np.stack(arrays, axis=0).astype(np.float32)

In [10]:
df_clips = (
    df_clip_plan
    .with_column("clip", 
        yield_clip_stack(
            col("file"), 
            col("clip_indices"),
            clip_size=T, 
            width=W, 
            height=H
        )
    )
)
df_clips.limit(5).collect()

🗡️ 🐟 InMemorySource: 00:00 

🗡️ 🐟 Project: 00:00 

🗡️ 🐟 UDF <lambda>: 00:00 

🗡️ 🐟 Project: 00:00 

🗡️ 🐟 Explode: 00:00 

🗡️ 🐟 Limit 5: 00:00 

🗡️ 🐟 Project: 00:00 

🗡️ 🐟 UDF clip_frame_time: 00:00 

🗡️ 🐟 UDF yield_clip_stack: 00:00 

path Utf8,size Int64,num_rows Int64,file File,"metadata Struct[width: Int32, height: Int32, fps: Float64, frame_count: Int32, time_base: Float64]",clip_indices FixedSizeList[Int32; 16],clip_time_base List[UInt64],"clip FixedShapeTensor[Float32; [16, 288, 288, 3]]"
file:///Users/everett-founder/Movies/Running.mp4,132936424,,"File(Reference(""file:///Users/everett-founder/Movies/Running.mp4"", None))","{width: 1280, height: 720, fps: 30, frame_count: 45733, time_base: 0.00001736111111111111, }","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]","[0, 1920, 3840, 5760, 7680, 9600, 11520, 13440, 15360, 17280, 19200, 21120, 23040, 24960, 26880, 28800]",<FixedShapeTensor>
file:///Users/everett-founder/Movies/Running.mp4,132936424,,"File(Reference(""file:///Users/everett-founder/Movies/Running.mp4"", None))","{width: 1280, height: 720, fps: 30, frame_count: 45733, time_base: 0.00001736111111111111, }","[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]","[30720, 32640, 34560, 36480, 38400, 40320, 42240, 44160, 46080, 48000, 49920, 51840, 53760, 55680, 57600, 59520]",<FixedShapeTensor>
file:///Users/everett-founder/Movies/Running.mp4,132936424,,"File(Reference(""file:///Users/everett-founder/Movies/Running.mp4"", None))","{width: 1280, height: 720, fps: 30, frame_count: 45733, time_base: 0.00001736111111111111, }","[32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]","[61440, 63360, 65280, 67200, 69120, 71040, 72960, 74880, 76800, 78720, 80640, 82560, 84480, 86400, 88320, 90240]",<FixedShapeTensor>
file:///Users/everett-founder/Movies/Running.mp4,132936424,,"File(Reference(""file:///Users/everett-founder/Movies/Running.mp4"", None))","{width: 1280, height: 720, fps: 30, frame_count: 45733, time_base: 0.00001736111111111111, }","[48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]","[92160, 94080, 96000, 97920, 99840, 101760, 103680, 105600, 107520, 109440, 111360, 113280, 115200, 117120, 119040, 120960]",<FixedShapeTensor>
file:///Users/everett-founder/Movies/Running.mp4,132936424,,"File(Reference(""file:///Users/everett-founder/Movies/Running.mp4"", None))","{width: 1280, height: 720, fps: 30, frame_count: 45733, time_base: 0.00001736111111111111, }","[64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79]","[122880, 124799, 126720, 128640, 130560, 132480, 134400, 136320, 138240, 140160, 142080, 144000, 145920, 147840, 149760, 151680]",<FixedShapeTensor>


### Define Inference Strategy

In [11]:
@daft.udf(
    return_dtype = dt.embedding(dt.float32(), 768),
    batch_size=1, # clips per batch (tune for throughput)
    num_gpus=0,
)
class VideoPrismVideoUDF:
    def __init__(self, model_name: str = "videoprism_lvt_public_v1_base"):
        "for 'videoprism_lvt_public_v1_large', set T = 8"
        
        from videoprism import models as vp
        self.model = vp.get_model(model_name)
        self.params = vp.load_pretrained_weights(model_name)

        @jax.jit
        def vf_b(clips):  # [B,T,288,288,3] -> [B,D]
            v, _, _ = self.model.apply(
                self.params,
                clips, 
                None, 
                None, 
                train=False
            )
            return v

        self.vf_b = vf_b

        # Warmup both
        _ = self.vf_b(jnp.zeros((B, T, H, W, C), jnp.float32)).block_until_ready()

    def __call__(self,
        clips: list[np.ndarray], # List[T,H,W,C] of len B
    ):
        # Batch Inference
        xb = jnp.stack(clips, axis=0) # [B,T,H,W,C]
        video_embeddings = self.vf_b(xb) # [B,768]
        np_embeddings = np.asarray(video_embeddings)  # Back to NumPy
        return [np_embeddings[i].tolist() for i in range(B)]

Previous runs with 24 batches of 16 frame clips processed in 128 sec.

In [12]:
print(f"Video Embeddings will process {B} clips of {T} frame each at {W}x{H}x{3}")
df_text_embs = (
    df_clips
    .with_column("video_embeddings", VideoPrismVideoUDF(col("clip")))
)
df_text_embs.limit(1).show()

Video Embeddings will process 24 clips of 16 frame each at 288x288x3


🗡️ 🐟 InMemorySource: 00:00 

🗡️ 🐟 Project: 00:00 

🗡️ 🐟 UDF <lambda>: 00:00 

🗡️ 🐟 Project: 00:00 

🗡️ 🐟 Explode: 00:00 

🗡️ 🐟 Limit 1: 00:00 

🗡️ 🐟 Project: 00:00 

🗡️ 🐟 UDF clip_frame_time: 00:00 

🗡️ 🐟 UDF yield_clip_stack: 00:00 

🗡️ 🐟 UDF VideoPrismVideoUDF: 00:00 

: 

In [None]:
df_video_embs.select("group_index","video_embeddings", "clip").count_rows()

Extract Audio from Video, Transcribe and Embed

In [None]:
import av
from av.audio.resampler import AudioResampler

@daft.func()
def extract_audio_frames_into_numpy_arrays(file: daft.File) -> np.ndarray:
    
    container = av.open(file)
    resampler = AudioResampler(format='s16', layout='mono', rate=16000)

    chunks = []
    try:
        for frame in container.decode(audio=0):
            # Resample to desired SR/mono/PCM16; result can be a frame or list of frames
            res = resampler.resample(frame)
            frames = res if isinstance(res, (list, tuple)) else [res]

            for f in frames:
                arr = f.to_ndarray()  # typically (channels, samples) or (samples,)
                arr = np.asarray(arr)

                # Flatten to 1-D mono
                if arr.ndim == 2:
                    # (1, N) or (N, 1) → (N,)
                    if arr.shape[0] == 1:
                        arr = arr[0]
                    elif arr.shape[1] == 1:
                        arr = arr[:, 0]
                    else:
                        # Unexpected multi-channel after mono resample: average as fallback
                        arr = arr.mean(axis=0)
                elif arr.ndim > 2:
                    arr = arr.reshape(-1)

                # Convert PCM16 → float32 in [-1, 1]
                if arr.dtype != np.float32:
                    arr = (arr.astype(np.float32) / 32768.0).clip(-1.0, 1.0)

                chunks.append(arr)
    finally:
        container.close()

    if not chunks:
        return np.zeros((0,), dtype=np.float32)

    audio = np.concatenate(chunks, axis=0).astype(np.float32, copy=False)
    return audio

    

In [None]:
@daft.udf(return_dtype = dt.string())
class ParakeetTranscribeUDF:
    def __init__(self, context_size: int = 256):
        import nemo.collections.asr as nemo_asr
        self.asr_model = nemo_asr.models.ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v3")
        self.asr_model.change_attention_model(
            self_attention_model="rel_pos_local_attn", 
            att_context_size=[context_size, context_size]
        )

    def __call__(self, audio: list[np.ndarray]):
        outputs = self.asr_model.transcribe(audio)   
        texts = [o.text for o in outputs]
        return texts



In [None]:

# Parakeet Transcribe with Timestamps 
@daft.udf(return_dtype = dt.struct({
    "segment": dt.list(dt.struct({
        "start_offset": dt.int32(),
        "end_offset": dt.int32(),
        "start": dt.float32(),
        "end": dt.float32()
    })),
}))
class ParakeetTranscribeTimestampsUDF:
    def __init__(self, context_size: int = 256):
        import nemo.collections.asr as nemo_asr
        self.asr_model = nemo_asr.models.ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v3")
        self.asr_model.change_attention_model(
            self_attention_model="rel_pos_local_attn", 
            att_context_size=[context_size, context_size]
        )

    def __call__(self, audio: list[np.ndarray]):
        outputs = self.asr_model.transcribe(audio, timestamps=True)   # No public flag to emit only segments
        return [o.timestamp["segment"] for o in outputs] 


In [None]:


@daft.udf(
    return_dtype = dt.embedding(dt.float32(), 768),
    batch_size=B, # clips per batch (tune for throughput)
    num_gpus=1,
)
class VideoPrismTextUDF:
    def __init__(self, model_name: str = "videoprism_lvt_public_v1_base"):
        from videoprism import models as vp
        self.model = vp.get_model(model_name)
        self.params = vp.load_pretrained_weights(model_name)
        self.text_tokenizer = vp.load_text_tokenizer('c4_en')

        @jax.jit
        def vf_b(text_ids, text_paddings):  # [B,T,288,288,3] -> [B,D]
            _, t, _ = self.model.apply(self.params, None, text_ids, text_paddings, train=False)
            return t # text embeddings 

        self.vf_b = vf_b

        # Warmup both
        text_ids, text_paddings = vp.tokenize_texts(self.text_tokenizer, ["Hello", "World"])
        _ = self.vf_b(None, text_ids, text_paddings).block_until_ready()

    def __call__(self,
        prompts: list[str], # List[T,H,W,C] of len B
    ):
        # Batch Inference
        text_ids, text_paddings = vp.tokenize_texts(self.text_tokenizer, prompts)
        text_embeddings = self.vf_b(text_ids, text_paddings) 

        return text_embeddings


# Using daft.File instead of read_video_frames()

Video data, as a critical type of multimodal data, uniquely integrates visual, audio, and temporal dimensions, inherently fusing spatial (image-based) and temporal information. It has been widely adopted across domains including short-video platforms, live streaming, public security, healthcare, and autonomous driving.

Given the large volume of video data, most processing paradigms typically involve streaming-based reading and processing to minimize memory footprint. This distinguishes it from image data, which generally requires full initial loading into memory prior to processing.

Thus, when introducing the Video data type into Daft, it should avoid storing the entire dataset in memory. Drawing inspiration from the File data type, we can either store merely a URL reference to the video data or directly utilize the underlying data structure of the File data type as its internal representation.

Beyond the core content of video data, it is critical to extract key metadata to facilitate subsequent filtering of target videos prior to processing. Videos encompass extensive metadata, such as frame count, resolution (height/width), time base, duration, pixel format, bit rate, codec name, and profile, among others. However, incorporating all such metadata into the Video data type is impractical from a memory efficiency standpoint. Instead, we prioritize including only essential metadata fields—specifically frame count, height, width, and FPS. Additional metadata can be dynamically retrieved during video processing as needed.

daft.read_video_frames is a very convenient API for reading video frames, also including recognize/filter the key frames and resize frame, it's very helpful for most common cases, but it might more focus on the case that reading key frames, there are some other cases might need a Video data type and native functions(even though we can used customized UDF to achieve it.), feel free to discuss:
support using different algorithms to reading/extracting accurate key frames, e.g. difference, optical flow, the default behavior of PyAV to extract key frame is based on I-frame the use the native encoding metadata pict_type='I'
Besides extracting/reading key frames, there are other use case about video, e.g. split video by key frame, extract audio from video, etc. it's better to add these functions on Video or file data type instead of adding new API for each use cases.
From performance perspective, it's better to use rust library to handle video processing logical as much as possible, e.g. ffmpeg-next, even though most tools are based on ffmepg
filter/pushdown video data based on thier metadata before processing them.


R Conner Howell
:daft:  Aug 28th at 11:07 AM
I agree. The idea behind the "File" type was to start with a wrapper of the appropriate python file-like protocols, then to further type into VideoFile, AudioFile, PdfFile, etc. — each of which having their own domain-specific methods such a read_frames, read_channels, read_pages etc. respectively. As you have also pointed out, this enables daft to implement this functionality in Rust as well.

In [None]:
df_file = daft.from_glob_path("~/Movies/")

In [None]:

df = df.from_glob_path("s3://bucket/videos/")

# Convert path to video directly from utf8 data type to video type. Daft should support convert from utf8 and file data type both.
df = df.with_column("video", video(col("path")))

# Filter video by video metadata.
df = df.filter((df["width"] > 1024) & (df["height"] > 576) & (df["frames"] > 100)))

# Extract the key frames, the `key_frames` function will streaming read video data
# and extract multiple key frames, the data type of each frame is FixedShapeImage. The `key_frames` might add more parameters to indicate what's the image mode of key frames.
# TODO consider whether to include some metadata for key frame to compatible with daft.read_video_frames
df = df.with_column("key_frames", key_frames(col("video"), method= "I_frame").explode("key_frames")

# Save the key frames as a dataset.
df.select("path", "key_frames").write_lance("key_frames_dataset")


## Appendix 

In [None]:
# Parakeet Transcribe with Timestamps 
@daft.udf(return_dtype = dt.struct({
    "word": dt.list(dt.struct({
        "word": dt.string(),
        "start_offset": dt.int32(),
        "end_offset": dt.int32(),
        "start": dt.float32(),
        "end": dt.float32()
    })),
    "segment": dt.list(dt.struct({
        "start_offset": dt.int32(),
        "end_offset": dt.int32(),
        "start": dt.float32(),
        "end": dt.float32()
    })),
    "char": dt.list(dt.struct({
        "char": dt.string(),
        "start_offset": dt.int32(),
        "end_offset": dt.int32(),
        "start": dt.float32(),
        "end": dt.float32()
    })),
}))
class ParakeetTranscribeTimestampsUDF:
    def __init__(self, context_size: int = 256):
        import nemo.collections.asr as nemo_asr
        self.asr_model = nemo_asr.models.ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v3")
        self.asr_model.change_attention_model(
            self_attention_model="rel_pos_local_attn", 
            att_context_size=[context_size, context_size]
        )

    def __call__(self, audio: list[np.ndarray]):
        outputs = self.asr_model.transcribe(audio, timestamps=True)   # No public flag to emit only segments
        return [o.timestamp for o in outputs] 

In [None]:
class DiarizationSortFormerUDF:
    def __init__(self, context_size: int = 256):
        from nemo.collections.asr.models import SortformerEncLabelModel
        self.diar_model = SortformerEncLabelModel.from_pretrained("nvidia/diar_streaming_sortformer_4spk-v2")
        self.diar_model.eval() # Switch to inference mode

    def __call__(self, audio: list[np.ndarray]):
        outputs = self.asr_model.transcribe(audio, timestamps=True)   # No public flag to emit only segments
        return [o.timestamp for o in outputs] 