# Install requirements

In [1]:
!pip install -r requirements.txt

Collecting openvino==2025.2.0 (from -r requirements.txt (line 1))
  Using cached openvino-2025.2.0-19140-cp312-cp312-manylinux2014_x86_64.whl.metadata (12 kB)
Collecting numpy==2.2.6 (from -r requirements.txt (line 2))
  Using cached numpy-2.2.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting opencv-python==4.11.0.86 (from -r requirements.txt (line 3))
  Downloading opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting pillow==11.3.0 (from -r requirements.txt (line 4))
  Using cached pillow-11.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (9.0 kB)
Collecting transformers==4.52.4 (from -r requirements.txt (line 5))
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting torch==2.7.1 (from -r requirements.txt (line 6))
  Downloading torch-2.7.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting torchvision==0.22.1 (from -r requirem

In [1]:
# Save model to OpenVINO format

from optimum.intel.openvino import OVModelForVisualCausalLM

# First time: export and save
model = OVModelForVisualCausalLM.from_pretrained(
    "llava-hf/LLaVA-NeXT-Video-7B-hf", 
    export=True,
    trust_remote_code=True
)

model.save_pretrained("./llava_openvino_model")

# Future times: load from local saved version (much faster)
model = OVModelForVisualCausalLM.from_pretrained("./llava_openvino_model")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00,  6.60it/s]
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor

INFO:nncf:Statistics of the bitwidth distribution:
┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑
│ Weight compression mode   │ % all parameters (layers)   │ % ratio-defining parameters (layers)   │
┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥
│ int8_asym                 │ 100% (225 / 225)            │ 100% (225 / 225)                       │
┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙


# Upload OV model to HF 

In [11]:
# Create model card and upload to Hugging Face Hub

from huggingface_hub import HfApi, create_repo
import os

# Replace with your desired repo name
REPO_NAME = "llava-next-video-openvino"  # Change this to your preferred name
HF_USERNAME = "ezelanza"  # Replace with your HF username

# Create model card content
model_card = """---
license: apache-2.0
base_model: llava-hf/LLaVA-NeXT-Video-7B-hf
tags:
- openvino
- llava
- multimodal
- video
- visual-question-answering
---

# LLaVA-NeXT-Video OpenVINO Model

This is an OpenVINO optimized version of the LLaVA-NeXT-Video-7B-hf model.

## Model Description
- **Base Model**: llava-hf/LLaVA-NeXT-Video-7B-hf
- **Optimization**: Converted to OpenVINO format for efficient inference
- **Size**: ~7B parameters

## Usage

```python
from optimum.intel.openvino import OVModelForVisualCausalLM

model = OVModelForVisualCausalLM.from_pretrained("YOUR_USERNAME/llava-next-video-openvino")
```

## License
This model inherits the license from the original LLaVA-NeXT model.
"""

# Save model card
with open("README.md", "w") as f:
    f.write(model_card)

print("Model card created: README.md")


Model card created: README.md


In [14]:
# Upload model to Hugging Face Hub

from huggingface_hub import HfApi
import os
# Login to Hugging Face

from huggingface_hub import login
import getpass

print("Go to: https://huggingface.co/settings/tokens")
print("Create a new token with WRITE permissions")
print()

token = getpass.getpass("Enter your HF token: ")
login(token=token)

# Configuration - UPDATE THESE VALUES
REPO_NAME = "ezelanza/llava-next-video-openvino"  # Your desired repo name
# The username will be automatically detected from your login

api = HfApi()

# Create repository
try:
    repo_url = api.create_repo(
        repo_id=REPO_NAME,
        exist_ok=True,
        repo_type="model"
    )
    print(f"Repository created/exists: {repo_url}")
except Exception as e:
    print(f"Repository creation error: {e}")

# Upload model files if they exist
if os.path.exists("./llava_openvino_model"):
    print("Uploading model files...")
    api.upload_folder(
        folder_path="./llava_openvino_model",
        repo_id=REPO_NAME,
        repo_type="model"
    )
    
    # Upload README
    if os.path.exists("README.md"):
        api.upload_file(
            path_or_fileobj="README.md",
            path_in_repo="README.md",
            repo_id=REPO_NAME,
            repo_type="model"
        )
    
    print(f"✅ Model uploaded successfully!")
    print(f"🔗 View your model at: https://huggingface.co/{api.whoami()['name']}/{REPO_NAME}")
else:
    print("❌ Model directory './llava_openvino_model' not found.")
    print("Run the first cell to save the model first.")


Go to: https://huggingface.co/settings/tokens
Create a new token with WRITE permissions

Repository created/exists: https://huggingface.co/ezelanza/llava-next-video-openvino
Uploading model files...


openvino_language_model.bin:   0%|          | 0.00/26.2G [00:00<?, ?B/s]
[A

[A[A



[A[A[A[A


openvino_vision_resampler_model.bin: 100%|██████████| 100/100 [00:00<00:00, 1.85kB/s]
openvino_language_model.bin:   0%|          | 9.37M/26.2G [00:00<04:39, 93.6MB/s]
[A

[A[A

[A[A
[A

[A[A
openvino_language_model.bin:   0%|          | 26.2M/26.2G [00:00<12:48, 34.0MB/s]

[A[A
openvino_language_model.bin:   0%|          | 52.4M/26.2G [00:01<07:52, 55.3MB/s]

[A[A
openvino_language_model.bin:   0%|          | 78.5M/26.2G [00:01<06:22, 68.3MB/s]

[A[A
openvino_multi_modal_projector_model.bin: 100%|██████████| 83.9M/83.9M [00:01<00:00, 57.0MB/s]


openvino_language_model.bin:   0%|          | 105M/26.2G [00:01<05:46, 75.3MB/s] 
[A

[A[A
openvino_language_model.bin:   1%|          | 131M/26.2G [00:01<05:46, 75.0MB/s]

openvino_language_model.bin:   1%|          | 155M/26.2G [00:02<04:32, 95.6MB/s]
[A

openvino_language_model.bin:   1%|          | 168M/26.2G [00:02<05:2

✅ Model uploaded successfully!
🔗 View your model at: https://huggingface.co/ezelanza/ezelanza/llava-next-video-openvino


# Load the model 

In [None]:
from optimum.intel.openvino import OVModelForVisualCausalLM
from transformers import LlavaNextVideoProcessor
from huggingface_hub import login
import getpass

print("Go to: https://huggingface.co/settings/tokens")
print("Create a new token with WRITE permissions")
print()

token = getpass.getpass("Enter your HF token: ")
login(token=token)
model_id = "ezelanza/llava-next-video-openvino"


model = OVModelForVisualCausalLM.from_pretrained(model_id)
processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")


Go to: https://huggingface.co/settings/tokens
Create a new token with WRITE permissions



You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


# Optimize it

In [6]:
from optimum.intel import OVQuantizationConfig, OVWeightQuantizationConfig, OVPipelineQuantizationConfig

dataset, num_samples = "contextual", 50

# weight-only 8bit
woq_8bit = OVWeightQuantizationConfig(bits=8)

# weight-only 4bit
woq_4bit = OVWeightQuantizationConfig(bits=4, group_size=16)

# static quantization
static_8bit = OVQuantizationConfig(bits=8, dataset=dataset, num_samples=num_samples)

# pipeline quantization: applying different quantization on each components
ppl_q = OVPipelineQuantizationConfig(
    quantization_configs={
        "lm_model": OVQuantizationConfig(bits=8),
        "multimodal_model": OVWeightQuantizationConfig(bits=8),
        "text_embeddings_model": OVWeightQuantizationConfig(bits=8),
        "vision_embeddings_model": OVWeightQuantizationConfig(bits=8),
        "vision_model": OVWeightQuantizationConfig(bits=8) 
    },
    dataset=dataset,
    num_samples=num_samples,
)

In [7]:
from optimum.intel import OVModelForVisualCausalLM, OVWeightQuantizationConfig

model_id = "ezelanza/llava-next-video-openvino"

q_model = OVModelForVisualCausalLM.from_pretrained(model_id, quantization_config=woq_8bit)
int8_model_path = "llava_next_video_int8"
q_model.save_pretrained(int8_model_path)

INFO:nncf:Statistics of the bitwidth distribution:
┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑
│ Weight compression mode   │ % all parameters (layers)   │ % ratio-defining parameters (layers)   │
┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥
│ int8_asym                 │ 100% (225 / 225)            │ 100% (225 / 225)                       │
┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙


INFO:nncf:Statistics of the bitwidth distribution:
┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑
│ Weight compression mode   │ % all parameters (layers)   │ % ratio-defining parameters (layers)   │
┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥
│ int8_sym                  │ 100% (1 / 1)                │ 100% (1 / 1)                           │
┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙


INFO:nncf:Statistics of the bitwidth distribution:
┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑
│ Weight compression mode   │ % all parameters (layers)   │ % ratio-defining parameters (layers)   │
┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥
│ int8_sym                  │ 100% (139 / 139)            │ 100% (139 / 139)                       │
┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙


Python(44709) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(44710) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(44711) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


INFO:nncf:Statistics of the bitwidth distribution:
┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑
│ Weight compression mode   │ % all parameters (layers)   │ % ratio-defining parameters (layers)   │
┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥
┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙


Python(44712) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(44713) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(44714) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(44715) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(44716) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(44717) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(44718) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(44719) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(44720) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(44721) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(44722) Malloc

INFO:nncf:Statistics of the bitwidth distribution:
┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑
│ Weight compression mode   │ % all parameters (layers)   │ % ratio-defining parameters (layers)   │
┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥
│ int8_sym                  │ 100% (2 / 2)                │ 100% (2 / 2)                           │
┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙


Python(44724) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(44725) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(44726) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Python(44727) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


In [11]:
# Upload model to Hugging Face Hub

from huggingface_hub import HfApi
import os
# Login to Hugging Face

from huggingface_hub import login
import getpass

print("Go to: https://huggingface.co/settings/tokens")
print("Create a new token with WRITE permissions")
print()

token = getpass.getpass("Enter your HF token: ")
login(token=token)

# Configuration - UPDATE THESE VALUES
REPO_NAME = "ezelanza/llava-next-video-openvino-int8"  # Your desired repo name
# The username will be automatically detected from your login

api = HfApi()

# Create repository
try:
    repo_url = api.create_repo(
        repo_id=REPO_NAME,
        exist_ok=True,
        repo_type="model"
    )
    print(f"Repository created/exists: {repo_url}")
except Exception as e:
    print(f"Repository creation error: {e}")

# Upload model files if they exist
if os.path.exists("./llava_next_video_int8"):
    print("Uploading model files...")
    api.upload_folder(
        folder_path="./llava_next_video_int8",
        repo_id=REPO_NAME,
        repo_type="model"
    )
    
    # Upload README
    if os.path.exists("README.md"):
        api.upload_file(
            path_or_fileobj="README.md",
            path_in_repo="README.md",
            repo_id=REPO_NAME,
            repo_type="model"
        )
    
    print(f"✅ Model uploaded successfully!")
    print(f"🔗 View your model at: https://huggingface.co/{api.whoami()['name']}/{REPO_NAME}")
else:
    print("❌ Model directory './llava_openvino_model' not found.")
    print("Run the first cell to save the model first.")

Go to: https://huggingface.co/settings/tokens
Create a new token with WRITE permissions

Repository created/exists: https://huggingface.co/ezelanza/llava-next-video-openvino-int8
Uploading model files...


openvino_multi_modal_projector_model.bin:   0%|          | 0.00/21.0M [00:00<?, ?B/s]
[A

[A[A


[A[A[A



[A[A[A[A'(MaxRetryError('HTTPSConnectionPool(host=\'hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com\', port=443): Max retries exceeded with url: /repos/60/90/6090cc385ffa8242599cb9933ddf89c49f0a95df92db8c4cb15f904cbf37cfa3/0e6fb96fcab98773f613fce6bc690e2c43dc5d7e0bef3bdfe207d9431d5b4480?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQLC2QXPN7%2F20250725%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250725T141644Z&X-Amz-Expires=86400&X-Amz-Signature=20006e0121fff48b85bd1a3ce0e5a655505162d03214475a1fdec4d4d9d4eed7&X-Amz-SignedHeaders=host&partNumber=1&uploadId=ygZb40kTtVIqmuflcTHo6zIhCbPRhzCVz84KiAdZ3XPMLpdjBu8LDcf0Yve.xTDtAVAUtbUxBKgisnbkNIsts5VfnMSQo3oP_a9Hak.yYjdP9F7q8BtjKQSudB30_ffT&x-id=UploadPart (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x709b6762d0>: Failed to resolve \'hf-hub-l

✅ Model uploaded successfully!
🔗 View your model at: https://huggingface.co/ezelanza/ezelanza/llava-next-video-openvino-int8


# Run inference

In [1]:
from huggingface_hub import hf_hub_download 
from transformers import LlavaNextVideoProcessor
from optimum.intel.openvino import OVModelForVisualCausalLM
from optimum.intel.openvino import OVModelForVisualCausalLM
from transformers import LlavaNextVideoProcessor
from huggingface_hub import login
import getpass

print("Go to: https://huggingface.co/settings/tokens")
print("Create a new token with WRITE permissions")
print()

token = getpass.getpass("Enter your HF token: ")
login(token=token)

#load model in memory
model_id = "ezelanza/llava-next-video-openvino-int8"

video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")

conversation = [
    {

        "role": "user",
        "content": [
            {"type": "text", "text": "What is happening in the video?"},
            {"type": "video", "path": video_path},
            ],
    },
]
processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
model = OVModelForVisualCausalLM.from_pretrained(model_id)


inputs = processor.apply_chat_template(
    conversation,
    num_frames=4,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True
)


  from .autonotebook import tqdm as notebook_tqdm


Go to: https://huggingface.co/settings/tokens
Create a new token with WRITE permissions



You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Unused or unrecognized kwargs: return_tensors.


In [9]:
output = model.generate(**inputs, max_new_tokens=60)
    
response = processor.batch_decode(
        output,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )[0]
    
print(f"MODEL OUTPUT (video frames): {response}")
    
if "ASSISTANT:" in response:
        description = response.split("ASSISTANT:")[-1].strip()
else:
        description = response.strip()
    
print(f"CAPTION GENERATED (video frames): {description}")

MODEL OUTPUT (video frames): USER: 
What is happening in the video? ASSISTANT: In the video, we see a young child sitting on a bed, wearing glasses and engrossed in reading a book. The child appears to be focused on the book, possibly reading or looking at the pictures. The room has a cozy and lived-in feel, with various items
CAPTION GENERATED (video frames): In the video, we see a young child sitting on a bed, wearing glasses and engrossed in reading a book. The child appears to be focused on the book, possibly reading or looking at the pictures. The room has a cozy and lived-in feel, with various items


# Extract from video

In [10]:
import cv2
import numpy as np
from pathlib import Path
import time
from transformers import LlavaNextVideoProcessor
from optimum.intel.openvino import OVModelForVisualCausalLM

def extract_video_frames(video_path, num_frames=4, width=320, height=240):
    """Extract evenly spaced frames from a video file."""
    cap = cv2.VideoCapture(video_path)
    
    if not cap.isOpened():
        print("Error: Could not open video file")
        return []
    
    total_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    duration = total_video_frames / fps
    
    print(f"Video info: {total_video_frames} frames, {fps:.1f} FPS, {duration:.1f} seconds")
    
    # Calculate frame indices to extract (evenly spaced)
    frame_indices = np.linspace(0, total_video_frames-1, num_frames, dtype=int)
    
    frames = []
    for i, frame_idx in enumerate(frame_indices):
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        ret, frame = cap.read()
        if ret:
            # Resize frame to reduce processing time
            frame_resized = cv2.resize(frame, (width, height))
            # Convert BGR to RGB
            frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
            frames.append(frame_rgb)
            print(f"Extracted frame {i+1}/{num_frames} at frame {frame_idx}")
    
    cap.release()
    return frames

In [11]:
    # Extract frames from video
frames = extract_video_frames(video_path, num_frames=4, width=320, height=240)
    
# Save frames as temporary images
frame_paths = []
for i, frame in enumerate(frames):
    frame_path = f"video_frame_{i}.jpg"
    cv2.imwrite(frame_path, cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
    frame_paths.append(Path(frame_path))
    print(f"Saved frame {i+1} as {frame_path}")


Video info: 243 frames, 25.0 FPS, 9.7 seconds
Extracted frame 1/4 at frame 0
Extracted frame 2/4 at frame 80
Extracted frame 3/4 at frame 161
Extracted frame 4/4 at frame 242
Saved frame 1 as video_frame_0.jpg
Saved frame 2 as video_frame_1.jpg
Saved frame 3 as video_frame_2.jpg
Saved frame 4 as video_frame_3.jpg


In [12]:
# Use frames as images in conversation
processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
model = OVModelForVisualCausalLM.from_pretrained(model_id)

    
conversation_with_frames = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Describe what you see in these images. What is happening?"},
                *[{"type": "image", "image": path.as_posix()} for path in frame_paths],
            ],
        },
    ]
    
    # Process with the same model and processor
inputs_with_frames = processor.apply_chat_template(
        conversation_with_frames,
        num_frames=4,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True
    )
    
    # Generate response
out_with_frames = model.generate(**inputs_with_frames, max_new_tokens=60)
    
response_with_frames = processor.batch_decode(
        out_with_frames,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )[0]
    
print(f"MODEL OUTPUT (video frames): {response_with_frames}")
    
if "ASSISTANT:" in response_with_frames:
        description_with_frames = response_with_frames.split("ASSISTANT:")[-1].strip()
else:
        description_with_frames = response_with_frames.strip()
    
print(f"CAPTION GENERATED (video frames): {description_with_frames}")

MODEL OUTPUT (video frames): USER: 



Describe what you see in these images. What is happening? ASSISTANT: In the image, there is a young child sitting on a bed, engrossed in reading a book. The child is wearing a light blue sleeveless top and glasses, and appears to be focused on the book in their hands. The bed has a patterned blanket,
CAPTION GENERATED (video frames): In the image, there is a young child sitting on a bed, engrossed in reading a book. The child is wearing a light blue sleeveless top and glasses, and appears to be focused on the book in their hands. The bed has a patterned blanket,


# EXtract from webcam

In [1]:
from huggingface_hub import hf_hub_download 
from transformers import LlavaNextVideoProcessor
from optimum.intel.openvino import OVModelForVisualCausalLM
from optimum.intel.openvino import OVModelForVisualCausalLM
from transformers import LlavaNextVideoProcessor
from huggingface_hub import login
import getpass

print("Go to: https://huggingface.co/settings/tokens")
print("Create a new token with WRITE permissions")
print()

token = getpass.getpass("Enter your HF token: ")
login(token=token)

#load model in memory
model_id = "ezelanza/llava-next-video-openvino-int8"

  from .autonotebook import tqdm as notebook_tqdm


Go to: https://huggingface.co/settings/tokens
Create a new token with WRITE permissions



In [2]:
import cv2
import numpy as np
from pathlib import Path
import time
from transformers import LlavaNextVideoProcessor
from optimum.intel.openvino import OVModelForVisualCausalLM

def capture_webcam_frames(duration_seconds=3, fps=4, width=320, height=240):
    """Capture frames from webcam for a specified duration automatically."""
    cap = cv2.VideoCapture(0)  # 0 for default webcam
    
    if not cap.isOpened():
        print("Error: Could not open webcam")
        return []
    
    # Set resolution to reduce processing time
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, 320)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height)
    
    # Verify the resolution was set
    actual_width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
    actual_height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
    print(f"Webcam resolution set to: {actual_width}x{actual_height}")

    frames = []
    total_frames = duration_seconds * fps
    frame_interval = 1.0 / fps  # Time between frames
    
    print(f"Capturing {total_frames} frames over {duration_seconds} seconds...")
    print("Starting in 3 seconds...")
    
    # Countdown
    for i in range(3, 0, -1):
        print(f"{i}...")
        time.sleep(1)
    
    print("Starting frame capture...")
    start_time = time.time()
    
    for i in range(total_frames):
        ret, frame = cap.read()
        if ret:
            frame_resized = cv2.resize(frame, (width, height))

            # Convert BGR to RGB
            frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
            frames.append(frame_rgb)
            
            elapsed_time = time.time() - start_time
            print(f"Captured frame {i+1}/{total_frames} at {elapsed_time:.1f}s")
            
            # Wait for next frame time
            if i < total_frames - 1:  # Don't wait after the last frame
                time.sleep(frame_interval)
    
    cap.release()
    print("Capture complete!")
    return frames

In [3]:
import cv2
from pathlib import Path


frames = capture_webcam_frames(duration_seconds=3,fps=4)

    # Save frames as temporary images
frame_paths = []
for i, frame in enumerate(frames):
        frame_path = f"webcam_frame_{i}.jpg"
        cv2.imwrite(frame_path, cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
        frame_paths.append(Path(frame_path))
        print(f"Saved frame {i+1} as {frame_path}")
    
conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Describe this video in one short sentence. What do you see happening? Describe it as if you were a person seeing it avoid saying 'IN the video'"},
                *[{"type": "image", "image": path.as_posix()} for path in frame_paths],
            ],
        },
    ]

Webcam resolution set to: 320.0x240.0
Capturing 12 frames over 3 seconds...
Starting in 3 seconds...
3...
2...
1...
Starting frame capture...
Captured frame 1/12 at 0.2s
Captured frame 2/12 at 0.4s
Captured frame 3/12 at 0.7s
Captured frame 4/12 at 1.0s
Captured frame 5/12 at 1.2s
Captured frame 6/12 at 1.5s
Captured frame 7/12 at 1.7s
Captured frame 8/12 at 2.0s
Captured frame 9/12 at 2.2s
Captured frame 10/12 at 2.5s
Captured frame 11/12 at 2.7s
Captured frame 12/12 at 3.0s
Capture complete!
Saved frame 1 as webcam_frame_0.jpg
Saved frame 2 as webcam_frame_1.jpg
Saved frame 3 as webcam_frame_2.jpg
Saved frame 4 as webcam_frame_3.jpg
Saved frame 5 as webcam_frame_4.jpg
Saved frame 6 as webcam_frame_5.jpg
Saved frame 7 as webcam_frame_6.jpg
Saved frame 8 as webcam_frame_7.jpg
Saved frame 9 as webcam_frame_8.jpg
Saved frame 10 as webcam_frame_9.jpg
Saved frame 11 as webcam_frame_10.jpg
Saved frame 12 as webcam_frame_11.jpg


In [None]:
processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
model = OVModelForVisualCausalLM.from_pretrained(model_id)


inputs = processor.apply_chat_template(
    conversation,
    num_frames=4,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True
)

out = model.generate(**inputs, max_new_tokens=60)

response = processor.batch_decode(
                    out,
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=True
                )[0]

if "ASSISTANT:" in response:
    description = response.split("ASSISTANT:")[-1].strip()
else:
    # If no ASSISTANT marker, use the full response
    description = response.strip()

print(f"CAPTION GENERATED: {description}")

You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Token indices sequence length is longer than the specified maximum sequence length for this model (16471 > 10250). Running this sequence through the model will result in indexing errors
