In [1]:
"""
Single-Movie AD Pipeline with Groq Multimodal Integration (Llama Guard 4 12B)
--------------------------------------------------------------------------

This version adds direct Groq API integration for multimodal inference using the model
`meta-llama/llama-guard-4-12b`.

The flow remains the same:
1. Load single movie (.h5)
2. Segment into shots
3. Aggregate shot embeddings
4. Retrieve IMDb-based character context (stub)
5. Send frame(s) + prompt to Groq multimodal model
6. Summarize into AD text

Author: Nikhil Andrew Franco (Groq-integrated prototype)
"""

import os
import h5py
import numpy as np
import chromadb
from typing import Dict, List, Tuple
import pickle
from dotenv import load_dotenv

# Optional visual helpers
try:
    import matplotlib.pyplot as plt
except Exception:
    plt = None



# ---------------------------
# I/O utilities
# ---------------------------


In [2]:

def load_h5_embeddings(h5_path: str, movie_key: str) -> np.ndarray:
    with h5py.File(h5_path, "r") as f:
        if movie_key not in f:
            raise KeyError(f"Movie {movie_key} not found in {h5_path}")
        data = np.array(f[movie_key]).astype(np.float32)
    return data

def normalize_rows(x: np.ndarray, eps: float = 1e-8) -> np.ndarray:
    norms = np.linalg.norm(x, axis=1, keepdims=True)
    return x / np.maximum(norms, eps)



# ---------------------------
# Shot segmentation
# ---------------------------


In [3]:

def detect_shots_by_similarity(emb: np.ndarray, sim_threshold: float = 0.80, min_len: int = 3) -> List[Tuple[int,int]]:
    emb = normalize_rows(emb)
    sims = (emb[:-1] * emb[1:]).sum(axis=1)
    cut_idx = np.where(sims < sim_threshold)[0]
    cuts = [0]
    for i in cut_idx:
        if i + 1 - cuts[-1] >= min_len:
            cuts.append(i + 1)
    if cuts[-1] != len(emb):
        cuts.append(len(emb))
    return [(cuts[i], cuts[i+1]) for i in range(len(cuts)-1)]

def aggregate_shots(emb: np.ndarray, shots: List[Tuple[int,int]]) -> np.ndarray:
    pooled = [emb[s:e].mean(axis=0) for s, e in shots]
    return normalize_rows(np.vstack(pooled).astype(np.float32))



# ---------------------------
# IMDb Character Bank (stub)
# ---------------------------

In [4]:


def recognize_characters_stub(movie_name: str, shot_idx: int) -> List[str]:
    if "American_Beauty" in movie_name:
        return ["Lester Burnham", "Angela Hayes"] if shot_idx % 3 == 0 else ["Lester Burnham"]
    return []



# ---------------------------
# Groq Multimodal LLM Integration (meta-llama/llama-guard-4-12b)
# ---------------------------


In [17]:

def generate_dense_description_groq(frame_paths: List[str], characters: List[str]) -> str:
    """Send frame(s) + contextual prompt to Groq multimodal model (Llama Guard 4 12B)."""
    load_dotenv()
    api_key = os.getenv("GROQ_API_KEY")
    if not api_key:
        raise EnvironmentError("Set GROQ_API_KEY in your environment or .env file.")

    try:
        from groq import Groq
    except ImportError:
        raise ImportError("Install Groq SDK with: pip install groq")

    client = Groq(api_key=api_key)
    image_url = frame_paths[0] if frame_paths else None

    messages = [
        {"role": "system", "content": "You are an assistant that generates rich scene descriptions for BLV users."},
        {"role": "user", "content": [
            {"type": "text", "text": f"Describe this movie shot vividly but concisely. Characters visible: {', '.join(characters) or 'Unknown'}."},
        ]}
    ]

    if image_url:
        messages[1]["content"].append({"type": "image", "image_url": image_url})

    resp = client.chat.completions.create(
        model="meta-llama/llama-4-scout-17b-16e-instruct",
        messages=messages,
        temperature=0.7,
        max_tokens=300
    )

    return resp.choices[0].message.get("content", "[No response from Groq API]")



# ---------------------------
# Summarization (text-only placeholder)
# ---------------------------


In [6]:

def summarize_to_AD(dense_text: str) -> str:
    return f"Audio Description: {dense_text.strip().capitalize()}"



# ---------------------------
# ChromaDB storage + retrieval
# ---------------------------


In [14]:


def store_in_chroma(movie_key: str, pooled: np.ndarray, shots: List[Tuple[int,int]]):
    client = chromadb.Client()
    collection = client.get_or_create_collection(name="movie_shots")


    collection.add(
        embeddings=pooled.tolist(),
        ids=[f"{movie_key}_{i}" for i in range(len(pooled))],
        metadatas=[
            {"movie": movie_key, "start_frame": int(s), "end_frame": int(e)}
            for i, (s, e) in enumerate(shots)
        ]
    )


    return collection


def query_chroma(collection, query_vec: np.ndarray, k: int = 5):
    results = collection.query(query_embeddings=[query_vec.tolist()], n_results=k)
    return results

# ---------------------------
# Main runner
# ---------------------------

In [18]:

if __name__ == "__main__":
    H5_PATH = r"C:\Users\nikhi\projects\AI-Video-Describer\MAD\features\CLIP_L14_frames_features_5fps.h5"
    MOVIE_KEY = "0001_American_Beauty"

    emb = load_h5_embeddings(H5_PATH, MOVIE_KEY)
    print(f"Loaded {MOVIE_KEY}: {emb.shape[0]} frames, {emb.shape[1]} dims")

    shots = detect_shots_by_similarity(emb)
    pooled = aggregate_shots(emb, shots)
    print(f"Detected {len(shots)} shots.")

    # Store embeddings in ChromaDB
    collection = store_in_chroma(MOVIE_KEY, pooled, shots)
    print(f"Stored {len(pooled)} shot vectors in ChromaDB.")

    # Example retrieval
    results = query_chroma(collection, pooled[0], k=3)
    print("\nNearest shots:")
    print(results)

    # Run first few shots through Groq LLM
    for i, (s, e) in enumerate(shots[:2]):
        characters = recognize_characters_stub(MOVIE_KEY, i)
        frame_stub = [f"frame_{s}.jpg"]  # replace with actual frame paths
        dense_desc = generate_dense_description_groq(frame_stub, characters)
        ad_text = summarize_to_AD(dense_desc)
        print(f"\nShot {i+1} ({s}-{e}) -> {ad_text}")

    print("\nGroq-integrated pipeline completed for single movie.")

Loaded 0001_American_Beauty: 36502 frames, 768 dims
Detected 989 shots.
Stored 989 shot vectors in ChromaDB.

Nearest shots:
{'ids': [['0001_American_Beauty_0', '0001_American_Beauty_975', '0001_American_Beauty_8']], 'embeddings': None, 'documents': [[None, None, None]], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[{'start_frame': 0, 'end_frame': 20, 'movie': '0001_American_Beauty'}, {'end_frame': 35276, 'start_frame': 35270, 'movie': '0001_American_Beauty'}, {'movie': '0001_American_Beauty', 'end_frame': 341, 'start_frame': 336}]], 'distances': [[0.0, 0.0008734691655263305, 0.0008986546308733523]]}


BadRequestError: Error code: 400 - {'error': {'message': "'messages.1' : for 'role:user' the following must be satisfied[('messages.1.content' : one of the following must be satisfied[('messages.1.content' : value must be a string) OR ('messages.1.content.1' : one of the following must be satisfied[('messages.1.content.1.type' : value is not one of the allowed values ['text']) OR ('messages.1.content.1.image_url' : value must be an object)])])]", 'type': 'invalid_request_error'}}