<a href="https://colab.research.google.com/github/daisysong76/AI--Machine--learning/blob/main/Video_Search_and_Retrieval_System_with_Gemini%2BLLama_VLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# pip install google-generativeai transformers torch torchvision faiss-cpu opencv-python Pillow tqdm google-cloud-vision

#gemini_api_key = "your_gemini_api_key"
#search_engine = VideoSearchEngine(
#    gemini_api_key=gemini_api_key,
#    llama_model_path="path_to_llama_model"
#)

import cv2
import torch
import numpy as np
from PIL import Image
from pathlib import Path
from typing import List, Tuple, Dict, Optional, NamedTuple, Union
from dataclasses import dataclass
import torch.nn.functional as F
from transformers import AutoProcessor, AutoModelForVision2Seq
import google.generativeai as genai
from google.cloud import vision
import faiss
import logging
from datetime import datetime
import json
import concurrent.futures
from tqdm import tqdm
import base64
import io

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class VLMFeature:
    """Represents features extracted by VLM models."""
    embedding: np.ndarray
    description: str
    confidence: float
    model_source: str  # 'gemini', 'llama', or 'ensemble'

@dataclass
class VideoFrame:
    """Represents a single frame with VLM features."""
    frame_id: str
    video_path: str
    timestamp: float
    features: List[VLMFeature]
    raw_frame: Optional[np.ndarray] = None
    width: int = 0
    height: int = 0

class GeminiVisionExtractor:
    """Handles feature extraction using Google's Gemini Vision model."""

    def __init__(self, api_key: str):
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel('gemini-pro-vision')

    def _encode_image(self, image: Image.Image) -> str:
        """Convert PIL Image to base64 string."""
        buffered = io.BytesIO()
        image.save(buffered, format="JPEG")
        return base64.b64encode(buffered.getvalue()).decode()

    async def extract_features(self, image: Image.Image) -> VLMFeature:
        """Extract features using Gemini Vision."""
        try:
            # Prepare the image for Gemini
            encoded_image = self._encode_image(image)

            # Generate description using Gemini
            prompt = """Analyze this image in detail. Focus on:
            1. Key objects and their relationships
            2. Actions or processes being shown
            3. Technical or industrial elements if present
            4. Spatial relationships and layout
            Provide a structured, detailed response."""

            response = await self.model.generate_content_async([prompt, encoded_image])
            description = response.text

            # Generate embedding using the description
            # Note: Since Gemini doesn't directly provide embeddings, we'll create one from the description
            embedding = await self.model.embed_content_async(description)

            return VLMFeature(
                embedding=np.array(embedding),
                description=description,
                confidence=response.candidates[0].score if hasattr(response, 'candidates') else 0.9,
                model_source='gemini'
            )

        except Exception as e:
            logger.error(f"Error in Gemini feature extraction: {str(e)}")
            return None

class LlamaVLMExtractor:
    """Handles feature extraction using Llama-VLM."""

    def __init__(self, model_path: str = "llama-vl-2"):
        self.processor = AutoProcessor.from_pretrained(model_path)
        self.model = AutoModelForVision2Seq.from_pretrained(
            model_path,
            torch_dtype=torch.float16,
            device_map="auto"
        )

    def extract_features(self, image: Image.Image) -> VLMFeature:
        """Extract features using Llama-VLM."""
        try:
            # Prepare inputs
            inputs = self.processor(images=image, return_tensors="pt").to(self.model.device)

            # Generate visual features and description
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=100,
                    num_beams=5,
                    temperature=0.7
                )

                # Get hidden states for embedding
                hidden_states = self.model.get_image_features(**inputs)
                embedding = F.normalize(hidden_states.mean(dim=1), dim=-1)

            # Decode the description
            description = self.processor.batch_decode(outputs, skip_special_tokens=True)[0]

            return VLMFeature(
                embedding=embedding.cpu().numpy()[0],
                description=description,
                confidence=outputs.sequences_scores[0].item() if hasattr(outputs, 'sequences_scores') else 0.9,
                model_source='llama'
            )

        except Exception as e:
            logger.error(f"Error in Llama-VLM feature extraction: {str(e)}")
            return None

class EnsembleVLMExtractor:
    """Combines features from multiple VLM models."""

    def __init__(self, gemini_api_key: str, llama_model_path: str = "llama-vl-2"):
        self.gemini_extractor = GeminiVisionExtractor(api_key=gemini_api_key)
        self.llama_extractor = LlamaVLMExtractor(model_path=llama_model_path)

    async def extract_features(self, image: Image.Image) -> List[VLMFeature]:
        """Extract features using both models and combine results."""
        features = []

        # Get Gemini features
        gemini_feature = await self.gemini_extractor.extract_features(image)
        if gemini_feature:
            features.append(gemini_feature)

        # Get Llama features
        llama_feature = self.llama_extractor.extract_features(image)
        if llama_feature:
            features.append(llama_feature)

        # Create ensemble feature if both extractions succeeded
        if len(features) == 2:
            # Combine embeddings
            combined_embedding = np.mean([f.embedding for f in features], axis=0)
            # Combine descriptions
            combined_description = f"Gemini: {features[0].description}\nLlama: {features[1].description}"

            features.append(VLMFeature(
                embedding=combined_embedding,
                description=combined_description,
                confidence=np.mean([f.confidence for f in features]),
                model_source='ensemble'
            ))

        return features

class VLMVectorDatabase:
    """Manages vector database for VLM features."""

    def __init__(self, dimension: int = 768):  # Adjust dimension based on model output
        self.dimension = dimension
        self.index = faiss.IndexFlatIP(dimension)
        self.frame_metadata: List[Tuple[VideoFrame, VLMFeature]] = []

    def add_frame(self, frame: VideoFrame):
        """Add frame features to the index."""
        for feature in frame.features:
            self.index.add(feature.embedding.reshape(1, -1))
            self.frame_metadata.append((frame, feature))

    def search(self,
              query_embedding: np.ndarray,
              k: int = 100,
              min_similarity: float = 0.7) -> List[Tuple[VideoFrame, VLMFeature, float]]:
        """Search for similar frames using VLM features."""
        # Reshape query embedding
        query_embedding = query_embedding.reshape(1, -1)

        # Perform similarity search
        similarities, indices = self.index.search(query_embedding, k)

        # Filter and return results
        results = []
        for idx, similarity in zip(indices[0], similarities[0]):
            if similarity >= min_similarity and idx < len(self.frame_metadata):
                frame, feature = self.frame_metadata[idx]
                results.append((frame, feature, float(similarity)))

        return results

class VideoSearchEngine:
    """Main class for video search using VLM models."""

    def __init__(self,
                 gemini_api_key: str,
                 llama_model_path: str = "llama-vl-2",
                 frame_interval: float = 1.0):
        self.frame_interval = frame_interval
        self.feature_extractor = EnsembleVLMExtractor(
            gemini_api_key=gemini_api_key,
            llama_model_path=llama_model_path
        )
        self.vector_db = VLMVectorDatabase()

    async def process_video(self, video_path: str):
        """Process and index a video file."""
        logger.info(f"Processing video: {video_path}")

        cap = cv2.VideoCapture(video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)
        frame_skip = int(fps * self.frame_interval)

        frame_count = 0
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            if frame_count % frame_skip == 0:
                # Convert BGR to RGB and to PIL Image
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                image = Image.fromarray(frame_rgb)

                # Extract features using ensemble of VLM models
                features = await self.feature_extractor.extract_features(image)

                if features:
                    video_frame = VideoFrame(
                        frame_id=f"{Path(video_path).stem}_{frame_count}",
                        video_path=video_path,
                        timestamp=frame_count / fps,
                        features=features,
                        width=frame.shape[1],
                        height=frame.shape[0]
                    )
                    self.vector_db.add_frame(video_frame)

            frame_count += 1

        cap.release()

    async def search_by_description(self,
                                  query_text: str,
                                  min_similarity: float = 0.7) -> List[Tuple[VideoFrame, VLMFeature, float]]:
        """Search for video segments using a text description."""
        # Use Gemini to generate an embedding for the query text
        genai_embedding = await self.feature_extractor.gemini_extractor.model.embed_content_async(query_text)
        query_embedding = np.array(genai_embedding)

        # Search the database
        return self.vector_db.search(query_embedding, min_similarity=min_similarity)

    async def search_by_image(self,
                            query_image: Image.Image,
                            min_similarity: float = 0.7) -> List[Tuple[VideoFrame, VLMFeature, float]]:
        """Search for video segments using a query image."""
        # Extract features from query image
        query_features = await self.feature_extractor.extract_features(query_image)

        # Search using ensemble feature if available
        ensemble_feature = next((f for f in query_features if f.model_source == 'ensemble'), None)
        if ensemble_feature:
            return self.vector_db.search(ensemble_feature.embedding, min_similarity=min_similarity)

        # Fallback to first available feature
        if query_features:
            return self.vector_db.search(query_features[0].embedding, min_similarity=min_similarity)

        return []

async def main():
    """Example usage of the VLM-based video search engine."""
    # Initialize the search engine with your API keys
    search_engine = VideoSearchEngine(
        gemini_api_key="your_gemini_api_key",
        llama_model_path="path_to_llama_model"
    )

    # Process videos
    video_dir = Path("path/to/videos")
    for video_path in video_dir.glob("*.mp4"):
        await search_engine.process_video(str(video_path))

    # Example: Search by description
    results = await search_engine.search_by_description(
        "Person operating industrial machinery with safety equipment"
    )

    # Example: Search by image
    query_image = Image.open("path/to/query.jpg")
    results = await search_engine.search_by_image(query_image)

    # Print results
    for frame, feature, similarity in results:
        print(f"Match in {frame.video_path} at {frame.timestamp:.2f}s")
        print(f"Description: {feature.description}")
        print(f"Confidence: {feature.confidence:.2%}")
        print(f"Similarity: {similarity:.2%}")
        print("---")

if __name__ == "__main__":
    import asyncio
    asyncio.run(main())

Multiple VLM Models:
Gemini Vision for high-quality image understanding
Llama-VLM for additional perspective
Ensemble approach combining both models

Enhanced Features:
Rich text descriptions from both models
Combined embeddings for better search accuracy
Confidence scores from each model


Async Processing:
Asynchronous feature extraction
Parallel video processing
Efficient handling of API calls


Flexible Search:
Search by text description
Search by image
Combined feature search