# VISTA

In [1]:
from IPython.display import display, HTML
display(HTML(
"""
<a target="_blank" href="https://colab.research.google.com/github/facebookresearch/segment-anything/blob/main/notebooks/automatic_mask_generator_example.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
"""
))

## Environment Set-up

In [None]:
!pip uninstall -y clip
!pip install git+https://github.com/openai/CLIP.git
!pip install decord
!pip install 'git+https://github.com/facebookresearch/detectron2.git'

In [None]:
import sys, os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

from dotenv import load_dotenv
from google import genai
from llm.base import AgentClient
from data.cache.memory_handler import MessageMemoryHandler
import chainlit as cl
from pathlib import Path
import shutil
import json
import cv2
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple, Any, Optional
import warnings
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torchvision.models import resnet50
from data.prompts.video_analysis_agent import VIDEO_ANALYSIS_AGENT_PROMPT
from utils.basetools.video_analysis_tool import create_video_analysis_tool
import clip
from pathlib import Path
import torch
import cv2
from google.colab import drive, files

load_dotenv()
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)
print(f"CLIP model loaded on {device}")


drive.mount('/content/drive')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

DATA_DIR = Path("/content/drive/MyDrive/soICT/datasets")

BATCH_SIZE = 8
HIDDEN_DIM = 512
NUM_HEADS = 8

In [None]:
# If using Google Colab, make using_colab = True
using_colab = False

In [None]:
if using_colab:
    import torch
    import torchvision
    print("PyTorch version:", torch.__version__)
    print("Torchvision version:", torchvision.__version__)
    print("CUDA is available:", torch.cuda.is_available())
    import sys
    !{sys.executable} -m pip install opencv-python matplotlib
    !{sys.executable} -m pip install 'git+https://github.com/facebookresearch/segment-anything.git'

    !mkdir images
    !wget -P images https://raw.githubusercontent.com/facebookresearch/segment-anything/main/notebooks/images/dog.jpg

    !wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth

In [None]:
import clip

device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)
print(f"CLIP model loaded on {device}")

## Main Code

### Cutting Frame From input video

In [None]:
from src.utils.basetools.preprocess_video import VideoPreprocessor

uploaded = files.upload() 
video_filename = list(uploaded.keys())[0]
sample_video_path = Path(video_filename)


output_dir = Path(f"/content/keyframes_output/{sample_video_path.stem}")
output_dir.mkdir(parents=True, exist_ok=True)

video_preprocessor_enhanced = VideoPreprocessor(frame_interval=1, similarity_threshold=0.8)

if sample_video_path.exists():
    print(f"Processing video: {sample_video_path}")
    video_info = video_preprocessor_enhanced.get_video_info(str(sample_video_path))
    print(f"Video info: {video_info}")

    filtered_frames, selected_indices = video_preprocessor_enhanced.extract_keyframes_with_redundancy_removal(
        str(sample_video_path),
        max_frames=video_info['frame_count']
    )

    print(f"Selected {len(filtered_frames)} keyframes")

    for i, frame in enumerate(filtered_frames):
        frame_path = output_dir / f"frame_{i:04d}.jpg"
        cv2.imwrite(str(frame_path), cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

    print(f"Saved keyframes to {output_dir}")
else:
    print(f"Video not found at: {sample_video_path}")

### Masking and Detect Object

In [None]:
from src.utils.basetools.mask import process_frame, show_processed_frames

processed_images = [process_frame(frame) for frame in filtered_frames]
show_processed_frames(processed_images)


### Call Agent for Generate Relation between 2 next to Frames

In [None]:
from src.utils.agents.image_relation_agent import ImageRelationAgent, ImageRelationInput, RelationOutput
# --- Usage ---
agent = ImageRelationAgent()

all_frame_relations = {}
for i, frame in enumerate(processed_images):
    input_data = ImageRelationInput(
        mask_frame=frame,
        original_img=filtered_frames[i], 
        prev_objects=agent.prev_objects
    )
    result = agent.run(input_data)
    all_frame_relations[f"frame_{i}"] = result.relations

for frame_name, rels in all_frame_relations.items():
    print(f"{frame_name}:")
    for r in rels:
        print(r)
    print()

### Call Agent for Building Graph

In [None]:
from src.utils.agents.scene_graph_agent import SameEntityAgent, SceneGraphInput

same_entity_agent = SameEntityAgent()

scene_graph_input = SceneGraphInput(frames_dict=all_frame_relations)
linked_result = same_entity_agent.run(scene_graph_input)

linked_scene_graph = linked_result.combined_relations

for r in linked_scene_graph:
    print(r)


In [None]:
from src.utils.agents.graph_reasoning_agent import GraphReasoningAgent, GraphReasoningInput

graph_text_for_question = "\n".join(linked_scene_graph)

question = "YOUR QUESTION"

agent = GraphReasoningAgent()

input_data = GraphReasoningInput(
    question=question,
    graph_text=graph_text_for_question
)

result = agent.run(input_data)
answer = result.answer

print(f"Question: {question}")
print(f"Answer: {answer}")
