## [1] Define environment variables

In [None]:
import os

os.environ["OPENAI_API_KEY"] = "<key>"
os.environ["VISION_API_KEY"] = "<key>"

In [None]:
# Setting up the deployment name
deployment_name: str = "<Azure Open AI Deployment Name>"
# The base URL for your Azure OpenAI resource. e.g. "https://<your resource name>.openai.azure.com"
openai_api_base: str = "https://<resource-name>.openai.azure.com"
# Currently OPENAI API have the following versions available: 2022-12-01.
# All versions follow the YYYY-MM-DD date structure.
openai_api_version: str = "2023-12-01-preview"
openai_api_key:str = os.getenv("OPENAI_API_KEY")

# The base URL for your vision resource endpoint, e.g. "https://<your-resource-name>.cognitiveservices.azure.com"
vision_api_endpoint: str = "https://<resource-name>.cognitiveservices.azure.com"
vision_api_key: str = os.getenv("VISION_API_KEY")

# Insert your video SAS URL, e.g. https://<your-storage-account-name>.blob.core.windows.net/<your-container-name>/<your-video-name>?<SAS-token>
video_SAS_url: str = "<video url>"
# This index name must be unique and contain no white spaces.
# It must start with alphanumeric, can contain hyphens but they must be followed by alphanumeric (no consecutive hyphens or trailing hyphen).
# It must be 24 characters or less.
video_index_name: str = "demo-index-2"
# This video ID must be unique
video_id: str = "demo-video-2"

## [2] Define Azure Open AI Configuration

In [None]:
import json
from pathlib import Path

config = {
    "GPT-4V_DEPLOYMENT_NAME": deployment_name,
    "OPENAI_API_BASE": openai_api_base,
    "OPENAI_API_VERSION": openai_api_version,
    "VISION_API_ENDPOINT": vision_api_endpoint,
}

p = Path("./config.json")

with p.open(mode="w") as file:
    file.write(json.dumps(config))

## [3] Define helper functions

The following code defines the helper functions that will be used to interact with the Azure Open AI API.
List of helper functions:
1. create_video_index
2. add_video_to_index
3. wait_for_ingest_completion
4. process_video_indexing (Calling function 1-3 in sequence)
5. call_GPT4V_video

In [None]:
import requests
import time

def create_video_index(vision_api_endpoint: str, vision_api_key: str, index_name: str) -> object:
    url = f"{vision_api_endpoint}/computervision/retrieval/indexes/{index_name}?api-version=2023-05-01-preview"
    headers = {"Ocp-Apim-Subscription-Key": vision_api_key, "Content-Type": "application/json"}
    data = {"features": [{"name": "vision", "domain": "surveillance"}, {"name": "speech"}]}
    return requests.put(url, headers=headers, data=json.dumps(data))


def add_video_to_index(
    vision_api_endpoint: str, vision_api_key: str, index_name: str, video_url: str, video_id: str, ingestion_name: str = "ingestion-01"
) -> object:
    url = (
        f"{vision_api_endpoint}/computervision/retrieval/indexes/{index_name}"
        f"/ingestions/{ingestion_name}?api-version=2023-05-01-preview"
    )
    headers = {"Ocp-Apim-Subscription-Key": vision_api_key, "Content-Type": "application/json"}
    data = {
        "videos": [{"mode": "add", "documentId": video_id, "documentUrl": video_url}],
        "generateInsightIntervals": False,
        "moderation": False,
        "filterDefectedFrames": False,
        "includeSpeechTranscrpt": True,
    }
    return requests.put(url, headers=headers, data=json.dumps(data))


def wait_for_ingestion_completion(
    vision_api_endpoint: str, vision_api_key: str, index_name: str, max_retries: int = 30
) -> bool:
    url = (
        f"{vision_api_endpoint}/computervision/retrieval/indexes/{index_name}/ingestions?api-version=2023-05-01-preview"
    )
    headers = {"Ocp-Apim-Subscription-Key": vision_api_key}
    retries = 0
    while retries < max_retries:
        time.sleep(10)
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            state_data = response.json()
            if state_data["value"][0]["state"] == "Completed":
                print(state_data)
                print("Ingestion completed.")
                return True
            if state_data["value"][0]["state"] == "Failed":
                print(state_data)
                print("Ingestion failed.")
                return False
        retries += 1
    return False

def process_video_indexing(
    vision_api_endpoint: str, vision_api_key: str, video_index_name: str, video_SAS_url: str, video_id: str
) -> None:
    # Step 1: Create an Index
    response = create_video_index(vision_api_endpoint, vision_api_key, video_index_name)
    print(response.status_code, response.text)

    # Step 2: Add a video file to the index
    response = add_video_to_index(vision_api_endpoint, vision_api_key, video_index_name, video_SAS_url, video_id)
    print(response.status_code, response.text)

    # Step 3: Wait for ingestion to complete
    if not wait_for_ingestion_completion(vision_api_endpoint, vision_api_key, video_index_name):
        print("Ingestion did not complete within the expected time.")

# Define GPT-4 Turbo with Vision API call with video index
def call_GPT4V_video(messages: str, vision_api: object, video_index: object) -> object:
    # Construct the API request URL
    api_url = (
        f"{openai_api_base}/openai/deployments/{deployment_name}"
        f"/extensions/chat/completions?api-version={openai_api_version}"
    )

    # Including the api-key in HTTP headers
    headers = {
        "Content-Type": "application/json",
        "api-key": openai_api_key,
        "x-ms-useragent": "Azure-GPT-4V-video/1.0.0",
    }

    # Payload for the request
    payload = {
        "model": "gpt-4-vision-preview",
        "dataSources": [
            {
                "type": "AzureComputerVisionVideoIndex",
                "parameters": {
                    "computerVisionBaseUrl": f"{vision_api.get('endpoint')}/computervision",
                    "computerVisionApiKey": vision_api.get("key"),
                    "indexName": video_index.get("video_index_name"),
                    "videoUrls": [video_index.get("video_SAS_url")],
                },
            }
        ],
        "enhancements": {"video": {"enabled": True}},
        "messages": messages,
        "temperature": 0.7,
        "top_p": 0.95,
        "max_tokens": 800,
    }

    # Send the request and handle the response
    try:
        response = requests.post(api_url, headers=headers, json=payload)
        response.raise_for_status()  # Raise an error for bad HTTP status codes
        return response.json()
    except requests.RequestException as e:
        print(f"Failed to make the request. Error: {e}")

## [4] Call APIs
First, call the `process_video_indexing` function to index the video. Then, call the `call_GPT4V_video` function to generate the video demo.

In [None]:
import os
import re
import sys

parent_dir = Path(Path.cwd()).parent
sys.path.append(str(parent_dir))

In [None]:
# You only need to run this cell once to create the index
process_video_indexing(vision_api_endpoint, vision_api_key, video_index_name, video_SAS_url, video_id)

In [None]:
# System messages and user prompt
sys_message = """
You are an AI assistant that understand images and video content.
You only response to user based on the images or videos.
Reply "I don't have this information" if the user asks for anything else. Keep the response concise.
"""
user_prompt = "is camera visible in this video"

# Make sure that the content of type acv_document_id is first in the use content list like in this example.
# Otherwise unexpected behavior can happen.
messages = [
    {"role": "system", "content": [{"type": "text", "text": sys_message}]},
    {
        "role": "user",
        "content": [{"type": "acv_document_id", "acv_document_id": video_id}, {"type": "text", "text": user_prompt}],
    },  # Prompt for the user
]

vision_api_config = {"endpoint": vision_api_endpoint, "key": vision_api_key}

video_config = {
    "video_SAS_url": video_SAS_url,
    "video_index_name": video_index_name,
}

# Call GPT-4 Turbo with Vision API and print the response
try:
    response = call_GPT4V_video(messages, vision_api=vision_api_config, video_index=video_config)
    text = response["choices"][0]["message"]["content"]
    sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", text)
    for sentence in sentences:  # Print the content of the response
        print(sentence)
except Exception as e:
    print(f"Failed to call GPT-4 Turbo with Vision API. Error: {e}")

## [5] Clean Up & Troubleshooting
You may encounter error on duplicated index, ingestion etc. The following helper functions help to list down the indexes and you can delete the index if duplication occurs.

In [None]:
def list_video_indexes(vision_api_endpoint: str, vision_api_key: str) -> object:
    url = f"{vision_api_endpoint}/computervision/retrieval/indexes?api-version=2023-05-01-preview"
    headers = {"Ocp-Apim-Subscription-Key": vision_api_key}
    return requests.get(url, headers=headers)

def list_video_ingestions(vision_api_endpoint: str, vision_api_key: str, index_name: str) -> object:
    url = (
        f"{vision_api_endpoint}/computervision/retrieval/indexes/{index_name}/ingestions?api-version=2023-05-01-preview"
    )
    headers = {"Ocp-Apim-Subscription-Key": vision_api_key}
    return requests.get(url, headers=headers)

def delete_video_index(vision_api_endpoint: str, vision_api_key: str, index_name: str) -> object:
    url = f"{vision_api_endpoint}/computervision/retrieval/indexes/{index_name}?api-version=2023-05-01-preview"
    headers = {"Ocp-Apim-Subscription-Key": vision_api_key}
    return requests.delete(url, headers=headers)

def search_video_index(vision_api_endpoint: str, vision_api_key: str, index_name: str, query: str) -> object:
    url = (
        f"{vision_api_endpoint}/computervision/retrieval/indexes/{index_name}:queryByText?api-version=2023-05-01-preview"
    )
    headers = {"Ocp-Apim-Subscription-Key": vision_api_key, "Content-Type": "application/json"}
    data = {"queryText": query, "top": 10, "skip": 0, "dedup": True, "dedupMaxDocumentCount": 5, "disableMetadataSearch": False}
    return requests.post(url, headers=headers, data=json.dumps(data))

In [None]:
## Function to list all the video indexes
get_video_indexes = list_video_indexes(vision_api_endpoint, vision_api_key)
video_indexes = get_video_indexes.json()

for index in video_indexes["value"]:
    print(f"Index name: {index['name']}")
    get_video_ingestion = list_video_ingestions(vision_api_endpoint, vision_api_key, 'demo-index-2')
    for ingestion in get_video_ingestion.json()["value"]:
        print(f"Ingestion name: {ingestion['name']} - {ingestion['state']}")

In [None]:
# Function to delete the video index
# Replace 'video_index_name' as needed
delete_video_index(vision_api_endpoint, vision_api_key, video_index_name)

In [None]:
# Function to test out the search functionality
query = "is camera visible in this video"
search_result = search_video_index(vision_api_endpoint, vision_api_key, video_index_name, query)
print(json.dumps(search_result.json(), indent=2))