# TwelveLabs Marengo Video Embedding Model + Bedrock + Elasticsearch


In this video we'll create a small app to search video embeddings from TwelveLabs' [Marengo](https://docs.twelvelabs.io/docs/concepts/models/marengo) model to search trailers from recent summer box office hits.  We'll use the [AWS Bedrock integration for TwelveLabs](https://docs.twelvelabs.io/docs/cloud-partner-integrations/amazon-bedrock) so that our data never has to be persisted outside of our own S3 buckets.

![Search Result](./images/marengo1.jpg) 

To run this notebook.

* You'll need an S3 bucket that can be written to by your AWS id.

* You'll need a ```.env``` file with the following content. Alternatively you can hard code in your keys and configurations below.


```bash
DATA_DIR = "./data"
AWS_ACCESS_KEY_ID = "your_access_key_id"
AWS_SECRET_ACCESS_KEY = "your_secret_access_key"
S3_BUCKET_NAME = "your_bucket_name"

ELASTICSEARCH_API_KEY = "your_elasticsearch_api_key"
ELASTICSEARCH_ENDPOINT = "your_elasticsearch_endpoint_including_port_number"
```

* Additionally, you'll need to enable the Marengo model in Bedrock for your account.

* Note, if you are behind a VPN or running on a cloud hosted notebook (like Google Colab) you'll likely be blocked from grabbing soruce data with yt-dlp

In [1]:
! pip install yt-dlp boto3 ipython tqdm elasticsearch ipywidgets python-dotenv


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
#########
## Python Imports
#########

import os
import yt_dlp
import os
import json
from pathlib import Path
import time

import boto3, botocore
import json
import time
from IPython.display import clear_output, HTML, display, Image
import tqdm
import copy
import uuid
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from time import sleep
from dotenv import load_dotenv
load_dotenv()


True

In [3]:
#########
## Configuration Details
#########


## Source Data ... public trailers for the top five grossing US box office movies of 2025
videos = [
    "https://www.youtube.com/watch?v=VWqJifMMgZE",  ## Lilo and Stitch 2025
    "https://www.youtube.com/watch?v=Ox8ZLF6cGM0",  ## Superman 2025 trailer
    "https://www.youtube.com/watch?v=jan5CFWs9ic",  ## Jurassic World Rebirth
    "https://www.youtube.com/watch?v=qpoBjOg5RHU",  ## Fantastic Four: First Steps
    "https://www.youtube.com/watch?v=22w7z_lT6YM",  ## How to Train Your Dragon 2025

]

## Local file system location for downloads
DATA_PATH = os.getenv('DATA_DIR', './data')

## AWS Configuration
AWS_REGION = "us-east-1"
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID", "your_access_key_id")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY", "your_secret_access_key")


# S3 Configuration
# S3_BUCKET_NAME = "<YOUR_S3_BUCKET_NAME>" # TODO: Replace with your S3 bucket name
S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME", "your_s3_bucket_name")
S3_VIDEOS_PATH = "videos"
S3_IMAGES_PATH = "images"
S3_EMBEDDINGS_PATH = "embeddings"
MARENGO_MODEL_ID = "twelvelabs.marengo-embed-2-7-v1:0"


ELASTICSEARCH_API_KEY = os.getenv("ELASTICSEARCH_API_KEY", "your_elasticsearch_api_key")
ELASTICSEARCH_ENDPOINT = os.getenv("ELASTICSEARCH_ENDPOINT", "your_elasticsearch_endpoint_including_port_number")

In [4]:
#########
## Data Class
#########

class VideoIntelligence:
    def __init__(self, url, platform, video_id):
        
        self.url = url
        self.platform = platform
        self.video_id = video_id
        self.video_string = f"{self.platform}_{self.video_id}"
        self.base_path = f"{DATA_PATH}/videos/{self.video_string}"
        
        self.images = []
        self.video_path = None
        self.metadata_file = None
        self.s3_key = None
        
        self.metadata = None
        self.title = None
        self.description = None

        self.enmbedings_list = None
        

    def get_images(self):
        return self.images

    def set_images(self, images):
        self.images = images

    def add_image(self, image):
        self.images.append(image)

    def get_video_string(self):
        return self.video_string

    def get_url(self):
        return self.url

    def get_platform(self):
        return self.platform

    def get_video_id(self):
        return self.video_id

    def get_base_path(self):    
        return self.base_path


    def get_video_path(self):
        return self.video_path


    def set_video_path(self, video_path):
        self.video_path = video_path

    def get_metadata_file(self):
        return self.metadata_file


    def set_metadata_file(self, metadata_file):
        self.metadata_file = metadata_file

    def get_metadata(self):
        return self.metadata
        
    def set_metadata(self, metadata):
        self.metadata = metadata
        self.title = metadata.get("title", "")
        self.description = metadata.get("description", "")
        
    def get_title(self):
        return self.title
        
    def get_description(self):
        return self.description
        
    def set_title(self, title):
        self.title = title
        
    def set_description(self, description):
        self.description = description

    def set_s3_key(self, s3_key):
        self.s3_key = s3_key

    def get_s3_key(self):
        return self.s3_key

    def set_embeddings_list(self, embeddings_list):
        self.embeddings_list = embeddings_list

    def get_embeddings_list(self):
        return self.embeddings_list

    def to_json(self):
        return self.__dict__

    def get_video_object(self):
        return {
            "url": self.url,
            "platform": self.platform,
            "video_id": self.video_id,
            "title": self.title
        }
    



In [5]:
#########
## Retrieve Videos and put them on local file system
#########

def get_video(video: VideoIntelligence):
    """
    Download video and metadata for a given VideoIntelligence object.
    """

    metadata = {}
    # NOTE: this also creates the video ID directory since we have parents=True
    # Create base path directory
    base_directory = Path(video.get_base_path())
    base_directory.mkdir(parents=True, exist_ok=True)

    video_path = video.get_base_path() + f"/{video.get_video_string()}.mp4"
    metadata_path = video.get_base_path() + "/metadata.json"

    ydl_opts = {
        'format': 'bestvideo+bestaudio/best',
        'outtmpl': video_path,
        'merge_output_format': 'mp4'
    }
    
    # Download video if it doesn't exist
    if not os.path.exists(video_path):
        print("Downloading video...")
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            metadata = ydl.extract_info(video.url, download=False)
            ydl.download([video.url])
        with open(metadata_path, "w") as f:
            json.dump(metadata, f)
    else:
        print("Video already exists. Skipping video download.")

    # Download metadata if it doesn't exist
    if not os.path.exists(metadata_path) and metadata == {}:
        print("Downloading metadata...")
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            metadata = ydl.extract_info(video.url, download=False)
    if not os.path.exists(metadata_path) and metadata == {}:
        print("Downloading metadata...")
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            metadata = ydl.extract_info(video.url, download=False)
        with open(metadata_path, "w") as f:
            json.dump(metadata, f)
    else: 
        print("Metadata already exists. Loading metadata from file...")
        metadata = json.load(open(metadata_path, "r"))

    video.set_metadata(metadata)
    video.set_video_path(video_path)


def get_video_with_retries(video, max_retries=3):
    """Get a video with automatic retries on failure"""
    
    retry_count = 0
    last_exception = None
    
    while retry_count < max_retries:
        try:
            get_video(video)
            print(f"Successfully processed {video.get_video_id()}")
            return True  # Success
        except Exception as e:
            retry_count += 1
            last_exception = e
            if retry_count < max_retries:
                print(f"Attempt {retry_count} failed for {video.get_video_id()}: {e}. Retrying...")
                # Wait a short time before retrying to allow for temporary issues to resolve
                time.sleep(2)
            else:
                print(f"All {max_retries} attempts failed for {video.get_video_id()}. Last error: {e}")
    
    # If we reached here, all retries failed
    if last_exception:
        raise last_exception
    return False


video_objects = []

for video_str in videos:
    if "youtube.com" in video_str:
        platform = "youtube"
        video_id = video_str.split("v=")[1]
        video_objects.append(VideoIntelligence(video_str, platform, video_id))

for video_object in video_objects:
    get_video(video_object) 

Video already exists. Skipping video download.
Metadata already exists. Loading metadata from file...
Video already exists. Skipping video download.
Metadata already exists. Loading metadata from file...
Video already exists. Skipping video download.
Metadata already exists. Loading metadata from file...
Video already exists. Skipping video download.
Metadata already exists. Loading metadata from file...
Video already exists. Skipping video download.
Metadata already exists. Loading metadata from file...


In [6]:
#########
## Setup Bedrock, test connection
#########

# Initialize AWS session
session = boto3.session.Session(
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=AWS_REGION
)

# Initialize AWS clients
bedrock_client = session.client('bedrock-runtime', region_name=AWS_REGION)
modelId = "amazon.titan-text-premier-v1:0"


def get_bedrock_completion(prompt, max_tokens=500, temperature=0.7):
    """Get a text completion from Bedrock Titan model"""
    try:
        body = json.dumps({
            "inputText": prompt,
            "textGenerationConfig": {
                "maxTokenCount": max_tokens,
                "temperature": temperature,
                "topP": 0.9
            }
        })
        
        response = bedrock_client.invoke_model(
            modelId=modelId,
            body=body
        )
        
        response_body = json.loads(response['body'].read())
        return response_body.get('results', [{}])[0].get('outputText', '')
    except Exception as e:
        print(f"Error getting completion: {e}")
        return str(e)

# Test with a simple prompt
test_prompt = "Hello, what are the biggest blockbuster movies of all time?"
try:
    completion = get_bedrock_completion(test_prompt)
    print("Titan's response:")
    print(completion)
except Exception as e:
    print("Bedrock API call failed:")
    print(f"Error type: {type(e).__name__}")
    print(f"Error message: {str(e)}")

Titan's response:
The biggest blockbuster movies of all time are Avatar, Titanic, and Star Wars.  These films were all huge hits, and are among the highest-grossing movies of all time.  Other famous blockbusters include The Lord of the Rings trilogy, The Avengers, and Jurassic Park.  Blockbuster movies tend to feature exciting action and adventure, as well as memorable characters and storylines, and are very popular with audiences around the world.


In [7]:
#########
## Validate S3 Configuration
#########

aws_account_id = session.client('sts').get_caller_identity()["Account"]
print(f"AWS Account ID: {aws_account_id}")
s3_client = session.client('s3')

# Verify bucket access
try:
    s3_client.head_bucket(Bucket=S3_BUCKET_NAME)
    print(f"✅ Successfully connected to S3 bucket: {S3_BUCKET_NAME}")
except Exception as e:
    print(f"❌ Error accessing S3 bucket: {e}")
    print("Please ensure the bucket exists and you have proper permissions.")

AWS Account ID: REDACTED
✅ Successfully connected to S3 bucket: REDACTED


In [8]:
#########
## Uploadd videos to S3, and make note of where we put them in data object
#########

for video_object in video_objects:
    # Get the video file path
    video_path = video_object.get_video_path()
    
    # Skip if video path is not set
    if not video_path:
        print(f"Skipping {video_object.get_video_string()} - No video path set")
        continue
        
    # Define S3 destination key - organize by platform and video ID
    # put this information in our data object for later
    s3_key = video_object.get_s3_key()
    if not s3_key:
        s3_key = f"{S3_VIDEOS_PATH}/{video_object.get_platform()}/{video_object.get_video_id()}/{os.path.basename(video_path)}"
    video_object.set_s3_key(s3_key)

    try:
        # Check if file already exists in S3
        try:
            s3_client.head_object(Bucket=S3_BUCKET_NAME, Key=s3_key)
            print(f"Video {video_object.get_video_string()} already exists in S3. Skipping upload.")
            continue
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == '404':
                # File doesn't exist in S3, proceed with upload
                pass
            else:
                # Some other error occurred
                raise e
        
        # Upload the video to S3
        print(f"Uploading {video_object.get_video_string()} to S3...")
        s3_client.upload_file(video_path, S3_BUCKET_NAME, s3_key)
        print(f"Successfully uploaded {video_object.get_video_string()} to S3")
        
    
    except Exception as e:
        print(f"Error uploading {video_object.get_video_string()} to S3: {str(e)}")

Video youtube_VWqJifMMgZE already exists in S3. Skipping upload.
Video youtube_Ox8ZLF6cGM0 already exists in S3. Skipping upload.
Video youtube_jan5CFWs9ic already exists in S3. Skipping upload.
Video youtube_qpoBjOg5RHU already exists in S3. Skipping upload.
Video youtube_22w7z_lT6YM already exists in S3. Skipping upload.


In [9]:
#########
## Use Bedrock hosted Twelve Labs models to create video embeddings
#########


# Helper function to wait for async embedding results
def wait_for_embedding_output(s3_bucket: str, s3_prefix: str, invocation_arn: str, verbose: bool = False) -> list:
    """
    Wait for Bedrock async embedding task to complete and retrieve results

    Args:
        s3_bucket (str): The S3 bucket name
        s3_prefix (str): The S3 prefix for the embeddings
        invocation_arn (str): The ARN of the Bedrock async embedding task

    Returns:
        list: A list of embedding data
        
    Raises:
        Exception: If the embedding task fails or no output.json is found
    """
    
    # Wait until task completes
    status = None
    while status not in ["Completed", "Failed", "Expired"]:
        response = bedrock_client.get_async_invoke(invocationArn=invocation_arn)
        status = response['status']
        if verbose:
            clear_output(wait=True)
            tqdm.tqdm.write(f"Embedding task status: {status}")
        time.sleep(5)
    
    if status != "Completed":
        raise Exception(f"Embedding task failed with status: {status}")
    
    # Retrieve the output from S3
    response = s3_client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_prefix)
    
    for obj in response.get('Contents', []):
        if obj['Key'].endswith('output.json'):
            output_key = obj['Key']
            obj = s3_client.get_object(Bucket=s3_bucket, Key=output_key)
            content = obj['Body'].read().decode('utf-8')
            data = json.loads(content).get("data", [])
            return data
    
    raise Exception("No output.json found in S3 prefix")


# Create video embedding
def create_video_embedding(video_s3_uri: str, video_id: str) -> list:
    """
    Create embeddings for video using Marengo on Bedrock
    
    Args:
        video_s3_uri (str): The S3 URI of the video to create an embedding for
        video_id (str): the identifying unique id of the video, to be used as a uuid
        
    Returns:
        list: A list of embedding data
    """
    
    unique_id = video_id
    s3_output_prefix = f'{S3_EMBEDDINGS_PATH}/{S3_VIDEOS_PATH}/{unique_id}'
    
    response = bedrock_client.start_async_invoke(
        modelId=MARENGO_MODEL_ID,
        modelInput={
            "inputType": "video",
            "mediaSource": {
                "s3Location": {
                    "uri": video_s3_uri,
                    "bucketOwner": aws_account_id
                }
            }
        },
        outputDataConfig={
            "s3OutputDataConfig": {
                "s3Uri": f's3://{S3_BUCKET_NAME}/{s3_output_prefix}'
            }
        }
    )
    
    invocation_arn = response["invocationArn"]
    print(f"Video embedding task started: {invocation_arn}")
    
    # Wait for completion and get results
    try:
        embedding_data = wait_for_embedding_output(S3_BUCKET_NAME, s3_output_prefix, invocation_arn)
    except Exception as e:
        print(f"Error waiting for embedding output: {e}")
        return None
    
    return embedding_data


def check_existing_embedding(video_id: str) -> bool:
    """Check S3 folder to see if this video already has an embedding created to avoid re-inference"""

    s3_output_prefix = f'{S3_EMBEDDINGS_PATH}/{S3_VIDEOS_PATH}/{video_id}'
    print(s3_output_prefix)

    try:
        # Check if any files exist at this prefix
        response = s3_client.list_objects_v2(Bucket=S3_BUCKET_NAME, Prefix=s3_output_prefix)
        
        if 'Contents' in response and any(obj['Key'].endswith('output.json') for obj in response.get('Contents', [])):
            print(f"Embedding {video_object.get_video_string()} already has an embedding. Skipping embedding creation.")
            # Find the output.json file
            for obj in response.get('Contents', []):
                if obj['Key'].endswith('output.json'):
                    output_key = obj['Key']
                    # Get the object from S3
                    obj = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=output_key)
                    # Read the content and parse as JSON
                    content = obj['Body'].read().decode('utf-8')
                    embedding_data = json.loads(content).get("data", [])
                    return embedding_data
        else:
            print(f"No existing embedding found for {video_object.get_video_string()}.")
            return None
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == '404':
            # File doesn't exist in S3, proceed with upload
            print("Did not find embedding in s3")
            return None
        else:
            # Some other error occurred
            raise e

def create_s3_uri(bucket_name: str, key: str)-> str:
    video_uri = f"s3://{bucket_name}/{key}"
    return video_uri



## Generate the embeddings one at a time, use S3 as cache to prevent double embedding generations
for video_object in tqdm.tqdm(video_objects, desc="Processing videos"):
    s3_key = video_object.get_s3_key()
    video_id = video_object.get_video_id()
    video_uri = create_s3_uri(S3_BUCKET_NAME, s3_key)  

    retrieved_embeddings = check_existing_embedding(video_id)
    if retrieved_embeddings:   
        video_object.set_embeddings_list(retrieved_embeddings)
    else:
        video_embedding_data = create_video_embedding(video_uri, video_id)
        video_object.set_embeddings_list(video_embedding_data)


Processing videos:   0%|          | 0/5 [00:00<?, ?it/s]

embeddings/videos/VWqJifMMgZE
Embedding youtube_VWqJifMMgZE already has an embedding. Skipping embedding creation.


Processing videos:  40%|████      | 2/5 [00:00<00:00,  5.13it/s]

embeddings/videos/Ox8ZLF6cGM0
Embedding youtube_Ox8ZLF6cGM0 already has an embedding. Skipping embedding creation.
embeddings/videos/jan5CFWs9ic
Embedding youtube_jan5CFWs9ic already has an embedding. Skipping embedding creation.


Processing videos:  80%|████████  | 4/5 [00:00<00:00,  6.22it/s]

embeddings/videos/qpoBjOg5RHU
Embedding youtube_qpoBjOg5RHU already has an embedding. Skipping embedding creation.
embeddings/videos/22w7z_lT6YM
Embedding youtube_22w7z_lT6YM already has an embedding. Skipping embedding creation.


Processing videos: 100%|██████████| 5/5 [00:00<00:00,  5.64it/s]


In [10]:
video_embedding_data = video_objects[0].get_embeddings_list()

##Preview Print
for i, embedding in enumerate(video_embedding_data[:3]):
    print(f"{i}")
    for key in embedding:
        if "embedding" == key:
            print(f"\t{key}: len {len(embedding[key])}")
        else:
            print(f"\t{key}: {embedding[key]}")

0
	embedding: len 1024
	embeddingOption: visual-text
	startSec: 0.0
	endSec: 6.199999809265137
1
	embedding: len 1024
	embeddingOption: visual-text
	startSec: 6.199999809265137
	endSec: 10.399999618530273
2
	embedding: len 1024
	embeddingOption: visual-text
	startSec: 10.399999618530273
	endSec: 17.299999237060547


In [11]:


es = Elasticsearch(
    hosts=[ELASTICSEARCH_ENDPOINT],
    api_key=ELASTICSEARCH_API_KEY
)

es_detail = es.info().body
if "version" in es_detail:
    identifier = es_detail['version']['build_flavor'] if 'build_flavor' in es_detail['version'] else es_detail['version']['number']
    print(f"✅ Successfully connected to Elasticsearch: {es_detail['version']['build_flavor']}")


✅ Successfully connected to Elasticsearch: serverless


In [12]:
docs = []

for video_object in video_objects:

    persist_object = video_object.get_video_object()
    embeddings = video_object.get_embeddings_list()

    for embedding in embeddings:
        if embedding["embeddingOption"] == "visual-image":

            # Create a copy of the persist object and add embedding details
            doc = copy.deepcopy(persist_object)
            doc["embedding"] =  embedding["embedding"]
            doc["start_sec"] =  embedding["startSec"]
            doc["end_sec"] =    embedding["endSec"]

            docs.append(doc)

            ### Documents should be of format
            # {
            #     "url": "https://www.youtube.com/watch?v=VWqJifMMgZE",
            #     "platform": "youtube",
            #     "video_id": "VWqJifMMgZE",
            #     "title": "Lilo & Stitch | Official Trailer | In Theaters May 23",
            #     "embedding": [
            #         0.049530029296875,
            #         -0.0153350830078125,
            #         0.04205322265625,
            #         ... <1024 dimensions total>
            #         417327880859375,
            #         0.01041412353515625
            #     ],
            #     "start_sec": 0.0,
            #     "end_sec": 6.199999809265137
            # }


In [13]:

index_varieties = [
    "flat",
    "hnsw",
    "int8_hnsw",
    "bbq_hnsw",
    "bbq_flat"
]

for index_variety in index_varieties:
    # Create an index for the movie trailer embeddings
    # Define mapping with proper settings for dense vector search
    index_name = f"twelvelabs-movie-trailer-{index_variety}"
    mappings = {
            "properties": {
                "url": {"type": "keyword"},
                "platform": {"type": "keyword"},
                "video_id": {"type": "keyword"},
                "title": {"type": "text", "analyzer": "standard"},
                "embedding": {
                    "type": "dense_vector", 
                    "dims": 1024,
                    "similarity": "cosine",
                    "index_options": {
                        "type": index_variety
                    }
                },
                "start_sec": {"type": "float"},
                "end_sec": {"type": "float"}
            }
        }
    
    

    # Check if index already exists
    if es.indices.exists(index=index_name):
        print(f"Deleting Index '{index_name}' and then sleeping for 2 seconds")
        es.indices.delete(index=index_name)
        sleep(2)
    # Create the index
    es.indices.create(index=index_name, mappings=mappings)
    print(f"Index '{index_name}' created successfully")

for index_variety in index_varieties:
    # Create an index for the movie trailer embeddings
    # Define mapping with proper settings for dense vector search
    index_name = f"twelvelabs-movie-trailer-{index_variety}"

    # Bulk insert docs into Elasticsearch index
    print(f"Indexing {len(docs)} documents into {index_name}...")
    
    
    # Create actions for bulk API
    actions = []
    for doc in docs:
        actions.append({
            "_index": index_name,
            "_source": doc
        })
    
    # Perform bulk indexing with error handling
    try:
        success, failed = bulk(es, actions, chunk_size=100, max_retries=3, 
                               initial_backoff=2, max_backoff=60)
        print(f"\tSuccessfully indexed {success} documents into {index_name}")
        if failed:
            print(f"\tFailed to index {len(failed)} documents")
    except Exception as e:
        print(f"Error during bulk indexing: {e}")
    
    print(f"Completed indexing documents into {index_name}")


Deleting Index 'twelvelabs-movie-trailer-flat' and then sleeping for 2 seconds
Index 'twelvelabs-movie-trailer-flat' created successfully
Deleting Index 'twelvelabs-movie-trailer-hnsw' and then sleeping for 2 seconds
Index 'twelvelabs-movie-trailer-hnsw' created successfully
Deleting Index 'twelvelabs-movie-trailer-int8_hnsw' and then sleeping for 2 seconds
Index 'twelvelabs-movie-trailer-int8_hnsw' created successfully
Deleting Index 'twelvelabs-movie-trailer-bbq_hnsw' and then sleeping for 2 seconds
Index 'twelvelabs-movie-trailer-bbq_hnsw' created successfully
Deleting Index 'twelvelabs-movie-trailer-bbq_flat' and then sleeping for 2 seconds
Index 'twelvelabs-movie-trailer-bbq_flat' created successfully
Indexing 155 documents into twelvelabs-movie-trailer-flat...
	Successfully indexed 155 documents into twelvelabs-movie-trailer-flat
Completed indexing documents into twelvelabs-movie-trailer-flat
Indexing 155 documents into twelvelabs-movie-trailer-hnsw...
	Successfully indexed 155 d

In [14]:
# Create text embedding
def create_text_embedding(text_query: str) -> list:
    """
    Create embeddings for text using Marengo on Bedrock

    Args:
        text_query (str): The text query to create an embedding for
        
    Returns:
        list: A list of embedding data
    """

    # Create a unique hash of the query text to use as identifier
    query_hash = str(uuid.uuid5(uuid.NAMESPACE_DNS, text_query))
    s3_output_prefix = f'{S3_EMBEDDINGS_PATH}/text/{query_hash}'

    ## see if we have already computed this text embedding
    embedding_data = None
    response = s3_client.list_objects_v2(Bucket=S3_BUCKET_NAME, Prefix=s3_output_prefix)
    if 'Contents' in response and any(obj['Key'].endswith('output.json') for obj in response.get('Contents', [])):
        print(f"Embedding {video_object.get_video_string()} already has an embedding. Skipping embedding creation.")
        # Find the output.json file
        for obj in response.get('Contents', []):
            if obj['Key'].endswith('output.json'):
                output_key = obj['Key']
                # Get the object from S3
                obj = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=output_key)
                # Read the content and parse as JSON
                content = obj['Body'].read().decode('utf-8')
                embedding_data = json.loads(content).get("data", [])
    ## else:
        ## print(f"No existing embedding found for {video_object.get_video_string()}.")
        
    ## if the embedding was not found in S3 compute it
    if not embedding_data:
        response = bedrock_client.start_async_invoke(
            modelId=MARENGO_MODEL_ID,
            modelInput={
                "inputType": "text",
                "inputText": text_query
            },
            outputDataConfig={
                "s3OutputDataConfig": {
                    "s3Uri": f's3://{S3_BUCKET_NAME}/{s3_output_prefix}'
                }
            }
        )
        invocation_arn = response["invocationArn"]
        print(f"Text embedding task started: {invocation_arn}")
        
        # Wait for completion and get results
        try:
            embedding_data = wait_for_embedding_output(S3_BUCKET_NAME, s3_output_prefix, invocation_arn)
        except Exception as e:
            print(f"Error waiting for embedding output: {e}")
            return None
    
    return embedding_data[0]["embedding"] if embedding_data else None
    


def vector_query(index_name: str, text_query: str) -> dict:
    
    query_embedding = create_text_embedding(text_query)
    query = {
        "retriever": {
            "knn": {
                "field": "embedding",
                "query_vector": query_embedding,
                "k": 10,
                "num_candidates": "25"
            }
        },
        "size": 10,
        "_source": False,
        "fields": ["title", "video_id", "start_sec"]
    }
    return es.search(index=index_name, body=query).body


text_query = "Show me scenes with dinosaurs"
print (vector_query("twelvelabs-movie-trailer-flat", text_query))






Embedding youtube_22w7z_lT6YM already has an embedding. Skipping embedding creation.
{'took': 3, 'timed_out': False, '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 10, 'relation': 'eq'}, 'max_score': 0.6417193, 'hits': [{'_index': 'twelvelabs-movie-trailer-flat', '_id': 'kBqvKpkBnp_cvWPLqCon', '_score': 0.6417193, 'fields': {'title': ['Jurassic World Rebirth | Official Trailer'], 'start_sec': [134.5], 'video_id': ['jan5CFWs9ic']}}, {'_index': 'twelvelabs-movie-trailer-flat', '_id': 'vhqvKpkBnp_cvWPLqioN', '_score': 0.6409597, 'fields': {'title': ['How To Train Your Dragon | Official Trailer'], 'start_sec': [59.0], 'video_id': ['22w7z_lT6YM']}}, {'_index': 'twelvelabs-movie-trailer-flat', '_id': 'uRqvKpkBnp_cvWPLqioN', '_score': 0.64061135, 'fields': {'title': ['How To Train Your Dragon | Official Trailer'], 'start_sec': [34.6], 'video_id': ['22w7z_lT6YM']}}, {'_index': 'twelvelabs-movie-trailer-flat', '_id': 'vBqvKpkBnp_cvWPLqioN', '_sc

In [15]:
from ipywidgets import widgets, HTML as WHTML, HBox, Layout
from IPython.display import display

def display_search_results_html(query):
    results = vector_query("twelvelabs-movie-trailer-flat", query)
    hits = results.get('hits', {}).get('hits', [])

    if not hits:
        return "<p>No results found</p>"

    items = []
    for hit in hits:
        fields = hit.get('fields', {})
        title = fields.get('title', ['No Title'])[0]
        score = hit.get('_score', 0)
        video_id = fields.get('video_id', [''])[0]
        start_sec = fields.get('start_sec', [0])[0]
        url = f"https://www.youtube.com/watch?v={video_id}&t={int(start_sec)}s"
        items.append(f'<li><a href="{url}" target="_blank">{title} (Start: {float(start_sec):.1f}s)</a> <span>Score: {score}</span></li>')

    return "<h3>Search Results:</h3><ul>" + "\n".join(items) + "</ul>"

def search_videos():
    search_input = widgets.Text(
        value='',
        placeholder='Enter your search query…',
        description='Search:',
        layout=Layout(width='70%')
    )

    search_button = widgets.Button(
        description='Search Videos',
        button_style='primary',
        layout=Layout(width='20%')
    )

    # Use a single HTML widget for output; update its .value to avoid double-rendering
    results_box = WHTML(value="")

    def on_button_click(_):
        q = search_input.value.strip()
        if not q:
            results_box.value = "<p>Please enter a search query</p>"
            return
        results_box.value = "<p>Searching…</p>"
        results_box.value = display_search_results_html(q)

    # Avoid multiple handler attachments if the cell is re-run
    try:
        search_button._click_handlers.callbacks.clear()
    except Exception:
        pass
    search_button.on_click(on_button_click)

    display(HBox([search_input, search_button]))
    display(results_box)

# Call this to create the UI
search_videos()


HBox(children=(Text(value='', description='Search:', layout=Layout(width='70%'), placeholder='Enter your searc…

HTML(value='')

---

#### Screenshot of UI when done:

![Screenshot](./images/marengo2.jpg) 