In [1]:
import os
import json
import pandas as pd
import urllib.request
import boto3
import base64
import requests

from requests.auth import HTTPBasicAuth
from botocore.config import Config

In [2]:
json_files = [pos_json for pos_json in os.listdir('movielens/') if pos_json.endswith('.json')]
json_data = []
for index, js in enumerate(json_files):
    with open(os.path.join('movielens/', js)) as json_file:
        json_text = json.load(json_file).get('data').get('searchResults')
        json_data.extend(json_text)

In [3]:
df = pd.DataFrame([json_data[i]['movie'] for i in range(len(json_data))])

In [4]:

def download_image(url, file_path, file_name):
    full_path = file_path + file_name
    urllib.request.urlretrieve(url, full_path)

In [6]:
for index, row in df.iterrows():
    url = 'https://image.tmdb.org/t/p/w500/' + row['posterPath']
    download_image(url, 'images/', row['posterPath'])

In [7]:
my_config = Config(
    region_name = 'us-east-1',
    signature_version = 'v4',
    retries = {
        'max_attempts': 10,
        'mode': 'standard'
    }
)

bedrock = boto3.client(service_name="bedrock", config=my_config)
bedrock_runtime = boto3.client(service_name="bedrock-runtime", config=my_config)

In [8]:
def get_embedding_for_poster(image_path):
    with open(image_path, "rb") as image_file:
        input_image = base64.b64encode(image_file.read()).decode('utf8')

    body = json.dumps(
        {
            "inputImage": input_image
        }
    )

    response = bedrock_runtime.invoke_model(
        body=body, 
        modelId="amazon.titan-embed-image-v1", 
        accept="application/json", 
        contentType="application/json"       
    )

    vector_json = json.loads(response['body'].read().decode('utf8'))
    image_name = image_path.split("/")[-1].split(".")[0]

    return vector_json, image_name

def get_embedding_for_poster_and_title(image_path, title):
    with open(image_path, "rb") as image_file:
        input_image = base64.b64encode(image_file.read()).decode('utf8')

    body = json.dumps(
        {
            "inputImage": input_image,
            "inputText": title
        }
    )

    response = bedrock_runtime.invoke_model(
        body=body, 
        modelId="amazon.titan-embed-image-v1", 
        accept="application/json", 
        contentType="application/json"       
    )

    vector_json = json.loads(response['body'].read().decode('utf8'))
    image_name = image_path.split("/")[-1].split(".")[0]

    return vector_json, image_name


def get_embedding_for_text(text):
    body = json.dumps(
        {
            "inputText": text
        }
    )

    response = bedrock_runtime.invoke_model(
        body=body, 
        modelId="amazon.titan-embed-image-v1", 
        accept="application/json", 
        contentType="application/json"       
    )

    vector_json = json.loads(response['body'].read().decode('utf8'))

    return vector_json, text

In [10]:
for index, row in df.iterrows():
    image_path = 'images/' + row['posterPath']
    vector_json, image_name = get_embedding_for_poster(image_path)
    with open('embeddings/' + image_name + '.json', 'w') as f:
        json.dump(vector_json, f)

In [11]:
for index, row in df.iterrows():
    image_path = 'images/' + row['posterPath']
    vector_json, image_name = get_embedding_for_poster_and_title(image_path, row['title'])
    with open('embeddings/' + 'with_title_' + image_name + '.json', 'w') as f:
        json.dump(vector_json, f)

In [12]:
df = df.drop(columns=['dvdReleaseDate', 'backdropPaths', 'youtubeTrailerIds', 'numRatings', 'avgRating'])

In [23]:
!pip3 install opensearch-py  pip install requests-aws4auth



Defaulting to user installation because normal site-packages is not writeable
Collecting install
  Downloading install-1.3.5-py3-none-any.whl (3.2 kB)
Collecting requests-aws4auth
  Downloading requests_aws4auth-1.2.3-py2.py3-none-any.whl (24 kB)
Installing collected packages: requests-aws4auth, install
Successfully installed install-1.3.5 requests-aws4auth-1.2.3
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [35]:
import boto3
from requests_aws4auth import AWS4Auth
from opensearchpy import OpenSearch, RequestsHttpConnection

# The region where you're running the OpenSearch cluster
region = 'us-east-1'

# Create a Boto3 session which will use the IAM role's credentials
session = boto3.Session()

# Get credentials from the session
credentials = session.get_credentials()

# AWS4Auth instance to sign our requests
awsauth = AWS4Auth(
    credentials.access_key,
    credentials.secret_key,
    region,
    'aoss',
    session_token=credentials.token  # Note: session_token is only needed when using temporary credentials.
)

# OpenSearch client configuration
host = 'ou37o53xx1kq6ddquv1c.us-east-1.aoss.amazonaws.com'  # For example: 'search-mydomain.us-west-1.es.amazonaws.com'
port = 443  # OpenSearch uses port 443 for HTTPS
use_ssl = True

# Create the OpenSearch client using the Boto3 session credentials
search = OpenSearch(
    hosts=[{'host': host, 'port': port}],
    http_auth=awsauth,
    use_ssl=use_ssl,
    verify_certs=True,
    connection_class=RequestsHttpConnection
)

# Example: Get information about the OpenSearch cluster
# response = search_client.info()

# print(response)


In [40]:
# Delete the index
index_name = "multi-modal-embedding-index"
response = search.indices.delete(index=index_name)
print(response)

NotFoundError: NotFoundError(404, 'index_not_found_exception', 'no such index [multi-modal-embedding-index]')

In [38]:
mapping = {
    "settings": {
        "index.knn": True,
    },
    "mappings": {
        "properties": {
            "titan_multimodal_embedding": {
                "type": "knn_vector",
                "dimension": 1024
            },
            "title": { 
                "type": "text"            
            },
            "plotSummary": { 
                "type": "text"            
            },
            "movieId": { 
                "type": "keyword"            
            },
            "imdbMovieId": { 
                "type": "keyword"            
            },
            "posterPath": { 
                "type": "text"            
            },
        }
    }
}

In [41]:
# Create the index
index_name = "multi-modal-embedding-index"
response = search.indices.create(index=index_name, body=mapping)

In [42]:
def create_document_from_row(row):

    embedding_file = 'embeddings/with_title_' + row['posterPath'].split("/")[-1].split(".")[0] + '.json'
    with open(embedding_file) as json_file:
        data = json.load(json_file)

    document = {
        "titan_multimodal_embedding": data['embedding'],
        "title": row['title'],
        "plotSummary": row['plotSummary'],
        "movieId": row['movieId'],
        "imdbMovieId": row['imdbMovieId'],
        "posterPath": row['posterPath']        
    }
    return document

In [43]:
for index, row in df.iterrows():
    document = create_document_from_row(row)
    response = search.index(index="multi-modal-embedding-index", body=document)
    print(response)

{'_index': 'multi-modal-embedding-index', '_id': '1%3A0%3AjcRfV4wBrGqo3jb5pGd5', '_version': 1, 'result': 'created', '_shards': {'total': 0, 'successful': 0, 'failed': 0}, '_seq_no': 0, '_primary_term': 0}
{'_index': 'multi-modal-embedding-index', '_id': '1%3A0%3A8-1fV4wB9kdO_KBxrUa6', '_version': 1, 'result': 'created', '_shards': {'total': 0, 'successful': 0, 'failed': 0}, '_seq_no': 0, '_primary_term': 0}
{'_index': 'multi-modal-embedding-index', '_id': '1%3A0%3AjsRfV4wBrGqo3jb5sGeE', '_version': 1, 'result': 'created', '_shards': {'total': 0, 'successful': 0, 'failed': 0}, '_seq_no': 0, '_primary_term': 0}
{'_index': 'multi-modal-embedding-index', '_id': '1%3A0%3A9O1fV4wB9kdO_KBxskZ6', '_version': 1, 'result': 'created', '_shards': {'total': 0, 'successful': 0, 'failed': 0}, '_seq_no': 0, '_primary_term': 0}
{'_index': 'multi-modal-embedding-index', '_id': '1%3A0%3Aj8RfV4wBrGqo3jb5tWeL', '_version': 1, 'result': 'created', '_shards': {'total': 0, 'successful': 0, 'failed': 0}, '_se