In [1]:
import os
import json
import pandas as pd
import urllib.request
import boto3
import base64
import requests

from requests.auth import HTTPBasicAuth
from botocore.config import Config

In [2]:
json_files = [pos_json for pos_json in os.listdir('movielens/') if pos_json.endswith('.json')]
json_data = []
for index, js in enumerate(json_files):
    with open(os.path.join('movielens/', js)) as json_file:
        json_text = json.load(json_file).get('data').get('searchResults')
        json_data.extend(json_text)

In [3]:
df = pd.DataFrame([json_data[i]['movie'] for i in range(len(json_data))])

In [5]:

def download_image(url, file_path, file_name):
    full_path = file_path + file_name
    urllib.request.urlretrieve(url, full_path)

In [7]:
for index, row in df.iterrows():
    url = 'https://image.tmdb.org/t/p/w500/' + row['posterPath']
    download_image(url, 'images/', row['posterPath'])

In [9]:
from dotenv import load_dotenv
load_dotenv()

boto_config = Config(
    region_name = 'us-west-2',
    signature_version = 'v4',
    retries = {
        'max_attempts': 10,
        'mode': 'standard'
    }
)

session = boto3.Session(profile_name=os.environ.get('profile'))

bedrock = session.client("bedrock", config=boto_config)
bedrock_runtime = session.client("bedrock-runtime", config=boto_config)


In [10]:
def get_embedding_for_poster(image_path):
    with open(image_path, "rb") as image_file:
        input_image = base64.b64encode(image_file.read()).decode('utf8')

    body = json.dumps(
        {
            "inputImage": input_image
        }
    )

    response = bedrock_runtime.invoke_model(
        body=body, 
        modelId="amazon.titan-embed-image-v1", 
        accept="application/json", 
        contentType="application/json"       
    )

    vector_json = json.loads(response['body'].read().decode('utf8'))
    image_name = image_path.split("/")[-1].split(".")[0]

    return vector_json, image_name

def get_embedding_for_poster_and_title(image_path, title):
    with open(image_path, "rb") as image_file:
        input_image = base64.b64encode(image_file.read()).decode('utf8')

    body = json.dumps(
        {
            "inputImage": input_image,
            "inputText": title
        }
    )

    response = bedrock_runtime.invoke_model(
        body=body, 
        modelId="amazon.titan-embed-image-v1", 
        accept="application/json", 
        contentType="application/json"       
    )

    vector_json = json.loads(response['body'].read().decode('utf8'))
    image_name = image_path.split("/")[-1].split(".")[0]

    return vector_json, image_name


def get_embedding_for_text(text):
    body = json.dumps(
        {
            "inputText": text
        }
    )

    response = bedrock_runtime.invoke_model(
        body=body, 
        modelId="amazon.titan-embed-image-v1", 
        accept="application/json", 
        contentType="application/json"       
    )

    vector_json = json.loads(response['body'].read().decode('utf8'))

    return vector_json, text

In [12]:
for index, row in df.iterrows():
    image_path = 'images/' + row['posterPath']
    vector_json, image_name = get_embedding_for_poster(image_path)
    with open('embeddings/' + image_name + '.json', 'w') as f:
        json.dump(vector_json, f)

In [13]:
for index, row in df.iterrows():
    image_path = 'images/' + row['posterPath']
    vector_json, image_name = get_embedding_for_poster_and_title(image_path, row['title'])
    with open('embeddings/' + 'with_title_' + image_name + '.json', 'w') as f:
        json.dump(vector_json, f)

In [14]:
df = df.drop(columns=['dvdReleaseDate', 'backdropPaths', 'youtubeTrailerIds', 'numRatings', 'avgRating'])

In [15]:
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth, helpers

session = boto3.Session(profile_name=os.environ.get('profile'))

client = session.client("opensearchserverless", config=boto_config)
 
host = "uhyhuh4pzdqdx2goan62.us-west-2.aoss.amazonaws.com"
region = "us-west-2"
service = "aoss"
credentials = session.get_credentials()
auth = AWSV4SignerAuth(credentials, region, service)
 
client = OpenSearch(
    hosts = [{"host": host, "port": 443}],
    http_auth = auth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    pool_maxsize = 20
)
 
index_name = "multi-modal-embedding-index"

In [16]:
index_body = {
    "mappings": {
        "properties": {
            "titan_multimodal_embedding": {
                "type": "knn_vector",
                "dimension": 1024
            },
            "title": { 
                "type": "text"            
            },
            "plotSummary": { 
                "type": "text"            
            },
            "movieId": { 
                "type": "keyword"            
            },
            "imdbMovieId": { 
                "type": "keyword"            
            },
            "posterPath": { 
                "type": "text"            
            },
        }
    },
    "settings": {
        "index": {
            "number_of_shards": 2,
            "knn.algo_param": {"ef_search": 512},
            "knn": True,
        }
    },
}
 
try:
    # response = client.indices.delete(index_name)
    # print(json.dumps(response, indent=2))
    response = client.indices.create(index_name, body=index_body)
    print(json.dumps(response, indent=2))
except Exception as ex:
    print(ex)

{
  "acknowledged": true,
  "shards_acknowledged": true,
  "index": "multi-modal-embedding-index"
}


In [17]:
def create_document_from_row(row):

    embedding_file = 'embeddings/with_title_' + row['posterPath'].split("/")[-1].split(".")[0] + '.json'
    with open(embedding_file) as json_file:
        data = json.load(json_file)

    document = {
        "titan_multimodal_embedding": data['embedding'],
        "title": row['title'],
        "plotSummary": row['plotSummary'],
        "movieId": row['movieId'],
        "imdbMovieId": row['imdbMovieId'],
        "posterPath": row['posterPath']        
    }
    return document

In [25]:
for index, row in df.iterrows():
    document = create_document_from_row(row)
    response = client.index(
        index = index_name,
        body = document,
        timeout = 60
    )
    print('\nAdding document:')
    print(response) 


Adding document:
{'_index': 'multi-modal-embedding-index', '_id': '1%3A0%3AeHGqAY0BrUVuMHllvs1b', '_version': 1, 'result': 'created', '_shards': {'total': 0, 'successful': 0, 'failed': 0}, '_seq_no': 0, '_primary_term': 0}

Adding document:
{'_index': 'multi-modal-embedding-index', '_id': '1%3A0%3AeLmqAY0BuHU76EbXwNU4', '_version': 1, 'result': 'created', '_shards': {'total': 0, 'successful': 0, 'failed': 0}, '_seq_no': 0, '_primary_term': 0}

Adding document:
{'_index': 'multi-modal-embedding-index', '_id': '1%3A0%3AeXGqAY0BrUVuMHllws0C', '_version': 1, 'result': 'created', '_shards': {'total': 0, 'successful': 0, 'failed': 0}, '_seq_no': 0, '_primary_term': 0}

Adding document:
{'_index': 'multi-modal-embedding-index', '_id': '1%3A0%3AebmqAY0BuHU76EbXw9Wi', '_version': 1, 'result': 'created', '_shards': {'total': 0, 'successful': 0, 'failed': 0}, '_seq_no': 0, '_primary_term': 0}

Adding document:
{'_index': 'multi-modal-embedding-index', '_id': '1%3A0%3AenGqAY0BrUVuMHllxc1E', '_ver

In [26]:
def get_embedding_for_text(text):
    body = json.dumps(
        {
            "inputText": text
        }
    )

    response = bedrock_runtime.invoke_model(
        body=body, 
        modelId="amazon.titan-embed-image-v1", 
        accept="application/json", 
        contentType="application/json"       
    )

    vector_json = json.loads(response['body'].read().decode('utf8'))

    return vector_json, text

In [27]:
def query(text, n=5):

    text_embedding = get_embedding_for_text(text)

    query = {
        "size": n,
        "query": {
            "knn": {
            "titan_multimodal_embedding": {
                "vector": text_embedding[0]['embedding'],
                "k": n
            }
            }
        },
        "_source": ["movieId", "title", "imdbMovieId", "posterPath", "plotSummary"]
        }

    response = client.search(
        body = query,
        index = index_name,
        timeout = 60
    )
    return response


In [28]:
query_str='Nuclear physicist'
response=query(query_str)

response['hits']['hits']

{'took': 24,
 'timed_out': False,
 '_shards': {'total': 0, 'successful': 0, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 5, 'relation': 'eq'},
  'max_score': 0.5717137,
  'hits': [{'_index': 'multi-modal-embedding-index',
    '_id': '1%3A0%3AjnGrAY0BrUVuMHllB81S',
    '_score': 0.5717137,
    '_source': {'imdbMovieId': '15398776',
     'plotSummary': "The story of J. Robert Oppenheimer's role in the development of the atomic bomb during World War II.",
     'movieId': 287699,
     'title': 'Oppenheimer',
     'posterPath': '/8Gxv8gSFCU0XGDykEGv7zR1n2ua.jpg'}},
   {'_index': 'multi-modal-embedding-index',
    '_id': '1%3A0%3Ae3GqAY0BrUVuMHllyM3O',
    '_score': 0.5489049,
    '_source': {'imdbMovieId': '5884796',
     'plotSummary': 'After a heroic job of successfully landing his storm-damaged aircraft in a war zone, a fearless pilot finds himself between the agendas of multiple militias planning to take the plane and its passengers hostage.',
     'movieId': 282671,
     't