In [19]:
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth, helpers
import boto3
from botocore.config import Config
import json
from dotenv import load_dotenv
import os

load_dotenv()

boto_config = Config(
    region_name = 'us-west-2',
    signature_version = 'v4',
    retries = {
        'max_attempts': 10,
        'mode': 'standard'
    }
)

session = boto3.Session(profile_name=os.environ.get('profile'))

client = session.client("opensearchserverless", config=boto_config)
 
host = "uhyhuh4pzdqdx2goan62.us-west-2.aoss.amazonaws.com"
region = "us-west-2"
service = "aoss"
credentials = session.get_credentials()
auth = AWSV4SignerAuth(credentials, region, service)
 
client = OpenSearch(
    hosts = [{"host": host, "port": 443}],
    http_auth = auth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    pool_maxsize = 20
)
 
index_name = "oscars-index"
index_body = {
    "mappings": {
        "properties": {
            "nominee_text": {"type": "text"},
            "nominee_vector": {
                "type": "knn_vector",
                "dimension": 1536,
                "method": {
                    "engine": "nmslib",
                    "space_type": "cosinesimil",
                    "name": "hnsw",
                    "parameters": {"ef_construction": 512, "m": 16},
                },
            },
        }
    },
    "settings": {
        "index": {
            "number_of_shards": 2,
            "knn.algo_param": {"ef_search": 512},
            "knn": True,
        }
    },
}
 
try:
    # response = client.indices.delete(index_name)
    # print(json.dumps(response, indent=2))
    response = client.indices.create(index_name, body=index_body)
    print(json.dumps(response, indent=2))
except Exception as ex:
    print(ex)

{
  "acknowledged": true
}
{
  "acknowledged": true,
  "shards_acknowledged": true,
  "index": "oscars-index"
}


In [12]:
import pandas as pd
df=pd.read_csv('./data/oscars.csv')
df.head()
df=df.loc[df['year_ceremony'] == 2023]
df=df.dropna(subset=['film'])
df['category'] = df['category'].str.lower()
df.head()
df['text'] = df['name'] + ' got nominated under the category, ' + df['category'] + ', for the film ' + df['film'] + ' to win the award'
df.loc[df['winner'] == False, 'text'] = df['name'] + ' got nominated under the category, ' + df['category'] + ', for the film ' + df['film'] + ' but did not win'
df.head()['text']

10639    Austin Butler got nominated under the category...
10640    Colin Farrell got nominated under the category...
10641    Brendan Fraser got nominated under the categor...
10642    Paul Mescal got nominated under the category, ...
10643    Bill Nighy got nominated under the category, a...
Name: text, dtype: object

In [15]:
bedrock = session.client('bedrock-runtime')

def text_embedding(text):
    body=json.dumps({"inputText": text})
    response = bedrock.invoke_model(body=body, modelId='amazon.titan-embed-text-v1', accept='application/json', contentType='application/json')
    response_body = json.loads(response.get('body').read())
    embedding = response_body.get('embedding')
    return embedding
df=df.assign(embedding=(df["text"].apply(lambda x : text_embedding(x))))


In [14]:

# bedrock_static = session.client('bedrock')
# response = bedrock_static.list_foundation_models(
#     byOutputModality='EMBEDDING'
# )
# print(response)

{'ResponseMetadata': {'RequestId': 'e39d8e71-0e39-4eb8-9543-04f9ab182d1a', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Fri, 12 Jan 2024 11:55:37 GMT', 'content-type': 'application/json', 'content-length': '2806', 'connection': 'keep-alive', 'x-amzn-requestid': 'e39d8e71-0e39-4eb8-9543-04f9ab182d1a'}, 'RetryAttempts': 0}, 'modelSummaries': [{'modelArn': 'arn:aws:bedrock:us-east-1::foundation-model/amazon.titan-embed-g1-text-02', 'modelId': 'amazon.titan-embed-g1-text-02', 'modelName': 'Titan Text Embeddings v2', 'providerName': 'Amazon', 'inputModalities': ['TEXT'], 'outputModalities': ['EMBEDDING'], 'customizationsSupported': [], 'inferenceTypesSupported': ['ON_DEMAND'], 'modelLifecycle': {'status': 'ACTIVE'}}, {'modelArn': 'arn:aws:bedrock:us-east-1::foundation-model/amazon.titan-embed-text-v1:2:8k', 'modelId': 'amazon.titan-embed-text-v1:2:8k', 'modelName': 'Titan Embeddings G1 - Text', 'providerName': 'Amazon', 'inputModalities': ['TEXT'], 'outputModalities': ['EMBEDDING'], 'res

In [17]:
# df.head()

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner,text,embedding
10639,2022,2023,95,actor in a leading role,Austin Butler,Elvis,False,Austin Butler got nominated under the category...,"[-0.109375, -0.16601562, 0.25390625, -0.621093..."
10640,2022,2023,95,actor in a leading role,Colin Farrell,The Banshees of Inisherin,False,Colin Farrell got nominated under the category...,"[-0.46484375, -0.055664062, 0.23828125, -0.761..."
10641,2022,2023,95,actor in a leading role,Brendan Fraser,The Whale,True,Brendan Fraser got nominated under the categor...,"[0.26757812, -0.56640625, 0.22265625, -0.40429..."
10642,2022,2023,95,actor in a leading role,Paul Mescal,Aftersun,False,"Paul Mescal got nominated under the category, ...","[-0.43359375, 0.032714844, 0.28125, -0.5390625..."
10643,2022,2023,95,actor in a leading role,Bill Nighy,Living,False,"Bill Nighy got nominated under the category, a...","[-0.43554688, -0.21289062, 0.18359375, -0.4335..."


In [23]:

def add_document(vector,text):
    document = {
      "nominee_vector": vector,
      "nominee_text": text
    }
    # print(document)
    response = client.index(
        index = 'oscars-index',
        body = document,
        timeout = 60
    )
    print('\nAdding document:')
    print(response) 
df.apply(lambda row: add_document(row['embedding'], row['text']), axis=1)



Adding document:
{'_index': 'oscars-index', '_id': '1%3A0%3AH3GV_YwBrUVuMHll6s3G', '_version': 1, 'result': 'created', '_shards': {'total': 0, 'successful': 0, 'failed': 0}, '_seq_no': 0, '_primary_term': 0}

Adding document:
{'_index': 'oscars-index', '_id': '1%3A0%3AILmV_YwBuHU76EbX7dUK', '_version': 1, 'result': 'created', '_shards': {'total': 0, 'successful': 0, 'failed': 0}, '_seq_no': 0, '_primary_term': 0}

Adding document:
{'_index': 'oscars-index', '_id': '1%3A0%3AIHGV_YwBrUVuMHll7s3a', '_version': 1, 'result': 'created', '_shards': {'total': 0, 'successful': 0, 'failed': 0}, '_seq_no': 0, '_primary_term': 0}

Adding document:
{'_index': 'oscars-index', '_id': '1%3A0%3AIbmV_YwBuHU76EbX8NWl', '_version': 1, 'result': 'created', '_shards': {'total': 0, 'successful': 0, 'failed': 0}, '_seq_no': 0, '_primary_term': 0}

Adding document:
{'_index': 'oscars-index', '_id': '1%3A0%3AIXGV_YwBrUVuMHll8s2t', '_version': 1, 'result': 'created', '_shards': {'total': 0, 'successful': 0, 'fa

10639    None
10640    None
10641    None
10642    None
10643    None
         ... 
10755    None
10756    None
10757    None
10758    None
10764    None
Length: 121, dtype: object

In [24]:
def search_index(vector):
    document = {
        "size": 15,
        "_source": {"excludes": ["nominee_vector"]},
        "query": {
            "knn": {
                 "nominee_vector": {
                     "vector": vector,
                     "k":15
                 }
            }
        }
    }
    response = client.search(
        body = document,
        index = "oscars-index",
        timeout = 60
    )
    return response

query='who won the award for best music?'
vector=text_embedding(query)

response=search_index(vector)
response['hits']['hits']

[{'_index': 'oscars-index',
  '_id': '1%3A0%3AQLmW_YwBuHU76EbXXNX_',
  '_score': 0.7270102,
  '_source': {'nominee_text': 'Volker Bertelmann got nominated under the category, music (original score), for the film All Quiet on the Western Front to win the award'}},
 {'_index': 'oscars-index',
  '_id': '1%3A0%3AQHGW_YwBrUVuMHllXs3G',
  '_score': 0.7232194,
  '_source': {'nominee_text': 'Justin Hurwitz got nominated under the category, music (original score), for the film Babylon but did not win'}},
 {'_index': 'oscars-index',
  '_id': '1%3A0%3ARLmW_YwBuHU76EbXa9Wu',
  '_score': 0.7190479,
  '_source': {'nominee_text': 'Music by M.M. Keeravaani; Lyric by Chandrabose got nominated under the category, music (original song), for the film RRR to win the award'}},
 {'_index': 'oscars-index',
  '_id': '1%3A0%3AQ3GW_YwBrUVuMHllac3c',
  '_score': 0.7148727,
  '_source': {'nominee_text': 'Music by Tems, Rihanna, Ryan Coogler and Ludwig Goransson; Lyric by Tems and Ryan Coogler got nominated under t

In [30]:
query='which film has its name related to animals?'
vector=text_embedding(query)

response=search_index(vector)
response['hits']['hits']

[{'_index': 'oscars-index',
  '_id': '1%3A0%3AK7mW_YwBuHU76EbXFNW2',
  '_score': 0.6087976,
  '_source': {'nominee_text': 'Chris Williams and Jed Schlanger got nominated under the category, animated feature film, for the film The Sea Beast but did not win'}},
 {'_index': 'oscars-index',
  '_id': '1%3A0%3AKrmW_YwBuHU76EbXEdVO',
  '_score': 0.5999809,
  '_source': {'nominee_text': 'Dean Fleischer Camp, Elisabeth Holm, Andrew Goldman, Caroline Kaplan and Paul Mezey got nominated under the category, animated feature film, for the film Marcel the Shell with Shoes On but did not win'}},
 {'_index': 'oscars-index',
  '_id': '1%3A0%3AKnGW_YwBrUVuMHllEs3z',
  '_score': 0.5870316,
  '_source': {'nominee_text': 'Joel Crawford and Mark Swift got nominated under the category, animated feature film, for the film Puss in Boots: The Last Wish but did not win'}},
 {'_index': 'oscars-index',
  '_id': '1%3A0%3ATHGW_YwBrUVuMHlliM3-',
  '_score': 0.5794037,
  '_source': {'nominee_text': 'Charlie Mackesy an