### OpenSearch Serverless

In [1]:
!pip install opensearch-py

Collecting opensearch-py
  Downloading opensearch_py-2.6.0-py2.py3-none-any.whl.metadata (7.0 kB)
Collecting Events (from opensearch-py)
  Downloading Events-0.5-py3-none-any.whl.metadata (3.9 kB)
Downloading opensearch_py-2.6.0-py2.py3-none-any.whl (311 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Events-0.5-py3-none-any.whl (6.8 kB)
Installing collected packages: Events, opensearch-py
Successfully installed Events-0.5 opensearch-py-2.6.0


In [None]:
# https://github.com/janakiramm/rag-bedrock-titan/blob/main/Part-1.ipynb
# https://docs.aws.amazon.com/pdfs/bedrock/latest/userguide/bedrock-ug.pdf#page=2&zoom=auto,-88,766
# https://opensearch.org/docs/latest/clients/python-low-level/
# https://opensearch.org/docs/latest/api-reference/index-apis/create-index/

In [114]:
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth, helpers
import boto3
from botocore.config import Config
import json

### Initialize and configure Boto Client for Bedrock

In [133]:
bedrock = boto3.client(
 service_name='bedrock-runtime',
 region_name='us-east-1',
 endpoint_url='https://bedrock.us-east-1.amazonaws.com'
)

### Initialize and configure Boto Client for Amazon Open Search Serverless

In [134]:
boto_config = Config(
    region_name = 'us-east-1',
    signature_version = 'v4',
    retries = {
        'max_attempts': 10,
        'mode': 'standard'
    }
)

In [135]:
client = boto3.client("opensearchserverless",config=boto_config)

In [94]:
#credentials = boto3.Session().get_credentials()

In [1]:
#print(credentials.access_key)
# ASIAS3VMPIBXGF73SVEX

In [2]:
#print(credentials.secret_key)
# aTegcp5MqVrzJPz4wZkFqfK1DfcSVjyeW67ezKvK

### Define encryption policy

In [119]:
policy = """{
                "Rules": [
                    {
                        "Resource": [
                            "collection/oscars-collection"
                        ],
                        "ResourceType": "collection"
                    }
                ],
                "AWSOwnedKey": true
            }"""

try:
    response = client.create_security_policy(
        description="oscars collection encryption security policy",
        name="oscars-encryption-policy",
        policy=policy,
        type="encryption",
    )
    print(json.dumps(response, indent=2))
except Exception as ex:
    print(ex)

An error occurred (ConflictException) when calling the CreateSecurityPolicy operation: Policy with name oscars-encryption-policy and type encryption already exists


In [None]:
#An error occurred (ServiceQuotaExceededException) when calling the CreateSecurityPolicy operation: Your request exceeds the limit for encryption policies in this AWS account. The limit is 50.

### Define network policy

In [13]:
policy = """[
                {
                    "Rules": [
                        {
                            "Resource": [
                                "collection/oscars-collection"
                            ],
                            "ResourceType": "collection"
                        }
                    ],
                    "AllowFromPublic": true
                }
            ]"""

try:
    response = client.create_security_policy(
        description="oscars collection network security policy",
        name="oscars-network-policy",
        policy=policy,
        type="network",
    )
    print(json.dumps(response, indent=2))
except Exception as ex:
    print(ex)

{
  "securityPolicyDetail": {
    "createdDate": 1722891509216,
    "description": "oscars collection network security policy",
    "lastModifiedDate": 1722891509216,
    "name": "oscars-network-policy",
    "policy": [
      {
        "Rules": [
          {
            "Resource": [
              "collection/oscars-collection"
            ],
            "ResourceType": "collection"
          }
        ],
        "AllowFromPublic": true
      }
    ],
    "policyVersion": "MTcyMjg5MTUwOTIxNl8x",
    "type": "network"
  },
  "ResponseMetadata": {
    "RequestId": "b8e9a6e8-d9ce-4874-bf8b-9dc1b15a2610",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "x-amzn-requestid": "b8e9a6e8-d9ce-4874-bf8b-9dc1b15a2610",
      "date": "Mon, 05 Aug 2024 20:58:29 GMT",
      "content-type": "application/x-amz-json-1.0",
      "content-length": "352",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


### Define data access policy

In [14]:
#replace the IAM principle with your own
policy = """[
                {
                    "Rules": [
                        {
                            "Resource": [
                                "collection/*"
                            ],
                            "Permission": [
                                "aoss:*"
                            ],
                            "ResourceType": "collection"
                        },
                        {
                            "Resource": [
                                "index/*/*"
                            ],
                            "Permission": [
                                "aoss:*"
                            ],
                            "ResourceType": "index"
                        }
                    ],
                    "Principal": [
                        "arn:aws:iam::196856463470:role/SandboxLoginRole" 
                    ],
                    "Description": "Rule 1"
                }
            ]"""

try:
    response = client.create_access_policy(
        description="oscars collection data access policy",
        name="oscars-data-access-policy",
        policy=policy,
        type="data"
    )
    print(json.dumps(response, indent=2))
except Exception as ex:
    print(ex)


{
  "accessPolicyDetail": {
    "createdDate": 1722891808340,
    "description": "oscars collection data access policy",
    "lastModifiedDate": 1722891808340,
    "name": "oscars-data-access-policy",
    "policy": [
      {
        "Rules": [
          {
            "Resource": [
              "collection/*"
            ],
            "Permission": [
              "aoss:*"
            ],
            "ResourceType": "collection"
          },
          {
            "Resource": [
              "index/*/*"
            ],
            "Permission": [
              "aoss:*"
            ],
            "ResourceType": "index"
          }
        ],
        "Principal": [
          "arn:aws:iam::196856463470:role/SandboxLoginRole"
        ],
        "Description": "Rule 1"
      }
    ],
    "policyVersion": "MTcyMjg5MTgwODM0MF8x",
    "type": "data"
  },
  "ResponseMetadata": {
    "RequestId": "665fbbe5-9496-4003-8704-e9c868b64640",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "x-am

In [136]:
#replace the IAM principle with your own
# Create a second Data Access Policy with SandboxServiceRole/Sagemaker
policy2 = """[
                {
                    "Rules": [
                        {
                            "Resource": [
                                "collection/*"
                            ],
                            "Permission": [
                                "aoss:*"
                            ],
                            "ResourceType": "collection"
                        },
                        {
                            "Resource": [
                                "index/*/*"
                            ],
                            "Permission": [
                                "aoss:*"
                            ],
                            "ResourceType": "index"
                        }
                    ],
                    "Principal": [
                        "arn:aws:sts::196856463470:assumed-role/SandboxServiceRole/SageMaker" 
                    ],
                    "Description": "Rule 1"
                }
            ]"""

try:
    response = client.create_access_policy(
        description="oscars collection data access policy",
        name="oscars-data-access-policy2",
        policy=policy2,
        type="data"
    )
    print(json.dumps(response, indent=2))
except Exception as ex:
    print(ex)


{
  "accessPolicyDetail": {
    "createdDate": 1723056064645,
    "description": "oscars collection data access policy",
    "lastModifiedDate": 1723056064645,
    "name": "oscars-data-access-policy2",
    "policy": [
      {
        "Rules": [
          {
            "Resource": [
              "collection/*"
            ],
            "Permission": [
              "aoss:*"
            ],
            "ResourceType": "collection"
          },
          {
            "Resource": [
              "index/*/*"
            ],
            "Permission": [
              "aoss:*"
            ],
            "ResourceType": "index"
          }
        ],
        "Principal": [
          "arn:aws:sts::196856463470:assumed-role/SandboxServiceRole/SageMaker"
        ],
        "Description": "Rule 1"
      }
    ],
    "policyVersion": "MTcyMzA1NjA2NDY0NV8x",
    "type": "data"
  },
  "ResponseMetadata": {
    "RequestId": "25e23cb8-f0ce-4fa0-9adc-caf85fc20add",
    "HTTPStatusCode": 200,
    "HTTPHe

### Create the collection

In [15]:
# create vector search collection
try:
    response = client.create_collection(
        description="oscars collection",
        name="oscars-collection",
        type="VECTORSEARCH"
    )
    print(json.dumps(response, indent=2))
except Exception as ex:
    print(ex)

{
  "createCollectionDetail": {
    "arn": "arn:aws:aoss:us-east-1:196856463470:collection/h729aiqjy4d3al1tih1i",
    "createdDate": 1722891852459,
    "description": "oscars collection",
    "id": "h729aiqjy4d3al1tih1i",
    "kmsKeyArn": "auto",
    "lastModifiedDate": 1722891852459,
    "name": "oscars-collection",
    "standbyReplicas": "ENABLED",
    "status": "CREATING",
    "type": "VECTORSEARCH"
  },
  "ResponseMetadata": {
    "RequestId": "0d965217-9b24-4b82-93b3-1b5147f12821",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "x-amzn-requestid": "0d965217-9b24-4b82-93b3-1b5147f12821",
      "date": "Mon, 05 Aug 2024 21:04:12 GMT",
      "content-type": "application/x-amz-json-1.0",
      "content-length": "343",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


### Initialize and configure OpenSearch client

In [None]:
#https://h729aiqjy4d3al1tih1i.us-east-1.aoss.amazonaws.com

In [108]:
!pip install requests-aws4auth

Collecting requests-aws4auth
  Downloading requests_aws4auth-1.3.1-py3-none-any.whl.metadata (18 kB)
Downloading requests_aws4auth-1.3.1-py3-none-any.whl (24 kB)
Installing collected packages: requests-aws4auth
Successfully installed requests-aws4auth-1.3.1


In [120]:
from requests_aws4auth import AWS4Auth

In [137]:
host = "h729aiqjy4d3al1tih1i.us-east-1.aoss.amazonaws.com" #replace this with the value from the AWS Management Console
#https://h729aiqjy4d3al1tih1i.us-east-1.aoss.amazonaws.com
region = "us-east-1"
service = "aoss"
credentials = boto3.Session().get_credentials()
auth = AWSV4SignerAuth(credentials, region, service)


In [143]:
boto3.Session().client("sts").get_caller_identity()

{'UserId': 'AROAS3VMPIBXEXNOJSQFN:SageMaker',
 'Account': '196856463470',
 'Arn': 'arn:aws:sts::196856463470:assumed-role/SandboxServiceRole/SageMaker',
 'ResponseMetadata': {'RequestId': '42f7c4a1-eb8e-46fd-a3a2-dc68b31a2818',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '42f7c4a1-eb8e-46fd-a3a2-dc68b31a2818',
   'content-type': 'text/xml',
   'content-length': '443',
   'date': 'Wed, 07 Aug 2024 18:56:06 GMT'},
  'RetryAttempts': 0}}

In [131]:
#response = boto3.Session().client("sts").assume_role(RoleArn= "arn:aws:iam::196856463470:role/SandboxLoginRole", RoleSessionName= "oscar")

ClientError: An error occurred (AccessDenied) when calling the AssumeRole operation: User: arn:aws:sts::196856463470:assumed-role/SandboxServiceRole/SageMaker is not authorized to perform: sts:AssumeRole on resource: arn:aws:iam::196856463470:role/SandboxLoginRole

In [None]:
# ClientError: An error occurred (AccessDenied) when calling the AssumeRole operation: User: arn:aws:sts::196856463470:assumed-role/SandboxServiceRole/SageMaker is not authorized to perform: sts:AssumeRole on resource: arn:aws:iam::196856463470:role/SandboxLoginRole

In [3]:
credentials.access_key, credentials.secret_key
# ('ASIAS3VMPIBXEQOSQOND', 'mPcsif9wdqdvRtdi0jH7VwjLy7N4IzIAiqwmOUWj')

NameError: name 'credentials' is not defined

In [110]:
# aws4auth = AWS4Auth(credentials.access_key, credentials.secret_key,
#                     region, service, session_token = credentials.token)

In [111]:
# ops_client = OpenSearch(
#     hosts = [{'host': host, 'port': 443}],
#     http_auth = aws4auth,
#     use_ssl = True,
#     verify_certs = True,
#     connection_class = RequestsHttpConnection,
#     timeout = 300
# )

In [139]:
client_os = OpenSearch(
    hosts = [{"host": host, "port": 443}],
    http_auth = auth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    pool_maxsize = 20
)

In [140]:
client_os

<OpenSearch([{'host': 'h729aiqjy4d3al1tih1i.us-east-1.aoss.amazonaws.com', 'port': 443}])>

In [None]:
# https://aws.plainenglish.io/bedrock-unveiled-indexing-own-data-to-opensearch-serverless-via-a-lambda-3dcdb8a1a4fb

In [112]:
# def get_vector_embedding(text, bedrock_client):
  
#     response = bedrock_client.invoke_model(
#         body=json.dumps({ "inputText": text }), 
#         modelId="amazon.titan-embed-text-v1", 
#         accept="application/json", 
#         contentType="application/json"
#     )
    
#     response_body = json.loads(response.get("body").read())
  
#     return response_body.get("embedding")

In [125]:
# def create_index_if_not_present(index) :
#     print(f'In create index')
#     if not ops_client.indices.exists(index):
 
#     # Define the settings and mappings for creating an OpenSearch index. 
#     # It includes settings related to KNN (k-nearest neighbors) 
#     # and defines two fields: "text" with the type "text" and "vector_field" 
#     # with the type "knn_vector" and a specified dimension.
#         settings = {
#             "settings": {
#                 "index": {
#                     "knn": True,
#                 }
#             },
#             "mappings": {
#                 "properties": {
#                     "text": {"type": "text"},
#                     "vector_field": {
#                         "type": "knn_vector",
#                         "dimension": 1600,
#                     },
#                 }
#             },
#         }
#         res = ops_client.indices.create(index, body=settings, ignore=[400])
#         print(res)



In [None]:
# def indexEmbedding(embedding, content) :
#   doc = {
#     'vector_field' : embedding,
#     'text': content
#   }
      
#   # Index the document in OpenSearch Serverless vector database
#   return ops_client.index(index, body=doc)

### Create an index

In [141]:
index_name = "oscars-index"
index_body = {
    "mappings": {
        "properties": {
            "nominee_text": {"type": "text"},
            "nominee_vector": {
                "type": "knn_vector",
                "dimension": 4096,
                "method": {
                    "engine": "nmslib",
                    "space_type": "cosinesimil",
                    "name": "hnsw",
                    "parameters": {"ef_construction": 512, "m": 16},
                },
            },
        }
    },
    "settings": {
        "index": {
            "number_of_shards": 2,
            "knn.algo_param": {"ef_search": 512},
            "knn": True,
        }
    },
}



In [142]:
client_os.indices.create(index_name, body=index_body)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'oscars-index'}

In [None]:
try:
    response = client_os.indices.create(index_name, body=index_body)
    print(json.dumps(response, indent=2))
except Exception as ex:
    print(ex)

### Verify the index

In [145]:
try:
    response = client_os.indices.get("oscars-index")
    print(json.dumps(response, indent=2))
except Exception as ex:
    print(ex)

{
  "oscars-index": {
    "aliases": {},
    "mappings": {
      "properties": {
        "nominee_text": {
          "type": "text"
        },
        "nominee_vector": {
          "type": "knn_vector",
          "dimension": 4096,
          "method": {
            "engine": "nmslib",
            "space_type": "cosinesimil",
            "name": "hnsw",
            "parameters": {
              "ef_construction": 512,
              "m": 16
            }
          }
        }
      }
    },
    "settings": {
      "index": {
        "number_of_shards": "2",
        "knn.algo_param": {
          "ef_search": "512"
        },
        "provided_name": "oscars-index",
        "knn": "true",
        "creation_date": "1723056332798",
        "number_of_replicas": "0",
        "uuid": "oIspLpEB8_nU2huZH3ah",
        "version": {
          "created": "135217827"
        }
      }
    }
  }
}


In [None]:
# Create a new index

In [160]:
index_name2 = "oscars-index2"
index_body2 = {
    "mappings": {
        "properties": {
            "nominee_text": {"type": "text"},
            "nominee_vector": {
                "type": "knn_vector",
                "dimension": 1024,
                "method": {
                    "engine": "nmslib",
                    "space_type": "cosinesimil",
                    "name": "hnsw",
                    "parameters": {"ef_construction": 512, "m": 16},
                },
            },
        }
    },
    "settings": {
        "index": {
            "number_of_shards": 2,
            "knn.algo_param": {"ef_search": 512},
            "knn": True,
        }
    },
}



In [161]:
try:
    response = client_os.indices.create(index_name2, body=index_body2)
    print(json.dumps(response, indent=2))
except Exception as ex:
    print(ex)

{
  "acknowledged": true,
  "shards_acknowledged": true,
  "index": "oscars-index2"
}


In [162]:
try:
    response = client_os.indices.get("oscars-index2")
    print(json.dumps(response, indent=2))
except Exception as ex:
    print(ex)

{
  "oscars-index2": {
    "aliases": {},
    "mappings": {
      "properties": {
        "nominee_text": {
          "type": "text"
        },
        "nominee_vector": {
          "type": "knn_vector",
          "dimension": 1024,
          "method": {
            "engine": "nmslib",
            "space_type": "cosinesimil",
            "name": "hnsw",
            "parameters": {
              "ef_construction": 512,
              "m": 16
            }
          }
        }
      }
    },
    "settings": {
      "index": {
        "number_of_shards": "2",
        "knn.algo_param": {
          "ef_search": "512"
        },
        "provided_name": "oscars-index2",
        "knn": "true",
        "creation_date": "1723058283628",
        "number_of_replicas": "0",
        "uuid": "BaJGLpEBdYM265si4_T5",
        "version": {
          "created": "135217827"
        }
      }
    }
  }
}


### Process the dataset

In [22]:
pwd

'/home/sagemaker-user'

In [24]:
import pandas as pd

In [51]:
#del df

In [163]:
df=pd.read_csv('./data/the_oscar_award.csv')

In [164]:
df.head()

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
0,1927,1928,1,ACTOR,Richard Barthelmess,The Noose,False
1,1927,1928,1,ACTOR,Emil Jannings,The Last Command,True
2,1927,1928,1,ACTRESS,Louise Dresser,A Ship Comes In,False
3,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True
4,1927,1928,1,ACTRESS,Gloria Swanson,Sadie Thompson,False


In [165]:
df=df.loc[df['year_ceremony'] == 2023]
df=df.dropna(subset=['film'])
df['category'] = df['category'].str.lower()

In [166]:
df.head()

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
10639,2022,2023,95,actor in a leading role,Austin Butler,Elvis,False
10640,2022,2023,95,actor in a leading role,Colin Farrell,The Banshees of Inisherin,False
10641,2022,2023,95,actor in a leading role,Brendan Fraser,The Whale,True
10642,2022,2023,95,actor in a leading role,Paul Mescal,Aftersun,False
10643,2022,2023,95,actor in a leading role,Bill Nighy,Living,False


### Concatenate columns to create a new text colummn

In [167]:
# Create the column for all rows first
df['text'] = df['name'] + ' got nominated under the category, ' + df['category'] + ', for the film ' + df['film'] + ' to win the award'


# Find the rows where 'winner' is False and replace the 'text' for those rows
df.loc[df['winner'] == False, 'text'] = df['name'] + ' got nominated under the category, ' + df['category'] + ', for the film ' + df['film'] + ' but did not win'

In [168]:
df.head()

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner,text
10639,2022,2023,95,actor in a leading role,Austin Butler,Elvis,False,Austin Butler got nominated under the category...
10640,2022,2023,95,actor in a leading role,Colin Farrell,The Banshees of Inisherin,False,Colin Farrell got nominated under the category...
10641,2022,2023,95,actor in a leading role,Brendan Fraser,The Whale,True,Brendan Fraser got nominated under the categor...
10642,2022,2023,95,actor in a leading role,Paul Mescal,Aftersun,False,"Paul Mescal got nominated under the category, ..."
10643,2022,2023,95,actor in a leading role,Bill Nighy,Living,False,"Bill Nighy got nominated under the category, a..."


### Generate embeddings for the text column from Titan

In [None]:
bedrock=boto3.client(service_name="bedrock-runtime")

In [None]:
response=bedrock.invoke_model(
    body=body,
    modelId=model_id,
    accept= "application/json",
    contentType="application/json"
    
)

response_body=json.loads(response.get("body").read())
response_text=response_body['generation']
print(response_text)

In [169]:
#Create the connection to Bedrock
bedrock = boto3.client(
    service_name='bedrock',
    region_name='us-east-1', 
    
)
 
bedrock_runtime = boto3.client(
    service_name='bedrock-runtime',
    region_name='us-east-1', 
    
)

In [170]:
# Let's see all available Amazon Models
available_models = bedrock.list_foundation_models()
#print(available_models)

In [66]:

 
for model in available_models['modelSummaries']:
  if 'amazon' in model['modelId']:
    print(model)

{'modelArn': 'arn:aws:bedrock:us-east-1::foundation-model/amazon.titan-tg1-large', 'modelId': 'amazon.titan-tg1-large', 'modelName': 'Titan Text Large', 'providerName': 'Amazon', 'inputModalities': ['TEXT'], 'outputModalities': ['TEXT'], 'responseStreamingSupported': True, 'customizationsSupported': [], 'inferenceTypesSupported': ['ON_DEMAND'], 'modelLifecycle': {'status': 'ACTIVE'}}
{'modelArn': 'arn:aws:bedrock:us-east-1::foundation-model/amazon.titan-image-generator-v1:0', 'modelId': 'amazon.titan-image-generator-v1:0', 'modelName': 'Titan Image Generator G1', 'providerName': 'Amazon', 'inputModalities': ['TEXT', 'IMAGE'], 'outputModalities': ['IMAGE'], 'customizationsSupported': ['FINE_TUNING'], 'inferenceTypesSupported': ['PROVISIONED'], 'modelLifecycle': {'status': 'ACTIVE'}}
{'modelArn': 'arn:aws:bedrock:us-east-1::foundation-model/amazon.titan-image-generator-v1', 'modelId': 'amazon.titan-image-generator-v1', 'modelName': 'Titan Image Generator G1', 'providerName': 'Amazon', 'i

In [171]:
# Define prompt and model parameters
prompt_data = """Write me a poem about apples"""
 
body = json.dumps({
    "inputText": prompt_data,
})
 
#model_id = 'amazon.titan-embed-text-v1' #look for embeddings in the modelID
model_id = 'amazon.titan-embed-text-v2:0'
accept = 'application/json' 
content_type = 'application/json'

In [172]:
# Invoke model 
response = bedrock_runtime.invoke_model(
    body=body, 
    modelId=model_id, 
    accept=accept, 
    contentType=content_type
)

In [173]:
# Print response
response_body = json.loads(response['body'].read())
embedding = response_body.get('embedding')
 
#Print the Embedding
 
#print(embedding)

In [174]:
print(len(embedding))
# 1024 dimensions for amazon.titan-embed-text-v2:0

1024


In [None]:
# amazon.titan-embed-text-v2:0
# Model ID – amazon.titan-embed-text-v2:0

In [175]:
df2 = df.copy()

In [176]:
df.shape

(121, 8)

In [177]:
df2.shape

(121, 8)

In [191]:
#model_id = 'amazon.titan-embed-text-v1'
model_id2 = 'amazon.titan-embed-text-v2:0' # 1024 dimension

In [None]:
#response_body = json.loads(response['body'].read())

In [192]:
def text_embedding(text):
    body=json.dumps({"inputText": text})
    response = bedrock_runtime.invoke_model(body=body, modelId=model_id2, accept='application/json', contentType='application/json')
    response_body = json.loads(response['body'].read())
    embedding = response_body.get('embedding')
    return embedding

In [71]:
#del df_new

In [180]:
df_new=df.assign(embedding=(df["text"].apply(lambda x : text_embedding(x))))

In [181]:
df_new.head()

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner,text,embedding
10639,2022,2023,95,actor in a leading role,Austin Butler,Elvis,False,Austin Butler got nominated under the category...,"[-0.05737971, 0.005850166, -0.012581864, 0.028..."
10640,2022,2023,95,actor in a leading role,Colin Farrell,The Banshees of Inisherin,False,Colin Farrell got nominated under the category...,"[0.019480161, 0.042364623, -0.036879916, -0.00..."
10641,2022,2023,95,actor in a leading role,Brendan Fraser,The Whale,True,Brendan Fraser got nominated under the categor...,"[-0.03609801, 0.022561258, 0.013797076, -0.049..."
10642,2022,2023,95,actor in a leading role,Paul Mescal,Aftersun,False,"Paul Mescal got nominated under the category, ...","[-0.072570145, 0.045575455, -0.014373797, -0.0..."
10643,2022,2023,95,actor in a leading role,Bill Nighy,Living,False,"Bill Nighy got nominated under the category, a...","[-0.083715715, 0.03935888, -0.0033579997, 0.00..."


In [182]:
df_new.shape

(121, 9)

In [194]:
len(df_new.iloc[0][8]) # dimension length - 1024

  len(df_new.iloc[0][8])


1024

### Ingest the text and embeddings into AWS OpenSearch Serverless

In [183]:
df_new2 = df_new.copy()

In [184]:
def add_document(vector,text):
    document = {
      "nominee_vector": vector,
      "nominee_text": text
    }
    
    response = client_os.index(
        index = 'oscars-index2',
        body = document
    )
    print('\nAdding document:')
    print(response) 

In [185]:
client_os

<OpenSearch([{'host': 'h729aiqjy4d3al1tih1i.us-east-1.aoss.amazonaws.com', 'port': 443}])>

In [186]:
df_new.apply(lambda row: add_document(row['embedding'], row['text']), axis=1)


Adding document:
{'_index': 'oscars-index2', '_id': '1%3A0%3A7ZVNLpEBBduygMq_xFCL', '_version': 1, 'result': 'created', '_shards': {'total': 0, 'successful': 0, 'failed': 0}, '_seq_no': 0, '_primary_term': 0}

Adding document:
{'_index': 'oscars-index2', '_id': '1%3A0%3AybpNLpEBQYqvp9uXxg7v', '_version': 1, 'result': 'created', '_shards': {'total': 0, 'successful': 0, 'failed': 0}, '_seq_no': 0, '_primary_term': 0}

Adding document:
{'_index': 'oscars-index2', '_id': '1%3A0%3A7pVNLpEBBduygMq_yFC7', '_version': 1, 'result': 'created', '_shards': {'total': 0, 'successful': 0, 'failed': 0}, '_seq_no': 0, '_primary_term': 0}

Adding document:
{'_index': 'oscars-index2', '_id': '1%3A0%3AyrpNLpEBQYqvp9uXyg6m', '_version': 1, 'result': 'created', '_shards': {'total': 0, 'successful': 0, 'failed': 0}, '_seq_no': 0, '_primary_term': 0}

Adding document:
{'_index': 'oscars-index2', '_id': '1%3A0%3A75VNLpEBBduygMq_zVCv', '_version': 1, 'result': 'created', '_shards': {'total': 0, 'successful': 0

10639    None
10640    None
10641    None
10642    None
10643    None
         ... 
10755    None
10756    None
10757    None
10758    None
10764    None
Length: 121, dtype: object

### Build Context to Perform semantic search

In [187]:
def search_index(vector):
    document = {
        "size": 15,
        "_source": {"excludes": ["nominee_vector"]},
        "query": {
            "knn": {
                 "nominee_vector": {
                     "vector": vector,
                     "k":15
                 }
            }
        }
    }
    response = client_os.search(
    body = document,
    index = "oscars-index2"
    )
    return response

In [195]:
text_embedding

<function __main__.text_embedding(text)>

In [196]:
query='who won the award for best music?'
vector=text_embedding(query)

In [197]:
len(vector)

1024

In [198]:
response=search_index(vector)
data=response['hits']['hits']

In [199]:
data

[{'_index': 'oscars-index2',
  '_id': '1%3A0%3AEJVOLpEBBduygMq_FVGF',
  '_score': 0.6093757,
  '_source': {'nominee_text': 'Music and Lyric by Diane Warren got nominated under the category, music (original song), for the film Tell It like a Woman but did not win'}},
 {'_index': 'oscars-index2',
  '_id': '1%3A0%3A7LpOLpEBQYqvp9uXFg5S',
  '_score': 0.5948621,
  '_source': {'nominee_text': 'Music and Lyric by Lady Gaga and BloodPop got nominated under the category, music (original song), for the film Top Gun: Maverick but did not win'}},
 {'_index': 'oscars-index2',
  '_id': '1%3A0%3A6bpOLpEBQYqvp9uXEA5k',
  '_score': 0.5942098,
  '_source': {'nominee_text': 'Volker Bertelmann got nominated under the category, music (original score), for the film All Quiet on the Western Front to win the award'}},
 {'_index': 'oscars-index2',
  '_id': '1%3A0%3AIZVOLpEBBduygMq_OFGJ',
  '_score': 0.586697,
  '_source': {'nominee_text': 'Mark Weingarten, James H. Mather, Al Nelson, Chris Burdon and Mark Tayl

#### Create the prompt

In [200]:
prompt='Who won the Best Actor award in a supporting role?'

In [209]:
vector=text_embedding(prompt)

In [210]:
response=search_index(vector)
data=response['hits']['hits']

In [211]:
data

[{'_index': 'oscars-index2',
  '_id': '1%3A0%3AzbpNLpEBQYqvp9uX0w5N',
  '_score': 0.60076416,
  '_source': {'nominee_text': 'Ke Huy Quan got nominated under the category, actor in a supporting role, for the film Everything Everywhere All at Once to win the award'}},
 {'_index': 'oscars-index2',
  '_id': '1%3A0%3A8ZVNLpEBBduygMq_0lB6',
  '_score': 0.5921046,
  '_source': {'nominee_text': 'Barry Keoghan got nominated under the category, actor in a supporting role, for the film The Banshees of Inisherin but did not win'}},
 {'_index': 'oscars-index2',
  '_id': '1%3A0%3A8JVNLpEBBduygMq_z1Cm',
  '_score': 0.5896583,
  '_source': {'nominee_text': 'Brian Tyree Henry got nominated under the category, actor in a supporting role, for the film Causeway but did not win'}},
 {'_index': 'oscars-index2',
  '_id': '1%3A0%3AyrpNLpEBQYqvp9uXyg6m',
  '_score': 0.5848743,
  '_source': {'nominee_text': 'Paul Mescal got nominated under the category, actor in a leading role, for the film Aftersun but did not

In [212]:
context = ''
for item in data:
    context += item['_source']['nominee_text'] + '\n'
print(context)

Ke Huy Quan got nominated under the category, actor in a supporting role, for the film Everything Everywhere All at Once to win the award
Barry Keoghan got nominated under the category, actor in a supporting role, for the film The Banshees of Inisherin but did not win
Brian Tyree Henry got nominated under the category, actor in a supporting role, for the film Causeway but did not win
Paul Mescal got nominated under the category, actor in a leading role, for the film Aftersun but did not win
James Friend got nominated under the category, cinematography, for the film All Quiet on the Western Front to win the award
Brendan Gleeson got nominated under the category, actor in a supporting role, for the film The Banshees of Inisherin but did not win
Jamie Lee Curtis got nominated under the category, actress in a supporting role, for the film Everything Everywhere All at Once to win the award
Brendan Fraser got nominated under the category, actor in a leading role, for the film The Whale to wi

In [213]:
augmented_prompt=f'Context - {context}\nBased on the above context, answer this question - {prompt}'

In [214]:
augmented_prompt

'Context - Ke Huy Quan got nominated under the category, actor in a supporting role, for the film Everything Everywhere All at Once to win the award\nBarry Keoghan got nominated under the category, actor in a supporting role, for the film The Banshees of Inisherin but did not win\nBrian Tyree Henry got nominated under the category, actor in a supporting role, for the film Causeway but did not win\nPaul Mescal got nominated under the category, actor in a leading role, for the film Aftersun but did not win\nJames Friend got nominated under the category, cinematography, for the film All Quiet on the Western Front to win the award\nBrendan Gleeson got nominated under the category, actor in a supporting role, for the film The Banshees of Inisherin but did not win\nJamie Lee Curtis got nominated under the category, actress in a supporting role, for the film Everything Everywhere All at Once to win the award\nBrendan Fraser got nominated under the category, actor in a leading role, for the fi

In [None]:
# context length should be within the accepted context length of the LLM Endpoint - else create chunks of data using Langchain/Llamaindex

In [None]:
# Augmented prompt has the context and the prompt

### Invoke Titan LLM Endpoint

In [219]:
config={
      "maxTokenCount": 1000,
      "stopSequences": [],
      "temperature":0.1,
      "topP":1
}

body = json.dumps({'inputText': augmented_prompt,'textGenerationConfig':config})

In [220]:
response = bedrock_runtime.invoke_model( 
 modelId='amazon.titan-tg1-large', 
 body=body
)

In [221]:
response_body = json.loads(response.get('body').read())
print(response_body.get('results')[0].get('outputText'))


Brendan Fraser won the Best Actor award in a supporting role.


In [None]:
# arn:aws:iam::196856463470:role/SandboxLoginRole

In [88]:
# import boto3
# import requests
# from requests.auth import HTTPBasicAuth
# import json

# # Replace these with your OpenSearch domain details
# opensearch_endpoint = 'https://h729aiqjy4d3al1tih1i.us-east-1.aoss.amazonaws.com'
# index_name = 'oscars-index'  # Name of the index (collection)
# region = 'us-east-1'  # Your AWS region

# # AWS credentials (use IAM roles or other secure methods for production)
# aws_access_key_id = credentials.access_key #'YOUR_ACCESS_KEY_ID'
# aws_secret_access_key = credentials.secret_key #'YOUR_SECRET_ACCESS_KEY'

# # Basic Authentication (use IAM roles for more secure authentication in production)
# auth = HTTPBasicAuth(aws_access_key_id, aws_secret_access_key)

# # HTTP headers
# headers = {
#     "Content-Type": "application/json"
# }

# # Index settings and mappings
# index_settings = {
#     "settings": {
#         "index": {
#             "number_of_shards": 1,
#             "number_of_replicas": 1
#         }
#     },
#     "mappings": {
#         "properties": {
#             "field1": {"type": "text"},
#             "field2": {"type": "keyword"},
#             "field3": {"type": "date"},
#             "field4": {"type": "integer"}
#         }
#     }
# }

# # Function to create the index
# def create_index():
#     url = f"{opensearch_endpoint}/{index_name}"
#     response = requests.put(url, auth=auth, headers=headers, data=json.dumps(index_settings))
    
#     if response.status_code == 200:
#         print(f"Index '{index_name}' created successfully.")
#     elif response.status_code == 400 and 'resource_already_exists_exception' in response.text:
#         print(f"Index '{index_name}' already exists.")
#     else:
#         print(f"Failed to create index. Status code: {response.status_code}, Response: {response.text}")




In [89]:
# # Run the function
# create_index()

Failed to create index. Status code: 403, Response: {"status":403,"request-id":"5212cdcb-a448-9e88-bbf6-3df21d3e3534","error":{"reason":"403 Forbidden","type":"Forbidden"}}

