In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# test bedrock embedding models with boto3

In [4]:
import json
import boto3

In [5]:
def get_bedrock_client(region_name: str, service_name: str = "bedrock-runtime") -> boto3.client:
    """get bedrock client"""
    return boto3.client(
        service_name=service_name,
        region_name=region_name,
    )

In [6]:
boto_client = get_bedrock_client(
    region_name='us-east-1'
)

In [14]:
texts = [
    "The Trojans, after a seven years’ voyage, set sail for Italy, but are overtaken by a dreadful storm, which Aeolus raises at the request of Juno.",
    "트로이아군은 7년간의 항해 끝에 이탈리아로 항해를 떠났지만 주노의 요청으로 무서운 폭풍에 추월당합니다.",
    "enus complains to Jupiter of her son’s misfortunes. Jupiter comforts her, and sends Mercury to procure him a kind reception among the Carthaginians.",
    "에누스는 주피터에게 아들의 불행에 대해 불평합니다. 주피터는 그녀를 위로하고 머큐리를 보내 카르타고 인들 사이에서 아들을 친절하게 맞이합니다.",
]

In [15]:
# amazon bedrock cohere

# cohere.embed-english-v3, cohere.embed-multilingual-v3
model_id = "cohere.embed-multilingual-v3"
# search_document, search_query, classification, clustering
input_type = "clustering" 
# float, int8, uint8, binary, ubinary
embedding_type = "float"

accept = '*/*'
content_type = 'application/json'

body = json.dumps({
    "texts": texts,
    "input_type": input_type,
    "embedding_types": [embedding_type]}
)

response = boto_client.invoke_model(
    body=body,
    modelId=model_id,
    accept=accept,
    contentType=content_type
)

response_body = json.loads(response.get('body').read())
embeddings_dict: dict = response_body.get('embeddings')
embeddings = embeddings_dict.get(embedding_type)
print(type(embeddings))
print(cosine_similarity(embeddings, embeddings))

<class 'list'>
[[1.         0.74069583 0.62030095 0.52303496]
 [0.74069583 1.         0.50378705 0.59600523]
 [0.62030095 0.50378705 1.         0.73347226]
 [0.52303496 0.59600523 0.73347226 1.        ]]


In [10]:
# amazon bedrock titan v2

model_id = "amazon.titan-embed-text-v2:0"
# 1024, 512, 256 (matryoshka embeddings?)
dimensions = 1024
# normalize, duh
normalize = True
# float or binary
embedding_type = "float"

accept = '*/*'
content_type = 'application/json'

def embed_titan_one(text):
    body = json.dumps({
        "inputText": text,
        "dimensions": dimensions,
        "normalize": normalize,
        "embeddingTypes": [embedding_type]}
    )

response = boto_client.invoke_model(
    body=body,
    modelId=model_id,
    accept=accept,
    contentType=content_type
)

response_body = json.loads(response.get('body').read())
embeddings_dict: dict = response_body.get('embeddings')
embeddings = embeddings_dict.get(embedding_type)
print(type(embeddings))
print(cosine_similarity(embeddings, embeddings))

list