# Step 3-0. Bedrock Titan 임베딩으로 벡터 데이터 업로드
Amazon Bedrock Titan 텍스트 임베딩 모델을 사용하여 벡터를 생성하고 OpenSearch에 업로드합니다.

## Step 2와의 차이점
- **Step 2**: 로컬 PC에 모델을 다운로드하여 임베딩 생성 (sentence-transformers, 384차원)
- **Step 3**: AWS Bedrock 클라우드 API를 호출하여 임베딩 생성 (Titan V2, 1024차원)

## AWS Bedrock이란?
AWS에서 제공하는 **완전 관리형 생성형 AI 서비스**로, 다양한 AI 모델(Claude, Titan 등)을 API로 사용할 수 있습니다.

## OpenSearch Serverless 최적화
- **faiss 엔진**: OpenSearch Serverless에 더 최적화된 벡터 검색 엔진
- **innerproduct**: cosinesimil 대신 사용하여 성능 향상

In [None]:
!pip install -q boto3==1.38.46 opensearch-py==2.8.0

## 1. 설정 (Configuration)

In [2]:
import os, json

# Step 0에서 저장한 설정 불러오기
try:
    with open("../config.json") as f:
        _config = json.load(f)
    print("✅ config.json 로드 완료")
except FileNotFoundError:
    raise FileNotFoundError("❌ config.json을 찾을 수 없습니다. Step 0 노트북을 먼저 실행해주세요.")

HOST = _config.get("OPENSEARCH_HOST")
if not HOST:
    raise ValueError("❌ config.json에 OPENSEARCH_HOST 값이 없습니다. Step 0 노트북을 먼저 실행해주세요.")
DEFAULT_REGION = _config.get("DEFAULT_REGION", "ap-northeast-2")
BEDROCK_REGION = _config.get("BEDROCK_REGION", "us-east-1")
PROFILE = _config.get("PROFILE", "skku-opensearch-session")

# Bedrock 설정
embedding_model_id = 'amazon.titan-embed-text-v2:0'
vector_dimension = 1024  # Titan V2는 1024 차원
index_name = 'bedrock-test'

✅ config.json 로드 완료


## 2. OpenSearch 클라이언트 생성

In [3]:
import boto3
from opensearchpy import OpenSearch, AWSV4SignerAuth, RequestsHttpConnection

service = 'aoss'
credentials = boto3.Session(profile_name=PROFILE).get_credentials()
auth = AWSV4SignerAuth(credentials, DEFAULT_REGION, service)

client = OpenSearch(
    hosts=[{'host': HOST, 'port': 443}],
    http_auth=auth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
    timeout=300
)

print("OpenSearch 클라이언트 생성 완료")

OpenSearch 클라이언트 생성 완료


## 3. Bedrock 클라이언트 생성 및 임베딩 함수 정의

In [4]:
import json

print(f"Creating a boto3 session with profile '{PROFILE}'...")
session = boto3.Session(profile_name=PROFILE)

print(f"Creating a Bedrock client in region: {BEDROCK_REGION}")
bedrock_client = session.client(
    service_name='bedrock-runtime',
    region_name=BEDROCK_REGION,
)
print("Bedrock client created successfully.")

def get_embedding_from_bedrock(text, model_id):
    """Bedrock API를 호출하여 주어진 텍스트의 벡터 임베딩을 반환합니다."""
    body = json.dumps({"inputText": text})
    response = bedrock_client.invoke_model(
        body=body,
        modelId=model_id,
        accept="application/json",
        contentType="application/json",
    )
    response_body = json.loads(response.get("body").read())
    return response_body.get("embedding")

Creating a boto3 session with profile 'skku-opensearch-session'...
Creating a Bedrock client in region: us-east-1
Bedrock client created successfully.


## 4. 벡터 인덱스 생성
OpenSearch Serverless에 최적화된 faiss 엔진과 innerproduct space type을 사용합니다.

In [5]:
index_body = {
    "settings": {
        "index": {
            "knn": True,
            "knn.algo_param.ef_search": 100,
            "analysis": {
                "analyzer": {
                    "korean_nori_analyzer": {
                        "type": "custom",
                        "tokenizer": "nori_tokenizer",
                        "filter": [
                            "nori_part_of_speech",
                            "nori_readingform",
                            "lowercase"
                        ]
                    }
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "content_vector": {
                "type": "knn_vector",
                "dimension": vector_dimension,
                "method": {
                    "name": "hnsw",
                    "space_type": "innerproduct",
                    "engine": "faiss"
                }
            },
            "post_id": {"type": "integer"},
            "title": {"type": "text", "analyzer": "korean_nori_analyzer"},
            "content": {"type": "text", "analyzer": "korean_nori_analyzer"},
            "author": {"type": "keyword"},
            "category": {"type": "keyword"},
            "tags": {"type": "keyword"},
            "created_at": {"type": "date"}
        }
    }
}

if client.indices.exists(index=index_name):
    print(f"Deleting existing index '{index_name}'...")
    client.indices.delete(index=index_name)

print(f"Creating new index '{index_name}'...")
client.indices.create(index=index_name, body=index_body)
print(f"Index '{index_name}' created successfully.")

Creating new index 'bedrock-test'...
Index 'bedrock-test' created successfully.


## 5. 데이터 임베딩 및 업로드

In [6]:
from opensearchpy import helpers

json_file_path = '../../data/json_data.json'

with open(json_file_path, 'r', encoding='utf-8') as f:
    documents = json.load(f)
print(f"Successfully loaded {len(documents)} documents.")

def generate_bulk_actions(docs, idx_name):
    for index, doc in enumerate(docs):
        text_to_embed = f"{doc.get('title', '')}\n{doc.get('content', '')}"
        vector = get_embedding_from_bedrock(text_to_embed, embedding_model_id)

        if not vector:
            print(f"Warning: Could not generate embedding for doc_id {doc.get('post_id')}. Skipping.")
            continue
        else:
            print(f"[{index+1}/{len(docs)}] embedding | vector = {vector}")

        source_data = {
            "content_vector": vector,
            "post_id": doc.get("post_id"),
            "title": doc.get("title"),
            "content": doc.get("content"),
            "author": doc.get("author"),
            "category": doc.get("category"),
            "tags": doc.get("tags"),
            "created_at": doc.get("created_at")
        }

        yield {
            "_index": idx_name,
            "_source": source_data
        }

print("Starting data embedding and uploading via Bedrock...")
success, failed = helpers.bulk(client, generate_bulk_actions(documents, index_name))

print(f"Successfully indexed {success} documents.")
if failed:
    print(f"Failed to index {len(failed)} documents.")
    for i, item in enumerate(failed[:5]):
        print(f"Failed item {i+1}: {item}")

Successfully loaded 50 documents.
Starting data embedding and uploading via Bedrock...
[1/50] embedding | vector = [-0.005434136372059584, 0.07403641939163208, 0.0230681411921978, -0.023335998877882957, 0.03473561629652977, 0.0999637320637703, 0.05330042168498039, -0.007063794415444136, -0.0642470121383667, -0.07031751424074173, 0.020754382014274597, 0.03339824080467224, 0.016232969239354134, -0.04826871305704117, -0.017610685899853706, 0.02339061163365841, 0.016206007450819016, -0.0011479641543701291, -0.01834903471171856, 0.01745382510125637, -0.03955727815628052, 0.08236084133386612, -0.06713840365409851, -0.01563367433845997, -0.02840747870504856, -0.006627670954912901, 0.10175446420907974, -0.05478774756193161, -0.04530826583504677, -0.07774503529071808, -0.027736598625779152, 0.02459724061191082, 0.033015619963407516, -0.02093384973704815, 0.029585309326648712, -0.009286775253713131, 0.0589219331741333, 0.012213843874633312, -0.026962604373693466, 0.021573008969426155, -0.0095313