# Bedrock Knowledge Base with S3 Vectors
Setup a cost-effective knowledge base using S3 Vectors for equipment spec sheets

In [None]:
import boto3
import json
import time
from pathlib import Path

REGION = 'us-east-1'
ACCOUNT_ID = boto3.client('sts').get_caller_identity()['Account']
PROJECT_NAME = 'equipment-specs-kb'

s3 = boto3.client('s3', region_name=REGION)
s3vectors = boto3.client('s3vectors', region_name=REGION)
iam = boto3.client('iam')
bedrock = boto3.client('bedrock-agent', region_name=REGION)

print(f"Account: {ACCOUNT_ID}, Region: {REGION}")

## 1. Create S3 Bucket and Upload Documents

In [None]:
DOCS_BUCKET = f"{PROJECT_NAME}-docs-{ACCOUNT_ID}"

try:
    s3.create_bucket(Bucket=DOCS_BUCKET)
    print(f"✓ Created bucket: {DOCS_BUCKET}")
except s3.exceptions.BucketAlreadyOwnedByYou:
    print(f"✓ Bucket exists: {DOCS_BUCKET}")

# Upload spec sheets
docs_path = Path('../Application/pre-requisites/documents/spec-sheets')
uploaded = []

for pdf_file in docs_path.glob('*.pdf'):
    key = f"spec-sheets/{pdf_file.name}"
    s3.upload_file(str(pdf_file), DOCS_BUCKET, key)
    uploaded.append(key)
    print(f"✓ Uploaded {pdf_file.name}")

print(f"\nTotal files uploaded: {len(uploaded)}")

## 2. Create S3 Vector Bucket and Index

In [None]:
VECTOR_BUCKET = f"{PROJECT_NAME}-vectors-{ACCOUNT_ID}"
VECTOR_INDEX = 'equipment-specs-index'
EMBEDDING_DIMENSION = 1024  # Titan Text Embeddings V2

# Create vector bucket
try:
    response = s3vectors.create_vector_bucket(vectorBucketName=VECTOR_BUCKET)
    print(f"✓ Created vector bucket: {VECTOR_BUCKET}")
except s3vectors.exceptions.VectorBucketAlreadyExists:
    print(f"✓ Vector bucket exists: {VECTOR_BUCKET}")

# Create vector index
try:
    response = s3vectors.create_index(
        vectorBucketName=VECTOR_BUCKET,
        indexName=VECTOR_INDEX,
        dimension=EMBEDDING_DIMENSION,
        distanceMetric='cosine',
        dataType='float32',
        metadataConfiguration={
            'nonFilterableMetadataKeys': ['AMAZON_BEDROCK_TEXT']
        }
    )
    VECTOR_INDEX_ARN = response['indexArn']
    print(f"✓ Created vector index: {VECTOR_INDEX}")
except s3vectors.exceptions.IndexAlreadyExists:
    response = s3vectors.get_index(
        vectorBucketName=VECTOR_BUCKET,
        indexName=VECTOR_INDEX
    )
    VECTOR_INDEX_ARN = response['indexArn']
    print(f"✓ Vector index exists: {VECTOR_INDEX}")

print(f"\nIndex ARN: {VECTOR_INDEX_ARN}")

## 3. Create IAM Role for Knowledge Base

In [None]:
KB_ROLE_NAME = f"{PROJECT_NAME}-kb-role"

trust_policy = {
    "Version": "2012-10-17",
    "Statement": [{
        "Effect": "Allow",
        "Principal": {"Service": "bedrock.amazonaws.com"},
        "Action": "sts:AssumeRole"
    }]
}

kb_policy = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": "bedrock:InvokeModel",
            "Resource": f"arn:aws:bedrock:{REGION}::foundation-model/amazon.titan-embed-text-v2:0"
        },
        {
            "Effect": "Allow",
            "Action": ["s3:ListBucket", "s3:GetObject"],
            "Resource": [
                f"arn:aws:s3:::{DOCS_BUCKET}",
                f"arn:aws:s3:::{DOCS_BUCKET}/*"
            ]
        },
        {
            "Effect": "Allow",
            "Action": [
                "s3vectors:GetIndex",
                "s3vectors:QueryVectors",
                "s3vectors:PutVectors",
                "s3vectors:GetVectors",
                "s3vectors:DeleteVectors"
            ],
            "Resource": VECTOR_INDEX_ARN
        }
    ]
}

try:
    response = iam.create_role(
        RoleName=KB_ROLE_NAME,
        AssumeRolePolicyDocument=json.dumps(trust_policy)
    )
    KB_ROLE_ARN = response['Role']['Arn']
    
    iam.put_role_policy(
        RoleName=KB_ROLE_NAME,
        PolicyName='kb-policy',
        PolicyDocument=json.dumps(kb_policy)
    )
    
    time.sleep(10)
    print(f"✓ Created KB role")
except iam.exceptions.EntityAlreadyExistsException:
    KB_ROLE_ARN = iam.get_role(RoleName=KB_ROLE_NAME)['Role']['Arn']
    print(f"✓ KB role exists")

print(f"Role ARN: {KB_ROLE_ARN}")

## 4. Create Knowledge Base

In [None]:
KB_NAME = 'equipment-specs-kb'

try:
    response = bedrock.create_knowledge_base(
        name=KB_NAME,
        description='Equipment specification sheets knowledge base',
        roleArn=KB_ROLE_ARN,
        knowledgeBaseConfiguration={
            'type': 'VECTOR',
            'vectorKnowledgeBaseConfiguration': {
                'embeddingModelArn': f'arn:aws:bedrock:{REGION}::foundation-model/amazon.titan-embed-text-v2:0',
                'embeddingModelConfiguration': {
                    'bedrockEmbeddingModelConfiguration': {
                        'dimensions': EMBEDDING_DIMENSION,
                        'embeddingDataType': 'FLOAT32'
                    }
                }
            }
        },
        storageConfiguration={
            'type': 'S3_VECTORS',
            's3VectorsConfiguration': {
                'indexArn': VECTOR_INDEX_ARN
            }
        }
    )
    KB_ID = response['knowledgeBase']['knowledgeBaseId']
    print(f"✓ Created knowledge base: {KB_ID}")
except bedrock.exceptions.ConflictException:
    kbs = bedrock.list_knowledge_bases()['knowledgeBaseSummaries']
    KB_ID = next(kb['knowledgeBaseId'] for kb in kbs if kb['name'] == KB_NAME)
    print(f"✓ Knowledge base exists: {KB_ID}")

## 5. Create Data Source

In [None]:
try:
    response = bedrock.create_data_source(
        knowledgeBaseId=KB_ID,
        name='spec-sheets-source',
        dataSourceConfiguration={
            'type': 'S3',
            's3Configuration': {
                'bucketArn': f'arn:aws:s3:::{DOCS_BUCKET}',
                'inclusionPrefixes': ['spec-sheets/']
            }
        },
        vectorIngestionConfiguration={
            'chunkingConfiguration': {
                'chunkingStrategy': 'FIXED_SIZE',
                'fixedSizeChunkingConfiguration': {
                    'maxTokens': 300,
                    'overlapPercentage': 20
                }
            }
        }
    )
    DATA_SOURCE_ID = response['dataSource']['dataSourceId']
    print(f"✓ Created data source: {DATA_SOURCE_ID}")
except bedrock.exceptions.ConflictException:
    sources = bedrock.list_data_sources(knowledgeBaseId=KB_ID)['dataSourceSummaries']
    DATA_SOURCE_ID = sources[0]['dataSourceId']
    print(f"✓ Data source exists: {DATA_SOURCE_ID}")

## 6. Sync Data Source (Ingest Documents)

In [None]:
response = bedrock.start_ingestion_job(
    knowledgeBaseId=KB_ID,
    dataSourceId=DATA_SOURCE_ID
)

job_id = response['ingestionJob']['ingestionJobId']
print(f"Started ingestion job: {job_id}")

# Wait for completion
while True:
    status_response = bedrock.get_ingestion_job(
        knowledgeBaseId=KB_ID,
        dataSourceId=DATA_SOURCE_ID,
        ingestionJobId=job_id
    )
    
    status = status_response['ingestionJob']['status']
    print(f"Status: {status}")
    
    if status == 'COMPLETE':
        stats = status_response['ingestionJob']['statistics']
        print(f"\n✓ Ingestion complete!")
        print(f"  Documents: {stats.get('numberOfDocumentsScanned', 0)}")
        print(f"  Chunks: {stats.get('numberOfNewDocumentsIndexed', 0)}")
        break
    elif status == 'FAILED':
        print("✗ Ingestion failed")
        break
    
    time.sleep(10)

## 7. Test Knowledge Base - Retrieve Only

In [None]:
bedrock_runtime = boto3.client('bedrock-agent-runtime', region_name=REGION)

def retrieve(query, num_results=3):
    response = bedrock_runtime.retrieve(
        knowledgeBaseId=KB_ID,
        retrievalQuery={'text': query},
        retrievalConfiguration={
            'vectorSearchConfiguration': {
                'numberOfResults': num_results,
                'overrideSearchType': 'SEMANTIC'
            }
        }
    )
    
    results = []
    for result in response['retrievalResults']:
        results.append({
            'score': result['score'],
            'content': result['content']['text'][:200] + '...',
            'source': result['location']['s3Location']['uri']
        })
    return results

# Test query
query = "What is the lifting capacity of the mobile crane?"
results = retrieve(query)

print(f"Query: {query}\n")
for i, result in enumerate(results, 1):
    print(f"Result {i} (Score: {result['score']:.3f})")
    print(f"Content: {result['content']}")
    print(f"Source: {result['source']}")
    print()

## 8. Test Knowledge Base - Retrieve and Generate

In [None]:
def ask(question):
    response = bedrock_runtime.retrieve_and_generate(
        input={'text': question},
        retrieveAndGenerateConfiguration={
            'type': 'KNOWLEDGE_BASE',
            'knowledgeBaseConfiguration': {
                'knowledgeBaseId': KB_ID,
                'modelArn': f'arn:aws:bedrock:{REGION}::foundation-model/amazon.nova-pro-v1:0',
                'retrievalConfiguration': {
                    'vectorSearchConfiguration': {
                        'numberOfResults': 5
                    }
                }
            }
        }
    )
    
    return response['output']['text']

# Test questions
questions = [
    "What equipment do we have and what are their key specifications?",
    "Compare the lifting capacities of our equipment",
    "What is the fuel capacity of the excavator?"
]

for question in questions:
    print(f"Q: {question}")
    answer = ask(question)
    print(f"A: {answer}\n")
    print("-" * 80 + "\n")

## 9. Query with Metadata Filters (Optional)

In [None]:
# Example with metadata filter
response = bedrock_runtime.retrieve(
    knowledgeBaseId=KB_ID,
    retrievalQuery={'text': 'equipment specifications'},
    retrievalConfiguration={
        'vectorSearchConfiguration': {
            'numberOfResults': 3,
            'filter': {
                'equals': {
                    'key': 'x-amz-bedrock-kb-source-uri',
                    'value': f's3://{DOCS_BUCKET}/spec-sheets/mobile-crane-mc750-spec-sheet.pdf'
                }
            }
        }
    }
)

print("Filtered results (mobile crane only):")
for result in response['retrievalResults']:
    print(f"Score: {result['score']:.3f}")
    print(f"Content: {result['content']['text'][:150]}...\n")

## 10. Save Configuration

In [None]:
config = {
    'knowledge_base_id': KB_ID,
    'data_source_id': DATA_SOURCE_ID,
    'vector_bucket': VECTOR_BUCKET,
    'vector_index': VECTOR_INDEX,
    'vector_index_arn': VECTOR_INDEX_ARN,
    'docs_bucket': DOCS_BUCKET,
    'role_arn': KB_ROLE_ARN,
    'region': REGION
}

with open('kb_config.json', 'w') as f:
    json.dump(config, f, indent=2)

print("✓ Configuration saved to kb_config.json")
print(json.dumps(config, indent=2))

## Summary

### Created Resources:

1. **S3 Bucket** - Document storage
2. **S3 Vector Bucket** - Vector embeddings storage
3. **S3 Vector Index** - Similarity search index
4. **IAM Role** - Permissions for KB
5. **Knowledge Base** - Bedrock KB with S3 Vectors
6. **Data Source** - S3 documents connection

### Cost Benefits:

- **90% cheaper** than traditional vector databases
- **Pay-as-you-go** pricing
- **No infrastructure** to manage

### Next Steps:

1. Add more documents to S3 bucket
2. Re-sync data source
3. Integrate with applications
4. Add metadata for filtering
5. Monitor costs and usage