###  Create an Amazon Kendra S3 data source with access control list

In [None]:
# Import necessary libraries and load environment variables

from dotenv import load_dotenv, find_dotenv, set_key
import dotenv
import os

# loading environment variables that are stored in local file
local_env_filename = 'dev.env'
load_dotenv(find_dotenv(local_env_filename),override=True)

os.environ['REGION'] = os.getenv('REGION')
os.environ['KENDRA_INDEX'] = os.getenv('KENDRA_INDEX')
os.environ['KENDRA_ROLE'] = os.getenv('KENDRA_ROLE')
os.environ['CUSTOM_DATA_SOURCE_ID_1'] = os.getenv('CUSTOM_DATA_SOURCE_ID_1')
os.environ['CUSTOM_DATA_SOURCE_ID_2'] = os.getenv('CUSTOM_DATA_SOURCE_ID_2')
os.environ['AMAZON_Q_APP_ID'] = os.getenv('AMAZON_Q_APP_ID')
os.environ['Q_CUSTOM_DATA_SOURCE_ID_1'] = os.getenv('Q_CUSTOM_DATA_SOURCE_ID_1')
os.environ['Q_CUSTOM_DATA_SOURCE_ID_2'] = os.getenv('Q_CUSTOM_DATA_SOURCE_ID_2')
os.environ['DEMO_S3_BUCKET'] = os.getenv('DEMO_S3_BUCKET')
os.environ['DEMO_S3_KEY'] = os.getenv('DEMO_S3_KEY')
os.environ['CLOUDFRONT_URL'] = os.getenv('CLOUDFRONT_URL')

REGION = os.environ['REGION']
KENDRA_INDEX = os.environ['KENDRA_INDEX']
KENDRA_ROLE = os.environ['KENDRA_ROLE']
CUSTOM_DATA_SOURCE_ID_1 = os.environ['CUSTOM_DATA_SOURCE_ID_1']
CUSTOM_DATA_SOURCE_ID_2 = os.environ['CUSTOM_DATA_SOURCE_ID_2']
AMAZON_Q_APP_ID = os.environ['AMAZON_Q_APP_ID']
Q_CUSTOM_DATA_SOURCE_ID_1 = os.environ['Q_CUSTOM_DATA_SOURCE_ID_1']
Q_CUSTOM_DATA_SOURCE_ID_2 = os.environ['Q_CUSTOM_DATA_SOURCE_ID_2']
DEMO_S3_BUCKET = os.environ['DEMO_S3_BUCKET']
DEMO_S3_KEY = os.environ['DEMO_S3_KEY']
CLOUDFRONT_URL = os.environ['CLOUDFRONT_URL']

In [None]:
# Create Kendra Amazon S3 data source with ACL
# see also https://aws.amazon.com/blogs/machine-learning/secure-your-amazon-kendra-indexes-with-the-acl-using-a-jwt-shared-secret-key/
# and https://aws.amazon.com/blogs/machine-learning/building-a-secure-search-application-with-access-controls-using-amazon-kendra/
# and https://docs.aws.amazon.com/kendra/latest/dg/create-index-access-control.html

import boto3
import json
import os

def create_acl_file(bucket_name, bucket_key):
    s3_client = boto3.client('s3')
    acldata = [{
        "keyPrefix": f"s3://{bucket_name}/{bucket_key}",
        "aclEntries": [
            {
                "Name": "SA",
                "Type": "GROUP",
                "Access": "ALLOW"
            }]
    }]
    metadata_file_name = f"{bucket_key}acl.json"
    s3_client.put_object(
        Bucket=bucket_name,
        Key=metadata_file_name,
        Body=json.dumps(acldata),
        ContentType='application/json'
    )

def create_metadata_files(bucket_name, bucket_key, meta_folder='meta'):
    s3_client = boto3.client('s3')
    
    # List objects in the specified bucket and prefix
    paginator = s3_client.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket_name, Prefix=bucket_key)

    for page in pages:
        for obj in page.get('Contents', []):
            if obj['Key'].endswith(('.txt', '.pdf', '.doc', '.docx')):  # Add or remove file types as needed
                # Generate metadata for each file
                metadata = {
                    "Attributes": {
                        "DocumentType": "llm-papers"  # You can customize this based on your needs
                    },
                    "AccessControlList": [
                        { "Access": "DENY", "Name": "huthmac@amazon.com", "Type": "USER" }
                        # Add more access control entries as needed
                    ]
                }

                # Create metadata file name
                metadata_file_name = f"{meta_folder}/{obj['Key']}.json"

                # Upload metadata file to S3
                s3_client.put_object(
                    Bucket=bucket_name,
                    Key=metadata_file_name,
                    Body=json.dumps(metadata),
                    ContentType='application/json'
                )

                print(f"Created metadata file: {metadata_file_name}")

def create_s3_data_source(index_id, data_source_name, bucket_name, bucket_key, iam_role_arn):
    kendra = boto3.client('kendra', region_name=REGION)

    # # Create metadata files
    # create_metadata_files(bucket_name, bucket_key)

    # create ACL file
    create_acl_file(bucket_name, bucket_key)

    # S3 data source configuration
    data_source_configuration = {
        "S3Configuration": {
            "AccessControlListConfiguration": {
                "KeyPath": f"s3://{bucket_name}/{bucket_key}acl.json"
            },
            # "DocumentsMetadataConfiguration": {
            #     "S3Prefix": "meta/"
            # },
            "BucketName": bucket_name,
            "InclusionPrefixes": [
                bucket_key
            ]
        }
    }

    try:
        response = kendra.create_data_source(
            IndexId=index_id,
            Name=data_source_name,
            Type='S3',
            Configuration=data_source_configuration,
            RoleArn=iam_role_arn,
            LanguageCode='en',
            Description='Amazon S3 data source for Kendra index'
        )
        
        print(f"Data source created successfully. Data source ID: {response['Id']}")
        return response['Id']
    except Exception as e:
        print(f"Error creating S3 data source: {str(e)}")
        return None

# Usage
index_id = KENDRA_INDEX  # Use your existing Kendra index ID
data_source_name = 's3-llmpaperstest'
bucket_name = 'felixh-kendra-demo'  # Use your S3 bucket name
bucket_key = 'llmpapers/'
iam_role_arn = KENDRA_ROLE  # IAM role ARN with necessary permissions

s3_data_source_id = create_s3_data_source(index_id, data_source_name, bucket_name, bucket_key, iam_role_arn)

if s3_data_source_id:
    print(f"S3 data source created with ID: {s3_data_source_id}")
    # Save the data source ID to your environment variables
    os.environ['S3_DATA_SOURCE_ID'] = s3_data_source_id
    dotenv.set_key(local_env_filename, "S3_DATA_SOURCE_ID", os.environ["S3_DATA_SOURCE_ID"])
else:
    print("Failed to create S3 data source")

In [None]:
# run Kendra Amazon S3 data source sync job
import boto3
import time

def start_s3_sync_job(index_id, data_source_id):
    kendra = boto3.client('kendra')

    try:
        response = kendra.start_data_source_sync_job(
            Id=data_source_id,
            IndexId=index_id
        )
        execution_id = response['ExecutionId']
        print(f"Sync job started. Execution ID: {execution_id}")
        return execution_id
    except Exception as e:
        print(f"Error starting sync job: {str(e)}")
        return None

def monitor_sync_job(index_id, data_source_id, execution_id):
    kendra = boto3.client('kendra')

    while True:
        try:
            response = kendra.list_data_source_sync_jobs(
                Id=data_source_id,
                IndexId=index_id
            )
            
            for job in response['History']:
                if job['ExecutionId'] == execution_id:
                    status = job['Status']
                    print(f"Sync job status: {status}")
                    
                    if status in ['FAILED', 'SUCCEEDED']:
                        if status == 'FAILED':
                            print(f"Sync job failed. Error message: {job.get('ErrorMessage', 'No error message provided')}")
                        return status
                    
                    break
            
            time.sleep(60)  # Wait for 60 seconds before checking again
        except Exception as e:
            print(f"Error monitoring sync job: {str(e)}")
            return None

# Usage
index_id = KENDRA_INDEX  # Use your existing Kendra index ID
data_source_id = os.environ['S3_DATA_SOURCE_ID']  # Use the S3 data source ID we just created

execution_id = start_s3_sync_job(index_id, data_source_id)

if execution_id:
    final_status = monitor_sync_job(index_id, data_source_id, execution_id)
    if final_status == 'SUCCEEDED':
        print("S3 data source sync completed successfully")
    else:
        print("S3 data source sync failed or was interrupted")
else:
    print("Failed to start S3 data source sync job")