In [None]:
import os
import boto3
import json
import gzip

from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
import opensearchpy

chunk_max_words = 500
model_name = "distiluse-base-multilingual-cased-v2"
index_name =  os.environ.get('AWS_OPENSEARCH_INDEX_NAME')
service = 'aoss'
host = os.getenv('AWS_OPENSEARCH_ENDPOINT')
region = os.getenv('AWS_REGION')

chunks_dir = "../data/split_chunks"

access_key = os.environ.get('AWS_ACCESS_KEY_ID')
secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY')


In [7]:
# Initialize OpenSearch client


session = boto3.Session()
credentials = session.get_credentials()
awsauth = AWSV4SignerAuth(credentials, region, service)

sts = session.client("sts")
identity = sts.get_caller_identity()
print("Running as IAM identity:", identity["Arn"])

aoss_client = OpenSearch(
    hosts=[{'host': host, 'port': 443}],
    http_compress=True,
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
    pool_maxsize=20,
    timeout=36000,  # Set request timeout in seconds
)

# Check if index exists
try:
    exists = aoss_client.indices.exists(index=index_name)
    print(f"Index {index_name} exists: {exists}")
    if not exists:
        raise {"statusCode": 500, "body": json.dumps(f"Index {index_name} does not exist")}
except Exception as e:
    print(f"Error checking index: {str(e)}")
    raise {"statusCode": 500, "body": json.dumps(f"Error checking index: {str(e)}")}



Running as IAM identity: arn:aws:iam::211125557955:user/Christophe
Index fed-court-chunks-index exists: True


Opensearch Index:

### Info Index Construction fed-court-chunks-index
"""
{
  "settings": {
    "index": {
      "knn": true
    }
  },
  "mappings": {
    "properties": {
      "embedding": {
        "type": "knn_vector",
        "dimension": 512,
        "method": {
          "name": "hnsw",
          "engine": "faiss",
          "space_type": "l2",
          "parameters": {
            "ef_construction": 128,
            "m": 16
          }
        }
      },
      "doc_id": {
        "type": "keyword"
      },
      "chunk_id": {
        "type": "keyword"
      }
    }
  }
}
"""

In [None]:
from_file = "../data/split_chunks/part_550.jsonl.gz"
def generate_docs():
    for root, _, files in os.walk(chunks_dir):
        for file in sorted(files):
            if file.endswith(".jsonl.gz"):
                file_path = os.path.join(root, file)
                if file_path >= from_file:
                    print(f"Processing file: {file_path}")

                    # Read the JSON lines from the gzipped file
                    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
                        for line in f:
                            try:
                                data = json.loads(line)
                                yield {
                                    "_index": index_name,
                                    "_source": {
                                        "embedding": data["embedding"],
                                        "doc_id": data["doc_id"],
                                        "chunk_id": data["chunk_id"]
                                    }
                                }

                            except Exception as e:
                                print(f"Error processing line: {str(e)}")
                                raise {"statusCode": 500, "body": json.dumps(f"Error processing line: {str(e)}")}
                    print(f"Finished processing file: {file_path}")

# Bulk ingest in chunks of 1000
success, failed = opensearchpy.helpers.bulk(
    aoss_client,
    generate_docs(),
    chunk_size=1000,
    raise_on_error=True,
    stats_only=False
)

print(f"Successfully inserted: {success}")
print(f"Failures: {failed}")

Processing file: ../data/split_chunks/part_550.jsonl.gz
Finished processing file: ../data/split_chunks/part_550.jsonl.gz
Processing file: ../data/split_chunks/part_551.jsonl.gz
Finished processing file: ../data/split_chunks/part_551.jsonl.gz
Processing file: ../data/split_chunks/part_552.jsonl.gz
Finished processing file: ../data/split_chunks/part_552.jsonl.gz
Processing file: ../data/split_chunks/part_553.jsonl.gz
Finished processing file: ../data/split_chunks/part_553.jsonl.gz
Processing file: ../data/split_chunks/part_554.jsonl.gz
Finished processing file: ../data/split_chunks/part_554.jsonl.gz
Processing file: ../data/split_chunks/part_555.jsonl.gz
Finished processing file: ../data/split_chunks/part_555.jsonl.gz
Processing file: ../data/split_chunks/part_556.jsonl.gz
Finished processing file: ../data/split_chunks/part_556.jsonl.gz
Processing file: ../data/split_chunks/part_557.jsonl.gz
Finished processing file: ../data/split_chunks/part_557.jsonl.gz
Processing file: ../data/split_c

In [26]:
lines = 0
for root, _, files in os.walk(chunks_dir):
    for file in sorted(files):
        if file.endswith(".jsonl.gz"):
            file_path = os.path.join(root, file)

            # Read the JSON lines from the gzipped file
            # counts lines in file
                # Read the JSON lines from the gzipped file
            with gzip.open(file_path, 'rt', encoding='utf-8') as f:
                line_count = sum(1 for _ in f)
                lines += line_count
print(f"Total lines in all files: {lines}")

Total lines in all files: 574828
