In [5]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import nltk
from opensearchpy import OpenSearch
import os

# Read parquet file
df = pd.read_parquet('../data/bger-2024-3-text.parquet')

#model_dims = 384
#model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

chunk_max_words = 500
model_dims = 512
model_name = "distiluse-base-multilingual-cased-v2"
model = SentenceTransformer(f"sentence-transformers/{model_name}")

nltk.download('punkt', download_dir='../data/nltk_data')
nltk.download('punkt_tab', download_dir='../data/nltk_data')
nltk.data.path.append('../data/nltk_data')

index_name = "fed-court-chunks-index"

output_path = f"../data/chunked_embeddings_{chunk_max_words}_{model_name}.jsonl"

region = os.environ.get('AWS_REGION')
access_key = os.environ.get('AWS_ACCESS_KEY_ID')
secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY')


[nltk_data] Downloading package punkt to ../data/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to ../data/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
from nltk.tokenize import sent_tokenize
import json
import os

if os.path.exists(output_path):
    raise FileExistsError(f"File {output_path} already exists. Aborting to prevent overwrite.")


def chunk_by_sentences(text, max_words=chunk_max_words):
    sentences = sent_tokenize(text)
    chunks, current_chunk = [], []
    current_length = 0
    for sentence in sentences:
        word_count = len(sentence.split())
        if current_length + word_count > max_words:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_length = word_count
        else:
            current_chunk.append(sentence)
            current_length += word_count
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

with open(output_path, "w") as f_out:
    for _, row in df.iterrows():
        chunks = chunk_by_sentences(str(row["text"]))
        embeddings = model.encode(chunks, batch_size=32, show_progress_bar=False)
        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
            data = {
                "doc_id": row["docref"],
                "chunk_id": i,
                "text": chunk,
                "url": row["url"],
                "date": row["date"],
                "language": row["language"],
                "embedding": embedding.tolist()
            }
            f_out.write(json.dumps(data) + "\n")


FileExistsError: File ../data/chunked_embeddings_500_distiluse-base-multilingual-cased-v2.jsonl already exists. Aborting to prevent overwrite.

In [None]:
import boto3
import gzip
import os

# S3 bucket name
bucket_name = "manual-embedding-chunks"

# Initialize S3 client
s3_client = boto3.client(
    "s3",
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key,
    region_name=region,
)

# Function to split JSONL file
def split_jsonl_file(input_path, output_dir, lines_per_file=1000):
    os.makedirs(output_dir, exist_ok=True)
    with open(input_path, "r") as infile:
        file_count = 0
        current_lines = []
        for line in infile:
            current_lines.append(line)
            if len(current_lines) >= lines_per_file:
                output_file = os.path.join(output_dir, f"part_{file_count}.jsonl.gz")
                with gzip.open(output_file, "wt", encoding="utf-8") as outfile:
                    print(f"Writing {len(current_lines)} lines to {output_file}")
                    outfile.writelines(current_lines)
                current_lines = []
                file_count += 1
        if current_lines:
            output_file = os.path.join(output_dir, f"part_{file_count}.jsonl.gz")
            with gzip.open(output_file, "wt", encoding="utf-8") as outfile:
                print(f"Writing remaining lines to {output_file}")
                outfile.writelines(current_lines)

# Function to upload files to S3
def upload_to_s3(directory, bucket_name):
    for file_name in os.listdir(directory):
        if file_name.endswith(".jsonl.gz"):
            file_path = os.path.join(directory, file_name)
            s3_client.upload_file(file_path, bucket_name, file_name)
            print(f"Uploaded {file_name} to S3 bucket {bucket_name}")

# Split the JSONL file and upload to S3
output_dir = "../data/split_chunks"
split_jsonl_file(output_path, output_dir)
upload_to_s3(output_dir, bucket_name)

Uploaded part_206.jsonl.gz to S3 bucket manual-embedding-chunks
Uploaded part_122.jsonl.gz to S3 bucket manual-embedding-chunks
Uploaded part_76.jsonl.gz to S3 bucket manual-embedding-chunks
Uploaded part_378.jsonl.gz to S3 bucket manual-embedding-chunks
Uploaded part_376.jsonl.gz to S3 bucket manual-embedding-chunks
Uploaded part_498.jsonl.gz to S3 bucket manual-embedding-chunks
Uploaded part_139.jsonl.gz to S3 bucket manual-embedding-chunks
Uploaded part_406.jsonl.gz to S3 bucket manual-embedding-chunks
Uploaded part_75.jsonl.gz to S3 bucket manual-embedding-chunks
Uploaded part_196.jsonl.gz to S3 bucket manual-embedding-chunks
Uploaded part_69.jsonl.gz to S3 bucket manual-embedding-chunks
Uploaded part_516.jsonl.gz to S3 bucket manual-embedding-chunks
Uploaded part_199.jsonl.gz to S3 bucket manual-embedding-chunks
Uploaded part_558.jsonl.gz to S3 bucket manual-embedding-chunks
Uploaded part_255.jsonl.gz to S3 bucket manual-embedding-chunks
Uploaded part_3.jsonl.gz to S3 bucket manua

In [15]:
import boto3
import json

# Config
bucket_name = "manual-embedding-chunks"
lambda_function_name = "manual-import-embeddings"

# AWS clients
s3 = boto3.client("s3")
lambda_client = boto3.client("lambda",
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key,
    region_name=region,
)

# List all objects under the prefix
paginator = s3.get_paginator("list_objects_v2")
pages = paginator.paginate(Bucket=bucket_name)

for page in pages:
    for obj in page.get("Contents", []):
        if not obj["Key"].endswith(".jsonl.gz"):
            continue
        key = obj["Key"]
        s3_uri = f"s3://{bucket_name}/{key}"
        
        print(f"Invoking Lambda with S3 URI: {s3_uri}")
        
        payload = {
            "s3_uri": s3_uri
        }

        response = lambda_client.invoke(
            FunctionName=lambda_function_name,
            InvocationType='Event',  # async; use 'RequestResponse' for sync
            Payload=json.dumps(payload)
        )
        #result = json.load(response['Payload'])
        print("Lambda result:", response)



Invoking Lambda with S3 URI: s3://manual-embedding-chunks/part_0.jsonl.gz
Lambda result: {'ResponseMetadata': {'RequestId': '37e3b98e-0419-4fcd-9d86-e6af63859be8', 'HTTPStatusCode': 202, 'HTTPHeaders': {'date': 'Fri, 18 Apr 2025 10:53:45 GMT', 'content-length': '0', 'connection': 'keep-alive', 'x-amzn-requestid': '37e3b98e-0419-4fcd-9d86-e6af63859be8', 'x-amzn-remapped-content-length': '0', 'x-amzn-trace-id': 'root=1-68022f39-13bf838e0012615c351bffe3;parent=23e5a2801537fbaf;sampled=0'}, 'RetryAttempts': 0}, 'StatusCode': 202, 'Payload': <botocore.response.StreamingBody object at 0x7d5bf42c80d0>}
Invoking Lambda with S3 URI: s3://manual-embedding-chunks/part_1.jsonl.gz
Lambda result: {'ResponseMetadata': {'RequestId': '7afbcf51-4da6-4cfa-a5de-ef0244775f07', 'HTTPStatusCode': 202, 'HTTPHeaders': {'date': 'Fri, 18 Apr 2025 10:53:45 GMT', 'content-length': '0', 'connection': 'keep-alive', 'x-amzn-requestid': '7afbcf51-4da6-4cfa-a5de-ef0244775f07', 'x-amzn-remapped-content-length': '0', 'x-

In [16]:
from opensearchpy import OpenSearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth
import boto3
import os
import json
from opensearchpy.helpers import bulk


service = 'aoss'  # OpenSearch Serverless service identifier
host = os.environ.get('AWS_OPENSEARCH_ENDPOINT')
index_name = 'fed-court-chunks-index'
region = os.environ.get('AWS_REGION')
access_key = os.environ.get('AWS_ACCESS_KEY_ID')
secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY')

# AWS credentials
session = boto3.Session(
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key,
    region_name=region
)

credentials = session.get_credentials()

awsauth = AWS4Auth(
    credentials.access_key,
    credentials.secret_key,
    region,
    service,
    session_token=credentials.token
)


# Initialize OpenSearch client
aoss_client = OpenSearch(
    hosts=[{'host': host, 'port': 443}],
    http_compress=True,
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection)

### Info Index Construction fed-court-chunks-index
"""
{
  "settings": {
    "index": {
      "knn": true
    }
  },
  "mappings": {
    "properties": {
      "embedding": {
        "type": "knn_vector",
        "dimension": 512,
        "method": {
          "name": "hnsw",
          "engine": "faiss",
          "space_type": "l2",
          "parameters": {
            "ef_construction": 128,
            "m": 16
          }
        }
      },
      "doc_id": {
        "type": "keyword"
      },
      "chunk_id": {
        "type": "keyword"
      }
    }
  }
}
"""

# some checks
indices = aoss_client.cat.indices(format="json")  # compact JSON format
for idx in indices:
    print(f"{idx['index']:30} | Docs: {idx['docs.count']:10} | Size: {idx['store.size']}")

# Generator to yield actions from the JSONL file
def generate_actions(jsonl_path):
    with open(jsonl_path, "r") as f:
        for line in f:
            chunk = json.loads(line)
            yield {
                "_index": index_name,
                "_source": {
                    "embedding": chunk["embedding"],
                    "doc_id": chunk["doc_id"]
                }
            }

# Use bulk helper with the generator
#bulk(aoss_client,
#     generate_actions(output_path),
#    chunk_size=10,            # 💡 try 20 or even 10
#    request_timeout=7200
#)

fed-court-chunks-index         | Docs: 488571     | Size: 5.6gb


In [None]:
import json
from opensearchpy.helpers import bulk


index_name = "fed-court-chunks"

# Connect to your cluster
opensearch_client = OpenSearch(
    hosts=[{"host": "opensearch-dev", "port": 9200}],  # adapt to your setup
    http_compress=False
)

indices = opensearch_client.cat.indices(format="json")  # compact JSON format
for idx in indices:
    print(f"{idx['index']:30} | Docs: {idx['docs.count']:10} | Size: {idx['store.size']}")

health = opensearch_client.cluster.health()
print(health)

if not opensearch_client.indices.exists(index=index_name):
    opensearch_client.indices.create(
        index=index_name,
        body={
            "settings": {
                "index": {
                    "knn": True
                }
            },
            "mappings": {
                "properties": {
                    "embedding": {
                        "type": "knn_vector",
                        "dimension": model_dims
                    },
                    "text": {
                        "type": "text"
                    },
                    "doc_id": {
                        "type": "keyword"
                    },
                    "url": {
                        "type": "text"
                    },
                    "date": {
                        "type": "text"
                    },
                    "chunk_id": {
                        "type": "integer"
                    }
                }
            }
        }
    )

# Generator to yield actions from the JSONL file
def generate_actions(jsonl_path):
    with open(jsonl_path, "r") as f:
        for line in f:
            chunk = json.loads(line)
            yield {
                "_index": index_name,
                "_id": f"{chunk['doc_id']}-{chunk['chunk_id']}",
                "_source": {
                    "embedding": chunk["embedding"],
                    "text": chunk["text"],
                    "doc_id": chunk["doc_id"],
                    "url": chunk["url"],
                    "date": chunk["date"],
                    "chunk_id": chunk["chunk_id"]
                }
            }


# Use bulk helper with the generator
bulk(opensearch_client,
     generate_actions(output_path),
    chunk_size=10,            # 💡 try 20 or even 10
    request_timeout=3600
)


(574828, [])