In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import nltk
from opensearchpy import OpenSearch


# Read parquet file
df = pd.read_parquet('../data/bger-2024-3-text.parquet')

model = SentenceTransformer("all-MiniLM-L6-v2")

nltk.download('punkt', download_dir='../data/nltk_data')
nltk.download('punkt_tab', download_dir='../data/nltk_data')
nltk.data.path.append('../data/nltk_data')

# Connect to your cluster
client = OpenSearch(
    hosts=[{"host": "opensearch-dev", "port": 9200}],  # adapt to your setup
    http_compress=True
)

# Create index with dense vector field
index_name = "fed-court-chunks"

if not client.indices.exists(index=index_name):
    client.indices.create(
        index=index_name,
        body={
            "mappings": {
                "properties": {
                    "doc_id": {"type": "keyword"},
                    "chunk_id": {"type": "integer"},
                    "text": {"type": "text"},
                    "embedding": {
                        "type": "knn_vector",  # or "dense_vector" depending on OpenSearch version
                        "dimension": 384
                    }
                }
            },
            "settings": {
                "index": {
                    "knn": True
                }
            }
        }
    )

In [None]:
from nltk.tokenize import sent_tokenize

def chunk_by_sentences(text, max_words=300):
    sentences = sent_tokenize(text)
    chunks, current_chunk = [], []
    current_length = 0
    for sentence in sentences:
        word_count = len(sentence.split())
        if current_length + word_count > max_words:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_length = word_count
        else:
            current_chunk.append(sentence)
            current_length += word_count
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

all_chunks = []
for doc_id, text in enumerate(df['text'].to_list()):
    chunks = chunk_by_sentences(str(text))
    for i, chunk in enumerate(chunks):
        all_chunks.append({
            "doc_id": doc_id,
            "chunk_id": i,
            "text": chunk
        })


In [None]:
texts_to_embed = [chunk["text"] for chunk in all_chunks]
embeddings = model.encode(texts_to_embed, batch_size=32, show_progress_bar=True)

Batches:   0%|          | 0/29218 [00:00<?, ?it/s]

In [None]:
import json

output_path = "../data/chunked_embeddings.jsonl"

with open(output_path, "w") as f_out:
    for i, chunk in enumerate(all_chunks):
        chunk["embedding"] = embeddings[i].tolist()  # convert NumPy array to list
        json_line = json.dumps(chunk)
        f_out.write(json_line + "\n")

In [8]:
from opensearchpy import OpenSearch
import json
from opensearchpy.helpers import bulk

# Generator to yield actions from the JSONL file
def generate_actions(jsonl_path):
    with open(jsonl_path, "r") as f:
        for line in f:
            chunk = json.loads(line)
            yield {
                "_index": index_name,
                "_id": f"{chunk['doc_id']}-{chunk['chunk_id']}",
                "_source": {
                    "doc_id": chunk["doc_id"],
                    "chunk_id": chunk["chunk_id"],
                    "text": chunk["text"],
                    "embedding": chunk["embedding"]
                }
            }


index_name = "fed-court-chunks"


# Connect to your cluster
client = OpenSearch(
    hosts=[{"host": "opensearch-dev", "port": 9200}],  # adapt to your setup
    http_compress=False
)

# Use bulk helper with the generator
bulk(client, generate_actions("../data/chunked_embeddings.jsonl"))

(934965, [])

In [11]:

indices = client.cat.indices(format="json")  # compact JSON format
for idx in indices:
    print(f"{idx['index']:30} | Docs: {idx['docs.count']:10} | Size: {idx['store.size']}")

health = client.cluster.health()
print(health)

fed-court-chunks               | Docs: 928000     | Size: 13.3gb
{'cluster_name': 'docker-cluster', 'status': 'yellow', 'timed_out': False, 'number_of_nodes': 1, 'number_of_data_nodes': 1, 'discovered_master': True, 'discovered_cluster_manager': True, 'active_primary_shards': 1, 'active_shards': 1, 'relocating_shards': 0, 'initializing_shards': 0, 'unassigned_shards': 1, 'delayed_unassigned_shards': 0, 'number_of_pending_tasks': 0, 'number_of_in_flight_fetch': 0, 'task_max_waiting_in_queue_millis': 0, 'active_shards_percent_as_number': 50.0}


{'acknowledged': True}