In [12]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import nltk
from opensearchpy import OpenSearch


# Read parquet file
df = pd.read_parquet('../data/bger-2024-3-text.parquet')

model_dims = 384
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

#model_dims = 512
#model = SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v2")

nltk.download('punkt', download_dir='../data/nltk_data')
nltk.download('punkt_tab', download_dir='../data/nltk_data')
nltk.data.path.append('../data/nltk_data')

# Connect to your cluster
opensearch_client = OpenSearch(
    hosts=[{"host": "opensearch-dev", "port": 9200}],  # adapt to your setup
    http_compress=True
)

# Create index with dense vector field
index_name = "fed-court-chunks"


[nltk_data] Downloading package punkt to ../data/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to ../data/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
from nltk.tokenize import sent_tokenize
import json

# TODO: use overlapping

def chunk_by_sentences(text, max_words=500):
    sentences = sent_tokenize(text)
    chunks, current_chunk = [], []
    current_length = 0
    for sentence in sentences:
        word_count = len(sentence.split())
        if current_length + word_count > max_words:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_length = word_count
        else:
            current_chunk.append(sentence)
            current_length += word_count
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

output_path = "../data/chunked_embeddings.jsonl"

with open(output_path, "w") as f_out:
    for _, row in df.iterrows():
        chunks = chunk_by_sentences(str(row["text"]))
        embeddings = model.encode(chunks, batch_size=32, show_progress_bar=False)
        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
            data = {
                "doc_id": row["docref"],
                "chunk_id": i,
                "text": chunk,
                "language": row["language"],
                "embedding": embedding.tolist()
            }
            f_out.write(json.dumps(data) + "\n")


In [None]:
from opensearchpy import OpenSearch
import json
from opensearchpy.helpers import bulk


index_name = "fed-court-chunks"


if not opensearch_client.indices.exists(index=index_name):
    opensearch_client.indices.create(
        index=index_name,
        body={
            "settings": {
                "index": {
                    "knn": True
                }
            },
            "mappings": {
                "properties": {
                    "embedding": {
                        "type": "knn_vector",
                        "dimension": model_dims
                    },
                    "text": {
                        "type": "text"
                    },
                    "doc_id": {
                        "type": "keyword"
                    },
                    "chunk_id": {
                        "type": "integer"
                    }
                }
            }
        }
    )

# Generator to yield actions from the JSONL file
def generate_actions(jsonl_path):
    with open(jsonl_path, "r") as f:
        for line in f:
            chunk = json.loads(line)
            yield {
                "_index": index_name,
                "_id": f"{chunk['doc_id']}-{chunk['chunk_id']}",
                "_source": {
                    "embedding": chunk["embedding"],
                    "text": chunk["text"],
                    "doc_id": chunk["doc_id"],
                    "chunk_id": chunk["chunk_id"]
                }
            }

# Connect to your cluster
client = OpenSearch(
    hosts=[{"host": "opensearch-dev", "port": 9200}],  # adapt to your setup
    http_compress=False
)

# Use bulk helper with the generator
bulk(client,
     generate_actions("../data/chunked_embeddings.jsonl"),
    chunk_size=10,            # 💡 try 20 or even 10
    request_timeout=3600
)


(934965, [])

In [36]:

indices = client.cat.indices(format="json")  # compact JSON format
for idx in indices:
    print(f"{idx['index']:30} | Docs: {idx['docs.count']:10} | Size: {idx['store.size']}")

health = client.cluster.health()
print(health)

fed-court-chunks               | Docs: 934965     | Size: 9.9gb
{'cluster_name': 'docker-cluster', 'status': 'yellow', 'timed_out': False, 'number_of_nodes': 1, 'number_of_data_nodes': 1, 'discovered_master': True, 'discovered_cluster_manager': True, 'active_primary_shards': 1, 'active_shards': 1, 'relocating_shards': 0, 'initializing_shards': 0, 'unassigned_shards': 1, 'delayed_unassigned_shards': 0, 'number_of_pending_tasks': 0, 'number_of_in_flight_fetch': 0, 'task_max_waiting_in_queue_millis': 0, 'active_shards_percent_as_number': 50.0}


In [6]:
client.indices.delete(index=index_name, request_timeout=3600)
#df.columns


{'acknowledged': True}