In [1]:
import pandas
from sentence_transformers import SentenceTransformer
import nltk
import os

# Read parquet file
df = pandas.read_parquet('../data/bger-2024-3-text.parquet')

#model_dims = 384
#model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

chunk_max_words = 500
model_dims = 512
model_name = "distiluse-base-multilingual-cased-v2"
model = SentenceTransformer(f"sentence-transformers/{model_name}")
model.save("../data/model")

nltk.download('punkt', download_dir='../data/nltk_data')
nltk.download('punkt_tab', download_dir='../data/nltk_data')
nltk.data.path.append('../data/nltk_data')

index_name = "fed-court-chunks-index"

output_path = f"../data/chunked_embeddings_{chunk_max_words}_{model_name}.jsonl"

region = os.environ.get('AWS_REGION')
access_key = os.environ.get('AWS_ACCESS_KEY_ID')
secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY')


[nltk_data] Downloading package punkt to ../data/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to ../data/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
from nltk.tokenize import sent_tokenize
import json
import os

if os.path.exists(output_path):
    raise FileExistsError(f"File {output_path} already exists. Aborting to prevent overwrite.")


def chunk_by_sentences(text, max_words=chunk_max_words):
    sentences = sent_tokenize(text)
    chunks, current_chunk = [], []
    current_length = 0
    for sentence in sentences:
        word_count = len(sentence.split())
        if current_length + word_count > max_words:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_length = word_count
        else:
            current_chunk.append(sentence)
            current_length += word_count
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

with open(output_path, "w") as f_out:
    for _, row in df.iterrows():
        chunks = chunk_by_sentences(str(row["text"]))
        embeddings = model.encode(chunks, batch_size=32, show_progress_bar=False)
        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
            data = {
                "doc_id": row["docref"],
                "chunk_id": i,
                "text": chunk,
                "url": row["url"],
                "date": row["date"],
                "language": row["language"],
                "embedding": embedding.tolist()
            }
            f_out.write(json.dumps(data) + "\n")


FileExistsError: File ../data/chunked_embeddings_500_distiluse-base-multilingual-cased-v2.jsonl already exists. Aborting to prevent overwrite.