In [None]:
import os
import json
import gzip
import psycopg2
from psycopg2.extras import execute_values

chunks_dir = "../data/split_chunks"
BATCH_SIZE = 1000

region = os.getenv('AWS_REGION')
access_key = os.environ.get('AWS_ACCESS_KEY_ID')
secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
aurora_user = os.environ.get('AWS_AURORA_VECTORS_USERNAME')
aurora_password = os.environ.get('AWS_AURORA_VECTORS_PASSWORD')
conn = psycopg2.connect(
    host=,  # writer endpoint
    port=5432,
    dbname="fedcourtdecisions",
    user=aurora_user,
    password=aurora_password
)

conn.autocommit = True

This requires the Aurora DB to be remotely accessible, so update the infra accordingly

In [4]:
from_file = "../data/split_chunks/part_0.jsonl.gz"

def generate_batches():
    buffer = []
    for root, _, files in os.walk(chunks_dir):
        for file in sorted(files):
            file_path = os.path.join(root, file)
            if file_path >= from_file and file.endswith(".jsonl.gz"):
                print(f"Processing {file_path}")
                with gzip.open(file_path, "rt", encoding="utf-8") as f:
                    for line in f:
                        try:
                            data = json.loads(line)
                            buffer.append((data["doc_id"], data["chunk_id"], data["embedding"]))
                            if len(buffer) >= BATCH_SIZE:
                                yield buffer
                                buffer = []
                        except Exception as e:
                            print(f"Error parsing: {e}")
                if buffer:
                    yield buffer
                    buffer = []

with conn.cursor() as cur:
    for batch in generate_batches():
        try:
            execute_values(
                cur,
                "INSERT INTO embeddings (doc_id, chunk_id, embedding) VALUES %s ON CONFLICT DO NOTHING",
                batch
            )
            print(f"Inserted {len(batch)} records.")
        except Exception as e:
            print(f"Batch insert failed: {e}")


Processing ../data/split_chunks/part_0.jsonl.gz
Inserted 1000 records.
Processing ../data/split_chunks/part_1.jsonl.gz
Inserted 1000 records.
Processing ../data/split_chunks/part_10.jsonl.gz
Inserted 1000 records.
Processing ../data/split_chunks/part_100.jsonl.gz
Inserted 1000 records.
Processing ../data/split_chunks/part_101.jsonl.gz
Inserted 1000 records.
Processing ../data/split_chunks/part_102.jsonl.gz
Inserted 1000 records.
Processing ../data/split_chunks/part_103.jsonl.gz
Inserted 1000 records.
Processing ../data/split_chunks/part_104.jsonl.gz
Inserted 1000 records.
Processing ../data/split_chunks/part_105.jsonl.gz
Inserted 1000 records.
Processing ../data/split_chunks/part_106.jsonl.gz
Inserted 1000 records.
Processing ../data/split_chunks/part_107.jsonl.gz
Inserted 1000 records.
Processing ../data/split_chunks/part_108.jsonl.gz
Inserted 1000 records.
Processing ../data/split_chunks/part_109.jsonl.gz
Inserted 1000 records.
Processing ../data/split_chunks/part_11.jsonl.gz
Inserte

In [None]:
lines = 0
for root, _, files in os.walk(chunks_dir):
    for file in sorted(files):
        if file.endswith(".jsonl.gz"):
            file_path = os.path.join(root, file)

            # Read the JSON lines from the gzipped file
            # counts lines in file
                # Read the JSON lines from the gzipped file
            with gzip.open(file_path, 'rt', encoding='utf-8') as f:
                line_count = sum(1 for _ in f)
                lines += line_count
print(f"Total lines in all files: {lines}")

Total lines in all files: 574828
