In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [4]:
import os
from pathlib import Path
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Qdrant
from qdrant_client import QdrantClient
from tqdm import tqdm
from qdrant_client.models import VectorParams, Distance

from text_parsers.unified_parser import parse_file


# --- Settings ---
COLLECTION_NAME = 'amatol_docs'
ROOT_DIR = './amatol-test'  # Modify this as needed

# --- Step 1: Recursively find all .txt files ---
def find_txt_files(root_dir: str) -> list[Path]:
    return [p for p in Path(root_dir).rglob('*.txt')]

# --- Step 2: Load ONE file and attach metadata ---
# from journal_parser import parse_journal_article

def load_one_document(path: Path, root_dir: str):
    parsed = parse_file(str(path))  # unified output
    return [Document(page_content=parsed["page_content"], metadata=parsed["metadata"])]

# --- Step 3: Chunk helper ---
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.schema import Document

def adaptive_chunk_documents(docs: list[Document], model: str = 'text-embedding-3-small') -> list[Document]:
    """Take a list of Documents, split adaptively, return list of Documents."""
    out_docs = []
    import tiktoken
    enc = tiktoken.encoding_for_model(model)

    for doc in docs:
        text = doc.page_content
        token_count = len(enc.encode(text))

        if token_count < 500:
            out_docs.append(doc)  # keep whole
        elif token_count < 1500:
            splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
                model_name=model, chunk_size=500, chunk_overlap=80
            )
            out_docs.extend(splitter.split_documents([doc]))
        else:
            splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
                model_name=model, chunk_size=800, chunk_overlap=100
            )
            out_docs.extend(splitter.split_documents([doc]))

    return out_docs


def _embedding_dim(embeddings) -> int:
    return len(embeddings.embed_query('dim?'))

def _ensure_collection(client: QdrantClient, name: str, embeddings) -> None:
    if not client.collection_exists(name):
        dim = _embedding_dim(embeddings)
        client.create_collection(
            collection_name=name,
            vectors_config=VectorParams(size=dim, distance=Distance.COSINE),
        )

# --- Step 4: Process files ONE AT A TIME ---
def embed_directory_one_file_at_a_time(root_dir: str, collection_name: str) -> None:
    txt_paths = find_txt_files(root_dir)
    if not txt_paths:
        print(f'No .txt files found under {root_dir}')
        return

    embeddings = OpenAIEmbeddings(model='text-embedding-3-small')
    client = QdrantClient(host='localhost', port=6333)
    _ensure_collection(client, collection_name, embeddings)
    vs = Qdrant(client=client, collection_name=collection_name, embeddings=embeddings)

    print(f'Indexing {len(txt_paths)} files from {root_dir} …')
    for path in tqdm(txt_paths, desc='Indexing files'):
        docs = load_one_document(path, root_dir)
        chunks = adaptive_chunk_documents(docs)
        vs.add_documents(chunks)
        print(f'  Added {len(chunks)} chunks from {path.relative_to(root_dir)}')


if __name__ == '__main__':
    embed_directory_one_file_at_a_time(ROOT_DIR, COLLECTION_NAME)


Indexing 10 files from ./amatol-test …


Indexing files:  10%|█         | 1/10 [00:01<00:15,  1.69s/it]

  Added 1 chunks from books/iron_age/p483.txt


Indexing files:  20%|██        | 2/10 [00:02<00:09,  1.23s/it]

  Added 4 chunks from books/amatol_book/p014-018.txt


Indexing files:  30%|███       | 3/10 [00:03<00:07,  1.09s/it]

  Added 2 chunks from books/amatol_book/p225.txt


Indexing files:  40%|████      | 4/10 [00:04<00:06,  1.01s/it]

  Added 3 chunks from books/amatol_book/p135-136.txt


Indexing files:  50%|█████     | 5/10 [00:05<00:04,  1.13it/s]

  Added 3 chunks from books/amatol_book/p025-027.txt


Indexing files:  60%|██████    | 6/10 [00:05<00:03,  1.20it/s]

  Added 3 chunks from books/amatol_book/p061-062.txt


Indexing files:  70%|███████   | 7/10 [00:06<00:01,  1.51it/s]

  Added 3 chunks from books/amatol_book/p253-254.txt


Indexing files:  80%|████████  | 8/10 [00:06<00:01,  1.87it/s]

  Added 2 chunks from books/amatol_book/p007.txt


Indexing files:  90%|█████████ | 9/10 [00:06<00:00,  2.00it/s]

  Added 1 chunks from books/amatol_book/p179.txt


Indexing files: 100%|██████████| 10/10 [00:07<00:00,  1.41it/s]

  Added 3 chunks from books/amatol_book/p181-182.txt



