In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [19]:
import os
from pathlib import Path
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Qdrant
from qdrant_client import QdrantClient
from tqdm import tqdm
from qdrant_client.models import VectorParams, Distance

from text_parsers.unified_parser import parse_file
from db import *

# --- Settings ---
COLLECTION_NAME = 'amatol_docs'
ROOT_DIR = './amatol-test'  # Modify this as needed
BATCH_SIZE = 12  # configurable, tune for performance

# --- Step 1: Recursively find all .txt files ---
def find_txt_files(root_dir: str) -> list[Path]:
    return [p for p in Path(root_dir).rglob('*.txt')]

# --- Step 2: Load ONE file and attach metadata ---
from langchain.schema import Document

def load_one_document(path: Path, root_dir: str):
    parsed = parse_file(str(path))  # unified output
    return [Document(page_content=parsed["page_content"], metadata=parsed["metadata"])]

# --- Step 3: Chunk helper ---
def adaptive_chunk_documents(docs: list[Document], model: str = 'text-embedding-3-small') -> list[Document]:
    """Take a list of Documents, split adaptively, return list of Documents."""
    out_docs = []
    import tiktoken
    enc = tiktoken.encoding_for_model(model)

    for doc in docs:
        text = doc.page_content
        token_count = len(enc.encode(text))

        if token_count < 500:
            out_docs.append(doc)  # keep whole
        elif token_count < 1500:
            splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
                model_name=model, chunk_size=500, chunk_overlap=80
            )
            out_docs.extend(splitter.split_documents([doc]))
        else:
            splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
                model_name=model, chunk_size=800, chunk_overlap=100
            )
            out_docs.extend(splitter.split_documents([doc]))

    return out_docs


def _embedding_dim(embeddings) -> int:
    return len(embeddings.embed_query('dim?'))

def _ensure_collection(client: QdrantClient, name: str, embeddings) -> None:
    if not client.collection_exists(name):
        dim = _embedding_dim(embeddings)
        client.create_collection(
            collection_name=name,
            vectors_config=VectorParams(size=dim, distance=Distance.COSINE),
        )

# --- Step 4: Batching Helpers ---
def flush_batch(buffer, vs, con):
    """Embed and insert a batch of documents into Qdrant + commit SQLite."""
    if not buffer:
        return

    vs.add_documents(buffer)
    con.commit()
    print(f'  Flushed {len(buffer)} chunks → Qdrant')
    buffer.clear()

# --- Step 5: Main Ingestion ---
def embed_directory_batched(root_dir: str, collection_name: str, batch_size: int = BATCH_SIZE) -> None:
    txt_paths = find_txt_files(root_dir)
    if not txt_paths:
        print(f'No .txt files found under {root_dir}')
        return

    # Initialize DB
    con = ensure_db()

    # Initialize embeddings + vectorstore
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small')
    client = QdrantClient(host='localhost', port=6333)
    _ensure_collection(client, collection_name, embeddings)
    vs = Qdrant(client=client, collection_name=collection_name, embeddings=embeddings)

    staging_buffer = []  # holds chunks before flushing

    print(f'Indexing {len(txt_paths)} files from {root_dir} …')
    for path in tqdm(txt_paths, desc='Indexing files'):
        # Step 1: hash check
        h = file_sha256(path)
        if document_exists(con, h):
            print(f'  SKIP (already indexed): {path.relative_to(root_dir)}')
            continue

        # Step 2: parse and insert doc row
        parsed = parse_file(str(path))

        # Step 3: chunk and attach doc_id
        docs = [Document(page_content=parsed['page_content'], metadata=parsed['metadata'])]
        chunks = adaptive_chunk_documents(docs)
        for ch in chunks:
            ch.metadata['doc_id'] = h

        insert_document(con, path, parsed, h, len(chunks))

        # Step 4: stage chunks
        staging_buffer.extend(chunks)
        print(f'  Staged {len(chunks)} chunks from {path.relative_to(root_dir)}')

        # Step 5: flush if buffer full
        if len(staging_buffer) >= batch_size:
            flush_batch(staging_buffer, vs, con)

    # Final flush
    flush_batch(staging_buffer, vs, con)


if __name__ == '__main__':
    embed_directory_batched(ROOT_DIR, COLLECTION_NAME, BATCH_SIZE)


Indexing 3 files from ./amatol-test …


Indexing files: 100%|██████████| 3/3 [00:00<00:00, 38.97it/s]

  Staged 1 chunks from newspapers/1918/1918-12-25__philadelphia-inquirer__p2__first-wedding.txt
  Staged 1 chunks from newspapers/1918/1918-06-05__philadelphia-inquirer__p13__new-train-service-and-population-size.txt
  SKIP (already indexed): books/amatol_book/p007__introduction.txt





  Flushed 2 chunks → Qdrant


In [9]:
client = QdrantClient(host="localhost", port=6333)

def view_vector_store(client, collection_name):
    points, _ = client.scroll(
        collection_name=COLLECTION_NAME,
        limit=10,
    )
    for pt in points:
        print(pt.id, pt.payload.keys())
        print(pt.payload)

view_vector_store(client, COLLECTION_NAME)

410f32bd-7da1-41a1-94a0-17de09cf0c2e dict_keys(['page_content', 'metadata'])
{'page_content': 'Introduction\n\nINTRODUCTION\nBY JAMES BLAINE WALKER\nSecretary Public Service Commission, 1st Dist. New York; Author of "Fifty Years of Rapid Transit," etc.\n\nFOR MANY YEARS travelers on the Pennsylvania Railroad, between Philadelphia and Atlantic City, have viewed from the car windows apparently interminable stretches of flat, sandy country covered with a thick growth of scrub oak, small pine trees and a dense undergrowth of brush. Many have speculated as to what practical use this great stretch of land, useless in its present state for agricultural purposes and yet most advantageously located, could be devoted. Quite unexpectedly the necessities of war brought a large portion of it into practical use. The Atlantic Loading Company, a corporation formed in New York City, entered into a contract with the Government to build and operate a shell loading plant as the agent of the Government. Th

In [16]:
from qdrant_client.models import Filter, FieldCondition, MatchValue

def delete_document_from_store(con, client, collection_name: str, doc_id: str) -> None:
    """Delete a document from SQLite and Qdrant using its doc_id."""
    # 1. Delete from Qdrant
    client.delete(
        collection_name=collection_name,
        wait=True,
        points_selector=Filter(
            must=[
                FieldCondition(
                    key="metadata.doc_id",  # always nested
                    match=MatchValue(value=str(doc_id))
                )
            ]
        ),
    )

    # 2. Delete from SQLite
    delete_document(con, doc_id)
    print(f"Deleted document {doc_id} from DB and Qdrant.")

con = ensure_db()
# embeddings = OpenAIEmbeddings(model='text-embedding-3-small')
client = QdrantClient(host="localhost", port=6333)

# Example: delete using the hash
path = Path("./amatol-test/newspapers/1918/1918-06-05__philadelphia-inquirer__p13__new-train-service-and-population-size.txt")
h = file_sha256(path)
delete_document_from_store(con, client, COLLECTION_NAME, h)

Deleted document dcb25bd09c839a3073c91933c4db673487498843981dc9e3dbf5b12d1f9aa535 from DB and Qdrant.


In [21]:
view_vector_store(client, COLLECTION_NAME)

In [22]:
list_all_documents(ensure_db())

[]