In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
import os
from pathlib import Path
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Qdrant
from qdrant_client import QdrantClient
from tqdm import tqdm
from qdrant_client.models import VectorParams, Distance

from text_parsers.unified_parser import parse_file
from db import *

# --- Settings ---
PROJECT_NAME = 'amatol_project'          # project-specific folder
COLLECTION_NAME = 'amatol_docs'
ROOT_DIR = './amatol-test'               # source documents
BATCH_SIZE = 12                          # tune for performance

def get_qdrant_client():
    project_dir = Path("projects") / PROJECT_NAME
    qdrant_path = project_dir / "qdrant"
    qdrant_path.mkdir(parents=True, exist_ok=True)
    return QdrantClient(path=str(qdrant_path))


In [2]:
import os
from pathlib import Path
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Qdrant
from qdrant_client import QdrantClient
from tqdm import tqdm
from qdrant_client.models import VectorParams, Distance

from text_parsers.unified_parser import parse_file
from db import *

# --- Settings ---
PROJECT_NAME = 'amatol_project'          # project-specific folder
COLLECTION_NAME = 'amatol_docs'
ROOT_DIR = './amatol-test'               # source documents
BATCH_SIZE = 12                          # tune for performance

# --- Step 1: Recursively find all .txt files ---
from langchain.schema import Document

def find_txt_files(root_dir: str) -> list[Path]:
    return [p for p in Path(root_dir).rglob('*.txt')]

# --- Step 2: Load ONE file and attach metadata ---
def load_one_document(path: Path, root_dir: str):
    parsed = parse_file(str(path))  # unified output
    return [Document(page_content=parsed["page_content"], metadata=parsed["metadata"])]

# --- Step 3: Chunk helper ---
def adaptive_chunk_documents(docs: list[Document], model: str = 'text-embedding-3-small') -> list[Document]:
    """Take a list of Documents, split adaptively, return list of Documents."""
    out_docs = []
    import tiktoken
    enc = tiktoken.encoding_for_model(model)

    for doc in docs:
        text = doc.page_content
        token_count = len(enc.encode(text))

        if token_count < 500:
            out_docs.append(doc)  # keep whole
        elif token_count < 1500:
            splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
                model_name=model, chunk_size=500, chunk_overlap=80
            )
            out_docs.extend(splitter.split_documents([doc]))
        else:
            splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
                model_name=model, chunk_size=800, chunk_overlap=100
            )
            out_docs.extend(splitter.split_documents([doc]))

    return out_docs


def _embedding_dim(embeddings) -> int:
    return len(embeddings.embed_query('dim?'))

def _ensure_collection(client: QdrantClient, name: str, embeddings) -> None:
    if not client.collection_exists(name):
        dim = _embedding_dim(embeddings)
        client.create_collection(
            collection_name=name,
            vectors_config=VectorParams(size=dim, distance=Distance.COSINE),
        )

# --- Step 4: Batching Helpers ---
def flush_batch(buffer, vs, con):
    """Embed and insert a batch of documents into Qdrant + commit SQLite."""
    if not buffer:
        return

    vs.add_documents(buffer)
    con.commit()
    print(f'  Flushed {len(buffer)} chunks → Qdrant')
    buffer.clear()

# --- Step 5: Main Ingestion ---
def embed_directory_batched(root_dir: str, collection_name: str, batch_size: int = BATCH_SIZE) -> None:
    txt_paths = find_txt_files(root_dir)
    if not txt_paths:
        print(f'No .txt files found under {root_dir}')
        return

    # Initialize DB
    con = ensure_db()

    # Initialize embeddings + vectorstore
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small')

    # --- Embedded Qdrant instead of Docker ---
    project_dir = Path("projects") / PROJECT_NAME
    qdrant_path = project_dir / "qdrant"
    qdrant_path.mkdir(parents=True, exist_ok=True)

    client = QdrantClient(path=str(qdrant_path))

    _ensure_collection(client, collection_name, embeddings)
    vs = Qdrant(client=client, collection_name=collection_name, embeddings=embeddings)

    staging_buffer = []  # holds chunks before flushing

    print(f'Indexing {len(txt_paths)} files from {root_dir} …')
    for path in tqdm(txt_paths, desc='Indexing files'):
        # Step 1: hash check
        h = file_sha256(path)
        if document_exists(con, h):
            print(f'  SKIP (already indexed): {path.relative_to(root_dir)}')
            continue

        # Step 2: parse and insert doc row
        parsed = parse_file(str(path))

        # Step 3: chunk and attach doc_id
        docs = [Document(page_content=parsed['page_content'], metadata=parsed['metadata'])]
        chunks = adaptive_chunk_documents(docs)
        for ch in chunks:
            ch.metadata['doc_id'] = h

        insert_document(con, path, parsed, h, len(chunks))

        # Step 4: stage chunks
        staging_buffer.extend(chunks)
        print(f'  Staged {len(chunks)} chunks from {path.relative_to(root_dir)}')

        # Step 5: flush if buffer full
        if len(staging_buffer) >= batch_size:
            flush_batch(staging_buffer, vs, con)

    # Final flush
    flush_batch(staging_buffer, vs, con)
    client.close()   # releases the file lock




In [3]:
if __name__ == '__main__':
    embed_directory_batched(ROOT_DIR, COLLECTION_NAME, BATCH_SIZE)


  embeddings = OpenAIEmbeddings(model='text-embedding-3-small')
  vs = Qdrant(client=client, collection_name=collection_name, embeddings=embeddings)


Indexing 3 files from ./amatol-test …


Indexing files: 100%|██████████| 3/3 [00:00<00:00, 480.83it/s]


  Staged 1 chunks from reports/new-jersey-department-of-health-report-on-amatol-population-1918-1920.txt
  Staged 1 chunks from newspapers/1918/1918-12-25__philadelphia-inquirer__p2__first-wedding.txt
  Staged 1 chunks from newspapers/1918/1918-06-05__philadelphia-inquirer__p13__new-train-service-and-population-size.txt
  Flushed 3 chunks → Qdrant


In [6]:
# project_dir = Path("projects") / PROJECT_NAME
# qdrant_path = project_dir / "qdrant"
# qdrant_path.mkdir(parents=True, exist_ok=True)


def view_vector_store(collection_name):
    client = get_qdrant_client()
    points, _ = client.scroll(
        collection_name=COLLECTION_NAME,
        limit=20,
    )
    for pt in points:
        print(pt.id, pt.payload.keys())
        print(pt.payload)
    
    client.close()

view_vector_store(COLLECTION_NAME)

14c475bbf7a84b65af99bd85bbc17b7f dict_keys(['page_content', 'metadata'])
{'page_content': 'Philadelphia Actress Marries\n1918-12-24\nAnnouncement was made today of the marriage of Miss Agnes Belcher, an actress of Philadelphia, and Peter James Reilly, of New York City, at Amatol. The wedding was the first solemnized at the town which was created by the large munition plants in the locality.\n\nThe groom, who was employed as a guard, recently fell heir to a large fortune by the will of a deceased uncle in the West. Rev. Thomas M. Sparks, of Hammonton, performed the ceremony yesterday afternoon.', 'metadata': {'source_type': 'newspaper', 'source_id': 'philadelphia-inquirer', 'source_name': 'Philadelphia Inquirer', 'date': '1918-12-25', 'page': 'p2', 'title': 'Philadelphia Actress Marries', 'subtitles': [], 'attribution': 'Special to The Inquirer.', 'city_date': 'ATLANTIC CITY, N. J., Dec. 24.', 'file_path': 'amatol-test/newspapers/1918/1918-12-25__philadelphia-inquirer__p2__first-wedding

In [5]:
from qdrant_client.models import Filter, FieldCondition, MatchValue

def delete_document_from_store(con, client, collection_name: str, doc_id: str) -> None:
    """Delete a document from SQLite and Qdrant using its doc_id."""
    # 1. Delete from Qdrant
    client.delete(
        collection_name=collection_name,
        wait=True,
        points_selector=Filter(
            must=[
                FieldCondition(
                    key="metadata.doc_id",  # always nested
                    match=MatchValue(value=str(doc_id))
                )
            ]
        ),
    )

    # 2. Delete from SQLite
    delete_document(con, doc_id)
    print(f"Deleted document {doc_id} from DB and Qdrant.")



In [7]:
con = ensure_db()
# embeddings = OpenAIEmbeddings(model='text-embedding-3-small')
client = QdrantClient(path=str(qdrant_path))

# Example: delete using the hash
path = Path("./amatol-test/newspapers/1918/1918-06-05__philadelphia-inquirer__p13__new-train-service-and-population-size.txt")
h = file_sha256(path)
delete_document_from_store(con, client, COLLECTION_NAME, h)
client.close()

Deleted document dcb25bd09c839a3073c91933c4db673487498843981dc9e3dbf5b12d1f9aa535 from DB and Qdrant.


In [22]:
view_vector_store(client, COLLECTION_NAME)

0f8b656183764b2f9b688a428fe23a5b dict_keys(['page_content', 'metadata'])
{'page_content': 'COL. HAWKINS. Six thousand acres which they had bought from a land company; we selected the site. They had two or three sites, and we approved the site after two or three other sites had been taken up and considered. We originally expected to build it near Camp Dix, and the commanding officer of Camp Dix told us he did not want it near his soldiers, so we moved.\nMR. GRAHAM. Did they do the building then?\nCOL. HAWKINS. Yes, sir.\nMR. GRAHAM. Had a contract for building it?\nCOL. HAWKINS. That was part of the main contract they had, the construction and operation of a loading plant as the agent of the United States.\nMR. GRAHAM. At the time you made the selection did they at that time own this 6,000 acres?\nCOL. HAWKINS. No, sir; they bought it from some land company.\nMR. GRAHAM. They had it at that time leased at $8 an acre, didn’t they? Well, on July 17, 1918, according to your book, Ordnance—

In [24]:
client.close()

In [6]:
from db import *

con = ensure_db()
list_all_documents(con)

[(3,
  'amatol-test/newspapers/1918/1918-06-05__philadelphia-inquirer__p13__new-train-service-and-population-size.txt',
  'Philadelphia Inquirer, 1918-06-05, p13, "Accommodations For Loaders"',
  'newspaper',
  'philadelphia-inquirer',
  '1918-06-05',
  1,
  'dcb25bd09c839a3073c91933c4db673487498843981dc9e3dbf5b12d1f9aa535',
  '2025-08-24T20:50:28.941904'),
 (2,
  'amatol-test/newspapers/1918/1918-12-25__philadelphia-inquirer__p2__first-wedding.txt',
  'Philadelphia Inquirer, 1918-12-25, p2, "Philadelphia Actress Marries"',
  'newspaper',
  'philadelphia-inquirer',
  '1918-12-25',
  1,
  '7fe21d906be1dc9daa423c98bad7e4d1781ab59e4230da3b752b497643512a20',
  '2025-08-24T20:50:28.940408'),
 (1,
  'amatol-test/reports/new-jersey-department-of-health-report-on-amatol-population-1918-1920.txt',
  'Forty-Third Annual Report of the Department of Health of the State of New Jersey, 1920, p. 175, "Mullica River Section"',
  'report',
  'new-jersey-department-of-health-report-on-amatol-population-