In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [4]:
# journal_parser.py
import json
from pathlib import Path
from langchain.schema import Document

# --- Load metadata.json once at import ---
with open("amatol/journals/metadata.json", "r", encoding="utf-8") as f:
    METADATA = json.load(f)


def parse_journal_article(file_path: str) -> Document:
    """
    Parse a journal article into a LangChain Document with metadata.
    Expects the filename stem (no .txt) to match keys in metadata.json.
    """
    path = Path(file_path)
    fname = path.stem  # no .txt extension

    if fname not in METADATA.get("journals", {}):
        raise ValueError(f"File {fname} not found in metadata.json")

    entry = METADATA["journals"][fname]
    raw_text = path.read_text(encoding="utf-8").strip()

    metadata = {
        "source_type": "journal",
        "journal": entry["journal"],
        "volume": entry["volume"],
        "issue": entry["issue"],
        "season": entry["season"],
        "pages": entry["pages"],
        "title": entry["title"],
        "citation": entry["citation"],
        "file_path": str(file_path),
    }

    return Document(page_content=raw_text, metadata=metadata)


# --- Demo run ---
if __name__ == "__main__":
    root = Path("amatol/journals")
    all_files = root.rglob("*.txt")

    for file_path in all_files:
        doc = parse_journal_article(file_path)
        print("\n=== File:", file_path, "===")
        print("Content preview:", doc.page_content[:200], "…")
        print("Metadata:")
        for k, v in doc.metadata.items():
            print(f"  {k}: {v}")



=== File: amatol/journals/2019-02-15__sojourn__p45-54__all-aboard-for-amatol-new-jersey.txt ===
Content preview: All Aboard for Amatol, NJ
As a result of America’s entry into World War I, Atlantic County received a great expansion of its industrial economic base. The largest result of this expansion was the cons …
Metadata:
  source_type: journal
  journal: SoJourn
  volume: 3
  issue: 2
  season: Winter 2018/19
  pages: 45-54
  title: All Aboard for Amatol, New Jersey
  citation: SoJourn 3.2, Winter 2018/19, pp. 45–54, "All Aboard for Amatol, New Jersey"
  file_path: amatol/journals/2019-02-15__sojourn__p45-54__all-aboard-for-amatol-new-jersey.txt


In [5]:
import os
from pathlib import Path
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Qdrant
from qdrant_client import QdrantClient
from tqdm import tqdm
from qdrant_client.models import VectorParams, Distance

# --- Settings ---
COLLECTION_NAME = 'amatol_docs'
ROOT_DIR = './amatol-test'  # Modify this as needed

# --- Step 1: Recursively find all .txt files ---
def find_txt_files(root_dir: str) -> list[Path]:
    return [p for p in Path(root_dir).rglob('*.txt')]

# --- Step 2: Load ONE file and attach metadata ---
# from journal_parser import parse_journal_article

def load_one_document(path: Path, root_dir: str):
    if "journals" in str(path):  # crude check
        return [parse_journal_article(path)]
    else:
        loader = TextLoader(str(path), encoding="utf-8")
        docs = loader.load()
        for doc in docs:
            doc.metadata["source"] = str(path.relative_to(root_dir))
        return docs

# --- Step 3: Chunk helper ---
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.schema import Document

def adaptive_chunk_documents(docs: list[Document], model: str = 'text-embedding-3-small') -> list[Document]:
    """Take a list of Documents, split adaptively, return list of Documents."""
    out_docs = []
    import tiktoken
    enc = tiktoken.encoding_for_model(model)

    for doc in docs:
        text = doc.page_content
        token_count = len(enc.encode(text))

        if token_count < 500:
            out_docs.append(doc)  # keep whole
        elif token_count < 1500:
            splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
                model_name=model, chunk_size=500, chunk_overlap=80
            )
            out_docs.extend(splitter.split_documents([doc]))
        else:
            splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
                model_name=model, chunk_size=800, chunk_overlap=100
            )
            out_docs.extend(splitter.split_documents([doc]))

    return out_docs


def _embedding_dim(embeddings) -> int:
    return len(embeddings.embed_query('dim?'))

def _ensure_collection(client: QdrantClient, name: str, embeddings) -> None:
    if not client.collection_exists(name):
        dim = _embedding_dim(embeddings)
        client.create_collection(
            collection_name=name,
            vectors_config=VectorParams(size=dim, distance=Distance.COSINE),
        )

# --- Step 4: Process files ONE AT A TIME ---
def embed_directory_one_file_at_a_time(root_dir: str, collection_name: str) -> None:
    txt_paths = find_txt_files(root_dir)
    if not txt_paths:
        print(f'No .txt files found under {root_dir}')
        return

    # Create shared clients once
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small')
    client = QdrantClient(host='localhost', port=6333)
    _ensure_collection(client, collection_name, embeddings)
    vs = Qdrant(client=client, collection_name=collection_name, embeddings=embeddings)

    print(f'Indexing {len(txt_paths)} files from {root_dir} …')
    for path in tqdm(txt_paths, desc='Indexing files'):
        # 1) load only THIS file
        docs = load_one_document(path, root_dir)
        # 2) chunk THIS file
        chunks = adaptive_chunk_documents(docs)
        # 3) upload JUST these chunks
        vs.add_documents(chunks)
        print(f'  Added {len(chunks)} chunks from {path.relative_to(root_dir)}')

if __name__ == '__main__':
    embed_directory_one_file_at_a_time(ROOT_DIR, COLLECTION_NAME)


Indexing 1 files from ./amatol-test …


Indexing files: 100%|██████████| 1/1 [00:01<00:00,  1.06s/it]

  Added 3 chunks from journals/2019-02-15__sojourn__p45-54__all-aboard-for-amatol-new-jersey.txt



