In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [8]:
import os
from pathlib import Path
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Qdrant
from qdrant_client import QdrantClient
from tqdm import tqdm
from qdrant_client.models import VectorParams, Distance

# --- Settings ---
COLLECTION_NAME = 'amatol_docs'
ROOT_DIR = './data-3'  # Modify this as needed

# --- Step 1: Recursively find all .txt files ---
def find_txt_files(root_dir: str) -> list[Path]:
    return [p for p in Path(root_dir).rglob('*.txt')]

# --- Step 2: Load ONE file and attach metadata ---
def load_one_document(path: Path, root_dir: str) -> list:
    loader = TextLoader(str(path), encoding='utf-8')
    docs = loader.load()
    for doc in docs:
        # Keep 'source' for now; we’ll add a deletion-friendly ID in the next step
        doc.metadata['source'] = str(path.relative_to(root_dir))
    return docs

# --- Step 3: Chunk helper ---
def chunk_documents(docs: list) -> list:
    splitter = RecursiveCharacterTextSplitter(chunk_size=550, chunk_overlap=50)
    return splitter.split_documents(docs)

def _embedding_dim(embeddings) -> int:
    return len(embeddings.embed_query('dim?'))

def _ensure_collection(client: QdrantClient, name: str, embeddings) -> None:
    if not client.collection_exists(name):
        dim = _embedding_dim(embeddings)
        client.create_collection(
            collection_name=name,
            vectors_config=VectorParams(size=dim, distance=Distance.COSINE),
        )

# --- Step 4: Process files ONE AT A TIME ---
def embed_directory_one_file_at_a_time(root_dir: str, collection_name: str) -> None:
    txt_paths = find_txt_files(root_dir)
    if not txt_paths:
        print(f'No .txt files found under {root_dir}')
        return

    # Create shared clients once
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small')
    client = QdrantClient(host='localhost', port=6333)
    _ensure_collection(client, collection_name, embeddings)
    vs = Qdrant(client=client, collection_name=collection_name, embeddings=embeddings)

    print(f'Indexing {len(txt_paths)} files from {root_dir} …')
    for path in tqdm(txt_paths, desc='Indexing files'):
        # 1) load only THIS file
        docs = load_one_document(path, root_dir)
        # 2) chunk THIS file
        chunks = chunk_documents(docs)
        # 3) upload JUST these chunks
        vs.add_documents(chunks)
        print(f'  Added {len(chunks)} chunks from {path.relative_to(root_dir)}')

if __name__ == '__main__':
    embed_directory_one_file_at_a_time(ROOT_DIR, COLLECTION_NAME)


Indexing 2 files from ./data-3 …


Indexing files:  50%|█████     | 1/2 [00:00<00:00,  1.71it/s]

  Added 4 chunks from 1918-11-12, 11, Philadelphia Inquirer, War is over.txt


Indexing files: 100%|██████████| 2/2 [00:00<00:00,  2.38it/s]

  Added 1 chunks from 1918-12-05, 4, Philadelphia Inquirer, operations halt.txt



