In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

In [None]:
import os
from pathlib import Path
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Qdrant
from qdrant_client import QdrantClient
from tqdm import tqdm
from qdrant_client.models import VectorParams, Distance

# --- Settings ---
COLLECTION_NAME = "amatol_docs"
ROOT_DIR = "./data-test-2" # Modify this as needed

# --- Step 1: Recursively find all .txt files ---
def find_txt_files(root_dir: str) -> list[Path]:
    return [p for p in Path(root_dir).rglob("*.txt")]

# --- Step 2: Load files and attach metadata ---
def load_documents(paths: list[Path], root_dir: str) -> list:
    all_docs = []
    for path in tqdm(paths, desc="Loading files"):
        loader = TextLoader(str(path), encoding="utf-8")
        docs = loader.load()
        for doc in docs:
            doc.metadata["source"] = str(path.relative_to(root_dir))
        all_docs.extend(docs)
    return all_docs

# --- Step 3: Set Chunk Size ---
def chunk_documents(docs: list) -> list:
    splitter = RecursiveCharacterTextSplitter(chunk_size=550, chunk_overlap=50)
    return splitter.split_documents(docs)


def _embedding_dim(embeddings) -> int:
    # cheap probe to get the dimension; for text-embedding-3-small it's 1536
    return len(embeddings.embed_query("dim?"))

def _ensure_collection(client: QdrantClient, name: str, embeddings) -> None:
    if not client.collection_exists(name):
        dim = _embedding_dim(embeddings)
        client.create_collection(
            collection_name=name,
            vectors_config=VectorParams(size=dim, distance=Distance.COSINE),
        )

# --- Step 4: Embed and upload to Qdrant ---
def embed_and_store(chunks: list, collection_name: str):
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small')
    client = QdrantClient(host="localhost", port=6333)

    # 1) Make sure the collection exists (create-once)
    _ensure_collection(client, collection_name, embeddings)

    # 2) Tag chunks so you can manage per-document later
    # doc_id = str(uuid.uuid4())
    # for i, d in enumerate(chunks):
    #     d.metadata.setdefault("document_id", doc_id)
    #     d.metadata.setdefault("chunk_index", i)

    # 3) Reuse the same wrapper and just append
    vs = Qdrant(client=client, collection_name=collection_name, embeddings=embeddings)
    vs.add_documents(chunks)

    print(f" Added {len(chunks)} chunks to '{collection_name}'")

txt_paths = find_txt_files(ROOT_DIR)
raw_docs = load_documents(txt_paths, ROOT_DIR)
chunks = chunk_documents(raw_docs)
embed_and_store(chunks, COLLECTION_NAME)



In [None]:
# import uuid
# from langchain.embeddings import OpenAIEmbeddings
# from langchain.vectorstores import Qdrant
# from qdrant_client import QdrantClient
# from langchain.document_loaders import TextLoader
# from langchain.text_splitter import RecursiveCharacterTextSplitter

# COLLECTION_NAME = 'historical_docs'

# def ensure_metadata(docs, document_id: str, filename: str):
#     for i, d in enumerate(docs):
#         d.metadata.setdefault('document_id', document_id)
#         d.metadata.setdefault('filename', filename)
#         d.metadata.setdefault('chunk_index', i)
#     return docs

# def upsert_documents(chunks, collection_name=COLLECTION_NAME):
#     embeddings = OpenAIEmbeddings(model='text-embedding-3-small')
#     client = QdrantClient(host='localhost', port=6333)

#     # Create the collection on first run; otherwise append
#     if client.collection_exists(collection_name):
#         vs = Qdrant(client=client, collection_name=collection_name, embeddings=embeddings)
#         vs.add_documents(chunks)
#     else:
#         Qdrant.from_documents(
#             documents=chunks,
#             embedding=embeddings,
#             client=client,                 # using the existing client
#             collection_name=collection_name,
#         )

# # Example use per upload:
# # 1) load the raw doc -> split into chunks
# # 2) add per-doc metadata (document_id, filename)
# def index_one_file(path: Path, root_dir: str):
#     loader = TextLoader(str(path), encoding='utf-8')
#     docs = loader.load()
#     splitter = RecursiveCharacterTextSplitter(chunk_size=550, chunk_overlap=50)
#     chunks = splitter.split_documents(docs)

#     document_id = str(uuid.uuid4())                # stable id for this uploaded file
#     filename = str(path.relative_to(root_dir))
#     chunks = ensure_metadata(chunks, document_id, filename)

#     upsert_documents(chunks, collection_name=COLLECTION_NAME)
#     print(f' Added {len(chunks)} chunks from {filename} (document_id={document_id})')