In [None]:
import os, json
from glob import glob
from langchain_community.document_loaders import TextLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

# === Directories ===
DATA_DIR = "./../../final_train/"
VECTOR_DB = "/app/vector_stores/bhavana_db/BAAI_db"

os.makedirs(VECTOR_DB, exist_ok=True)

# === Embedding model ===
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")

# === Vector store ===
vector_store = Chroma(
    collection_name="bhavana_collection",
    embedding_function=embeddings,
    persist_directory=VECTOR_DB
)

# === Splitter ===
splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50,
    add_start_index=True,
)

# === Loader for partial TXT ===
def load_partial_txt(path, max_chars=1000):
    """Load only part of a TXT file (first max_chars)."""
    with open(path, "r", encoding="utf-8") as f:
        content = f.read(max_chars)   # ✅ only first N characters
    return [Document(page_content=content, metadata={"source": os.path.basename(path)})]

# === Dispatcher (TXT only) ===
def process_and_add_txt(file_paths, max_chars=1000):
    for path in file_paths:
        docs = load_partial_txt(path, max_chars=max_chars)

        # split
        split_docs = splitter.split_documents(docs)

        # just count, no chunk previews
        print(f"📦 {os.path.basename(path)} → {len(split_docs)} chunks")

        # add to vector DB
        vector_store.add_documents(split_docs)
        print(f"✅ Indexed {len(split_docs)} chunks from {os.path.basename(path)}\n")


# === Collect only TXT files ===
txt_files = glob(os.path.join(DATA_DIR, "**", "*.txt"), recursive=True)

print(f"🔎 Found {len(txt_files)} TXT files to process")

# === Run indexing ===
process_and_add_txt(txt_files, max_chars=2000)   # <--- change max_chars to what you need

print("🎉 TXT files processed & vector DB stored at:", VECTOR_DB)


🔎 Found 97958 TXT files to process
📦 19393327__WQ__E-2008-4260__EN.txt → 1 chunks
✅ Indexed 1 chunks from 19393327__WQ__E-2008-4260__EN.txt

📦 22860129__WQA__E-2009-3395__EN.txt → 2 chunks
✅ Indexed 2 chunks from 22860129__WQA__E-2009-3395__EN.txt

📦 14701632__QT__H-2007-0537__EN.txt → 1 chunks
✅ Indexed 1 chunks from 14701632__QT__H-2007-0537__EN.txt

📦 23299635__WQ__E-2009-4626__EN.txt → 1 chunks
✅ Indexed 1 chunks from 23299635__WQ__E-2009-4626__EN.txt

📦 15616943__WQ__E-2007-2963__EN.txt → 4 chunks
✅ Indexed 4 chunks from 15616943__WQ__E-2007-2963__EN.txt

📦 18305684__IM-PRESS__20080423-IPR-27459__EN_8ae3dc.txt → 5 chunks
✅ Indexed 5 chunks from 18305684__IM-PRESS__20080423-IPR-27459__EN_8ae3dc.txt

📦 30733054__WQA__E-2011-002789__EN.txt → 4 chunks
✅ Indexed 4 chunks from 30733054__WQA__E-2011-002789__EN.txt

📦 32901198__WQ__E-2011-011977__EN.txt → 3 chunks
✅ Indexed 3 chunks from 32901198__WQ__E-2011-011977__EN.txt

📦 29875515__WQ__E-2011-003389__EN.txt → 2 chunks
✅ Indexed 2 chun