In [None]:
import os, json
from glob import glob
from tqdm.notebook import tqdm
from langchain_community.document_loaders import TextLoader, CSVLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

# === Directories ===
DATA_DIR = "./../final_train/"
VECTOR_DB = "/app/vector_stores/bhavana_db/BAAI_db"

os.makedirs(VECTOR_DB, exist_ok=True)

# === Embedding model ===
# lightweight + fast for testing
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")

# === Vector store ===
vector_store = Chroma(
    collection_name="bhavana_collection",
    embedding_function=embeddings,
    persist_directory=VECTOR_DB
)

# === Splitter ===
splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50,
    add_start_index=True,
)

# === Loaders ===
def load_txt(path):
    return TextLoader(path, encoding='utf-8').load()

def load_tsv(path):
    return CSVLoader(path, encoding='utf-8', csv_args={'delimiter': '\t'}).load()

def load_custom_json(path):
    with open(path, "r", encoding="utf-8") as f:
        raw = json.load(f)

    docs = []
    for entry in raw.get("context", []):
        if len(entry) != 2:
            continue
        title, sentences = entry
        content = " ".join(sentences)
        docs.append(Document(page_content=content, metadata={"title": title}))
    return docs

# === Dispatcher ===
def process_and_add_files(file_paths):
    for path in file_paths:
        ext = os.path.splitext(path)[1].lower()
        if ext == ".txt":
            docs = load_txt(path)
        elif ext == ".tsv":
            docs = load_tsv(path)
        elif ext == ".json":
            docs = load_custom_json(path)
        else:
            print(f"⚠️ Skipping unsupported file: {path}")
            continue

        # split
        split_docs = splitter.split_documents(docs)

        # just count, no chunk previews
        print(f"📦 {os.path.basename(path)} → {len(split_docs)} chunks")

        # add to vector DB
        vector_store.add_documents(split_docs)
        print(f"✅ Indexed {len(split_docs)} chunks from {os.path.basename(path)}\n")


# === Collect files ===
all_files = (
    glob(os.path.join(DATA_DIR, "**", "*.txt"), recursive=True)
    + glob(os.path.join(DATA_DIR, "**", "*.tsv"), recursive=True)
    + glob(os.path.join(DATA_DIR, "**", "context_*.json"), recursive=True)
)

print(f"🔎 Found {len(all_files)} files to process")

# === Run indexing ===
process_and_add_files(all_files)

print("🎉 All files processed & vector DB stored at:", VECTOR_DB)
