In [13]:
!uv add langchain langchain-huggingface langchain-chroma chromadb tiktoken ijson pandas torch transformers


[2K[2mResolved [1m229 packages[0m [2min 36ms[0m[0m                                        [0m
[2K[2mAudited [1m222 packages[0m [2min 0.02ms[0m[0m                                       [0m


In [31]:
import os, json
from glob import glob
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader, JSONLoader, CSVLoader
from langchain_core.documents import Document

In [22]:
DATA_DIR = "./../final_train/"
VECTOR_DB = "./chroma_db"   # local folder, not /app (unless inside Docker)

In [23]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
import os, json
from glob import glob
from langchain_community.document_loaders import TextLoader, CSVLoader
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings

print("⏳ Loading embedding model...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
print("✅ Embedding model loaded")

# === Directories ===
DATA_DIR = "./../final_train/"
VECTOR_DB = "/app/vector_stores/bhavana_db"  # ✅ Bhavana DB folder
os.makedirs(VECTOR_DB, exist_ok=True)

# === Embeddings (HuggingFace) ===
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# === Vector Store ===
vector_store = Chroma(
    collection_name="bhavana_collection",
    embedding_function=embeddings,
    persist_directory=VECTOR_DB
)

# === Splitter ===
splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50,
    add_start_index=True,
)

# === Loaders ===
def load_txt(path):
    return TextLoader(path, encoding='utf-8').load()

def load_tsv(path):
    return CSVLoader(path, encoding='utf-8', csv_args={'delimiter': '\t'}).load()

def load_custom_json(path):
    with open(path, "r", encoding="utf-8") as f:
        raw = json.load(f)

    docs = []
    for entry in raw.get("context", []):
        if len(entry) != 2:
            continue
        title, sentences = entry
        content = " ".join(sentences)
        docs.append(Document(page_content=content, metadata={"title": title}))
    return docs

# === Dispatcher ===
def process_and_add_files(file_paths):
    for path in file_paths:
        ext = os.path.splitext(path)[1].lower()
        if ext == ".txt":
            docs = load_txt(path)
        elif ext == ".tsv":
            docs = load_tsv(path)
        elif ext == ".json":
            docs = load_custom_json(path)
        else:
            print(f"⚠️ Skipping unsupported file: {path}")
            continue

        # split
        split_docs = splitter.split_documents(docs)

        # ✅ Preview first 2 chunks for sanity
        if split_docs:
            print(f"\n🔎 Preview chunks from {os.path.basename(path)}:")
            for i, d in enumerate(split_docs[:2]):
                print(f"--- Chunk {i+1} ---")
                print("Meta:", d.metadata)
                print("Text:", d.page_content[:200], "...\n")

        # add to vector DB
        vector_store.add_documents(split_docs)
        print(f"📚 Indexed {len(split_docs)} chunks from {os.path.basename(path)}")

# === Collect files ===
all_files = (
    glob(os.path.join(DATA_DIR, "**", "*.txt"), recursive=True)
    + glob(os.path.join(DATA_DIR, "**", "*.tsv"), recursive=True)
    + glob(os.path.join(DATA_DIR, "**", "context_*.json"), recursive=True)
)

print(f"🔎 Found {len(all_files)} files to process")

# === Run indexing ===
process_and_add_files(all_files)

print(f"✅ All documents indexed into {VECTOR_DB}")


⏳ Loading embedding model...
✅ Embedding model loaded
🔎 Found 125328 files to process

🔎 Preview chunks from 19393327__WQ__E-2008-4260__EN.txt:
--- Chunk 1 ---
Meta: {'source': './../final_train/19393327__WQ__E-2008-4260__EN.txt', 'start_index': 0}
Text: WRITTEN QUESTION E-4260/08
by Robert Kilroy-Silk (NI)
to the Commission
(24 July 2008)
Subject: Subsidising Spanish bullfighting
Will the Commission provide full details of the direct and indirect sub ...

📚 Indexed 1 chunks from 19393327__WQ__E-2008-4260__EN.txt

🔎 Preview chunks from 22860129__WQA__E-2009-3395__EN.txt:
--- Chunk 1 ---
Meta: {'source': './../final_train/22860129__WQA__E-2009-3395__EN.txt', 'start_index': 0}
Text: Answer given by Mr Barrot on behalf of the Commission
(22 June 2009)
The Commission fully acknowledges the utmost importance of the values the Honourable Member is referring to in his question. These  ...

--- Chunk 2 ---
Meta: {'source': './../final_train/22860129__WQA__E-2009-3395__EN.txt', 'start_index': 3