In [2]:
import os
from docx import Document as DocxDocument
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Set your documents directory
DOCS_DIR = "Documents"

# Load Q&A pairs from DOCX
def load_qa_from_docx(doc_path):
    lang = "amharic" if "amharic" in doc_path.lower() else "english"
    docx = DocxDocument(doc_path)
    qa_docs = []
    q, a = "", ""
    
    for para in docx.paragraphs:
        text = para.text.strip()
        if not text:
            continue

        if text.startswith("Q:") or text.startswith("ጥ:"):
            q = text[2:].strip()
        elif text.startswith("A:") or text.startswith("መ:"):
            a = text[2:].strip()
            if q and a:
                content = f"Q: {q}\nA: {a}" if lang == "english" else f"ጥ: {q}\nመ: {a}"
                qa_docs.append(Document(page_content=content, metadata={"source": doc_path, "lang": lang}))
                q, a = "", ""
    return qa_docs

# Build FAISS DB
def build_faiss_vector_db():
    all_docs = []

    for file in os.listdir(DOCS_DIR):
        if file.endswith(".docx") and not file.startswith("~$"):  # Skip temp files
            full_path = os.path.join(DOCS_DIR, file)
            all_docs.extend(load_qa_from_docx(full_path))

    # Split documents into chunks
    splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=20)
    chunks = splitter.split_documents(all_docs)

    # Load embedding model
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    # Create FAISS vector DB
    vectorstore = FAISS.from_documents(chunks, embeddings)
    vectorstore.save_local("faiss_bilingual_db")

    print(f"✅ {len(chunks)} Q&A chunks indexed and saved in FAISS.")

# Run
if __name__ == "__main__":
    if not os.path.exists("faiss_bilingual_db"):
        build_faiss_vector_db()
    else:
        print("✅ FAISS DB already exists. Delete it if you want to rebuild.")


PackageNotFoundError: Package not found at 'Documents\~$faqs.docx'