In [3]:
import os
from PIL import Image
import pytesseract
from langchain.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

# ---- Constants ----
DOCS_DIR = "Documents"
FAISS_DB_PATH = "faiss_faqs_db1"

# ---- Embedding Model (Supports Amharic & English) ----
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)

# ---- OCR Image Reader ----
def extract_text_from_image(image_path):
    try:
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image)
        return Document(page_content=text, metadata={"source": image_path})
    except Exception as e:
        print(f"❌ Error reading image {image_path}: {e}")
        return None

# ---- Load and Process Two Files ----
def process_documents():
    docs = []
    supported_extensions = (".pdf", ".docx", ".jpg", ".jpeg", ".png", ".tif")

    files = [f for f in os.listdir(DOCS_DIR) if f.lower().endswith(supported_extensions)]

    if len(files) != 2:
        return "⚠️ Please make sure exactly two documents (one English, one Amharic) are in the 'Documents/' folder."

    for filename in files:
        filepath = os.path.join(DOCS_DIR, filename)

        if filename.endswith(".pdf"):
            loader = PyPDFLoader(filepath)
            docs.extend(loader.load())

        elif filename.endswith(".docx"):
            loader = UnstructuredWordDocumentLoader(filepath)
            docs.extend(loader.load())

        elif filename.endswith((".png", ".jpg", ".jpeg", ".tif")):
            doc = extract_text_from_image(filepath)
            if doc:
                docs.append(doc)

    if not docs:
        return "❌ No text could be extracted from the documents."

    # Split text
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_documents(docs)
    texts = [chunk.page_content for chunk in chunks]

    # Build and save FAISS index
    vector_store = FAISS.from_texts(texts, embedding_model)
    vector_store.save_local(FAISS_DB_PATH)

    return f"✅ Successfully processed and stored {len(texts)} chunks from two documents."

# ---- Run ----
if not os.path.exists(FAISS_DB_PATH):
    print(process_documents())
else:
    print("✅ FAISS index already exists.")


✅ FAISS index already exists.
