In [6]:
import os
import torch
from PIL import Image
import pytesseract
from langchain.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

# Directory containing PDF, DOCX, and image files
DOCS_DIR = "Documents"

# Optional: Tesseract path for Windows (uncomment if needed)
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Embedding model (multilingual for Amharic support)
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# OCR text extraction from image
def extract_text_from_image(image_path):
    try:
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image, lang='amh+eng')  # Optional: combine Amharic and English
        return Document(page_content=text, metadata={"source": image_path})
    except Exception as e:
        print(f"❌ Error reading image {image_path}: {e}")
        return None

# Document processing function
def process_documents():
    docs = []

    for filename in os.listdir(DOCS_DIR):
        if filename.startswith("~$"):
            print(f"⚠️ Skipping temporary file: {filename}")
            continue

        filepath = os.path.join(DOCS_DIR, filename)

        try:
            if filename.endswith(".pdf"):
                loader = PyPDFLoader(filepath)
                docs.extend(loader.load())
            elif filename.endswith(".docx"):
                loader = UnstructuredWordDocumentLoader(filepath)
                docs.extend(loader.load())
            elif filename.lower().endswith((".png", ".jpg", ".jpeg", ".tif")):
                doc = extract_text_from_image(filepath)
                if doc:
                    docs.append(doc)
            else:
                print(f"⏩ Skipping unsupported file: {filename}")
        except Exception as e:
            print(f"❌ Failed to load {filename}: {e}")

    if not docs:
        return "⚠️ No valid documents found to process."

    # Text chunking
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_documents(docs)

    # Convert to plain texts
    texts = [chunk.page_content for chunk in chunks if chunk.page_content.strip()]

    if not texts:
        return "⚠️ No text extracted from the documents."

    # Build FAISS vector store
    vector_store = FAISS.from_texts(texts, embedding_model)
    vector_store.save_local("faiss_faqs_db_Amh")

    return f"✅ {len(texts)} text chunks processed and stored in FAISS!"

# Run once unless FAISS already exists
if not os.path.exists("faiss_faqs_db_Amh"):
    result = process_documents()
    print(result)
else:
    print("✅ FAISS index already exists. Skipping processing.")


✅ FAISS index already exists. Skipping processing.
