In [1]:
import os
from PIL import Image
import pytesseract
from langchain.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langdetect import detect, DetectorFactory

# Ensure consistent language detection results
DetectorFactory.seed = 0

DOCS_DIR = "Documents"  # Folder containing PDFs, DOCX, and image files

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

def extract_text_from_image(image_path):
    try:
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image)
        lang = detect(text) if text.strip() else "unknown"
        return Document(page_content=text, metadata={"source": image_path, "language": lang})
    except Exception as e:
        print(f"❌ Error reading image {image_path}: {e}")
        return None

def process_documents():
    docs = []

    for filename in os.listdir(DOCS_DIR):
        filepath = os.path.join(DOCS_DIR, filename)

        if filename.endswith(".pdf"):
            loader = PyPDFLoader(filepath)
            loaded_docs = loader.load()
            # Add language detection metadata
            for doc in loaded_docs:
                lang = detect(doc.page_content) if doc.page_content.strip() else "unknown"
                doc.metadata["language"] = lang
            docs.extend(loaded_docs)

        elif filename.endswith(".docx"):
            loader = UnstructuredWordDocumentLoader(filepath)
            loaded_docs = loader.load()
            for doc in loaded_docs:
                lang = detect(doc.page_content) if doc.page_content.strip() else "unknown"
                doc.metadata["language"] = lang
            docs.extend(loaded_docs)

        elif filename.endswith((".png", ".jpg", ".jpeg", ".tif")):
            doc = extract_text_from_image(filepath)
            if doc:
                docs.append(doc)
        else:
            continue  # skip unsupported formats

    # Split text into chunks (this will preserve metadata)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_documents(docs)

    # Debug: show language distribution
    lang_counts = {}
    for c in chunks:
        lang = c.metadata.get("language", "unknown")
        lang_counts[lang] = lang_counts.get(lang, 0) + 1
    print(f"Language counts in chunks: {lang_counts}")

    # Extract texts for embedding
    texts = [chunk.page_content for chunk in chunks]

    # Create FAISS index with all language data
    vector_store = FAISS.from_documents(chunks, embedding_model)
    vector_store.save_local("faiss_faqs_db")

    return f"✅ {len(texts)} chunks processed and stored in FAISS!"

# Run processing if FAISS DB doesn't exist
if not os.path.exists("faiss_faqs_db"):
    print(process_documents())
else:
    print("✅ FAISS index already exists. Skipping processing.")


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")





ValueError: not a ZIP archive (so not a DOCX file): 'Documents\\~$faqs.docx'