In [2]:
import os
import torch
from PIL import Image
import pytesseract
from langchain.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

# ✅ Optional: LLaMA 2 generation import (if needed later)
# from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

DOCS_DIR = "Documents"  # Folder containing PDFs, DOCX, and image files

# Set Tesseract path if needed (Windows)
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Initialize SentenceTransformer-based embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Helper function to handle image OCR
def extract_text_from_image(image_path):
    try:
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image)
        return Document(page_content=text, metadata={"source": image_path})
    except Exception as e:
        print(f"❌ Error reading image {image_path}: {e}")
        return None

# Function to load and process documents including images
def process_documents():
    docs = []

    for filename in os.listdir(DOCS_DIR):
        filepath = os.path.join(DOCS_DIR, filename.lower())

        if filename.endswith(".pdf"):
            loader = PyPDFLoader(filepath)
            docs.extend(loader.load())
        elif filename.endswith(".docx"):
            loader = UnstructuredWordDocumentLoader(filepath)
            docs.extend(loader.load())
        elif filename.endswith((".png", ".jpg", ".jpeg", ".tif")):
            doc = extract_text_from_image(filepath)
            if doc:
                docs.append(doc)
        else:
            continue  # Skip unsupported formats

    # Split text into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_documents(docs)

    # Extract text for embedding
    texts = [chunk.page_content for chunk in chunks]

    # Create FAISS index
    vector_store = FAISS.from_texts(texts, embedding_model)
    vector_store.save_local("faiss_faqs_db1")

    return f"✅ {len(texts)} chunks processed and stored in FAISS!"

# Run if FAISS DB doesn't exist
if not os.path.exists("faiss_faqs_db1"):
    print(process_documents())
else:
    print("✅ FAISS index already exists. Skipping processing.")


✅ 14 chunks processed and stored in FAISS!
