In [22]:
# loading the libraries
from dotenv import load_dotenv
import pymupdf
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document
from langchain.chains.retrieval import create_retrieval_chain
from langchain import hub
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_openai import ChatOpenAI

In [9]:
load_dotenv()

# set the embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small") 

In [2]:
# defining a class for extracting text from documents
class PdfExtractors:
    def __init__(self, pdf_path: str):
        self.pdf_path = pdf_path

    def extract_text_from_pdf(self) -> str:
        """Extract text directly from PDF pages."""
        try:
            doc = pymupdf.open(self.pdf_path)
            text_parts = []
            for page in doc:
                text_parts.append(page.get_text())
            doc.close()
            return "".join(text_parts)
        except FileNotFoundError:
            return f"Error: PDF file '{self.pdf_path}' not found."
        except Exception as e:
            return f"Error extracting text from PDF: {str(e)}"

    def extract_text_from_images(self) -> str:
        """Extract text from images in a PDF using OCR."""
        try:
            images = convert_from_path(self.pdf_path)
            text_from_images = []
            for img in images:
                text = pytesseract.image_to_string(img)
                text_from_images.append(text)
            return "\n".join(text_from_images)
        except FileNotFoundError:
            return f"Error: PDF file '{self.pdf_path}' not found."
        except Exception as e:
            return f"Error extracting text from images: {str(e)}"

    def extract_from_jpg(self, image_path: str) -> str:
        """Extract text from a single JPG image using OCR."""
        try:
            img = Image.open(image_path)
            text = pytesseract.image_to_string(img)
            img.close()
            return text
        except FileNotFoundError:
            return f"Error: Image file '{image_path}' not found."
        except Exception as e:
            return f"Error extracting text from image: {str(e)}"

In [None]:
# chunking and storing extracted text

def load_and_chunk_pdfs_and_images(pdf_paths, images_paths, chunk_size=500, chunk_overlap=50):
    documents = []

    # process PDFs
    for pdf_path in pdf_paths:
        pdf_extractor = PdfExtractors(pdf_path)
        try:
            pdf_text = pdf_extractor.extract_text_from_pdf()
            if pdf_text:
                documents.append(Document(page_content=pdf_text, metadata={"source": pdf_path, "type": "pdf"}))
            else:
                pdf_text_from_images = pdf_extractor.extract_text_from_images()
                documents.append(Document(page_content=pdf_text_from_images, metadata={"source": pdf_path, "type": "pdf"}))
        except Exception as e:
            print(f"Error processing PDF {pdf_path}: {e}")

    # Process JPG images
    for image_path in images_paths:
        try:
            # Use PdfExtractors instance to extract text from JPG
            pdf_extractor = PdfExtractors("")  # Dummy path since extract_from_jpg doesn't use pdf_path
            image_text = pdf_extractor.extract_from_jpg(image_path)
            if not image_text.startswith("Error"):
                documents.append(Document(page_content=image_text, metadata={"source": image_path, "type": "jpg"}))
        except Exception as e:
            print(f"Error processing image {image_path}: {e}")
    
    # chunk the documents into manageable pieces
    chunked_documents = []
    for doc in documents:
        text = doc.page_content
        for i in range(0, len(text), chunk_size - chunk_overlap):
            chunk = text[i:i+chunk_size]
            chunked_documents.append(Document(page_content=chunk, metadata=doc.metadata))

    return chunked_documents

def documents_storing_in_vectorDB(documents):
    try:
        # setting the vectore store
        PineconeVectorStore.from_documents(
            documents,
            embeddings,
            index_name="ocr-rag"
        )
        print(f"Documents stored successfully")
    except Exception as e:
        print(f"Documents storing failed: {e}")


# examples
pdf_paths = ["C:/Research Folder/LLM research/LLM projects/ocr_rag/Data/credit.pdf", "C:/Research Folder/LLM research/LLM projects/ocr_rag/Data/rent_receipt.pdf","C:/Research Folder/LLM research/LLM projects/ocr_rag/Data/docu-tracking-ai-in-10-charts.pdf"]
images_paths = ["C:/Research Folder/LLM research/LLM projects/ocr_rag/Data/image_test1.png", "C:/Research Folder/LLM research/LLM projects/ocr_rag/Data/image_test2.png" ]

# load and chunk the PDFs and images
documents = load_and_chunk_pdfs_and_images(pdf_paths, images_paths)

In [31]:
documents

[Document(metadata={'source': 'C:/Research Folder/LLM research/LLM projects/ocr_rag/Data/credit.pdf', 'type': 'pdf'}, page_content='  \n \n \nPage 1 of 2\nCREDIT CARD / ACH PAYMENT AUTHORIZATION \n \n \nCheck One (1) and Enter Your Details \n \n- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -  \n \n☐ - Recurring Charge - You authorize regularly scheduled charges to your credit card \nor bank account. You will be charged the amount indicated below each billing period. A \nreceipt for each payment will be provided to you and the charge will appear on your \ncredi'),
 Document(metadata={'source': 'C:/Research Folder/LLM research/LLM projects/ocr_rag/Data/credit.pdf', 'type': 'pdf'}, page_content='d to you and the charge will appear on your \ncredit card or bank statement. You agree that no prior notification will be provided \nunless the date or amount changes, in which case you will receive notice from us at \

In [33]:
# storing in vectorStore
documents_storing_in_vectorDB(documents=documents)

Documents stored successfully


In [34]:
INDEX_NAME = "ocr-rag"
def run_llms(query: str):
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    docsearch = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)
    chat = ChatOpenAI(verbose=True, temperature=0, model="gpt-4o-mini-2024-07-18")

    retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
    stuff_documents_chain = create_stuff_documents_chain(chat, retrieval_qa_chat_prompt)

    qa = create_retrieval_chain(
        retriever=docsearch.as_retriever(),
        combine_docs_chain=stuff_documents_chain
    )

    result = qa.invoke(input={"input": query})
    return result

In [35]:
# testing our Retrieval Augmented Generation
res = run_llms(query="what is the global revenue of artificial intelligence?")
print(res["answer"])

The global revenue of artificial intelligence is projected to be $98.4 billion by 2023.
