In [6]:
import os
import fitz  # PyMuPDF
import easyocr
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# Load environment variables
load_dotenv()

# Initialize Gemini LLM
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)

# Define a prompt for summarization
summarization_prompt = PromptTemplate.from_template(
    "Don't add or remove content,also eliminate duplicate content, and format the following content:\n\n{content}"
)
summarization_chain = LLMChain(
    llm=llm, prompt=summarization_prompt, verbose=True)

# Function to extract text and images from PDF and return Markdown


def process_document_with_gemini(pdf_path):
    markdown_content = ""
    doc = fitz.open(pdf_path)

    for page_num in range(len(doc)):
        page = doc[page_num]
        # Extract text
        text = page.get_text()
        markdown_content += f"## Page {page_num + 1}\n\n{text.strip()}\n\n"

        # Summarize the text using Gemini
        if text.strip():
            summary = summarization_chain.run({"content": text.strip()})
            markdown_content += f"### Summary of Page {page_num + 1}\n\n{summary}\n\n"

        # Extract images
        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            img_ext = base_image["ext"]
            img_filename = f"page{page_num + 1}_img{img_index + 1}.{img_ext}"

            # Save image
            with open(img_filename, "wb") as img_file:
                img_file.write(image_bytes)
            markdown_content += f"![Image {img_index + 1}](./{img_filename})\n\n"

    # Perform OCR on images
    reader = easyocr.Reader(['en'])
    image_files = [f for f in os.listdir() if f.startswith(
        "page") and f.endswith(("png", "jpg", "jpeg"))]

    if image_files:
        markdown_content += "## OCR Results\n\n"
        for image_path in image_files:
            ocr_text = reader.readtext(image_path, detail=0)
            ocr_text_combined = ' '.join(ocr_text)

            # Summarize OCR results using Gemini
            if ocr_text_combined.strip():
                ocr_summary = summarization_chain.run(
                    {"content": ocr_text_combined})
                markdown_content += f"### OCR for {image_path}\n\n{ocr_summary}\n\n"

    return markdown_content


if __name__ == "__main__":
    # Provide the PDF path
    pdf_path = "file.pdf"  # Replace with your PDF file

    # Process the document and generate Markdown
    result = process_document_with_gemini(pdf_path)

    # Save the Markdown to a file
    output_md = "output_with_gemini.md"
    with open(output_md, "w", encoding="utf-8") as file:
        file.write(result)

    print(f"Markdown content saved to {output_md}")

  summarization_chain = LLMChain(
  summary = summarization_chain.run({"content": text.strip()})




[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mDon't add or remove content,also eliminate duplicate content, and format the following content:

DEPARTMENT OF 
 COMPUTER SCIENCE AND ENGINEERING 
 
 
 
(AIML633P) 
Digital Image Processing 
 
B. Tech – Computer Science and Engineering  
(AIML) 
 
School of Engineering and Technology,  
CHRIST (Deemed to be University), 
Kumbalagodu, Bengaluru-560 074 
2024-25[0m

[1m> Finished chain.[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mDon't add or remove content,also eliminate duplicate content, and format the following content:

Certificate 
 
This is to certify that   George Bobby, 2262067    has successfully completed the record 
work for ( AIML633P -Digital Image Processing) in partial fulfillment for the award of Bachelor 
of Technology in Computer Science and Engineering (AIML) during the year 2024-2025. 
 
 
 
 
 
 
 
 
HEAD OF THE DEPARTMENT  
 
 
 
FACULTY- IN C

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.



[1m> Finished chain.[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mDon't add or remove content,also eliminate duplicate content, and format the following content:

CHRIST (DEEMED TO BE UNIVERSITY) B A N G A L 0 R E IN D [ A DEPARTMENT OF COMPUTER SCIENCE AND ENGINEERING (AIML633P) Digital Image Processing B. Tech Computer Science and Engineering (AIML) School o Engineering and Technology, CHRIST (Deemed to be University), Kumbalagodu; Bengaluru-560 074 2024-25[0m

[1m> Finished chain.[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mDon't add or remove content,also eliminate duplicate content, and format the following content:

CHRIST (DEEMED TO BE UNIVERSITY) B A N G A L 0 R E IN D [ A Certificate This certifi that George Bobby; 2262067 has successfialh - comnleted the Tecurd IOrkTur AIML6SSP -Digital Image Precessing parlial fulfillment for the award of Buchelor of Techology Cmnuer Sccuce ((nO Engineering 