## 2 Functions
1. `parse_for_ingestion`: Parsing for ingestion (more specific and intentional)
2. `parse_user_files`: Parsing for user routing (general, not as important)

In [3]:
import nest_asyncio
nest_asyncio.apply()
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
import fitz
import pdfplumber
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import os
import cv2

SyntaxError: invalid syntax (exc.py, line 258)

### Helper Functions

In [5]:
import fitz  # PyMuPDF
import os
import io
import cv2
from pdf2image import convert_from_path

def extract_images_from_pdf(pdf_path, output_folder="extracted_images", min_contour_area=5000):
    """
    Extracts images from a rendered PDF file using OpenCV.

    Args:
        pdf_path (str): Path to the PDF file.
        output_folder (str): Directory to save extracted images.
        min_contour_area (int): Minimum area size to detect images.

    Returns:
        List[str]: List of saved image file paths.
    """
    os.makedirs(output_folder, exist_ok=True)  # Create folder if it doesn’t exist
    poppler_path = "/opt/homebrew/bin"  # Adjust this for your system

    images = convert_from_path(pdf_path, dpi=300, poppler_path=poppler_path)  # Render PDF pages to images
    extracted_images = []

    for page_num, image in enumerate(images):
        open_cv_image = np.array(image)  # Convert to NumPy array
        open_cv_image = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2BGR)  # Convert RGB to BGR

        # Convert to grayscale
        gray = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)

        # Apply threshold to separate text from images
        _, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)

        # Find contours
        contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        figure_count = 0  # Track number of extracted images per page
        for contour in contours:
            if cv2.contourArea(contour) < min_contour_area:  # Ignore small contours (noise)
                continue

            x, y, w, h = cv2.boundingRect(contour)  # Get bounding box
            cropped_image = open_cv_image[y:y + h, x:x + w]  # Crop detected figure

            # Save image
            image_filename = f"page_{page_num+1}_figure_{figure_count+1}.png"
            image_path = os.path.join(output_folder, image_filename)
            cv2.imwrite(image_path, cropped_image)  # Save image using OpenCV

            extracted_images.append(image_path)
            print(f"✔ Saved image: {image_path}")

            figure_count += 1

    return extracted_images

# Example usage:
pdf_file = "/Users/lishuyao/Documents/NUS/MODS/Y3S2/Capstone/ODPRT-chatbot/notebooks/1810.04805v2.pdf"
output_dir = "output_images"
extracted_images = extract_images_from_pdf(pdf_file, output_dir)

print(f"Extracted {len(extracted_images)} images.")


✔ Saved image: output_images/page_3_figure_1.png
✔ Saved image: output_images/page_3_figure_2.png
✔ Saved image: output_images/page_5_figure_1.png
✔ Saved image: output_images/page_5_figure_2.png
✔ Saved image: output_images/page_5_figure_3.png
✔ Saved image: output_images/page_5_figure_4.png
✔ Saved image: output_images/page_5_figure_5.png
✔ Saved image: output_images/page_5_figure_6.png
✔ Saved image: output_images/page_5_figure_7.png
✔ Saved image: output_images/page_5_figure_8.png
✔ Saved image: output_images/page_5_figure_9.png
✔ Saved image: output_images/page_5_figure_10.png
✔ Saved image: output_images/page_5_figure_11.png
✔ Saved image: output_images/page_5_figure_12.png
✔ Saved image: output_images/page_5_figure_13.png
✔ Saved image: output_images/page_5_figure_14.png
✔ Saved image: output_images/page_5_figure_15.png
✔ Saved image: output_images/page_5_figure_16.png
✔ Saved image: output_images/page_5_figure_17.png
✔ Saved image: output_images/page_5_figure_18.png
✔ Saved ima

In [1]:
import numpy as np
import cv2 as cv
test_image = np.zeros((100, 100, 3), dtype=np.uint8)
test_image_bgr = cv.cvtColor(test_image, cv.COLOR_RGB2BGR)

In [6]:
import os
import cv2
import numpy as np
from pdf2image import convert_from_path
import fitz  # PyMuPDF
from PIL import Image

# Define paths
pdf_path = "1810.04805v2.pdf"
# output_text_file = "/data/combined_text.txt"
output_figures_folder = "data/extracted_figures"

# Create output directories
# os.makedirs(os.path.dirname(output_text_file), exist_ok=True)
os.makedirs(output_figures_folder, exist_ok=True)

# Function to extract text from PDF
def extract_combined_text_from_pdf(pdf_path, output_file_path):
    doc = fitz.open(pdf_path)
    
    with open(output_file_path, "w", encoding="utf-8") as text_file:
        for page_num in range(len(doc)):
            text = doc[page_num].get_text("text")
            text_file.write(f"\n\n--- Page {page_num+1} ---\n\n")
            text_file.write(text)

    return output_file_path

# Function to extract figures from PDF
poppler_path = "/opt/homebrew/bin"  # Adjust this if needed
def extract_figures_from_pdf(pdf_path, output_folder, min_contour_area=5000):
    images = convert_from_path(pdf_path,  poppler_path = poppler_path)
    extracted_figures = {}

    for page_num, image in enumerate(images):
        # Convert image to OpenCV format
        open_cv_image = np.array(image)
        open_cv_image = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2BGR)

        # Convert to grayscale and apply edge detection
        gray = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)
        edges = cv2.Canny(gray, 50, 150)

        # Find contours
        contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        figure_paths = []
        figure_count = 0

        for contour in contours:
            if cv2.contourArea(contour) < min_contour_area:
                continue

            x, y, w, h = cv2.boundingRect(contour)
            figure_image = image.crop((x, y, x + w, y + h))

            figure_path = os.path.join(output_folder, f"page_{page_num+1}_figure_{figure_count+1}.png")
            figure_image.save(figure_path, "PNG")
            figure_paths.append(figure_path)

            figure_count += 1

        extracted_figures[f"Page {page_num+1}"] = figure_paths

    return extracted_figures

# Extract text and figures
# text_data = extract_combined_text_from_pdf(pdf_path, output_text_file)
figure_data = extract_figures_from_pdf(pdf_path, output_figures_folder)

# Combine results in a dictionary
extracted_data = {
    # "text": text_data,
    "figures": figure_data
}

# Return the extracted paths
extracted_data

error: OpenCV(4.11.0) :-1: error: (-5:Bad argument) in function 'cvtColor'
> Overload resolution failed:
>  - src is not a numpy array, neither a scalar
>  - Expected Ptr<cv::UMat> for argument 'src'


In [9]:
import os
import cv2
import numpy as np
from pdf2image import convert_from_path
import fitz  # PyMuPDF
from PIL import Image

# Define paths
pdf_path = "1810.04805v2.pdf"
output_text_file = "data/combined_text.txt"
output_figures_folder = "data/extracted_figures"

# Create output directories
os.makedirs(os.path.dirname(output_text_file), exist_ok=True)
os.makedirs(output_figures_folder, exist_ok=True)

# # Function to extract text from PDF
# def extract_combined_text_from_pdf(pdf_path, output_file_path):
#     doc = fitz.open(pdf_path)
    
#     with open(output_file_path, "w", encoding="utf-8") as text_file:
#         for page_num in range(len(doc)):
#             text = doc[page_num].get_text("text")
#             text_file.write(f"\n\n--- Page {page_num+1} ---\n\n")
#             text_file.write(text)

#     return output_file_path

# Function to extract figures from PDF
poppler_path = "/opt/homebrew/bin"  # Adjust this if needed

def extract_figures_and_tables_from_pdf(pdf_path, output_folder, min_contour_area=5000, min_table_area=10000):
    images = convert_from_path(pdf_path, poppler_path=poppler_path)  # Convert PDF pages to images
    extracted_images = {}

    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    for page_num, image in enumerate(images):
        # Convert image to OpenCV format
        open_cv_image = np.array(image)
        open_cv_image = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2BGR)

        # Convert to grayscale
        gray = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)

        # Apply adaptive thresholding to detect tables
        thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 15, 4)

        # Detect vertical and horizontal lines to identify tables
        kernel_h = np.ones((1, 10), np.uint8)
        kernel_v = np.ones((10, 1), np.uint8)
        horizontal_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel_h)
        vertical_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel_v)
        table_mask = cv2.add(horizontal_lines, vertical_lines)

        # Find contours for tables and figures
        contours, _ = cv2.findContours(table_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        figure_paths = []
        table_paths = []
        count_figures = 0
        count_tables = 0

        for contour in contours:
            x, y, w, h = cv2.boundingRect(contour)

            if cv2.contourArea(contour) > min_table_area:
                # Extract tables
                table_image = image.crop((x, y, x + w, y + h))
                table_path = os.path.join(output_folder, f"page_{page_num+1}_table_{count_tables+1}.png")
                table_image.save(table_path, "PNG")
                table_paths.append(table_path)
                count_tables += 1
            elif cv2.contourArea(contour) > min_contour_area:
                # Extract figures
                figure_image = image.crop((x, y, x + w, y + h))
                figure_path = os.path.join(output_folder, f"page_{page_num+1}_figure_{count_figures+1}.png")
                figure_image.save(figure_path, "PNG")
                figure_paths.append(figure_path)
                count_figures += 1

        extracted_images[f"Page {page_num+1}"] = {
            "figures": figure_paths,
            "tables": table_paths
        }

    return extracted_images

# Extract text and figures
# text_data = extract_combined_text_from_pdf(pdf_path, output_text_file)
figure_data = extract_figures_and_tables_from_pdf(pdf_path, output_figures_folder)

# Combine results in a dictionary
extracted_data = {
    # "text": text_data,
    "figures": figure_data
}

# Return the extracted paths
print(extracted_data)


error: OpenCV(4.11.0) :-1: error: (-5:Bad argument) in function 'cvtColor'
> Overload resolution failed:
>  - src is not a numpy array, neither a scalar
>  - Expected Ptr<cv::UMat> for argument 'src'


In [43]:
import os
import cv2
import numpy as np
import pdfplumber
from pdf2image import convert_from_path
from PIL import Image

def extract_clean_images_from_pdf(pdf_path, output_folder,
                                  min_contour_area=10000,  # Minimum size for valid content
                                  max_logo_area=60000,  # Max size for headers/logos
                                  header_threshold=100,  # Ignore anything at the top of the page
                                  signature_aspect_ratio=(4, 15),  # Aspect ratio to detect signatures
                                  signature_max_height=100):  # Max height for signatures
    """
    Extracts figures and tables from a PDF while removing headers, logos, and signatures.

    Args:
        pdf_path (str): Path to the PDF file.
        output_folder (str): Folder to save extracted images.
    
    Returns:
        dict: A dictionary containing extracted images per page.
    """

    images = convert_from_path(pdf_path, poppler_path="/opt/homebrew/bin")  # Convert PDF to images
    extracted_data = {}

    os.makedirs(output_folder, exist_ok=True)  # Ensure output directory exists

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, image in enumerate(images):
            page = pdf.pages[page_num]

            # Convert image to OpenCV format
            open_cv_image = np.array(image)
            open_cv_image = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2BGR)
            gray = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)

            # **Detect Tables with pdfplumber**
            tables = page.extract_tables()
            table_paths = []

            if tables:
                for i, table in enumerate(tables):
                    table_path = os.path.join(output_folder, f"page_{page_num+1}_table_{i+1}.png")

                    # Convert table to an image using PIL
                    table_img = Image.new("RGB", (1000, 30 * len(table)), "white")
                    draw = ImageDraw.Draw(table_img)

                    for row_idx, row in enumerate(table):
                        row_text = " | ".join(str(cell) if cell is not None else "" for cell in row)
                        draw.text((10, 30 * row_idx), row_text, fill="black")

                    table_img.save(table_path, "PNG")
                    table_paths.append(table_path)

            # **Detect Figures with OpenCV**
            edges = cv2.Canny(gray, 50, 150)  # Detect edges
            contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

            figure_paths = []
            figure_count = 0

            for contour in contours:
                x, y, w, h = cv2.boundingRect(contour)
                area = w * h
                aspect_ratio = w / h if h > 0 else 0  # Prevent division by zero

                # **Ignore headers and logos** (small, high-up elements)
                if area < max_logo_area and y < header_threshold:
                    continue  

                # **Ignore signatures** (long, narrow elements at the bottom)
                if signature_aspect_ratio[0] < aspect_ratio < signature_aspect_ratio[1] and h < signature_max_height:
                    continue  

                if area > min_contour_area:  
                    figure_image = image.crop((x, y, x + w, y + h))
                    figure_path = os.path.join(output_folder, f"page_{page_num+1}_figure_{figure_count+1}.png")
                    figure_image.save(figure_path, "PNG")
                    figure_paths.append(figure_path)
                    figure_count += 1

            extracted_data[f"Page {page_num+1}"] = {
                "figures": figure_paths,
                "tables": table_paths
            }

    return extracted_data

# **Example Usage**
pdf_path = "/Users/lishuyao/Documents/NUS/MODS/Y3S2/Capstone/ODPRT-chatbot/processed_docs/emails_with_attachments/Agreement Type 01-01/processed_attachments/2018-1323 PA NUS-HDB (Fully Executed)_processed_processed.pdf"
output_folder = "data/extracted_content"

extracted_data = extract_clean_images_from_pdf(pdf_path, output_folder)

print("✅ Extracted files:", extracted_data)


✅ Extracted files: {'Page 1': {'figures': [], 'tables': []}, 'Page 2': {'figures': [], 'tables': []}, 'Page 3': {'figures': [], 'tables': []}, 'Page 4': {'figures': ['data/extracted_content/page_4_figure_1.png', 'data/extracted_content/page_4_figure_2.png', 'data/extracted_content/page_4_figure_3.png', 'data/extracted_content/page_4_figure_4.png'], 'tables': []}, 'Page 5': {'figures': ['data/extracted_content/page_5_figure_1.png'], 'tables': []}, 'Page 6': {'figures': ['data/extracted_content/page_6_figure_1.png', 'data/extracted_content/page_6_figure_2.png'], 'tables': []}, 'Page 7': {'figures': ['data/extracted_content/page_7_figure_1.png'], 'tables': []}, 'Page 8': {'figures': [], 'tables': []}, 'Page 9': {'figures': [], 'tables': []}, 'Page 10': {'figures': ['data/extracted_content/page_10_figure_1.png'], 'tables': []}, 'Page 11': {'figures': ['data/extracted_content/page_11_figure_1.png'], 'tables': []}, 'Page 12': {'figures': ['data/extracted_content/page_12_figure_1.png'], 'tabl

In [16]:
def parse_for_ingestion(files):
    # parser = LlamaParse(
    # api_key="(Your API key here)",
    # result_type="markdown"
    # )

    store = {}
    # documents = await parser.aload_data('/sample report')
    
    for fname in files:
        if fname.endswith('.pdf'):
            print("found pdf")
            text = extract_text_from_pdf(fname)
            tables = extract_tables_from_pdf(fname)
            images = extract_images_from_pdf(fname)
            store[fname] = {"text": text, "tables": tables, "images": images} 
    return store

parse_for_ingestion(["1810.04805v2.pdf"])

found pdf


NameError: name 'extract_images_from_pdf' is not defined

In [1]:
! pip install markitdown

Collecting markitdown
  Obtaining dependency information for markitdown from https://files.pythonhosted.org/packages/5f/24/f2de79bc50c82d63d243834b67af4ed3ae8b8bf71652aecc6118d4d1a306/markitdown-0.0.1a4-py3-none-any.whl.metadata
  Downloading markitdown-0.0.1a4-py3-none-any.whl.metadata (8.1 kB)
Collecting azure-ai-documentintelligence (from markitdown)
  Obtaining dependency information for azure-ai-documentintelligence from https://files.pythonhosted.org/packages/84/a8/c9c66d4d04b8aee06ebdc9a6077736b222b9b2fe92364fed6f9a1c08ece0/azure_ai_documentintelligence-1.0.0-py3-none-any.whl.metadata
  Downloading azure_ai_documentintelligence-1.0.0-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.3/51.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting azure-identity (from markitdown)
  Obtaining dependency information for azure-identity from https://files.pythonhosted.org/packages/de/aa/819513c1dbef990af690bb5eefb5e337f8698d75df

In [8]:
from markitdown import MarkItDown
# md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>")
# result = md.convert("1810.04805v2.pdf")
# print(result.text_content)
markdown_file = "output.md"
md = MarkItDown() # Set to True to enable plugins
result = md.convert("1810.04805v2.pdf")
with open(markdown_file, "w", encoding="utf-8") as md_file:
    md_file.write(result.text_content)

In [9]:
import mammoth

with open("1810.04805v2.pdf", "rb") as docx_file:

    result = mammoth.convert_to_markdown(docx_file)

    markdown_text = result["value"]

    print(markdown_text) 

BadZipFile: File is not a zip file

In [11]:
import pdfplumber
import pdfminer
from pdfminer.high_level import extract_text
from PIL import Image
import fitz  # PyMuPDF
import io
import os

def extract_text_from_pdf(pdf_path):
    """Extract text from PDF"""
    text = extract_text(pdf_path)
    return f"<text>\n{text.strip()}\n</text>\n\n"

def extract_tables_from_pdf(pdf_path):
    """Extract tables from PDF and convert them to Markdown"""
    tables_md = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            tables = page.extract_tables()
            for table_index, table in enumerate(tables, start=1):
                tables_md += f"<tbl>\n"
                for row in table:
                    row_md = "| " + " | ".join(str(cell) if cell else " " for cell in row) + " |"
                    tables_md += row_md + "\n"
                tables_md += "</tbl>\n\n"
    return tables_md

def extract_images_from_pdf(pdf_path, output_folder="extracted_images"):
    """Extract images from PDF and save them"""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    images_md = ""
    doc = fitz.open(pdf_path)
    for page_num in range(len(doc)):
        for img_index, img in enumerate(doc[page_num].get_images(full=True), start=1):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_filename = f"{output_folder}/image_{page_num+1}_{img_index}.{image_ext}"
            
            with open(image_filename, "wb") as f:
                f.write(image_bytes)

            images_md += f"<img>{image_filename}</img>\n\n"

    return images_md

def convert_pdf_to_markdown(pdf_path, output_md):
    """Convert PDF to Markdown format with relevant tags"""
    markdown_content = ""

    # Extract text
    markdown_content += extract_text_from_pdf(pdf_path)

    # Extract tables
    markdown_content += extract_tables_from_pdf(pdf_path)

    markdown_content += extract_images_from_pdf(pdf_path)

    # Save to Markdown file
    with open(output_md, "w", encoding="utf-8") as md_file:
        md_file.write(markdown_content)

    print(f"Markdown file saved as {output_md}")

pdf_file = "1810.04805v2.pdf"  # Replace with your PDF file path
markdown_file = "output.md"
convert_pdf_to_markdown(pdf_file, markdown_file)

Markdown file saved as output.md


In [26]:
import pdfplumber
import fitz  # PyMuPDF for rendering pages as images
from pdf2image import convert_from_path
import os

def extract_text_from_pdf(pdf_path, output_text_file):
    """Extracts text from PDF and saves it to a text file."""
    extracted_text = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                extracted_text.append(text)

    text_content = "\n\n".join(extracted_text)
    
    with open(output_text_file, "w", encoding="utf-8") as file:
        file.write(text_content)

    return output_text_file

def extract_tables_and_figures(pdf_path, output_folder):
    """Takes screenshots of figures and tables and saves them as image files."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    pdf_document = fitz.open(pdf_path)
    image_paths = []

    for page_num in range(len(pdf_document)):
        page = pdf_document[page_num]
        pix = page.get_pixmap(dpi=300)  # High-resolution image
        img_path = os.path.join(output_folder, f"page_{page_num + 1}.png")
        
        pix.save(img_path)
        image_paths.append(img_path)

    return image_paths

def process_pdf(pdf_path):
    """Extracts text and captures tables/figures as images from the PDF."""
    base_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_folder = f"{base_name}_output"
    
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # Paths for extracted content
    text_file_path = os.path.join(output_folder, f"{base_name}_text.txt")
    image_output_folder = os.path.join(output_folder, "images")

    # Extract text
    text_file = extract_text_from_pdf(pdf_path, text_file_path)

    # Extract images (tables/figures)
    image_files = extract_tables_and_figures(pdf_path, image_output_folder)

    return {
        # "text_file": text_file,
        "image_files": image_files
    }

# Example Usage
pdf_path = "1810.04805v2.pdf"  # Replace with your PDF file path
result = process_pdf(pdf_path)

# Output
# print("Extracted Text File:", result["text_file"])
print("Extracted Images:", result["image_files"])


Extracted Images: ['1810.04805v2_output/images/page_1.png', '1810.04805v2_output/images/page_2.png', '1810.04805v2_output/images/page_3.png', '1810.04805v2_output/images/page_4.png', '1810.04805v2_output/images/page_5.png', '1810.04805v2_output/images/page_6.png', '1810.04805v2_output/images/page_7.png', '1810.04805v2_output/images/page_8.png', '1810.04805v2_output/images/page_9.png', '1810.04805v2_output/images/page_10.png', '1810.04805v2_output/images/page_11.png', '1810.04805v2_output/images/page_12.png', '1810.04805v2_output/images/page_13.png', '1810.04805v2_output/images/page_14.png', '1810.04805v2_output/images/page_15.png', '1810.04805v2_output/images/page_16.png']


In [13]:
from unstructured.partition.auto import partition

elements = partition(filename="1810.04805v2.pdf")
print("\n\n".join([str(el) for el in elements]))

ModuleNotFoundError: No module named 'pi_heif'

In [6]:
! pip install tesseract



In [3]:
import pytesseract
from PIL import Image
import cv2
import numpy as np
import pdfplumber

# Load the image
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
image_path = "1810.04805v2_output/images/page_9.png"
image = Image.open(image_path)

# Convert image to grayscale for better OCR accuracy
gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)

# Use OCR to extract text
extracted_text = pytesseract.image_to_string(gray)

# Use OCR to detect tables as well
extracted_tables = pytesseract.image_to_data(gray, output_type=pytesseract.Output.DICT)

def save_markdown(text, tables, output_md):
    """Save extracted text and tables in a Markdown file"""
    markdown_content = ""

    # Add text section
    markdown_content += f"<text>\n{text.strip()}\n</text>\n\n"

    # Add table section
    for table in tables:
        markdown_content += "<tbl>\n"
        for row in table:
            row_md = "| " + " | ".join(str(cell) if cell else " " for cell in row) + " |"
            markdown_content += row_md + "\n"
        markdown_content += "</tbl>\n\n"

    # Save to Markdown file
    with open(output_md, "w", encoding="utf-8") as md_file:
        md_file.write(markdown_content)

    print(f"Markdown file saved as {output_md}")

TesseractNotFoundError: /usr/bin/tesseract is not installed or it's not in your PATH. See README file for more information.

In [8]:
! pip install pytesseract



In [None]:
import pytesseract
from PIL import Image
import cv2
import numpy as np
import pdfplumber

# Load the image
pytesseract.pytesseract.tesseract_cmd = "/opt/homebrew/bin/tesseract"
image_path = "1810.04805v2_output/images/page_9.png"
image = Image.open(image_path)

# Convert image to grayscale for better OCR accuracy
gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)

# Use OCR to extract text
extracted_text = pytesseract.image_to_string(gray)

# Use OCR to detect tables as well
extracted_tables = pytesseract.image_to_data(gray, output_type=pytesseract.Output.DICT)

def save_markdown(text, tables, output_md):
    """Save extracted text and tables in a Markdown file"""
    markdown_content = ""

    # Add text section
    markdown_content += f"<text>\n{text.strip()}\n</text>\n\n"

    # Add table section
    for table in tables:
        markdown_content += "<tbl>\n"
        for row in table:
            row_md = "| " + " | ".join(str(cell) if cell else " " for cell in row) + " |"
            markdown_content += row_md + "\n"
        markdown_content += "</tbl>\n\n"

    # Save to Markdown file
    with open(output_md, "w", encoding="utf-8") as md_file:
        md_file.write(markdown_content)

    print(f"Markdown file saved as {output_md}")

output_markdown_file = "output.md"
save_markdown(extracted_text, extracted_tables,output_markdown_file)


{'level': [1,
  2,
  3,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  2,
  3,
  4,
  5,
  5,
  5,
  5,
  5,
  2,
  3,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
 

In [14]:
import pytesseract
from PIL import Image
import cv2
import numpy as np
import pdfplumber
import os

# Set Tesseract Path (Modify this if needed)
pytesseract.pytesseract.tesseract_cmd = "/opt/homebrew/bin/tesseract"

def extract_text_from_pdf(pdf_path):
    """Extracts text from all pages of a PDF using pdfplumber."""
    extracted_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text()
            if text:
                extracted_text += f"\n<text>\nPage {i+1}\n{text.strip()}\n</text>\n\n"
    return extracted_text

def extract_tables_from_pdf(pdf_path):
    """Extracts tables from a PDF and converts them to Markdown."""
    tables_md = ""
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            tables = page.extract_tables()
            for table in tables:
                tables_md += f"\n<tbl>\nPage {i+1}\n"
                for row in table:
                    row_md = "| " + " | ".join(str(cell) if cell else " " for cell in row) + " |"
                    tables_md += row_md + "\n"
                tables_md += "</tbl>\n\n"
    return tables_md

def save_markdown(pdf_path, output_md):
    """Processes the entire PDF and saves the extracted text and tables in Markdown format."""
    markdown_content = ""

    # Extract text
    markdown_content += extract_text_from_pdf(pdf_path)

    # Extract tables
    markdown_content += extract_tables_from_pdf(pdf_path)

    # Save to Markdown file
    with open(output_md, "w", encoding="utf-8") as md_file:
        md_file.write(markdown_content)

    print(f"Markdown file saved as {output_md}")

# Example Usage
pdf_file = "1810.04805v2.pdf"  # Replace with your PDF file path
output_markdown_file = "output.md"
save_markdown(pdf_file, output_markdown_file)


Markdown file saved as output.md


In [None]:
from img2table.document import Image

# Instantiation of the image
img = Image(src="1810.04805v2_output/images/page_9.png")

# Table identification
imgage_tables = img.extract_tables()

# Result of table identification
imgage_tables

In [2]:
! pip install img2table



In [None]:
from img2table.document import PDF
from img2table.ocr import TesseractOCR

# Instantiation of the pdf
pdf = PDF(src="1810.04805v2.pdf")
print("building ocr")
# Instantiation of the OCR, Tesseract, which requires prior installation
ocr = TesseractOCR(lang="eng")
print("ocr built")
print("extracting tables")
# Table identification and extraction
pdf_tables = pdf.extract_tables(ocr=ocr)


pdf_tables



In [3]:
! pip install camelot

Collecting camelot
  Downloading Camelot-12.06.29.tar.gz (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting SQLAlchemy<0.8.0,>=0.7.7 (from camelot)
  Downloading SQLAlchemy-0.7.10.tar.gz (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting Elixir>=0.7.1 (from camelot)
  Downloading Elixir-0.7.1.tar.gz (47 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting sqlalchemy-migrate>=0.7.1 (from camelot)
  Downloading sqlalchemy_migrate-0.13.0-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting xlwt==0.7.2 (from camelot)
  Downloading xlwt-0.7.2.zip (131 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting xlrd==0.7.1 (from camelot)
  Downloadin

In [4]:
import camelot
tables = camelot.read_pdf('1810.04805v2.pdf')
tables[0].df

AttributeError: module 'camelot' has no attribute 'read_pdf'

In [2]:
def parse_user_files(fname):
    # handle different file types
    if fname.endswith('.csv'):
        pass
    elif fname.endswith('.xml'):
        pass
    elif fname.endswith('.pptx'):
        pass
    elif fname.endswith('.docx'):
        pass
    elif fname.endswith('.pdf'):
        pass
    elif fname.endswith('.txt'):
        pass

In [None]:
from IPython.display import display
import sys
import os

# Get the absolute path of the project root
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Add the root directory to sys.path
sys.path.append(project_root)

from chatbot.backend.document_parser.document_parser import DocumentParser
docParser = DocumentParser()
extracted_images = docParser.extract_images_from_pdf('/Users/lishuyao/Documents/NUS/MODS/Y3S2/Capstone/ODPRT-chatbot/notebooks/1810.04805v2.pdf')
print("image extracted")
for img in extracted_images:
    print("displaying image")
    display(img)  # Displays images inline in Jupyter Notebook

In [9]:
import pdfplumber
def extract_text_from_pdf(file_path: str):
    """Extracts text from a PDF file."""
    extracted_text = ""
    
    with pdfplumber.open(file_path) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text()
            if text:
                print(f"✅ Extracted Text from Page {i+1}:\n{text}\n")  # Debug print
                extracted_text += f"\nPage {i+1}\n{text.strip()}\n\n"
            else:
                print(f"❌ No text found on Page {i+1}")

    if not extracted_text:
        print("⚠️ No text extracted from the entire PDF. It might contain images instead of text.")
    
    return extracted_text

extract_text_from_pdf("/Users/lishuyao/Documents/NUS/MODS/Y3S2/Capstone/ODPRT-chatbot/processed_docs/emails_with_attachments/Agreement Type 01-01/processed_attachments/Extension and  Virement Request for JSA Polder project_NUS_processed_processed.pdf")

❌ No text found on Page 1
❌ No text found on Page 2
❌ No text found on Page 3
⚠️ No text extracted from the entire PDF. It might contain images instead of text.


''

In [10]:
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
from PIL import Image

def extract_text_from_scanned_pdf(file_path: str):
    """Extracts text from a scanned PDF using OCR."""
    extracted_text = ""

    # Convert PDF pages to images
    images = convert_from_path(file_path, poppler_path="/opt/homebrew/bin")  # Adjust poppler_path if needed

    for i, image in enumerate(images):
        text = pytesseract.image_to_string(image)
        extracted_text += f"\nPage {i+1}\n{text.strip()}\n\n"
        print(f"✅ Extracted OCR Text from Page {i+1}:\n{text}")

    return extracted_text

# Run OCR
extract_text_from_scanned_pdf("/Users/lishuyao/Documents/NUS/MODS/Y3S2/Capstone/ODPRT-chatbot/processed_docs/emails_with_attachments/Agreement Type 01-01/processed_attachments/Extension and  Virement Request for JSA Polder project_NUS_processed_processed.pdf")


✅ Extracted OCR Text from Page 1:
College of Design and Engineering
Dean’s Office

NUS

National University
of Singapore

14" Dec 2023

To: Mr. Tan Yeow Cheong
Director (Land Reclamation)
Housing & Development Board

Subject: Proposed Extension and Fund Virement of the JSA (Polder) Project

Dear Mr Tan,

lam writing to formally request an extension of the project ending date for the Joint Study on
Advanced Geohydrological and Geotechnical Instrumentations for the Construction of
Polder (JSA Polder Project, WBS no: A-0005466-01-00), which has been a vital joint-study
between Housing & Development Board (HDB) and National University of Singapore (NUS)
since 2019.

The letter will outline the reasons for extending the project beyond its original timeline, as
detailed below, as well as the reasons for the funding virement.

Original Project Timeline: 2"4 January 2019 to 2"! January 2024
Proposed new project end date: 315t December 2024

A. Justification for the Project Extension:

1. Testi

'\nPage 1\nCollege of Design and Engineering\nDean’s Office\n\nNUS\n\nNational University\nof Singapore\n\n14" Dec 2023\n\nTo: Mr. Tan Yeow Cheong\nDirector (Land Reclamation)\nHousing & Development Board\n\nSubject: Proposed Extension and Fund Virement of the JSA (Polder) Project\n\nDear Mr Tan,\n\nlam writing to formally request an extension of the project ending date for the Joint Study on\nAdvanced Geohydrological and Geotechnical Instrumentations for the Construction of\nPolder (JSA Polder Project, WBS no: A-0005466-01-00), which has been a vital joint-study\nbetween Housing & Development Board (HDB) and National University of Singapore (NUS)\nsince 2019.\n\nThe letter will outline the reasons for extending the project beyond its original timeline, as\ndetailed below, as well as the reasons for the funding virement.\n\nOriginal Project Timeline: 2"4 January 2019 to 2"! January 2024\nProposed new project end date: 315t December 2024\n\nA. Justification for the Project Extension:\n\

In [21]:
import os
import cv2
import numpy as np
from pdf2image import convert_from_path
from PIL import Image

def extract_useful_images_from_pdf(file_path: str, min_contour_area=10_000):
    """
    Extracts useful images from a PDF file, ignoring logos, headers, and small graphics.

    Args:
        file_path (str): Path to the PDF file.
        min_contour_area (int): Minimum contour area to consider an object as a useful figure.

    Returns:
        dict: A dictionary containing extracted figures per page.
    """
    save_directory = os.path.join(os.getcwd(), "extracted_images")
    os.makedirs(save_directory, exist_ok=True)

    poppler_path = "/opt/homebrew/bin"  # Adjust if needed
    images = convert_from_path(file_path, poppler_path=poppler_path)
    extracted_figures = {}

    for page_num, image in enumerate(images):
        try:
            if image is None:
                print(f"Skipping Page {page_num+1} in {file_path} (No image content).")
                continue  # Skip empty pages

            # Convert PIL image to NumPy array (ensure RGB format)
            open_cv_image = np.array(image.convert("RGB"))

            if open_cv_image is None or open_cv_image.size == 0:
                print(f"Skipping Page {page_num+1} in {file_path} (Invalid image data).")
                continue  # Skip invalid images

            # Convert to grayscale for better processing
            gray = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2GRAY)

            # Apply GaussianBlur to reduce noise
            blurred = cv2.GaussianBlur(gray, (5, 5), 0)

            # Apply adaptive thresholding for better edge detection
            thresh = cv2.adaptiveThreshold(
                blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
            )

            # Detect edges using Canny
            edges = cv2.Canny(thresh, 50, 150)

            # Find contours
            contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

            figure_paths = []
            figure_count = 0

            for contour in contours:
                x, y, w, h = cv2.boundingRect(contour)

                # **🚀 Filtering Step: Ignore small, thin, or too wide/tall images**
                aspect_ratio = w / float(h)  # Width/Height Ratio
                area = cv2.contourArea(contour)

                if area < min_contour_area:
                    continue  # Skip small objects

                if aspect_ratio > 4 or aspect_ratio < 0.2:
                    continue  # Skip banners, headers, and sidebars

                # Extract the figure
                figure_image = image.crop((x, y, x + w, y + h))

                # Save the image
                figure_path = os.path.join(save_directory, f"page_{page_num+1}_figure_{figure_count+1}.png")
                figure_image.save(figure_path, "PNG")
                figure_paths.append(figure_path)

                figure_count += 1

            if figure_paths:
                extracted_figures[f"Page {page_num+1}"] = figure_paths

        except Exception as e:
            print(f"⚠️ Error processing Page {page_num+1} of {file_path}: {e}")

    return extracted_figures

# **Example Usage**
pdf_path = "/Users/lishuyao/Documents/NUS/MODS/Y3S2/Capstone/ODPRT-chatbot/processed_docs/emails_with_attachments/Agreement Type 01-01/processed_attachments/Extension and  Virement Request for JSA Polder project_NUS_processed_processed.pdf"
extracted_images = extract_useful_images_from_pdf(pdf_path)
print(f"Extracted {len(extracted_images)} pages with images.")


Extracted 2 pages with images.


In [46]:
import os
import cv2
import numpy as np
from pdf2image import convert_from_path
from PIL import Image

def extract_meaningful_images_from_pdf(file_path: str, min_contour_area=15_000):
    """
    Extracts useful images from a PDF file, filtering out logos, headers, and small graphics.

    Args:
        file_path (str): Path to the PDF file.
        min_contour_area (int): Minimum contour area to consider an object as useful.

    Returns:
        dict: A dictionary containing extracted figures per page.
    """
    save_directory = os.path.join(os.getcwd(), "filtered_extracted_images")
    os.makedirs(save_directory, exist_ok=True)

    poppler_path = "/opt/homebrew/bin"  # Adjust if needed
    images = convert_from_path(file_path, poppler_path=poppler_path, dpi=300)
    extracted_figures = {}

    for page_num, image in enumerate(images):
        try:
            if image is None:
                print(f"Skipping Page {page_num+1} in {file_path} (No image content).")
                continue

            # Convert PIL image to NumPy array (RGB)
            open_cv_image = np.array(image.convert("RGB"))

            if open_cv_image is None or open_cv_image.size == 0:
                print(f"Skipping Page {page_num+1} in {file_path} (Invalid image data).")
                continue  # Skip invalid images

            # Convert to grayscale and blur for noise reduction
            gray = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2GRAY)
            blurred = cv2.GaussianBlur(gray, (5, 5), 0)

            # Adaptive thresholding to enhance edges
            thresh = cv2.adaptiveThreshold(
                blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
            )

            # Edge detection with Canny
            edges = cv2.Canny(thresh, 50, 150)

            # Find contours
            contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

            figure_paths = []
            figure_count = 0

            for contour in contours:
                x, y, w, h = cv2.boundingRect(contour)

                # 🚀 **Filtering Step: Ignore unwanted shapes**
                aspect_ratio = w / float(h)
                area = cv2.contourArea(contour)

                if area < min_contour_area:
                    continue  # Ignore small objects

                if aspect_ratio > 4 or aspect_ratio < 0.25:
                    continue  # Ignore banners, sidebars, and overly narrow/wide objects

                # Convert to color space to check for monochrome images (logos, watermarks)
                roi = open_cv_image[y:y + h, x:x + w]
                mean_color = np.mean(roi, axis=(0, 1))  # Average color value

                # If an image has very low color variance, it's likely a watermark/logo → Ignore it
                if np.std(mean_color) < 15:
                    continue

                # Extract the figure
                figure_image = image.crop((x, y, x + w, y + h))

                # Save the image
                figure_path = os.path.join(save_directory, f"page_{page_num+1}_figure_{figure_count+1}.png")
                figure_image.save(figure_path, "PNG")
                figure_paths.append(figure_path)

                figure_count += 1

            if figure_paths:
                extracted_figures[f"Page {page_num+1}"] = figure_paths

        except Exception as e:
            print(f"⚠️ Error processing Page {page_num+1} of {file_path}: {e}")

    return extracted_figures

# **Example Usage**
pdf_path = "/Users/lishuyao/Documents/NUS/MODS/Y3S2/Capstone/ODPRT-chatbot/processed_docs/emails_with_attachments/Agreement Type 01-01/processed_attachments/Extension and  Virement Request for JSA Polder project_NUS_processed_processed.pdf"
extracted_images = extract_meaningful_images_from_pdf(pdf_path)
print(f"✅ Extracted {len(extracted_images)} pages with useful images.")


✅ Extracted 2 pages with useful images.


In [None]:
"""USE THIS METHOD"""
import os
import cv2
import numpy as np
from pdf2image import convert_from_path
from PIL import Image

def extract_tables_and_figures_from_pdf(file_path: str, min_contour_area=10_000):
    """
    Extracts meaningful images and tables from a PDF file, treating tables as full images.

    Args:
        file_path (str): Path to the PDF file.
        min_contour_area (int): Minimum contour area to consider an object useful.

    Returns:
        dict: A dictionary containing extracted figures and tables per page.
    """
    save_directory = os.path.join(os.getcwd(), "extracted_images")
    os.makedirs(save_directory, exist_ok=True)

    poppler_path = "/opt/homebrew/bin"  # Adjust if needed
    images = convert_from_path(file_path, poppler_path=poppler_path, dpi=300)
    extracted_elements = {}

    for page_num, image in enumerate(images):
        try:
            if image is None:
                print(f"Skipping Page {page_num+1} in {file_path} (No image content).")
                continue

            # Convert PIL image to NumPy array (RGB)
            open_cv_image = np.array(image.convert("RGB"))

            if open_cv_image is None or open_cv_image.size == 0:
                print(f"Skipping Page {page_num+1} in {file_path} (Invalid image data).")
                continue

            # Convert to grayscale and apply Gaussian blur for noise reduction
            gray = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2GRAY)
            blurred = cv2.GaussianBlur(gray, (5, 5), 0)

            # Adaptive thresholding for better table & figure detection
            thresh = cv2.adaptiveThreshold(
                blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 15, 4
            )

            # Detect horizontal and vertical lines (to detect tables)
            kernel_horizontal = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 1))
            kernel_vertical = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 25))

            horizontal_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel_horizontal)
            vertical_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel_vertical)

            # Combine horizontal and vertical lines to detect full tables
            table_mask = cv2.add(horizontal_lines, vertical_lines)

            # Edge detection
            edges = cv2.Canny(thresh, 50, 150)

            # Find contours (for both tables and figures)
            contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            figure_paths = []
            table_paths = []
            figure_count, table_count = 0, 0

            for contour in contours:
                x, y, w, h = cv2.boundingRect(contour)
                area = cv2.contourArea(contour)
                aspect_ratio = w / float(h)

                # 🚀 **Table Detection**: Looks for grid-like structures
                if cv2.countNonZero(table_mask[y:y+h, x:x+w]) > 0.5 * area:
                    if area < min_contour_area:
                        continue  # Ignore small detected tables
                    table_image = image.crop((x, y, x + w, y + h))
                    table_path = os.path.join(save_directory, f"page_{page_num+1}_table_{table_count+1}.png")
                    table_image.save(table_path, "PNG")
                    table_paths.append(table_path)
                    table_count += 1
                    continue  # Skip further processing for tables

                # 🚀 **Figure Detection**: Looks for large, meaningful images
                if area > min_contour_area and 0.3 < aspect_ratio < 3:
                    figure_image = image.crop((x, y, x + w, y + h))
                    
                    figure_path = os.path.join(save_directory, f"page_{page_num+1}_figure_{figure_count+1}.png")
                    figure_image.save(figure_path, "PNG")
                    figure_paths.append(figure_path)
                    figure_count += 1

            if figure_paths or table_paths:
                extracted_elements[f"Page {page_num+1}"] = {
                    "figures": figure_paths,
                    "tables": table_paths
                }

        except Exception as e:
            print(f"⚠️ Error processing Page {page_num+1} of {file_path}: {e}")

    return extracted_elements

# **Example Usage**
extracted_data = extract_tables_and_figures_from_pdf(pdf_path)
print(f"✅ Extracted {len(extracted_data)} pages with useful images and tables.")





✅ Extracted 3 pages with useful images and tables.


In [18]:
import os
import cv2
import numpy as np
from pdf2image import convert_from_path
from PIL import Image

def extract_tables_and_figures_from_pdf(file_path: str, min_contour_area=10_000, max_logo_area=50_000):
    """
    Extracts tables and figures from a PDF, treating tables as whole images and avoiding logos.

    Args:
        file_path (str): Path to the PDF file.
        min_contour_area (int): Minimum contour area to consider an object useful.
        max_logo_area (int): Maximum area for logos and small non-useful images.

    Returns:
        dict: A dictionary containing extracted tables and figures per page.
    """
    save_directory = os.path.join(os.getcwd(), "extracted_images")
    os.makedirs(save_directory, exist_ok=True)

    poppler_path = "/opt/homebrew/bin"  # Adjust if needed
    images = convert_from_path(file_path, poppler_path=poppler_path, dpi=300)
    extracted_elements = {}

    for page_num, image in enumerate(images):
        try:
            if image is None:
                print(f"Skipping Page {page_num+1} in {file_path} (No image content).")
                continue

            # Convert PIL image to NumPy array (RGB)
            open_cv_image = np.array(image.convert("RGB"))

            if open_cv_image is None or open_cv_image.size == 0:
                print(f"Skipping Page {page_num+1} in {file_path} (Invalid image data).")
                continue

            # Convert to grayscale
            gray = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2GRAY)

            # Adaptive thresholding to enhance table structure
            thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                           cv2.THRESH_BINARY_INV, 15, 4)

            # Morphological operations to detect lines (tables)
            kernel_horizontal = cv2.getStructuringElement(cv2.MORPH_RECT, (50, 1))
            kernel_vertical = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 50))
            horizontal_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel_horizontal)
            vertical_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel_vertical)
            table_mask = cv2.add(horizontal_lines, vertical_lines)

            # Edge detection
            edges = cv2.Canny(gray, 50, 150)

            # Find contours (tables and figures)
            contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            figure_paths = []
            table_paths = []
            figure_count, table_count = 0, 0

            for contour in contours:
                x, y, w, h = cv2.boundingRect(contour)
                area = cv2.contourArea(contour)
                aspect_ratio = w / float(h)

                # **Skip small logos and noise**
                if area < max_logo_area and aspect_ratio < 2 and y < 100:
                    continue  # Likely a logo or header

                # **Table Detection**: Uses structured grid-like features
                if cv2.countNonZero(table_mask[y:y+h, x:x+w]) > 0.5 * area:
                    if area < min_contour_area:
                        continue  # Ignore small detected tables
                    table_image = image.crop((x, y, x + w, y + h))
                    table_path = os.path.join(save_directory, f"page_{page_num+1}_table_{table_count+1}.png")
                    table_image.save(table_path, "PNG")
                    table_paths.append(table_path)
                    table_count += 1
                    continue  # Skip further processing for tables

                # **Figure Detection**: Captures meaningful large figures
                if area > min_contour_area and 0.3 < aspect_ratio < 3:
                    figure_image = image.crop((x, y, x + w, y + h))
                    figure_path = os.path.join(save_directory, f"page_{page_num+1}_figure_{figure_count+1}.png")
                    figure_image.save(figure_path, "PNG")
                    figure_paths.append(figure_path)
                    figure_count += 1

            if figure_paths or table_paths:
                extracted_elements[f"Page {page_num+1}"] = {
                    "figures": figure_paths,
                    "tables": table_paths
                }

        except Exception as e:
            print(f"⚠️ Error processing Page {page_num+1} of {file_path}: {e}")

    return extracted_elements

# **Example Usage**
extracted_data = extract_tables_and_figures_from_pdf(pdf_path)
print(f"✅ Extracted {len(extracted_data)} pages with useful images and tables.")


✅ Extracted 3 pages with useful images and tables.


In [44]:
import os
import cv2
import numpy as np
from pdf2image import convert_from_path
from PIL import Image

def extract_tables_and_figures_from_pdf(file_path: str, min_table_area=20_000, min_figure_area=20_000, max_logo_area=60_000):
    """
    Extracts tables and figures from a PDF while avoiding logos.

    Args:
        file_path (str): Path to the PDF file.
        min_table_area (int): Minimum area for a table to be considered valid.
        min_figure_area (int): Minimum area for a figure to be considered valid.
        max_logo_area (int): Maximum area for logos (small decorations).

    Returns:
        dict: A dictionary containing extracted tables and figures per page.
    """
    save_directory = os.path.join(os.getcwd(), "extracted_images")
    os.makedirs(save_directory, exist_ok=True)

    poppler_path = "/opt/homebrew/bin"  # Adjust if needed
    images = convert_from_path(file_path, poppler_path=poppler_path, dpi=300)
    extracted_elements = {}

    for page_num, image in enumerate(images):
        try:
            if image is None:
                print(f"Skipping Page {page_num+1} (No image content).")
                continue

            # Convert PIL image to NumPy array (RGB)
            open_cv_image = np.array(image.convert("RGB"))

            if open_cv_image is None or open_cv_image.size == 0:
                print(f"Skipping Page {page_num+1} (Invalid image data).")
                continue

            # Convert to grayscale
            gray = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2GRAY)

            # **Step 1: Table Detection**
            # Apply adaptive thresholding to enhance table structure
            thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                           cv2.THRESH_BINARY_INV, 15, 4)

            # Use Hough Line Transform to detect grid structures (tables)
            edges = cv2.Canny(thresh, 50, 150)
            lines = cv2.HoughLinesP(edges, 1, np.pi / 180, threshold=100, minLineLength=100, maxLineGap=5)

            table_mask = np.zeros_like(gray)
            if lines is not None:
                for line in lines:
                    x1, y1, x2, y2 = line[0]
                    cv2.line(table_mask, (x1, y1), (x2, y2), 255, 2)  # Draw detected table lines

            # **Step 2: Contour Detection (Tables & Figures)**
            contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            figure_paths = []
            table_paths = []
            figure_count, table_count = 0, 0

            for contour in contours:
                x, y, w, h = cv2.boundingRect(contour)
                area = cv2.contourArea(contour)
                aspect_ratio = w / float(h)

                # **Ignore logos & small decorative elements**
                if area < max_logo_area and y < 100:
                    continue  # Likely a logo/header

                # **Table Detection: Uses structured grid-like features**
                if cv2.countNonZero(table_mask[y:y+h, x:x+w]) > 0.4 * area:
                    if area < min_table_area:
                        continue  # Ignore small detected tables
                    table_image = image.crop((x, y, x + w, y + h))
                    table_path = os.path.join(save_directory, f"page_{page_num+1}_table_{table_count+1}.png")
                    table_image.save(table_path, "PNG")
                    table_paths.append(table_path)
                    table_count += 1
                    continue  # Skip further processing for tables

                # **Figure Detection: Captures meaningful figures**
                if area > min_figure_area and 0.3 < aspect_ratio < 3:
                    figure_image = image.crop((x, y, x + w, y + h))
                    figure_path = os.path.join(save_directory, f"page_{page_num+1}_figure_{figure_count+1}.png")
                    figure_image.save(figure_path, "PNG")
                    figure_paths.append(figure_path)
                    figure_count += 1

            if figure_paths or table_paths:
                extracted_elements[f"Page {page_num+1}"] = {
                    "figures": figure_paths,
                    "tables": table_paths
                }

        except Exception as e:
            print(f"⚠️ Error processing Page {page_num+1}: {e}")

    return extracted_elements

# **Example Usage**
extracted_data = extract_tables_and_figures_from_pdf(pdf_path)
print(f"✅ Extracted {len(extracted_data)} pages with useful images and tables.")


✅ Extracted 1 pages with useful images and tables.


In [25]:
import os
import pdfplumber
from PIL import Image

def extract_tables_and_figures_pdfimage(file_path: str, min_figure_area=20_000, max_logo_area=60_000):
    """
    Extracts tables and figures as embedded images from a PDF while avoiding logos.

    Args:
        file_path (str): Path to the PDF file.
        min_figure_area (int): Minimum area for a figure to be considered valid.
        max_logo_area (int): Maximum area for logos (small decorations).

    Returns:
        dict: A dictionary containing extracted tables and figures per page.
    """
    save_directory = os.path.join(os.getcwd(), "extracted_images")
    os.makedirs(save_directory, exist_ok=True)

    extracted_elements = {}

    with pdfplumber.open(file_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            figures = []
            tables = []
            
            # Extract all images embedded in the PDF
            for img_index, img in enumerate(page.images):
                x, y, w, h = img["x0"], img["top"], img["x1"] - img["x0"], img["bottom"] - img["top"]
                area = w * h

                # **Ignore logos & small decorative elements**
                if area < max_logo_area and y < 100:
                    continue  # Likely a logo/header

                # **Convert `PageImage` to a PIL Image**
                image_obj = page.to_image().image  # Extracts PIL image
                cropped_img = image_obj.crop((x, y, x + w, y + h))  # Crop to bounding box

                # **Save the extracted image**
                image_path = os.path.join(save_directory, f"page_{page_num+1}_img_{img_index+1}.png")
                cropped_img.save(image_path, "PNG")

                # **Classify as figure or table**
                if area > min_figure_area:
                    figures.append(image_path)
                else:
                    tables.append(image_path)

            # Store extracted tables and figures
            if figures or tables:
                extracted_elements[f"Page {page_num+1}"] = {"figures": figures, "tables": tables}

    return extracted_elements

# **Example Usage**
extracted_data = extract_tables_and_figures_pdfimage(pdf_path)
print(f"✅ Extracted {len(extracted_data)} pages with useful images and tables.")


AttributeError: 'PageImage' object has no attribute 'image'