## 2 Functions
1. `parse_for_ingestion`: Parsing for ingestion (more specific and intentional)
2. `parse_user_files`: Parsing for user routing (general, not as important)

In [12]:
import nest_asyncio
nest_asyncio.apply()
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
import fitz
import pdfplumber
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import os
import cv2

### Helper Functions

In [13]:
# for pdfs
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    return text

def extract_tables_from_pdf(pdf_path):
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables.extend(page.extract_tables())
    return tables



def extract_images_from_pdf(pdf_path, output_dir='./extracted_images/'):
    """
    Extracts images from a PDF and saves them to the specified directory.
    Returns a list of paths to the extracted images.
    """
    os.makedirs(output_dir, exist_ok=True)
    image_paths = []
    pdf_document = fitz.open(pdf_path)
    
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        image_list = page.get_images(full=True)
        
        for img_index, img_info in enumerate(image_list):
            xref = img_info[0]  # Image cross-reference ID
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            
            # Generate a unique filename
            image_filename = f"image_page{page_num + 1}_index{img_index}.{image_ext}"
            image_path = os.path.join(output_dir, image_filename)
            
            # Save the image
            with open(image_path, "wb") as img_file:
                img_file.write(image_bytes)
            
            image_paths.append(image_path)
    
    pdf_document.close()
    return image_paths

# def extract_tables_as_images(pdf_path, output_dir='./extracted_tables/'):
#     """
#     Extracts tables from a PDF, renders them as images, and saves them.
#     Returns a list of paths to the saved table images.
#     """
#     os.makedirs(output_dir, exist_ok=True)
#     table_image_paths = []
    
#     with pdfplumber.open(pdf_path) as pdf:
#         for page_num, page in enumerate(pdf.pages):
#             tables = page.extract_tables()
            
#             if tables:
#                 for table_index, table in enumerate(tables):
#                     # Render the table as an image
#                     table_image = page.to_image().debug_tablefinder()
                    
#                     # Save the table as an image
#                     table_filename = f"table_page{page_num + 1}_index{table_index}.png"
#                     table_path = os.path.join(output_dir, table_filename)
#                     table_image.save(table_path)
                    
#                     table_image_paths.append(table_path)
    
#     return table_image_paths

# def extract_images_and_tables(pdf_path, image_output_dir='./extracted_images/', table_output_dir='./extracted_tables/'):
#     """
#     Extracts both images and tables from a PDF and saves them as images.
#     Returns a dictionary with paths to the extracted images and tables.
#     """
#     images = extract_images_from_pdf(pdf_path, image_output_dir)
#     tables = extract_tables_as_images(pdf_path, table_output_dir)
    
#     return {
#         "images": images,
#         "tables": tables
#     }


def extract_figures_from_pdf(pdf_path, output_folder="extracted_figures", min_contour_area=5000):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Convert PDF pages to images
    images = convert_from_path(pdf_path)
    extracted_image_paths = []

    for page_num, image in enumerate(images):
        # Convert PIL image to OpenCV format
        open_cv_image = np.array(image)
        open_cv_image = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2BGR)
        
        # Convert to grayscale
        gray = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)
        
        # Apply edge detection
        edges = cv2.Canny(gray, 50, 150)
        
        # Find contours
        contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        figure_count = 0
        for contour in contours:
            # Ignore small contours
            if cv2.contourArea(contour) < min_contour_area:
                continue
            
            # Get bounding box of the figure
            x, y, w, h = cv2.boundingRect(contour)
            
            # Crop the detected figure
            figure_image = image.crop((x, y, x + w, y + h))
            
            # Save the extracted figure
            figure_path = os.path.join(output_folder, f"page_{page_num+1}_figure_{figure_count+1}.png")
            figure_image.save(figure_path, "PNG")
            extracted_image_paths.append(figure_path)
            
            figure_count += 1

    return extracted_image_paths

In [25]:
import os
import cv2
import numpy as np
from pdf2image import convert_from_path
import fitz  # PyMuPDF
from PIL import Image

# Define paths
pdf_path = "1810.04805v2.pdf"
output_text_file = "/data/combined_text.txt"
output_figures_folder = "data/extracted_figures"

# Create output directories
os.makedirs(os.path.dirname(output_text_file), exist_ok=True)
os.makedirs(output_figures_folder, exist_ok=True)

# Function to extract text from PDF
def extract_combined_text_from_pdf(pdf_path, output_file_path):
    doc = fitz.open(pdf_path)
    
    with open(output_file_path, "w", encoding="utf-8") as text_file:
        for page_num in range(len(doc)):
            text = doc[page_num].get_text("text")
            text_file.write(f"\n\n--- Page {page_num+1} ---\n\n")
            text_file.write(text)

    return output_file_path

# Function to extract figures from PDF
poppler_path = "/opt/homebrew/bin"  # Adjust this if needed
def extract_figures_from_pdf(pdf_path, output_folder, min_contour_area=5000):
    images = convert_from_path(pdf_path,  poppler_path = poppler_path)
    extracted_figures = {}

    for page_num, image in enumerate(images):
        # Convert image to OpenCV format
        open_cv_image = np.array(image)
        open_cv_image = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2BGR)

        # Convert to grayscale and apply edge detection
        gray = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)
        edges = cv2.Canny(gray, 50, 150)

        # Find contours
        contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        figure_paths = []
        figure_count = 0

        for contour in contours:
            if cv2.contourArea(contour) < min_contour_area:
                continue

            x, y, w, h = cv2.boundingRect(contour)
            figure_image = image.crop((x, y, x + w, y + h))

            figure_path = os.path.join(output_folder, f"page_{page_num+1}_figure_{figure_count+1}.png")
            figure_image.save(figure_path, "PNG")
            figure_paths.append(figure_path)

            figure_count += 1

        extracted_figures[f"Page {page_num+1}"] = figure_paths

    return extracted_figures

# Extract text and figures
text_data = extract_combined_text_from_pdf(pdf_path, output_text_file)
figure_data = extract_figures_from_pdf(pdf_path, output_figures_folder)

# Combine results in a dictionary
extracted_data = {
    "text": text_data,
    "figures": figure_data
}

# Return the extracted paths
extracted_data

OSError: [Errno 30] Read-only file system: '/data'

In [30]:
import os
import cv2
import numpy as np
from pdf2image import convert_from_path
import fitz  # PyMuPDF
from PIL import Image

# Define paths
pdf_path = "1810.04805v2.pdf"
output_text_file = "data/combined_text.txt"
output_figures_folder = "data/extracted_figures"

# Create output directories
os.makedirs(os.path.dirname(output_text_file), exist_ok=True)
os.makedirs(output_figures_folder, exist_ok=True)

# Function to extract text from PDF
def extract_combined_text_from_pdf(pdf_path, output_file_path):
    doc = fitz.open(pdf_path)
    
    with open(output_file_path, "w", encoding="utf-8") as text_file:
        for page_num in range(len(doc)):
            text = doc[page_num].get_text("text")
            text_file.write(f"\n\n--- Page {page_num+1} ---\n\n")
            text_file.write(text)

    return output_file_path

# Function to extract figures from PDF
poppler_path = "/opt/homebrew/bin"  # Adjust this if needed

def extract_figures_and_tables_from_pdf(pdf_path, output_folder, min_contour_area=5000, min_table_area=10000):
    images = convert_from_path(pdf_path, poppler_path=poppler_path)  # Convert PDF pages to images
    extracted_images = {}

    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    for page_num, image in enumerate(images):
        # Convert image to OpenCV format
        open_cv_image = np.array(image)
        open_cv_image = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2BGR)

        # Convert to grayscale
        gray = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)

        # Apply adaptive thresholding to detect tables
        thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 15, 4)

        # Detect vertical and horizontal lines to identify tables
        kernel_h = np.ones((1, 10), np.uint8)
        kernel_v = np.ones((10, 1), np.uint8)
        horizontal_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel_h)
        vertical_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel_v)
        table_mask = cv2.add(horizontal_lines, vertical_lines)

        # Find contours for tables and figures
        contours, _ = cv2.findContours(table_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        figure_paths = []
        table_paths = []
        count_figures = 0
        count_tables = 0

        for contour in contours:
            x, y, w, h = cv2.boundingRect(contour)

            if cv2.contourArea(contour) > min_table_area:
                # Extract tables
                table_image = image.crop((x, y, x + w, y + h))
                table_path = os.path.join(output_folder, f"page_{page_num+1}_table_{count_tables+1}.png")
                table_image.save(table_path, "PNG")
                table_paths.append(table_path)
                count_tables += 1
            elif cv2.contourArea(contour) > min_contour_area:
                # Extract figures
                figure_image = image.crop((x, y, x + w, y + h))
                figure_path = os.path.join(output_folder, f"page_{page_num+1}_figure_{count_figures+1}.png")
                figure_image.save(figure_path, "PNG")
                figure_paths.append(figure_path)
                count_figures += 1

        extracted_images[f"Page {page_num+1}"] = {
            "figures": figure_paths,
            "tables": table_paths
        }

    return extracted_images

# Extract text and figures
text_data = extract_combined_text_from_pdf(pdf_path, output_text_file)
figure_data = extract_figures_and_tables_from_pdf(pdf_path, output_figures_folder)

# Combine results in a dictionary
extracted_data = {
    "text": text_data,
    "figures": figure_data
}

# Return the extracted paths
print(extracted_data)


{'text': 'data/combined_text.txt', 'figures': {'Page 1': {'figures': [], 'tables': []}, 'Page 2': {'figures': [], 'tables': []}, 'Page 3': {'figures': [], 'tables': ['data/extracted_figures/page_3_table_1.png', 'data/extracted_figures/page_3_table_2.png']}, 'Page 4': {'figures': [], 'tables': []}, 'Page 5': {'figures': [], 'tables': []}, 'Page 6': {'figures': [], 'tables': []}, 'Page 7': {'figures': [], 'tables': []}, 'Page 8': {'figures': [], 'tables': []}, 'Page 9': {'figures': [], 'tables': []}, 'Page 10': {'figures': [], 'tables': []}, 'Page 11': {'figures': [], 'tables': []}, 'Page 12': {'figures': [], 'tables': []}, 'Page 13': {'figures': [], 'tables': ['data/extracted_figures/page_13_table_1.png', 'data/extracted_figures/page_13_table_2.png', 'data/extracted_figures/page_13_table_3.png']}, 'Page 14': {'figures': [], 'tables': []}, 'Page 15': {'figures': [], 'tables': []}, 'Page 16': {'figures': [], 'tables': ['data/extracted_figures/page_16_table_1.png']}}}


In [17]:
import os
import cv2
import numpy as np
import pdfplumber
from pdf2image import convert_from_path
from PIL import Image

def extract_figures_and_tables_from_pdf(pdf_path, output_folder, min_contour_area=5000):
    images = convert_from_path(pdf_path, poppler_path="/opt/homebrew/bin")  # Convert PDF pages to images
    extracted_data = {}

    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Use pdfplumber for table detection
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, image in enumerate(images):
            page = pdf.pages[page_num]

            # Convert image to OpenCV format
            open_cv_image = np.array(image)
            open_cv_image = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2BGR)

            # Convert to grayscale
            gray = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)

            # Apply adaptive thresholding for better contrast
            adaptive_thresh = cv2.adaptiveThreshold(
                gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 15, 4
            )

            # **Enhanced Table Detection Using pdfplumber**
            tables = page.extract_tables()
            table_paths = []
            if not tables:
                # **Fallback: Extract words inside possible table areas**
                words = page.extract_words()
                if words:
                    table_text = "\n".join([f"{word['text']}" for word in words])
                    table_path = os.path.join(output_folder, f"page_{page_num+1}_fallback_table.txt")
                    with open(table_path, "w", encoding="utf-8") as f:
                        f.write(table_text)
                    table_paths.append(table_path)
            else:
                for i, table in enumerate(tables):
                    table_path = os.path.join(output_folder, f"page_{page_num+1}_table_{i+1}.txt")
                    with open(table_path, "w", encoding="utf-8") as f:
                        for row in table:
                            f.write("\t".join(str(cell) if cell is not None else "" for cell in row) + "\n")
                    table_paths.append(table_path)

            # **Improved Figure Detection Using OpenCV**
            edges = cv2.Canny(gray, 50, 150)  # Detect edges
            contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

            figure_paths = []
            figure_count = 0

            for contour in contours:
                x, y, w, h = cv2.boundingRect(contour)
                if cv2.contourArea(contour) > min_contour_area:
                    # Extract figure
                    figure_image = image.crop((x, y, x + w, y + h))
                    figure_path = os.path.join(output_folder, f"page_{page_num+1}_figure_{figure_count+1}.png")
                    figure_image.save(figure_path, "PNG")
                    figure_paths.append(figure_path)
                    figure_count += 1

            extracted_data[f"Page {page_num+1}"] = {
                "figures": figure_paths,
                "tables": table_paths
            }

    return extracted_data

# Define paths
pdf_path = "1810.04805v2.pdf"
output_folder = "data/extracted_content"

# Run the function
extracted_data = extract_figures_and_tables_from_pdf(pdf_path, output_folder)

# Print extracted file paths
print(extracted_data)


{'Page 1': {'figures': [], 'tables': ['data/extracted_content/page_1_fallback_table.txt']}, 'Page 2': {'figures': [], 'tables': ['data/extracted_content/page_2_fallback_table.txt']}, 'Page 3': {'figures': ['data/extracted_content/page_3_figure_1.png', 'data/extracted_content/page_3_figure_2.png'], 'tables': ['data/extracted_content/page_3_table_1.txt', 'data/extracted_content/page_3_table_2.txt', 'data/extracted_content/page_3_table_3.txt', 'data/extracted_content/page_3_table_4.txt', 'data/extracted_content/page_3_table_5.txt']}, 'Page 4': {'figures': [], 'tables': ['data/extracted_content/page_4_fallback_table.txt']}, 'Page 5': {'figures': [], 'tables': ['data/extracted_content/page_5_table_1.txt', 'data/extracted_content/page_5_table_2.txt', 'data/extracted_content/page_5_table_3.txt', 'data/extracted_content/page_5_table_4.txt', 'data/extracted_content/page_5_table_5.txt', 'data/extracted_content/page_5_table_6.txt', 'data/extracted_content/page_5_table_7.txt', 'data/extracted_cont

In [15]:
import os
import cv2
import numpy as np
import pdfplumber
from pdf2image import convert_from_path
from PIL import Image

def extract_tables_and_figures_as_images(pdf_path, output_folder, min_contour_area=5000):
    images = convert_from_path(pdf_path, poppler_path="/opt/homebrew/bin")  # Convert PDF pages to images
    extracted_data = {}

    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Use pdfplumber for table detection
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, image in enumerate(images):
            page = pdf.pages[page_num]

            # Convert image to OpenCV format
            open_cv_image = np.array(image)
            open_cv_image = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2BGR)

            # Convert to grayscale
            gray = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)

            # Apply adaptive thresholding for better contrast
            adaptive_thresh = cv2.adaptiveThreshold(
                gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 15, 4
            )

            table_paths = []
            figure_paths = []

            # **Extract Tables as Images**
            tables = page.extract_tables()
            if tables:
                for i, table in enumerate(tables):
                    # Get table bounding box
                    bbox = page.bbox
                    table_img = image.crop(bbox)

                    table_path = os.path.join(output_folder, f"page_{page_num+1}_table_{i+1}.png")
                    table_img.save(table_path, "PNG")
                    table_paths.append(table_path)
            else:
                # **Fallback: Crop words that may be part of a table**
                words = page.extract_words()
                if words:
                    x0 = min([word["x0"] for word in words])
                    y0 = min([word["top"] for word in words])
                    x1 = max([word["x1"] for word in words])
                    y1 = max([word["bottom"] for word in words])
                    
                    table_img = image.crop((x0, y0, x1, y1))
                    table_path = os.path.join(output_folder, f"page_{page_num+1}_fallback_table.png")
                    table_img.save(table_path, "PNG")
                    table_paths.append(table_path)

            # **Extract Figures as Images**
            edges = cv2.Canny(gray, 50, 150)  # Detect edges
            contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

            figure_count = 0
            for contour in contours:
                x, y, w, h = cv2.boundingRect(contour)
                if cv2.contourArea(contour) > min_contour_area:
                    # Extract figure
                    figure_image = image.crop((x, y, x + w, y + h))
                    figure_path = os.path.join(output_folder, f"page_{page_num+1}_figure_{figure_count+1}.png")
                    figure_image.save(figure_path, "PNG")
                    figure_paths.append(figure_path)
                    figure_count += 1

            extracted_data[f"Page {page_num+1}"] = {
                "figures": figure_paths,
                "tables": table_paths
            }

    return extracted_data

# Define paths
pdf_path = "1810.04805v2.pdf"
output_folder = "data/extracted_images"

# Run the function
extracted_data = extract_tables_and_figures_as_images(pdf_path, output_folder)

# Print extracted file paths
print(extracted_data)


{'Page 1': {'figures': [], 'tables': ['data/extracted_images/page_1_fallback_table.png']}, 'Page 2': {'figures': [], 'tables': ['data/extracted_images/page_2_fallback_table.png']}, 'Page 3': {'figures': ['data/extracted_images/page_3_figure_1.png', 'data/extracted_images/page_3_figure_2.png'], 'tables': ['data/extracted_images/page_3_table_1.png', 'data/extracted_images/page_3_table_2.png', 'data/extracted_images/page_3_table_3.png', 'data/extracted_images/page_3_table_4.png', 'data/extracted_images/page_3_table_5.png']}, 'Page 4': {'figures': [], 'tables': ['data/extracted_images/page_4_fallback_table.png']}, 'Page 5': {'figures': [], 'tables': ['data/extracted_images/page_5_table_1.png', 'data/extracted_images/page_5_table_2.png', 'data/extracted_images/page_5_table_3.png', 'data/extracted_images/page_5_table_4.png', 'data/extracted_images/page_5_table_5.png', 'data/extracted_images/page_5_table_6.png', 'data/extracted_images/page_5_table_7.png', 'data/extracted_images/page_5_table_8

In [16]:
def parse_for_ingestion(files):
    # parser = LlamaParse(
    # api_key="(Your API key here)",
    # result_type="markdown"
    # )

    store = {}
    # documents = await parser.aload_data('/sample report')
    
    for fname in files:
        if fname.endswith('.pdf'):
            print("found pdf")
            text = extract_text_from_pdf(fname)
            tables = extract_tables_from_pdf(fname)
            images = extract_images_from_pdf(fname)
            store[fname] = {"text": text, "tables": tables, "images": images} 
    return store

parse_for_ingestion(["1810.04805v2.pdf"])

found pdf


NameError: name 'extract_images_from_pdf' is not defined

In [1]:
! pip install markitdown

Collecting markitdown
  Obtaining dependency information for markitdown from https://files.pythonhosted.org/packages/5f/24/f2de79bc50c82d63d243834b67af4ed3ae8b8bf71652aecc6118d4d1a306/markitdown-0.0.1a4-py3-none-any.whl.metadata
  Downloading markitdown-0.0.1a4-py3-none-any.whl.metadata (8.1 kB)
Collecting azure-ai-documentintelligence (from markitdown)
  Obtaining dependency information for azure-ai-documentintelligence from https://files.pythonhosted.org/packages/84/a8/c9c66d4d04b8aee06ebdc9a6077736b222b9b2fe92364fed6f9a1c08ece0/azure_ai_documentintelligence-1.0.0-py3-none-any.whl.metadata
  Downloading azure_ai_documentintelligence-1.0.0-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.3/51.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting azure-identity (from markitdown)
  Obtaining dependency information for azure-identity from https://files.pythonhosted.org/packages/de/aa/819513c1dbef990af690bb5eefb5e337f8698d75df

In [8]:
from markitdown import MarkItDown
# md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>")
# result = md.convert("1810.04805v2.pdf")
# print(result.text_content)
markdown_file = "output.md"
md = MarkItDown() # Set to True to enable plugins
result = md.convert("1810.04805v2.pdf")
with open(markdown_file, "w", encoding="utf-8") as md_file:
    md_file.write(result.text_content)

In [9]:
import mammoth

with open("1810.04805v2.pdf", "rb") as docx_file:

    result = mammoth.convert_to_markdown(docx_file)

    markdown_text = result["value"]

    print(markdown_text) 

BadZipFile: File is not a zip file

In [11]:
import pdfplumber
import pdfminer
from pdfminer.high_level import extract_text
from PIL import Image
import fitz  # PyMuPDF
import io
import os

def extract_text_from_pdf(pdf_path):
    """Extract text from PDF"""
    text = extract_text(pdf_path)
    return f"<text>\n{text.strip()}\n</text>\n\n"

def extract_tables_from_pdf(pdf_path):
    """Extract tables from PDF and convert them to Markdown"""
    tables_md = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            tables = page.extract_tables()
            for table_index, table in enumerate(tables, start=1):
                tables_md += f"<tbl>\n"
                for row in table:
                    row_md = "| " + " | ".join(str(cell) if cell else " " for cell in row) + " |"
                    tables_md += row_md + "\n"
                tables_md += "</tbl>\n\n"
    return tables_md

def extract_images_from_pdf(pdf_path, output_folder="extracted_images"):
    """Extract images from PDF and save them"""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    images_md = ""
    doc = fitz.open(pdf_path)
    for page_num in range(len(doc)):
        for img_index, img in enumerate(doc[page_num].get_images(full=True), start=1):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_filename = f"{output_folder}/image_{page_num+1}_{img_index}.{image_ext}"
            
            with open(image_filename, "wb") as f:
                f.write(image_bytes)

            images_md += f"<img>{image_filename}</img>\n\n"

    return images_md

def convert_pdf_to_markdown(pdf_path, output_md):
    """Convert PDF to Markdown format with relevant tags"""
    markdown_content = ""

    # Extract text
    markdown_content += extract_text_from_pdf(pdf_path)

    # Extract tables
    markdown_content += extract_tables_from_pdf(pdf_path)

    markdown_content += extract_images_from_pdf(pdf_path)

    # Save to Markdown file
    with open(output_md, "w", encoding="utf-8") as md_file:
        md_file.write(markdown_content)

    print(f"Markdown file saved as {output_md}")

pdf_file = "1810.04805v2.pdf"  # Replace with your PDF file path
markdown_file = "output.md"
convert_pdf_to_markdown(pdf_file, markdown_file)

Markdown file saved as output.md


In [2]:
import pdfplumber
import fitz  # PyMuPDF for rendering pages as images
from pdf2image import convert_from_path
import os

def extract_text_from_pdf(pdf_path, output_text_file):
    """Extracts text from PDF and saves it to a text file."""
    extracted_text = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                extracted_text.append(text)

    text_content = "\n\n".join(extracted_text)
    
    with open(output_text_file, "w", encoding="utf-8") as file:
        file.write(text_content)

    return output_text_file

def extract_tables_and_figures(pdf_path, output_folder):
    """Takes screenshots of figures and tables and saves them as image files."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    pdf_document = fitz.open(pdf_path)
    image_paths = []

    for page_num in range(len(pdf_document)):
        page = pdf_document[page_num]
        pix = page.get_pixmap(dpi=300)  # High-resolution image
        img_path = os.path.join(output_folder, f"page_{page_num + 1}.png")
        
        pix.save(img_path)
        image_paths.append(img_path)

    return image_paths

def process_pdf(pdf_path):
    """Extracts text and captures tables/figures as images from the PDF."""
    base_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_folder = f"{base_name}_output"
    
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # Paths for extracted content
    text_file_path = os.path.join(output_folder, f"{base_name}_text.txt")
    image_output_folder = os.path.join(output_folder, "images")

    # Extract text
    text_file = extract_text_from_pdf(pdf_path, text_file_path)

    # Extract images (tables/figures)
    image_files = extract_tables_and_figures(pdf_path, image_output_folder)

    return {
        "text_file": text_file,
        "image_files": image_files
    }

# Example Usage
pdf_path = "1810.04805v2.pdf"  # Replace with your PDF file path
result = process_pdf(pdf_path)

# Output
print("Extracted Text File:", result["text_file"])
print("Extracted Images:", result["image_files"])


Extracted Text File: 1810.04805v2_output/1810.04805v2_text.txt
Extracted Images: ['1810.04805v2_output/images/page_1.png', '1810.04805v2_output/images/page_2.png', '1810.04805v2_output/images/page_3.png', '1810.04805v2_output/images/page_4.png', '1810.04805v2_output/images/page_5.png', '1810.04805v2_output/images/page_6.png', '1810.04805v2_output/images/page_7.png', '1810.04805v2_output/images/page_8.png', '1810.04805v2_output/images/page_9.png', '1810.04805v2_output/images/page_10.png', '1810.04805v2_output/images/page_11.png', '1810.04805v2_output/images/page_12.png', '1810.04805v2_output/images/page_13.png', '1810.04805v2_output/images/page_14.png', '1810.04805v2_output/images/page_15.png', '1810.04805v2_output/images/page_16.png']


In [13]:
from unstructured.partition.auto import partition

elements = partition(filename="1810.04805v2.pdf")
print("\n\n".join([str(el) for el in elements]))

ModuleNotFoundError: No module named 'pi_heif'

In [6]:
! pip install tesseract



In [3]:
import pytesseract
from PIL import Image
import cv2
import numpy as np
import pdfplumber

# Load the image
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
image_path = "1810.04805v2_output/images/page_9.png"
image = Image.open(image_path)

# Convert image to grayscale for better OCR accuracy
gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)

# Use OCR to extract text
extracted_text = pytesseract.image_to_string(gray)

# Use OCR to detect tables as well
extracted_tables = pytesseract.image_to_data(gray, output_type=pytesseract.Output.DICT)

def save_markdown(text, tables, output_md):
    """Save extracted text and tables in a Markdown file"""
    markdown_content = ""

    # Add text section
    markdown_content += f"<text>\n{text.strip()}\n</text>\n\n"

    # Add table section
    for table in tables:
        markdown_content += "<tbl>\n"
        for row in table:
            row_md = "| " + " | ".join(str(cell) if cell else " " for cell in row) + " |"
            markdown_content += row_md + "\n"
        markdown_content += "</tbl>\n\n"

    # Save to Markdown file
    with open(output_md, "w", encoding="utf-8") as md_file:
        md_file.write(markdown_content)

    print(f"Markdown file saved as {output_md}")

TesseractNotFoundError: /usr/bin/tesseract is not installed or it's not in your PATH. See README file for more information.

In [8]:
! pip install pytesseract



In [None]:
import pytesseract
from PIL import Image
import cv2
import numpy as np
import pdfplumber

# Load the image
pytesseract.pytesseract.tesseract_cmd = "/opt/homebrew/bin/tesseract"
image_path = "1810.04805v2_output/images/page_9.png"
image = Image.open(image_path)

# Convert image to grayscale for better OCR accuracy
gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)

# Use OCR to extract text
extracted_text = pytesseract.image_to_string(gray)

# Use OCR to detect tables as well
extracted_tables = pytesseract.image_to_data(gray, output_type=pytesseract.Output.DICT)

def save_markdown(text, tables, output_md):
    """Save extracted text and tables in a Markdown file"""
    markdown_content = ""

    # Add text section
    markdown_content += f"<text>\n{text.strip()}\n</text>\n\n"

    # Add table section
    for table in tables:
        markdown_content += "<tbl>\n"
        for row in table:
            row_md = "| " + " | ".join(str(cell) if cell else " " for cell in row) + " |"
            markdown_content += row_md + "\n"
        markdown_content += "</tbl>\n\n"

    # Save to Markdown file
    with open(output_md, "w", encoding="utf-8") as md_file:
        md_file.write(markdown_content)

    print(f"Markdown file saved as {output_md}")

output_markdown_file = "output.md"
save_markdown(extracted_text, extracted_tables,output_markdown_file)


{'level': [1,
  2,
  3,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  2,
  3,
  4,
  5,
  5,
  5,
  5,
  5,
  2,
  3,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  4,
  5,
 

In [14]:
import pytesseract
from PIL import Image
import cv2
import numpy as np
import pdfplumber
import os

# Set Tesseract Path (Modify this if needed)
pytesseract.pytesseract.tesseract_cmd = "/opt/homebrew/bin/tesseract"

def extract_text_from_pdf(pdf_path):
    """Extracts text from all pages of a PDF using pdfplumber."""
    extracted_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text()
            if text:
                extracted_text += f"\n<text>\nPage {i+1}\n{text.strip()}\n</text>\n\n"
    return extracted_text

def extract_tables_from_pdf(pdf_path):
    """Extracts tables from a PDF and converts them to Markdown."""
    tables_md = ""
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            tables = page.extract_tables()
            for table in tables:
                tables_md += f"\n<tbl>\nPage {i+1}\n"
                for row in table:
                    row_md = "| " + " | ".join(str(cell) if cell else " " for cell in row) + " |"
                    tables_md += row_md + "\n"
                tables_md += "</tbl>\n\n"
    return tables_md

def save_markdown(pdf_path, output_md):
    """Processes the entire PDF and saves the extracted text and tables in Markdown format."""
    markdown_content = ""

    # Extract text
    markdown_content += extract_text_from_pdf(pdf_path)

    # Extract tables
    markdown_content += extract_tables_from_pdf(pdf_path)

    # Save to Markdown file
    with open(output_md, "w", encoding="utf-8") as md_file:
        md_file.write(markdown_content)

    print(f"Markdown file saved as {output_md}")

# Example Usage
pdf_file = "1810.04805v2.pdf"  # Replace with your PDF file path
output_markdown_file = "output.md"
save_markdown(pdf_file, output_markdown_file)


Markdown file saved as output.md


In [None]:
from img2table.document import Image

# Instantiation of the image
img = Image(src="1810.04805v2_output/images/page_9.png")

# Table identification
imgage_tables = img.extract_tables()

# Result of table identification
imgage_tables

In [2]:
! pip install img2table



In [None]:
from img2table.document import PDF
from img2table.ocr import TesseractOCR

# Instantiation of the pdf
pdf = PDF(src="1810.04805v2.pdf")
print("building ocr")
# Instantiation of the OCR, Tesseract, which requires prior installation
ocr = TesseractOCR(lang="eng")
print("ocr built")
print("extracting tables")
# Table identification and extraction
pdf_tables = pdf.extract_tables(ocr=ocr)


pdf_tables



In [3]:
! pip install camelot

Collecting camelot
  Downloading Camelot-12.06.29.tar.gz (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting SQLAlchemy<0.8.0,>=0.7.7 (from camelot)
  Downloading SQLAlchemy-0.7.10.tar.gz (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting Elixir>=0.7.1 (from camelot)
  Downloading Elixir-0.7.1.tar.gz (47 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting sqlalchemy-migrate>=0.7.1 (from camelot)
  Downloading sqlalchemy_migrate-0.13.0-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting xlwt==0.7.2 (from camelot)
  Downloading xlwt-0.7.2.zip (131 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting xlrd==0.7.1 (from camelot)
  Downloadin

In [4]:
import camelot
tables = camelot.read_pdf('1810.04805v2.pdf')
tables[0].df

AttributeError: module 'camelot' has no attribute 'read_pdf'

In [2]:
def parse_user_files(fname):
    # handle different file types
    if fname.endswith('.csv'):
        pass
    elif fname.endswith('.xml'):
        pass
    elif fname.endswith('.pptx'):
        pass
    elif fname.endswith('.docx'):
        pass
    elif fname.endswith('.pdf'):
        pass
    elif fname.endswith('.txt'):
        pass

In [None]:
from IPython.display import display
import sys
import os

# Get the absolute path of the project root
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Add the root directory to sys.path
sys.path.append(project_root)

from chatbot.backend.document_parser.document_parser import DocumentParser
docParser = DocumentParser()
extracted_images = docParser.extract_images_from_pdf('/Users/lishuyao/Documents/NUS/MODS/Y3S2/Capstone/ODPRT-chatbot/notebooks/1810.04805v2.pdf')
print("image extracted")
for img in extracted_images:
    print("displaying image")
    display(img)  # Displays images inline in Jupyter Notebook