In [None]:
from pdf2docx import parse
import fitz  # PyMuPDF
import io
from PIL import Image
import os
import subprocess
import xml.etree.ElementTree as ET
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
import tempfile

In [None]:
pdf_file = "../data/Infineon-ICE5xSAG-DataSheet-v02_30-EN.pdf"  # Source PDF file
docx_file = (
    "../data/Infineon-ICE5xSAG-DataSheet-v02_30-EN.docx"  # Destination DOCX file
)
output_pdf = "../data/restructure.pdf"  # Destination PDF file

In [None]:
parse(pdf_file, docx_file, start=0, end=None)

In [None]:
def extract_content_and_reassemble(input_pdf_path, output_pdf_path):
    # Open the original PDF
    original_doc = fitz.open(input_pdf_path)
    # Create a new PDF
    new_doc = fitz.open()

    for page_num in range(len(original_doc)):
        # Extract the page
        page = original_doc.load_page(page_num)
        # Get the page dimensions
        rect = page.rect

        # Create a new page in the new PDF with the same dimensions
        new_page = new_doc.new_page(width=rect.width, height=rect.height)

        # Extract and insert text
        text = page.get_text("text")
        new_page.insert_text((0, 0), text, fontsize=12)

        # Extract and insert images
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = original_doc.extract_image(xref)
            image_bytes = base_image["image"]

            # Open the image with PIL
            image = Image.open(io.BytesIO(image_bytes))

            # Define the position and size for the image on the new page
            # Here, we place images below the text with some padding
            x = 0
            y = 100 + img_index * (image.height + 10)  # Adjust as needed
            new_page.insert_image(
                fitz.Rect(x, y, x + image.width, y + image.height),
                pixmap=fitz.Pixmap(image_bytes),
            )

    # Save the new PDF
    new_doc.save(output_pdf_path)

In [None]:
# Example usage
extract_content_and_reassemble(pdf_file, output_pdf)

In [None]:
def extract_pdf_to_xml(input_pdf, output_xml):
    """
    Convert PDF to XML using pdf2xml
    Note: This requires pdf2xml to be installed on your system
    """
    try:
        # For Linux/Mac systems
        subprocess.run(["pdftohtml", "-xml", input_pdf, output_xml], check=True)
        print(f"Successfully converted {input_pdf} to {output_xml}")
        return True
    except subprocess.CalledProcessError as e:
        print(f"Error converting PDF to XML: {e}")
        return False
    except FileNotFoundError:
        try:
            # For Windows with poppler installed
            subprocess.run(["pdf2xml", input_pdf, output_xml], check=True)
            print(f"Successfully converted {input_pdf} to {output_xml}")
            return True
        except (subprocess.CalledProcessError, FileNotFoundError):
            print(
                "Error: pdf2xml/pdftohtml not found. Please install poppler-utils or pdf2xml."
            )
            print("For Linux: sudo apt-get install poppler-utils")
            print("For Mac: brew install poppler")
            print("For Windows: Download and install pdf2xml or poppler")
            return False


def parse_xml_content(xml_file):
    """
    Parse the XML file to extract text content with positions and font information
    """
    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()

        pages = []

        # Extract document dimensions from the first page
        first_page = root.find(".//page")
        if first_page is not None:
            width = float(first_page.get("width"))
            height = float(first_page.get("height"))
        else:
            # Default to letter size if no dimensions found
            width, height = letter

        # Process each page
        for page_elem in root.findall(".//page"):
            page_data = {
                "width": float(page_elem.get("width")),
                "height": float(page_elem.get("height")),
                "texts": [],
            }

            # Extract text elements with their positions and properties
            for text_elem in page_elem.findall(".//text"):
                text_data = {
                    "content": text_elem.text or "",
                    "x": float(text_elem.get("left", 0)),
                    "y": float(text_elem.get("top", 0)),
                    "width": float(text_elem.get("width", 0)),
                    "height": float(text_elem.get("height", 0)),
                    "font": text_elem.get("font", "Helvetica"),
                    "size": float(text_elem.get("size", 12)),
                    "color": text_elem.get("color", "#000000"),
                }
                page_data["texts"].append(text_data)

            pages.append(page_data)

        return {"dimensions": (width, height), "pages": pages}

    except ET.ParseError as e:
        print(f"Error parsing XML file: {e}")
        return None
    except Exception as e:
        print(f"Unexpected error: {e}")
        return None


def create_pdf_from_xml_content(content_data, output_pdf):
    """
    Create a new PDF based on the extracted content
    """
    try:
        # Create PDF
        c = canvas.Canvas(output_pdf, pagesize=(content_data["dimensions"]))

        # Register default fonts
        default_fonts = {
            "Helvetica": "Helvetica",
            "Times": "Times-Roman",
            "Courier": "Courier",
        }

        # Process each page
        for page_idx, page in enumerate(content_data["pages"]):
            if page_idx > 0:
                c.showPage()  # Create a new page

            # Set page dimensions
            c.setPageSize((page["width"], page["height"]))

            # Draw text elements
            for text in page["texts"]:
                # Skip empty text
                if not text["content"]:
                    continue

                # Set font and size
                font_name = default_fonts.get(text["font"], "Helvetica")
                c.setFont(font_name, text["size"])

                # Draw text at position
                # Note: ReportLab's coordinate system starts from bottom-left
                # while most XML from pdf2xml uses top-left, so we need to adjust Y
                adjusted_y = page["height"] - text["y"] - text["height"]
                c.drawString(text["x"], adjusted_y, text["content"])

        c.save()
        print(f"Successfully created PDF: {output_pdf}")
        return True

    except Exception as e:
        print(f"Error creating PDF: {e}")
        return False


def process_pdf(input_pdf, output_pdf):
    """
    Main function to process a PDF: extract to XML and recreate as PDF
    """
    # Create a temporary directory for intermediate files
    with tempfile.TemporaryDirectory() as temp_dir:
        # Define intermediate XML file path
        temp_xml = os.path.join(temp_dir, "temp_output.xml")

        # Step 1: Extract PDF to XML
        if not extract_pdf_to_xml(input_pdf, temp_xml):
            return False

        # Step 2: Parse XML content
        content_data = parse_xml_content(temp_xml)
        if not content_data:
            return False

        # Step 3: Create new PDF from XML content
        if not create_pdf_from_xml_content(content_data, output_pdf):
            return False

        return True

In [None]:
process_pdf(pdf_file, output_pdf)