# Settings

In [11]:
from PyPDF2 import PdfReader, PdfWriter

# Library showcase

### Read pdf file & metadata

In [10]:
pdf_path = '../../data/GLUE_Software_Developer_2024.pdf'
reader = PdfReader(pdf_path)
print(reader.metadata)

{'/Author': 'Holly Gibbs', '/Creator': 'Microsoft Word', '/CreationDate': "D:20240215214715+00'00'", '/ModDate': "D:20240215214715+00'00'"}


## Extract text from the pdf file

In [23]:
reader = PdfReader(pdf_path)
pages_content = {f'page_{i}': page.extract_text().strip() for i,page in enumerate(reader.pages, start=1)}

## Split pdf into individal pages

In [None]:
from PyPDF2 import PdfReader, PdfWriter

# Specify the path to your PDF file
pdf_path = 'path_to_your_pdf_file.pdf'

# Open the PDF file
reader = PdfReader(pdf_path)

# Loop through each page in the PDF
for i, page in enumerate(reader.pages, start=1):
    writer = PdfWriter()  # Create a new PdfWriter object for each page
    writer.add_page(page)  # Add the current page to the writer object
    
    # Specify the output filename for the current page
    output_filename = f'page_{i}.pdf'
    
    # Write the page to a new PDF file
    with open(output_filename, 'wb') as output_pdf:
        writer.write(output_pdf)
    
    print(f'Created: {output_filename}')


## Merging pdf files

In [None]:
# Create a PDF writer instance
writer = PdfWriter()

# Open the first PDF and add its pages
reader1 = PdfReader("document1.pdf")
for page in reader1.pages:
    writer.add_page(page)

# Open the second PDF and add its pages
reader2 = PdfReader("document2.pdf")
for page in reader2.pages:
    writer.add_page(page)

# Write out the merged PDF
with open("merged_document.pdf", "wb") as out:
    writer.write(out)

## Rotate pages

In [None]:
# Specify the path to your PDF file
pdf_path = 'path_to_your_pdf_file.pdf'

# Open the PDF file
reader = PdfReader(pdf_path)

# Create a PDF writer object for the output PDF
writer = PdfWriter()

# Loop through each page in the PDF
for page in reader.pages:
    # Rotate the page by 90 degrees
    page.rotate_clockwise(90)
    # Add the rotated page to the writer object
    writer.add_page(page)

# Specify the output filename
output_pdf_path = 'rotated_pdf_file.pdf'

# Write the output PDF to a file
with open(output_pdf_path, 'wb') as output_pdf:
    writer.write(output_pdf)

print(f'Rotated PDF saved as: {output_pdf_path}')

## Cropping Pages

In [13]:
# Specify the path to your PDF file
pdf_path = 'path_to_your_pdf_file.pdf'
pdf_path = '../../data/GLUE_Software_Developer_2024.pdf'

# Open the PDF file
reader = PdfReader(pdf_path)

# Create a PDF writer object for the output PDF
writer = PdfWriter()

# Define the coordinates of the lower left and upper right corners of the new crop box
# These values should be in points (1 point = 1/72 inch)
x0, y0, x1, y1 = 50, 50, 400, 600

# Loop through each page in the PDF
for page in reader.pages:
    # Update the page's crop box
    page.cropbox.lower_left = (x0, y0)
    page.cropbox.upper_right = (x1, y1)
    # Add the cropped page to the writer object
    writer.add_page(page)

# Specify the output filename
output_pdf_path = 'cropped_pdf_file.pdf'

# Write the output PDF to a file
with open(output_pdf_path, 'wb') as output_pdf:
    writer.write(output_pdf)

print(f'Cropped PDF saved as: {output_pdf_path}')


Cropped PDF saved as: cropped_pdf_file.pdf


## Add bounding boxes to the pdf

In [28]:
from reportlab.pdfgen import canvas

def create_bounding_box_pdf(output_path, box_coords, page_size):
    """
    Creates a PDF with a bounding box.
    Args:
    - output_path: Path to save the overlay PDF.
    - box_coords: A tuple of (x0, y0, x1, y1) for the bounding box.
    - page_size: A tuple of (width, height) for the page size.
    """
    c = canvas.Canvas(output_path, pagesize=page_size)
    c.setStrokeColorRGB(1, 0, 0)  # Set the bounding box color to red
    c.setLineWidth(2)  # Set the bounding box line width
    c.rect(*box_coords)  # Draw the bounding box
    c.save()

# Example usage
create_bounding_box_pdf('bounding_box_overlay.pdf', (0, 0, 612, 792), (612, 792)) # Page size here is 8.5 x 11 inches in points


In [29]:
# Paths to your original PDF and the overlay PDF
original_pdf_path = 'path_to_your_original_pdf.pdf'
overlay_pdf_path = 'bounding_box_overlay.pdf'

# Create a reader for the original and overlay PDFs
original_pdf = PdfReader(original_pdf_path)
overlay_pdf = PdfReader(overlay_pdf_path)

# Create a writer for the output PDF
writer = PdfWriter()

# Assuming you want to add the bounding box to the first page
page = original_pdf.pages[0]
overlay_page = overlay_pdf.pages[0]

# Merge the overlay onto the original page
page.merge_page(overlay_page)

# Add the modified page to the writer, and the rest of the pages from the original PDF
writer.add_page(page)
for i in range(1, len(original_pdf.pages)):
    writer.add_page(original_pdf.pages[i])

# Write to a new file
output_pdf_path = 'output_with_bounding_box.pdf'
with open(output_pdf_path, 'wb') as output_file:
    writer.write(output_file)

print(f'Output PDF with bounding box saved as: {output_pdf_path}')

Output PDF with bounding box saved as: output_with_bounding_box.pdf


## Add watermark

In [None]:
from PyPDF2 import PdfReader, PdfWriter

def add_watermark(input_pdf_path, watermark_pdf_path, output_pdf_path):
    # Create a PDF reader object for the input and watermark PDFs
    input_pdf = PdfReader(input_pdf_path)
    watermark_pdf = PdfReader(watermark_pdf_path)
    watermark_page = watermark_pdf.pages[0]
    
    # Create a PDF writer object for the output PDF
    writer = PdfWriter()
    
    # Iterate through the input PDF pages
    for page in input_pdf.pages:
        # Merge the watermark with the page
        page.merge_page(watermark_page)
        # Add the page to the writer object
        writer.add_page(page)
    
    # Write the watermarked PDF to a file
    with open(output_pdf_path, 'wb') as output_pdf:
        writer.write(output_pdf)

# Specify your PDF paths
input_pdf_path = 'path_to_your_document.pdf'
watermark_pdf_path = 'watermark.pdf'
output_pdf_path = 'document_with_watermark.pdf'

# Add watermark
add_watermark(input_pdf_path, watermark_pdf_path, output_pdf_path)

print("Watermark added successfully.")