# Download PDFs from website

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from bs4 import BeautifulSoup
import os
from time import sleep
import requests

I tested this out with the ESMATS (European Space Mechanisms and Tribology Symposium) papers which are available: https://www.esmats.eu/esmatspapers/index.php

In [None]:
def get_esmats_papers(url_base,year,folder):
    url=url_base+year
    # Set up the WebDriver, requires chrome to be installed
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode (without opening a browser window)
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=chrome_options) # requires a chrome browser to be installed

    driver.get(url) # # Open the webpage
    time.sleep(5)  # Wait for the page to load, adjust this as needed

    page_source = driver.page_source    # Get page source
    driver.quit()   # Close the browser

    soup = BeautifulSoup(page_source, 'html.parser')    # Parse the page source with BeautifulSoup
    print(f"Got soup for {year}!")
 
    # Find all anchor tags with href attribute ending with .pdf
    pdf_links = soup.find_all('a', href=lambda href: href and href.endswith('.pdf'))

    # Directory where PDFs will be saved
    os.makedirs(folder, exist_ok=True)

    # Iterate over all found PDF links and download each PDF
    for link in pdf_links:
        pdf_url = link['href']
        # Full URL if the link is relative
        if not pdf_url.startswith('http'):
            pdf_url = 'https://www.esmats.eu/esmatspapers/' + pdf_url
        print(f'Downloading {pdf_url}')
        # Download the PDF file
        response = requests.get(pdf_url)
        # Get the file name from the URL
        file_name = pdf_url.split('/')[-1]
        # Save the PDF file
        with open(os.path.join(folder,year+"_"+file_name), 'wb') as file:
            file.write(response.content)

    print(f"Download completed for {year}!")

In [None]:

# years = ["2023", "2021", "2019", "2017", "2015", "2013", "2011", "2009", "2007", "2005", "2003", "2001", "1999"]
years = ["2023", "2021", "2019", "2017", "2015", "2013", "2011", "2009", "2007", "2005", "2003", "2001", "1999"]
url="https://www.esmats.eu/esmatspapers/completelist.php?whichYear="
folder=os.path.join('..','data','ESMAT')

for year in years:
    get_esmats_papers(url,year,folder)

# Google Document AI, reapplying OCR back to the PDFs

In [1]:
from google.cloud import documentai_v1 as documentai
from google.cloud.documentai_toolbox import document
import fitz
import os
from tqdm import tqdm
from PIL import Image
import ocrmypdf

In [2]:
def remove_text_by_rasterizing(input_pdf, output_pdf):
    """
    Removes all text content from a PDF by rasterizing it into images
    and recreating the PDF with just the images.
    """
    doc = fitz.open(input_pdf)
    image_pages = []

    for page_number in range(len(doc)):
        # Render each page as an image
        page = doc[page_number]
        pix = page.get_pixmap(dpi=150)  # High resolution for better quality
        image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        image_pages.append(image)

    # Save the images as a new PDF
    if image_pages:
        image_pages[0].save(output_pdf, save_all=True, append_images=image_pages[1:])
    print(f"PDF without text saved to: {output_pdf}")

def process_pdf_with_document_ai(project_id, location, processor_id, pdf_in):
    """Extracts text from a PDF using Google Document AI."""
    # Instantiate the Document AI client
    client = documentai.DocumentProcessorServiceClient()

    # Set processor details
    name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

    # Read the PDF file
    with open(pdf_in, "rb") as file:
        pdf_content = file.read()

    # Create the request
    raw_document = documentai.RawDocument(content=pdf_content, mime_type="application/pdf")
    request = documentai.ProcessRequest(name=name, raw_document=raw_document)

    # Process the document: https://cloud.google.com/document-ai/docs/handle-response#text_layout_and_quality_scores 
    result = client.process_document(request=request)

    # Convert to hOCR format
    wrapped_document = document.Document.from_documentai_document(result.document)
    hocr_string = wrapped_document.export_hocr_str(title=os.path.basename(pdf_in))
    
    return hocr_string

def save_hocr_to_file(hocr_string, output_path):
    """Saves the hOCR string to a file."""
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(hocr_string)
    print(f"hOCR file saved to: {output_path}")

def convert_hocr_to_pdf(hocr_path, image_pdf_path, output_pdf_path):
    """Converts hOCR file to searchable PDF using ocrmypdf."""
    try:
        print(f"Converting hOCR to PDF: {output_pdf_path}")
        ocrmypdf.ocr(
            input_file=image_pdf_path,
            output_file=output_pdf_path,
            sidecar=hocr_path,
            skip_text=True,
            optimize=3,
            progress_bar=False
        )
        print(f"PDF with searchable text created at: {output_pdf_path}")
    except ocrmypdf.exceptions.PdfMergeFailedError as e:
        print(f"Error converting hOCR to PDF: {e}")

In [3]:
# FIXME need to split into <20Mb chunks for Google Document AI

# Google Document AI parameters
project_id = "ai-aerospace"
location = "us"
processor_id = "baa26d1093093c7e"   # get from google cloud console

# Define the path to the folder containing the PDFs
# pdf_folder_path = "/Users/danmueller/Library/CloudStorage/GoogleDrive-dsm@danmueller.pro/My Drive/AI Aerospace/Documents/Aerospace Mechanisms/ESMATS/process_queue"
pdf_folder_path = "data/"

# List all PDF files in the folder
pdf_files = [
    f for f in os.listdir(pdf_folder_path)
    if f.endswith('.pdf') and '_reocr' not in os.path.basename(f) and '_without_text' not in os.path.basename(f)
]
print(f"Processing PDFs: {pdf_files}")

# Iterate over each PDF file and process it with a progress bar
for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
    print(f"Processing: {pdf_file}")
    input_pdf_path = os.path.join(pdf_folder_path, pdf_file)
    base_name = os.path.basename(input_pdf_path)
    name, ext = os.path.splitext(base_name)
    pdf_without_text_path = os.path.join(pdf_folder_path, f"{name}_without_text{ext}")
    output_pdf_path = os.path.join(pdf_folder_path, f"{name}_reocr{ext}")

    # Step 1: Remove the existing OCR text layer
    print("Removing existing OCR text layer...")
    remove_text_by_rasterizing(input_pdf_path, pdf_without_text_path)

    # Step 2: Extract text with Google Document AI
    print("Extracting text using Google Document AI...")
    hocr_output = process_pdf_with_document_ai(project_id, location, processor_id, pdf_without_text_path)
    hocr_output_path = os.path.join(pdf_folder_path, f"{name}_ocr.hocr")
    print(f"Saving hOCR to file: {hocr_output_path}")
    save_hocr_to_file(hocr_output, hocr_output_path)

    # Convert hOCR to PDF
    print(f"Converting hOCR to PDF: {output_pdf_path}")
    convert_hocr_to_pdf(hocr_output_path, pdf_without_text_path, output_pdf_path)


Processing PDFs: ['1999_breguet.pdf', '1999_cacho.pdf', 'AMS_1998.pdf', '1999_carre.pdf', '1999_barillot.pdf']


Processing PDFs:   0%|          | 0/5 [00:00<?, ?it/s]

Processing: 1999_breguet.pdf
Removing existing OCR text layer...
PDF without text saved to: data/1999_breguet_without_text.pdf
Extracting text using Google Document AI...
Saving hOCR to file: data/1999_breguet_ocr.hocr
hOCR file saved to: data/1999_breguet_ocr.hocr
Converting hOCR to PDF: data/1999_breguet_reocr.pdf


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Processing PDFs:  20%|██        | 1/5 [00:13<00:54, 13.57s/it]

PDF with searchable text created at: data/1999_breguet_reocr.pdf
Processing: 1999_cacho.pdf
Removing existing OCR text layer...
PDF without text saved to: data/1999_cacho_without_text.pdf
Extracting text using Google Document AI...
Saving hOCR to file: data/1999_cacho_ocr.hocr
hOCR file saved to: data/1999_cacho_ocr.hocr
Converting hOCR to PDF: data/1999_cacho_reocr.pdf


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Processing PDFs:  40%|████      | 2/5 [00:26<00:39, 13.15s/it]

PDF with searchable text created at: data/1999_cacho_reocr.pdf
Processing: AMS_1998.pdf
Removing existing OCR text layer...
PDF without text saved to: data/AMS_1998_without_text.pdf
Extracting text using Google Document AI...


Processing PDFs:  40%|████      | 2/5 [00:50<01:15, 25.23s/it]


InvalidArgument: 400 Request contains an invalid argument. [field_violations {
  field: "raw_document.content"
  description: "Document size (105867223) exceeds the limit: 20971520."
}
]