# Download PDFs from website

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from bs4 import BeautifulSoup
import os
from time import sleep
import requests

I tested this out with the ESMATS (European Space Mechanisms and Tribology Symposium) papers which are available: https://www.esmats.eu/esmatspapers/index.php

In [None]:
def get_esmats_papers(url_base,year,folder):
    url=url_base+year
    # Set up the WebDriver, requires chrome to be installed
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode (without opening a browser window)
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=chrome_options) # requires a chrome browser to be installed

    driver.get(url) # # Open the webpage
    time.sleep(5)  # Wait for the page to load, adjust this as needed

    page_source = driver.page_source    # Get page source
    driver.quit()   # Close the browser

    soup = BeautifulSoup(page_source, 'html.parser')    # Parse the page source with BeautifulSoup
    print(f"Got soup for {year}!")
 
    # Find all anchor tags with href attribute ending with .pdf
    pdf_links = soup.find_all('a', href=lambda href: href and href.endswith('.pdf'))

    # Directory where PDFs will be saved
    os.makedirs(folder, exist_ok=True)

    # Iterate over all found PDF links and download each PDF
    for link in pdf_links:
        pdf_url = link['href']
        # Full URL if the link is relative
        if not pdf_url.startswith('http'):
            pdf_url = 'https://www.esmats.eu/esmatspapers/' + pdf_url
        print(f'Downloading {pdf_url}')
        # Download the PDF file
        response = requests.get(pdf_url)
        # Get the file name from the URL
        file_name = pdf_url.split('/')[-1]
        # Save the PDF file
        with open(os.path.join(folder,year+"_"+file_name), 'wb') as file:
            file.write(response.content)

    print(f"Download completed for {year}!")

In [None]:

# years = ["2023", "2021", "2019", "2017", "2015", "2013", "2011", "2009", "2007", "2005", "2003", "2001", "1999"]
years = ["2021", "2019", "2017", "2015", "2013", "2011", "2009", "2007", "2005", "2003", "2001", "1999"]
url="https://www.esmats.eu/esmatspapers/completelist.php?whichYear="
folder=os.path.join('..','data','ESMAT')

for year in years:
    get_esmats_papers(url,year,folder)

# Install and command line myocrpdf

In [None]:
# !brew install ocrmypdf

In [None]:
# !ocrmypdf --tesseract-timeout 0 --force-ocr AMS_2000.pdf AMS_2000_stripped.pdf
# !ocrmypdf --sidecar AMS_2000_redo_out.txt AMS_2000_stripped.pdf AMS_2000_strip_redo.pdf

# Batch process documents

In [3]:
import os
from tqdm.notebook import tqdm

In [7]:
# Re-OCR AMS docs
directory=os.path.join('..','data','AMS')
documents = ['AMS_1980.pdf', 
             'AMS_1981.pdf',
             'AMS_1982.pdf',
             'AMS_1983.pdf',
             'AMS_1984.pdf',
             'AMS_1985.pdf',
             'AMS_1986.pdf',
             'AMS_1987.pdf',
             'AMS_1988.pdf',
             'AMS_1990.pdf',
             'AMS_1991.pdf',
             'AMS_1992.pdf',
             'AMS_1993.pdf',
             'AMS_1994.pdf',
             'AMS_1995.pdf',
             'AMS_1996.pdf',
             'AMS_1997.pdf',
             'AMS_1998.pdf',
             'AMS_1999.pdf']

# Re-OCR ESMAT docs from 1999-2003, which are probably pretty outdated OCRs.
directory=os.path.join('..','data','AMS','reocr')
documents = [file for file in os.listdir(directory) if file.endswith('.pdf') and any(year in file for year in ['1980', '1981', '1982'])]

In [None]:
for doc in tqdm(documents,desc='Document Processing'):
    print(f"Processing {doc}")
    try:
        for i in tqdm(range(2), desc=f"Processing {doc}", leave=False):
            if i == 0:
                os.system(f'ocrmypdf --tesseract-timeout 0 --continue-on-soft-render-error --force-ocr {directory}/{doc} {directory}/{doc}_stripped.pdf')   # Stripped pdf
            # elif i == 1:    
            #     os.system(f'ocrmypdf --sidecar {directory}/{doc}_strip_reocr.txt --continue-on-soft-render-error {directory}/{doc}_stripped.pdf {directory}/{doc}_strip_reocr.pdf') # Apply OCR, output file
            elif i == 1:
                os.system(f'ocrmypdf --sidecar {directory}/{doc}_reocr.txt --continue-on-soft-render-error --redo-ocr {directory}/{doc} {directory}/{doc}_reocr.pdf') # Apply OCR, output file
    except:
        print(f'Error processing {doc}')
        pass