# Download PDFs from website

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from bs4 import BeautifulSoup
import os
from time import sleep
import requests

In [None]:
def get_esmats_papers(url_base,year,folder):
    url=url_base+year
    # Set up the WebDriver, requires chrome to be installed
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode (without opening a browser window)
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=chrome_options) # requires a chrome browser to be installed

    driver.get(url) # # Open the webpage
    time.sleep(5)  # Wait for the page to load, adjust this as needed

    page_source = driver.page_source    # Get page source
    driver.quit()   # Close the browser

    soup = BeautifulSoup(page_source, 'html.parser')    # Parse the page source with BeautifulSoup
    print(f"Got soup for {year}!")
 
    # Find all anchor tags with href attribute ending with .pdf
    pdf_links = soup.find_all('a', href=lambda href: href and href.endswith('.pdf'))

    # Directory where PDFs will be saved
    os.makedirs(folder, exist_ok=True)

    # Iterate over all found PDF links and download each PDF
    for link in pdf_links:
        pdf_url = link['href']
        # Full URL if the link is relative
        if not pdf_url.startswith('http'):
            pdf_url = 'https://www.esmats.eu/esmatspapers/' + pdf_url
        print(f'Downloading {pdf_url}')
        # Download the PDF file
        response = requests.get(pdf_url)
        # Get the file name from the URL
        file_name = pdf_url.split('/')[-1]
        # Save the PDF file
        with open(os.path.join(folder,year+"_"+file_name), 'wb') as file:
            file.write(response.content)

    print(f"Download completed for {year}!")

In [None]:

# years = ["2023", "2021", "2019", "2017", "2015", "2013", "2011", "2009", "2007", "2005", "2003", "2001", "1999"]
years = ["2021", "2019", "2017", "2015", "2013", "2011", "2009", "2007", "2005", "2003", "2001", "1999"]
url="https://www.esmats.eu/esmatspapers/completelist.php?whichYear="
folder=os.path.join('..','data','ESMAT')

for year in years:
    get_esmats_papers(url,year,folder)

# Install and command line myocrpdf

In [None]:
# !brew install ocrmypdf

In [None]:
# !ocrmypdf --tesseract-timeout 0 --force-ocr AMS_2000.pdf AMS_2000_stripped.pdf
# !ocrmypdf --sidecar AMS_2000_redo_out.txt AMS_2000_stripped.pdf AMS_2000_strip_redo.pdf

# Batch process documents

In [1]:
import os
from tqdm.notebook import tqdm

In [2]:
# Re-OCR AMS docs
# directory=os.path.join('..','data','AMS')
# documents = ['AMS_2000.pdf', 
#              'AMS_2001.pdf',
#              'AMS_2002.pdf',
#              'AMS_2004.pdf',
#              'AMS_2006.pdf',
#              'AMS_2008.pdf',
#              'AMS_2010.pdf',
#              'AMS_2012.pdf',
#              'AMS_2014.pdf',
#              'AMS_2016.pdf',
#              'AMS_2018.pdf',
#              'AMS_2020.pdf']

# Re-OCR ESMAT docs from 1999-2003, which are probably pretty outdated OCRs.
directory=os.path.join('..','data','ESMAT')
documents = [file for file in os.listdir(directory) if file.endswith('.pdf') and file.startswith(('1999', '2001', '2003'))]

In [3]:
for doc in tqdm(documents,desc='Document Processing'):
    print(f"Processing {doc}")
    try:
        for i in tqdm(range(3), desc=f"Processing {doc}", leave=False):
            if i == 0:
                os.system(f'ocrmypdf --tesseract-timeout 0 --continue-on-soft-render-error --force-ocr {directory}/{doc} {directory}/{doc}_stripped.pdf')   # Stripped pdf
            elif i == 1:    
                os.system(f'ocrmypdf --sidecar {directory}/{doc}_strip_reocr.txt --continue-on-soft-render-error {directory}/{doc}_stripped.pdf {directory}/{doc}_strip_reocr.pdf') # Apply OCR, output file
            elif i == 2:
                os.system(f'ocrmypdf --sidecar {directory}/{doc}_reocr.txt --continue-on-soft-render-error --redo-ocr {directory}/{doc} {directory}/{doc}_reocr.pdf') # Apply OCR, output file
    except:
        print(f'Error processing {doc}')
        pass

Document Processing:   0%|          | 0/165 [00:00<?, ?it/s]

Processing 2001_wood.pdf


Processing 2001_wood.pdf:   0%|          | 0/3 [00:00<?, ?it/s]

Start processing 8 pages concurrently
    1 page already has text! - rasterizing text and running OCR anyway
    2 page already has text! - rasterizing text and running OCR anyway
    3 page already has text! - rasterizing text and running OCR anyway
    4 page already has text! - rasterizing text and running OCR anyway
    5 page already has text! - rasterizing text and running OCR anyway
    6 page already has text! - rasterizing text and running OCR anyway
    7 page already has text! - rasterizing text and running OCR anyway
    8 page already has text! - rasterizing text and running OCR anyway
    9 page already has text! - rasterizing text and running OCR anyway
   10 page already has text! - rasterizing text and running OCR anyway
   11 page already has text! - rasterizing text and running OCR anyway
Postprocessing...
[2KLinearizing           [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [35m100%[0m [32m100/100[0m [36m0:00:00[0m0m
[?25hImage optimization ratio: 1.23 sav

Processing 2001_andion.pdf


Processing 2001_andion.pdf:   0%|          | 0/3 [00:00<?, ?it/s]

Start processing 8 pages concurrently
    1 page already has text! - rasterizing text and running OCR anyway
    2 page already has text! - rasterizing text and running OCR anyway
    3 page already has text! - rasterizing text and running OCR anyway
    4 page already has text! - rasterizing text and running OCR anyway
    5 page already has text! - rasterizing text and running OCR anyway
    6 page already has text! - rasterizing text and running OCR anyway
    7 page already has text! - rasterizing text and running OCR anyway
    8 page already has text! - rasterizing text and running OCR anyway
Postprocessing...
[2KLinearizing           [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [35m100%[0m [32m100/100[0m [36m0:00:00[0m0m
[?25hImage optimization ratio: 1.23 savings: 19.0%
Total file size ratio: 0.35 savings: -188.7%
Output file is a PDF/A-2B (as expected)
The output file size is 2.89× larger than the input file.
Possible reasons for this include:
--force-ocr was issued,

Processing 2003_cadiergues.pdf


Processing 2003_cadiergues.pdf:   0%|          | 0/3 [00:00<?, ?it/s]

EncryptedPdfError: Input PDF is encrypted. The encryption must be removed to
perform OCR.

For information about this PDF's security use
    qpdf --show-encryption infilename

You can remove the encryption using
    qpdf --decrypt [--password=[password]] infilename

InputFileError: File not found - ../data/ESMAT/2003_cadiergues.pdf_stripped.pdf


Processing 2003_thiel.pdf


EncryptedPdfError: Input PDF is encrypted. The encryption must be removed to
perform OCR.

For information about this PDF's security use
    qpdf --show-encryption infilename

You can remove the encryption using
    qpdf --decrypt [--password=[password]] infilename



Processing 2003_thiel.pdf:   0%|          | 0/3 [00:00<?, ?it/s]

EncryptedPdfError: Input PDF is encrypted. The encryption must be removed to
perform OCR.

For information about this PDF's security use
    qpdf --show-encryption infilename

You can remove the encryption using
    qpdf --decrypt [--password=[password]] infilename

InputFileError: File not found - ../data/ESMAT/2003_thiel.pdf_stripped.pdf


Processing 2001_gradt.pdf


EncryptedPdfError: Input PDF is encrypted. The encryption must be removed to
perform OCR.

For information about this PDF's security use
    qpdf --show-encryption infilename

You can remove the encryption using
    qpdf --decrypt [--password=[password]] infilename



Processing 2001_gradt.pdf:   0%|          | 0/3 [00:00<?, ?it/s]

Start processing 4 pages concurrently
    1 page already has text! - rasterizing text and running OCR anyway
    2 page already has text! - rasterizing text and running OCR anyway
    3 page already has text! - rasterizing text and running OCR anyway
    4 page already has text! - rasterizing text and running OCR anyway
Postprocessing...
[2KLinearizing           [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [35m100%[0m [32m100/100[0m [36m0:00:00[0m0m
[?25hImage optimization ratio: 1.11 savings: 9.9%
Total file size ratio: 0.29 savings: -245.5%
Output file is a PDF/A-2B (as expected)
The output file size is 3.46× larger than the input file.
Possible reasons for this include:
--force-ocr was issued, causing transcoding.
PDF/A conversion was enabled. (Try `--output-type pdf`.)

Start processing 4 pages concurrently
Postprocessing...
[?25lLinearizing           [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [35m  0%[0m [32m  0/100[0m [36m-:--:--[0mSome input metadata could n

Processing 1999_blais.pdf


Processing 1999_blais.pdf:   0%|          | 0/3 [00:00<?, ?it/s]

Start processing 8 pages concurrently
    1 page already has text! - rasterizing text and running OCR anyway
    2 page already has text! - rasterizing text and running OCR anyway
    3 page already has text! - rasterizing text and running OCR anyway
    4 page already has text! - rasterizing text and running OCR anyway
    5 page already has text! - rasterizing text and running OCR anyway
    6 page already has text! - rasterizing text and running OCR anyway
    7 page already has text! - rasterizing text and running OCR anyway
    8 page already has text! - rasterizing text and running OCR anyway
Postprocessing...
[2KLinearizing           [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [35m100%[0m [32m100/100[0m [36m0:00:00[0m0m
[?25hImage optimization ratio: 1.10 savings: 9.3%
Total file size ratio: 0.05 savings: -2064.9%
Output file is a PDF/A-2B (as expected)
The output file size is 21.65× larger than the input file.
Possible reasons for this include:
--force-ocr was issued