# 1. Convert pdfs to XML and txt
Convert the pdfs (e.g. all the pdfs in a specified directory) to:
- XML files generated by the Cermine library (we can use the XML files to retrieve things like title and publication year)
- txt files where we can access all the text, e.g. when the XML contents are not complete enough

In [1]:
import sys
import os
import re
import urllib.request
import subprocess
import time

# own files
from pysci import convertpdf as pdf
from pysci import docutils as du

### Get Cermine
Conversion from scientific article PDFs to structured XML files is via the Java library Cermine: https://github.com/CeON/CERMINE

Install it on your machine, or use the code below to fetch the jar (assuming you can run Java jar files).

In [2]:
cermine_url = r'https://maven.ceon.pl/artifactory/kdd-releases/pl/edu/icm/cermine/cermine-impl/1.13/cermine-impl-1.13-jar-with-dependencies.jar'
cermine_jar_local_path = 'cermine-impl-1.13-jar-with-dependencies.jar'

# download the cermine jar to current dir
filename, headers = urllib.request.urlretrieve(cermine_url, cermine_jar_local_path)

### PDF to XML via Cermine

In [3]:
# directory containing the PDFs to process
articles_dir = 'pdfs'

def convert_pdf_dir_to_xml(pdf_dir, verbose=True):
    output = subprocess.run([r"java", '-cp', cermine_jar_local_path, 'pl.edu.icm.cermine.ContentExtractor', 
                             '-path', articles_dir], stdout=subprocess.PIPE).stdout
    result_string = output.decode('utf-8', 'ignore')
    result_output = result_string.splitlines() # using split('\r\n') gives empty str at end of list
    for line in result_output:
        if verbose:
            print(line)
    return result_output

print("Processing pdfs in %s using Cermine..." %articles_dir)

time1 = time.time()

# convert all the pdfs
output = convert_pdf_dir_to_xml(articles_dir, verbose=False)

time2 = time.time()

print("Done.")
print("Conversion of all files in %s took %0.1fmin" %(articles_dir, (time2-time1)/60.0))

Processing pdfs in pdfs using Cermine...
Done.
Conversion of all files in pdfs took 0.5min


In [4]:
# optionally: delete Cermine jar
os.remove(cermine_jar_local_path)

### PDF to TXT using pdfminer

In [5]:
files_in = 0
files_converted = 0

print("Processing pdfs in %s using pdfminer..." %articles_dir)

time1 = time.time()

for root, dirs, files in os.walk(articles_dir):
    # ignore files that aren't pdf
    files[:] = [f for f in files if os.path.splitext(f)[1].lower() == du.PDF_extension]
    for filename_pdf in files:
        files_in += 1
        filepath_pdf = os.path.join(root,filename_pdf)
        filename_raw = du.remove_extension(filename_pdf)
        filename_txt = filename_raw + du.TXT_extension
        filepath_txt = os.path.join(root,filename_txt)
        if os.path.isfile(filepath_txt):
            #print("Skipping %s" %filepath_txt[:90])
            continue
        success = pdf.convert_pdf_to_text(filepath_pdf, filepath_txt, verbose=False)
        if success:
            print("Converted file %s" %filepath_txt[:90])
            files_converted += 1
        else:
            try:
                os.remove(filepath_txt)
            except PermissionError as e:
                print("PermissionError while trying to delete file %s" %filepath_txt[:90])
            
print("We read in %s pdf files and output %s text files." %(files_in, files_converted))

time2 = time.time()
print('Conversion of all files in %s took %0.1fmin' %(articles_dir, (time2-time1)/60.0))

Processing pdfs in pdfs using pdfminer...
Converted file pdfs\Liu_et_al-2015-Insect_Conservation_and_Diversity.txt
Converted file pdfs\Russo_et_al-2013-Ecology_and_Evolution.txt
We read in 2 pdf files and output 2 text files.
Conversion of all files in pdfs took 0.6min
