# Evaluating PDF extractors

Caution: The code throughout this notebook is quite bad and largely undocumented, as it was only used for quick-and-dirty experiments.

TLDR: PyMuPDF best among the tested PDF extraction libraries. Detection of footnotes and their resolution works decent.

For a more extensive interpretation of the results, please see the thesis.

## Setup

In [2]:
import time
import numpy as np
import pandas as pd

In [3]:
!pip3 install -U PyMuPDF
import fitz

!pip3 install -U pdfminer.six
from pdfminer.high_level import extract_text as pdfminer_extract_text
from pdfminer.high_level import extract_pages as pdfminer_extract_pages
from pdfminer.layout import LTTextContainer as pdfminer_LTTextContainer

!pip3 install -U PyPDF2
import PyPDF2

!pip3 install -U pdfplumber
import pdfplumber

!pip3 install -U borb
from borb.pdf import PDF as BorbPDF
from borb.toolkit.text.simple_text_extraction import (
    SimpleTextExtraction as BorbTextExtraction,
)

!pip3 install -U tika
from tika import parser as tika_parser
from io import StringIO
from bs4 import BeautifulSoup

In [4]:
feedbacks = pd.read_csv(
    "../24212003_requirements_for_artificial_intelligence/patched_feedbacks.csv"
)
attachments = pd.read_csv(
    "../24212003_requirements_for_artificial_intelligence/attachments.csv"
)

attachments = pd.merge(feedbacks, attachments)[["id", "user_type", "filename"]]
attachments["filename"] = (
    "../24212003_requirements_for_artificial_intelligence/" + attachments["filename"]
)
attachments["n_pages"] = attachments["filename"].map(lambda f: fitz.open(f).page_count)

attachments.head(3)

Unnamed: 0,id,user_type,filename,n_pages
0,2665651,ngo,../24212003_requirements_for_artificial_intell...,7
1,2665650,ngo,../24212003_requirements_for_artificial_intell...,2
2,2665649,ngo,../24212003_requirements_for_artificial_intell...,39


In [5]:
def rng(analysis: int):
    """Return a NumPy random number generator - make code cells deterministic
    independent of how often or in which order they are executed.
    """
    return np.random.default_rng(seed=123 * analysis)


def stratified_sample(df: pd.DataFrame, by: str, n: int) -> pd.DataFrame:
    """Perform a stratified sample: sample n elements for each 'by' stratum."""
    grouped = df.groupby(by, group_keys=False)
    return grouped.apply(lambda x: x.sample(min(len(x), n), random_state=12345))

## 1. Read errors & time

### Experimental setup

##### A common way of executing extractors

In [5]:
def run_experiment(extractor, extractor_name, attachments, timeout=5 * 60):
    successes, failures = 0, 0
    start = time.time()

    for pdf in attachments["filename"]:
        if time.time() - start > timeout:
            break
        try:
            text = extractor(pdf)

            # Should have extracted at least some text
            assert type(text) in (str, list)
            if type(text) == str:
                assert len(text) > 100
            elif type(text) == list:
                assert sum(len(x) for x in text) > 100

            successes += 1
        except:
            print(f"Failure to read {pdf}")
            failures += 1

    runtime = time.time() - start
    runtime = runtime if runtime < timeout else timeout

    return {
        "name": extractor_name,
        "successes": successes,
        "failures": failures,
        "runtime": runtime,
    }


results_1 = []

##### Defining extractor methods for each library

In [6]:
def fitz_extractor(pdf):
    doc = fitz.open(pdf)
    return [page.get_text() for page in doc]


results_1.append(run_experiment(fitz_extractor, "fitz", attachments))

Failure to read ../24212003_requirements_for_artificial_intelligence/attachments/2665436.pdf


In [7]:
def pdfminer_extractor(pdf):
    return pdfminer_extract_text(pdf)


results_1.append(run_experiment(pdfminer_extractor, "pdfminer", attachments))

Failure to read ../24212003_requirements_for_artificial_intelligence/attachments/2665436.pdf


In [8]:
def pypdf2_extractor(pdf):
    reader = PyPDF2.PdfReader(pdf)
    return [page.extract_text() for page in reader.pages]


results_1.append(run_experiment(pypdf2_extractor, "PyPDF2", attachments))

Failure to read ../24212003_requirements_for_artificial_intelligence/attachments/2665436.pdf


In [9]:
def pdfplumber_extractor(pdf):
    with pdfplumber.open(pdf) as doc:
        return [page.extract_text() for page in doc.pages]


results_1.append(run_experiment(pdfplumber_extractor, "pdfplumber", attachments))

Failure to read ../24212003_requirements_for_artificial_intelligence/attachments/2665436.pdf


In [10]:
def borb_extractor(pdf):
    ste = BorbTextExtraction()
    with open(pdf, "rb") as in_file_handle:
        doc = BorbPDF.loads(in_file_handle, [ste])
        return list(ste.__dict__["_text_per_page"].values())


results_1.append(run_experiment(borb_extractor, "borb", attachments))

Unable to process XMP meta-data
Unable to process XMP meta-data
Unable to process XMP meta-data
Unable to process XMP meta-data
Unable to process XMP meta-data
Unable to process XMP meta-data
Unable to process XMP meta-data
Unable to process XMP meta-data
Unable to process XMP meta-data
Unable to process XMP meta-data
Unable to process XMP meta-data
Unable to process XMP meta-data


In [11]:
def tika_extractor(pdf):
    return tika_parser.from_file(pdf)["content"]


results_1.append(run_experiment(tika_extractor, "tika", attachments))

2022-10-25 14:25:00,628 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.24/tika-server-1.24.jar to /tmp/tika-server.jar.
2022-10-25 14:25:04,182 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.24/tika-server-1.24.jar.md5 to /tmp/tika-server.jar.md5.
2022-10-25 14:25:04,821 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


##### Collecting the results

In [19]:
results_1_df = pd.DataFrame(results_1).sort_values(
    ["runtime", "successes"], ascending=[True, False]
)

In [20]:
file = "../24212003_requirements_for_artificial_intelligence/attachments/2665436.pdf"
tika_extractor(file)

'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nTÜV-Association Position Paper on batteries and waste batteries\n\n\n\n\n \n\n \n\n \n\n \n\n \n\nhttp://www.tuev-verband.de/\n\n\n \n\n \n\nhttp://www.tuev-verband.de/\n\n\n \n\nhttp://www.tuev-verband.de/\n\n\n \n\nhttp://www.tuev-verband.de/\n\n\n \n\nhttp://www.tuev-verband.de/\n\n\nhttp://www.tuev-verband.de/\n\n\nhttp://www.tuev-verband.de/\n\n'

In [21]:
# As tika also fails to properly read 2665436's attachment, update df
results_1_df.loc[results_1_df["name"] == "tika", "successes"] -= 1
results_1_df.loc[results_1_df["name"] == "tika", "failures"] += 1

results_1_df.to_csv("results/results_1_runtime_errors.csv", index=False)

### Results

In [22]:
pd.read_csv("results/results_1_runtime_errors.csv")

Unnamed: 0,name,successes,failures,runtime
0,fitz,255,1,7.079093
1,tika,255,1,35.953465
2,PyPDF2,255,1,93.199966
3,pdfminer,255,1,230.098631
4,pdfplumber,255,1,292.154443
5,borb,22,0,300.0


## 2. Reading order

### Experimental setup

##### Selecting a sample

In [88]:
sample_2 = stratified_sample(attachments, "user_type", 5)
rng_2 = rng(1)
sample_2["page"] = sample_2["n_pages"].map(lambda n: rng_2.choice(n) + 1)
sample_2["source_id"] = sample_2["id"]
sample_2["source"] = (
    "attachments/"
    + sample_2["id"].astype(str)
    + ".pdf#page="
    + sample_2["page"].astype(str)
)
sample_2.head(3)

Unnamed: 0,id,user_type,filename,n_pages,page,source_id,source
3,2665648,academic_research_institution,../24212003_requirements_for_artificial_intell...,8,1,2665648,attachments/2665648.pdf#page=1
196,2663366,academic_research_institution,../24212003_requirements_for_artificial_intell...,13,9,2663366,attachments/2663366.pdf#page=9
106,2665480,academic_research_institution,../24212003_requirements_for_artificial_intell...,64,38,2665480,attachments/2665480.pdf#page=38


##### Defining extractors for each method

In [109]:
def fitz_extractor(pdf, page):
    doc = fitz.open(pdf)
    return doc[page - 1].get_text()


def pypdf2_extractor(pdf, page):
    reader = PyPDF2.PdfReader(pdf)
    return reader.pages[page - 1].extract_text()


def pdfminer_extractor(pdf, page):
    texts = [
        elem.get_text()
        for elem in list(pdfminer_extract_pages(pdf))[page - 1]
        if isinstance(elem, pdfminer_LTTextContainer)
    ]
    return "".join(texts)


def tika_extractor(pdf, page):
    """Adapted from: https://github.com/chrismattmann/tika-python/issues/191#issuecomment-612544137"""
    pages_txt = []
    data = tika_parser.from_file(pdf, xmlContent=True)
    xhtml_data = BeautifulSoup(data["content"])
    for i, content in enumerate(xhtml_data.find_all("div", attrs={"class": "page"})):
        _buffer = StringIO()
        _buffer.write(str(content))
        parsed_content = tika_parser.from_buffer(_buffer.getvalue())
        text = parsed_content["content"].strip()
        pages_txt.append(text)
    return pages_txt[page - 1]

##### Creating an evaluation

In [111]:
extractors = [
    ("fitz", fitz_extractor),
    ("PyPDF2", pypdf2_extractor),
    ("pdfminer", pdfminer_extractor),
    ("tika", tika_extractor),
]

dfs = []
for method_name, method_extractor in extractors:
    new_df = sample_2.copy()
    new_df["id"] = (
        method_name + "_" + new_df["id"].astype(str) + "_" + new_df["page"].astype(str)
    )
    new_df["task"] = "Is the line order correct?"
    new_df["content"] = [
        "<pre>" + method_extractor(row["filename"], row["page"]) + "</pre>"
        for _, row in new_df.iterrows()
    ]
    new_df["result"] = None
    dfs.append(new_df)
sample_2_out = pd.concat(dfs)
sample_2_out = sample_2_out.sample(len(sample_2_out))
sample_2_out.to_csv(
    "../02_eval_tool/evaluations/pdf_extractors_line_order.csv", index=False
)

### Results

- Exclude page 1 of X, recurring header/footer contents, and similar things
- Missing body text lines are counted as negative findings

In [18]:
results_2 = pd.read_csv("results/results_2_line_order.csv")
grouped = results_2.groupby(results_2["id"].str.split("_").map(lambda x: x[0]))
pd.concat(
    (
        grouped["result"].value_counts().to_frame("absolute"),
        grouped["result"].value_counts(normalize=True).to_frame("relative"),
    ),
    axis=1,
)

Unnamed: 0_level_0,Unnamed: 1_level_0,absolute,relative
id,result,Unnamed: 2_level_1,Unnamed: 3_level_1
PyPDF2,POSITIVE,32,0.744186
PyPDF2,NEUTRAL,7,0.162791
PyPDF2,NEGATIVE,4,0.093023
fitz,POSITIVE,39,0.906977
fitz,NEUTRAL,4,0.093023
pdfminer,POSITIVE,33,0.767442
pdfminer,NEGATIVE,8,0.186047
pdfminer,NEUTRAL,2,0.046512
tika,POSITIVE,36,0.837209
tika,NEUTRAL,5,0.116279


## 3. Word spelling (& spurious newlines)

### Experimental setup

##### Setting up a sample

In [44]:
sample_3 = stratified_sample(attachments, "user_type", 7)
rng_3 = rng(3)
sample_3["page"] = sample_3["n_pages"].map(lambda n: rng_3.choice(n) + 1)
sample_3["line"] = sample_3["id"].map(lambda _: rng_3.choice(10) + 1)
sample_3["source_id"] = sample_3["id"]
sample_3["source"] = (
    "attachments/"
    + sample_3["id"].astype(str)
    + ".pdf#page="
    + sample_3["page"].astype(str)
)
sample_3.head(3)

Unnamed: 0,id,user_type,filename,n_pages,page,line,source_id,source
3,2665648,academic_research_institution,../24212003_requirements_for_artificial_intell...,8,7,9,2665648,attachments/2665648.pdf#page=7
196,2663366,academic_research_institution,../24212003_requirements_for_artificial_intell...,13,11,2,2663366,attachments/2663366.pdf#page=11
106,2665480,academic_research_institution,../24212003_requirements_for_artificial_intell...,64,64,8,2665480,attachments/2665480.pdf#page=64


##### Defining extractors for the remaining libraries

In [47]:
def fitz_extractor(pdf, page, line):
    doc = fitz.open(pdf)
    text = doc[int(page) - 1].get_text()
    lines = [l for l in text.splitlines() if len(l) > 5]
    return lines[min(len(lines), line) - 1]


def tika_extractor(pdf, page, line):
    """Adapted from: https://github.com/chrismattmann/tika-python/issues/191#issuecomment-612544137"""
    pages_txt = []
    data = tika_parser.from_file(pdf, xmlContent=True)
    xhtml_data = BeautifulSoup(data["content"])
    for i, content in enumerate(xhtml_data.find_all("div", attrs={"class": "page"})):
        _buffer = StringIO()
        _buffer.write(str(content))
        parsed_content = tika_parser.from_buffer(_buffer.getvalue())
        text = parsed_content["content"].strip()
        pages_txt.append(text)
    text = pages_txt[page - 1]
    lines = [l for l in text.splitlines() if len(l) > 5]
    return lines[min(len(lines), line) - 1]

##### Creating an evaluation

In [48]:
extractors = [
    ("fitz", fitz_extractor),
    ("tika", tika_extractor),
]

dfs = []
for method_name, method_extractor in extractors:
    new_df = sample_3.copy()

    # Add id, task, and result columns
    new_df["id"] = (
        method_name + "_" + new_df["id"].astype(str) + "_" + new_df["page"].astype(str)
    )
    new_df["task"] = "Is the highlighted line correct?"
    new_df["result"] = None

    # Add content column
    contents = []
    for _, row in new_df.iterrows():
        # Tell the user the line number
        content = f"<span>Line {row['line']}:</span>"

        # If row is not the first line on the page, add previous line as context
        if row["line"] > 1:
            prev_line = method_extractor(row["filename"], row["page"], row["line"] - 1)
            content += "<pre>" + prev_line + "</pre>"

        # Add the line in bold font
        line = method_extractor(row["filename"], row["page"], row["line"])
        content += '<pre style="font-weight: 700;">' + line + "</pre>"

        # Add following line as context
        next_line = method_extractor(row["filename"], row["page"], row["line"] + 1)
        if next_line != line:
            content += "<pre>" + next_line + "</pre>"

        contents.append(content)
    new_df["content"] = contents

    dfs.append(new_df)
sample_3_out = pd.concat(dfs)
sample_3_out = sample_3_out.sample(len(sample_3_out))
sample_3_out.to_csv("../02_eval_tool/evaluations/pdf_words.csv", index=False)

### Results

In [17]:
results_3 = pd.read_csv("results/results_3_words.csv")
grouped = results_3.groupby(results_3["id"].str.split("_").map(lambda x: x[0]))
pd.concat(
    (
        grouped["result"].value_counts().to_frame("absolute"),
        grouped["result"].value_counts(normalize=True).to_frame("relative"),
    ),
    axis=1,
)

Unnamed: 0_level_0,Unnamed: 1_level_0,absolute,relative
id,result,Unnamed: 2_level_1,Unnamed: 3_level_1
fitz,POSITIVE,58,0.983051
fitz,NEGATIVE,1,0.016949
tika,POSITIVE,58,0.983051
tika,NEGATIVE,1,0.016949


In both cases, the error was not being able to read the ligature 'ff' in "sign-off".

## 4. Footnote references

### Experimental setup

##### Selecting a sample

In [217]:
sample_4 = stratified_sample(attachments.query("id != 2665436"), "user_type", 20)
rng_4 = rng(4)
sample_4["page"] = sample_4["n_pages"].map(
    lambda n: rng_3.choice(n, min(n, 2), replace=False) + 1
)
sample_4 = sample_4.explode("page")
sample_4["source_id"] = sample_4["id"]
sample_4["source"] = (
    "attachments/"
    + sample_4["id"].astype(str)
    + ".pdf#page="
    + sample_4["page"].astype(str)
)
sample_4.head(3)

Unnamed: 0,id,user_type,filename,n_pages,page,source_id,source
3,2665648,academic_research_institution,../24212003_requirements_for_artificial_intell...,8,1,2665648,attachments/2665648.pdf#page=1
3,2665648,academic_research_institution,../24212003_requirements_for_artificial_intell...,8,6,2665648,attachments/2665648.pdf#page=6
196,2663366,academic_research_institution,../24212003_requirements_for_artificial_intell...,13,11,2663366,attachments/2663366.pdf#page=11


##### Define a footnote reference extractor using PyMuPDF

In [8]:
SUP = 1
import re


def wrap(s, high):
    if (
        high
        # Positive match to Arabic numbers or Roman numerals
        and re.match(r"\s*[\divx][\s\divx,]*$", s) is not None
        # Negative match to common false positives
        and re.match(r"\s+$|\s*rd\s*|\s*st\s*|\s*th\s*|\s*\.\s*", s) is None
    ):
        return f'<code style="font-weight: 1000; background-color: #FF0000;">{s}</code>'
    return f"<code>{s}</code>"


def fitz_extractor(pdf, page):
    doc = fitz.open(pdf)
    text = doc[int(page) - 1].get_text("dict", flags=16 + 2)
    lines = [l for b in text["blocks"] for l in b["lines"]]
    lines = [
        "".join([wrap(s["text"], s["flags"] & SUP) for s in l["spans"]]) for l in lines
    ]
    return "<br>".join(lines)

##### Create an evaluation

In [219]:
sample_4_out = sample_4.copy()

sample_4_out["id"] = (
    "fitz_" + sample_4_out["id"].astype(str) + "_" + sample_4_out["page"].astype(str)
)
sample_4_out[
    "task"
] = "Footnotes in doc + all footnotes highlighted: 👍; something other than footnote or not all footnotes highlighted: 👎; no footnote in doc: 👌"
sample_4_out["content"] = sample_4_out.apply(
    lambda row: fitz_extractor(row["filename"], row["page"]), axis=1
)
sample_4_out["result"] = None

sample_4_out = sample_4_out.sample(len(sample_4_out))
sample_4_out.to_csv("../02_eval_tool/evaluations/pdf_footnotes.csv", index=False)

### Results

In [15]:
results_4 = pd.read_csv("results/results_4_footnotes.csv")
pd.concat(
    (
        results_4["result"].value_counts().to_frame("absolute"),
        results_4["result"].value_counts(normalize=True).to_frame("relative"),
    ),
    axis=1,
)

Unnamed: 0,absolute,relative
NO_FOOTNOTES_ON_PAGE,181,0.721116
POSITIVE,61,0.243028
FALSE_NEGATIVE,9,0.035857


In [16]:
counts = results_4["result"].value_counts()
tp, fn = counts["POSITIVE"], counts["FALSE_NEGATIVE"]
fp = len(results_4) - tp - fn - counts["NO_FOOTNOTES_ON_PAGE"]
print("Precision:", tp / (tp + fp))
print("Recall:", round(tp / (tp + fn), 2))
print("Pages containing footnotes:", round((tp + fn) / len(results_4), 2))

Precision: 1.0
Recall: 0.87
Pages containing footnotes: 0.28


## 5. Footnote resolution

### Experimental setup

##### (Experimentally) resolving footnotes

In [1]:
from footnote_resolution_v1 import extract_pdf, line_to_text

In [6]:
results = {}
for _, row in attachments.iterrows():
    pages = extract_pdf(row["filename"])
    for page, (lines, footnotes, replacements) in enumerate(pages):
        if len(footnotes) == 0:
            continue

        results[(row["id"], page + 1)] = (lines, footnotes, replacements)

  lines = pd.Series(lines).value_counts()


##### Selecting a sample

In [7]:
pdfs = list(set(map(lambda x: x[0], results.keys())))

my_rng = rng(5)
pdfs = my_rng.choice(pdfs, 128, shuffle=True, replace=False)

sample_5 = []
for pdf in pdfs:
    pages = list(map(lambda x: x[1], filter(lambda x: x[0] == pdf, results.keys())))
    page = my_rng.choice(pages)
    sample_5.append(dict(pdf=pdf, page=page, results=results[(pdf, page)]))

##### Creating an evaluation

In [8]:
sample_5_out = []

for sample in sample_5:
    content = ""
    lines, footnotes, replacements = sample["results"]
    one_resolved = False
    for footnote in footnotes:
        content += (
            f"<code style=\"font-weight: 1000;\">Foonote: {footnote['text']}</code><br>"
        )
        repl = list(filter(lambda x: x["footnote"] == footnote, replacements))
        if len(repl) > 0:
            line = repl[0]["replacement"][0]
            first_repl = line_to_text(lines[line], clean=True)[
                len(line_to_text([footnote], clean=True)) :
            ]
            content += f"<code>{first_repl.replace('<', '〈')}</code><br>"

            for line in repl[0]["replacement"][1:]:
                content += f"    <code>{line_to_text(lines[line], clean=True).replace('<', '〈')}</code><br>"

            one_resolved = True
        else:
            content += '    <code style="font-weight: 1000; background-color: #FF0000;">No resolution</code>'
        content += "<br><br>"
    sample_5_out.append(
        dict(
            id=sample["pdf"],
            page=sample["page"],
            content=content,
            result=None
            if "#FF0000" not in content
            else "SOME_UNRESOLVED"
            if one_resolved
            else "NONE_RESOLVED",
        )
    )

sample_5_out = pd.DataFrame(sample_5_out)
sample_5_out["source"] = (
    "attachments/"
    + sample_5_out["id"].astype(str)
    + ".pdf#page="
    + sample_5_out["page"].astype(str)
)
sample_5_out["task"] = (
    "Are footnotes correctly resolved? ("
    + "Anything wrong, including another footnote's text inside a footnotes resolution? 👎 "
    + "Some footnotes are missing but the one's that are there are good? 👌 "
    + "All footnotes resolved correctly? 👍) "
)

sample_5_out.to_csv("../02_eval_tool/evaluations/pdf_footnote_parsing.csv", index=False)

### Results

In [14]:
pd.read_csv("results/results_5_footnote_parsing.csv")["result"].value_counts(
    normalize=True
).round(3)

POSITIVE               0.750
NONE_RESOLVED          0.141
WRONG_RESOLUTION       0.055
SOME_UNRESOLVED        0.039
FOOTNOTE_UNDETECTED    0.016
Name: result, dtype: float64