Skip to content

Commit

Permalink
Add PDF text extraction (#109)
Browse files Browse the repository at this point in the history
  • Loading branch information
tanaysoni committed Jun 8, 2020
1 parent 479fcb1 commit ef9e4f4
Show file tree
Hide file tree
Showing 19 changed files with 424 additions and 65 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
language: python
sudo: false
sudo: true
cache: pip
python:
- "3.7"
Expand Down
7 changes: 6 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,11 @@ You will find the Swagger API documentation at http://127.0.0.1:80/docs
.. image:: https://raw.githubusercontent.com/deepset-ai/haystack/master/docs/img/annotation_tool.png


7. Development
7. Indexing PDF files
______________________

Haystack has a customizable PDF text extraction pipeline with cleaning functions for header, footers, and tables. It supports complex document layouts with multi-column text.

8. Development
-------------------
* Unit tests can be executed by running :code:`tox`.
Empty file.
44 changes: 44 additions & 0 deletions haystack/indexing/file_converters/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from abc import abstractmethod
from pathlib import Path


class BaseConverter:
"""
Base class for implementing file converts to transform input documents to text format for indexing in database.
"""

def __init__(
self,
remove_numeric_tables: bool = None,
remove_header_footer: bool = None,
remove_whitespace: bool = None,
remove_empty_lines: bool = None,
valid_languages: [str] = None,
):
"""
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
:param remove_header_footer: use heuristic to remove footers and headers across different pages by searching
for the longest common string. This heuristic uses exact matches and therefore
works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4"
or similar.
:param remove_whitespace: strip whitespaces before or after each line in the text.
:param remove_empty_lines: remove more than two empty lines in the text.
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
(https://en.wikipedia.org/wiki/ISO_639-1) format.
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
"""
self.remove_numeric_tables = remove_numeric_tables
self.remove_header_footer = remove_header_footer
self.remove_whitespace = remove_whitespace
self.remove_empty_lines = remove_empty_lines
self.valid_languages = valid_languages

@abstractmethod
def extract_pages(self, file_path: Path) -> [str]:
pass
230 changes: 230 additions & 0 deletions haystack/indexing/file_converters/pdftotext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
import logging
import re
import subprocess
from functools import partial, reduce
from itertools import chain
from pathlib import Path

import fitz
import langdetect

from haystack.indexing.file_converters.base import BaseConverter

logger = logging.getLogger(__name__)


class PDFToTextConverter(BaseConverter):
def __init__(
self,
remove_numeric_tables: bool = False,
remove_whitespace: bool = None,
remove_empty_lines: bool = None,
remove_header_footer: bool = None,
valid_languages: [str] = None,
):
"""
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
:param remove_whitespace: strip whitespaces before or after each line in the text.
:param remove_empty_lines: remove more than two empty lines in the text.
:param remove_header_footer: use heuristic to remove footers and headers across different pages by searching
for the longest common string. This heuristic uses exact matches and therefore
works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4"
or similar.
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
(https://en.wikipedia.org/wiki/ISO_639-1) format.
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
"""
verify_installation = subprocess.run(["pdftotext -v"], shell=True)
if verify_installation.returncode == 127:
raise Exception(
"""pdftotext is not installed. It is part of xpdf or poppler-utils software suite.
Installation on Linux:
wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.02.tar.gz &&
tar -xvf xpdf-tools-linux-4.02.tar.gz && sudo cp xpdf-tools-linux-4.02/bin64/pdftotext /usr/local/bin
Installation on MacOS:
brew install xpdf
You can find more details here: https://www.xpdfreader.com
"""
)

super().__init__(
remove_numeric_tables=remove_numeric_tables,
remove_whitespace=remove_whitespace,
remove_empty_lines=remove_empty_lines,
remove_header_footer=remove_header_footer,
valid_languages=valid_languages,
)

def extract_pages(self, file_path: Path) -> [str]:

page_count = fitz.open(file_path).pageCount

pages = []
for page_number in range(1, page_count + 1):
# pdftotext tool provides an option to retain the original physical layout of a PDF page. This behaviour
# can be toggled by using the layout param.
# layout=True
# + table structures get retained better
# - multi-column pages(eg, research papers) gets extracted with text from multiple columns on same line
# layout=False
# + keeps strings in content stream order, hence multi column layout works well
# - cells of tables gets split across line
#
# Here, as a "safe" default, layout is turned off.
page = self._extract_page(file_path, page_number, layout=False)
lines = page.splitlines()
cleaned_lines = []
for line in lines:
words = line.split()
digits = [word for word in words if any(i.isdigit() for i in word)]

# remove lines having > 40% of words as digits AND not ending with a period(.)
if self.remove_numeric_tables:
if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
logger.debug(f"Removing line '{line}' from {file_path}")
continue

if self.remove_whitespace:
line = line.strip()

cleaned_lines.append(line)

page = "\n".join(cleaned_lines)

if self.remove_empty_lines:
page = re.sub(r"\n\n+", "\n\n", page)

pages.append(page)
page_number += 1

if self.valid_languages:
document_text = "".join(pages)
if not self._validate_language(document_text):
logger.warning(
f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
f"been decoded in the correct text format."
)

if self.remove_header_footer:
pages, header, footer = self.find_and_remove_header_footer(
pages, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
)
logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")

return pages

def _extract_page(self, file_path: Path, page_number: int, layout: bool):
"""
Extract a page from the pdf file at file_path.
:param file_path: path of the pdf file
:param page_number: page number to extract(starting from 1)
:param layout: whether to retain the original physical layout for a page. If disabled, PDF pages are read in
the content stream order.
"""
if layout:
command = ["pdftotext", "-layout", "-f", str(page_number), "-l", str(page_number), file_path, "-"]
else:
command = ["pdftotext", "-f", str(page_number), "-l", str(page_number), file_path, "-"]
output_page = subprocess.run(command, capture_output=True, shell=False)
page = output_page.stdout.decode(errors="ignore")
return page

def _validate_language(self, text: str):
"""
Validate if the language of the text is one of valid languages.
"""
try:
lang = langdetect.detect(text)
except langdetect.lang_detect_exception.LangDetectException:
lang = None

if lang in self.valid_languages:
return True
else:
return False

def _ngram(self, seq: str, n: int):
"""
Return ngram (of tokens - currently splitted by whitespace)
:param seq: str, string from which the ngram shall be created
:param n: int, n of ngram
:return: str, ngram as string
"""

# In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
# we add a space here and remove it after creation of the ngrams again (see below)
seq = seq.replace("\n", " \n")
seq = seq.replace("\t", " \t")

seq = seq.split(" ")
ngrams = (
" ".join(seq[i : i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(seq) - n + 1)
)

return ngrams

def _allngram(self, seq: str, min_ngram: int, max_ngram: int):
lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))
ngrams = map(partial(self._ngram, seq), lengths)
res = set(chain.from_iterable(ngrams))
return res

def find_longest_common_ngram(self, sequences: [str], max_ngram: int = 30, min_ngram: int = 3):
"""
Find the longest common ngram across different text sequences (e.g. start of pages).
Considering all ngrams between the specified range. Helpful for finding footers, headers etc.
:param sequences: list[str], list of strings that shall be searched for common n_grams
:param max_ngram: int, maximum length of ngram to consider
:param min_ngram: minimum length of ngram to consider
:return: str, common string of all sections
"""

seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences)
intersection = reduce(set.intersection, seqs_ngrams)

try:
longest = max(intersection, key=len)
except ValueError:
# no common sequence found
longest = ""
return longest if longest.strip() else None

def find_and_remove_header_footer(
self, pages: [str], n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
):
"""
Heuristic to find footers and headers across different pages by searching for the longest common string.
For headers we only search in the first n_chars characters (for footer: last n_chars).
Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX",
but won't detect "Page 3 of 4" or similar.
:param pages: list of strings, one string per page
:param n_chars: number of first/last characters where the header/footer shall be searched in
:param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header)
:param n_last_pages_to_ignore: number of last pages to ignore
:return: (cleaned pages, found_header_str, found_footer_str)
"""

# header
start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
found_header = self.find_longest_common_ngram(start_of_pages)
if found_header:
pages = [page.replace(found_header, "") for page in pages]

# footer
end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
found_footer = self.find_longest_common_ngram(end_of_pages)
if found_footer:
pages = [page.replace(found_footer, "") for page in pages]
return pages, found_header, found_footer
79 changes: 38 additions & 41 deletions haystack/indexing/io.py → haystack/indexing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,57 +4,53 @@
import tempfile
import tarfile
import zipfile
from typing import Callable
from haystack.indexing.file_converters.pdftotext import PDFToTextConverter

logger = logging.getLogger(__name__)


def write_documents_to_db(document_store, document_dir, clean_func=None, only_empty_db=False, split_paragraphs=False):
def convert_files_to_dicts(dir_path: str, clean_func: Callable = None, split_paragraphs: bool = False) -> [dict]:
"""
Write all text files(.txt) in the sub-directories of the given path to the connected database.
Convert all files(.txt, .pdf) in the sub-directories of the given path to Python dicts that can be written to a
Document Store.
:param document_dir: path for the documents to be written to the database
:param dir_path: path for the documents to be written to the database
:param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str)
:param only_empty_db: If true, docs will only be written if db is completely empty.
Useful to avoid indexing the same initial docs again and again.
:param split_paragraphs: split text in paragraphs.
:return: None
"""
file_paths = Path(document_dir).glob("**/*.txt")

# check if db has already docs
if only_empty_db:
n_docs = document_store.get_document_count()
if n_docs > 0:
logger.info(f"Skip writing documents since DB already contains {n_docs} docs ... "
"(Disable `only_empty_db`, if you want to add docs anyway.)")
return None

# read and add docs
docs_to_index = []

file_paths = [p for p in Path(dir_path).glob("**/*")]
if ".pdf" in [p.suffix.lower() for p in file_paths]:
pdf_converter = PDFToTextConverter()
else:
pdf_converter = None

documents = []
for path in file_paths:
with open(path) as doc:
text = doc.read()
if clean_func:
text = clean_func(text)

if split_paragraphs:
for para in text.split("\n\n"):
if not para.strip(): # skip empty paragraphs
continue
docs_to_index.append(
{
"name": path.name,
"text": para
}
)
else:
docs_to_index.append(
{
"name": path.name,
"text": text
}
)
document_store.write_documents(docs_to_index)
logger.info(f"Wrote {len(docs_to_index)} docs to DB")
if path.suffix.lower() == ".txt":
with open(path) as doc:
text = doc.read()
elif path.suffix.lower() == ".pdf":
pages = pdf_converter.extract_pages(path)
text = "\n".join(pages)
else:
raise Exception(f"Indexing of {path.suffix} files is not currently supported.")

if clean_func:
text = clean_func(text)

if split_paragraphs:
for para in text.split("\n\n"):
if not para.strip(): # skip empty paragraphs
continue
documents.append({"name": path.name, "text": para})
else:
documents.append({"name": path.name, "text": text})

return documents


def fetch_archive_from_http(url, output_dir, proxies=None):
Expand Down Expand Up @@ -97,3 +93,4 @@ def fetch_archive_from_http(url, output_dir, proxies=None):
archive.extractall(output_dir)
# temp_file gets deleted here
return True

0 comments on commit ef9e4f4

Please sign in to comment.