Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add PDF parser for indexing #109

Merged
merged 32 commits into from
Jun 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
f41accf
Add PDF parser for indexing
tanaysoni May 13, 2020
adb5458
Add param to remove headers in PDF
tanaysoni May 13, 2020
f81f23b
Add xpdf conversion and header/footer cleaning
tanaysoni May 25, 2020
239f3d0
Add a module for file converters
tanaysoni May 28, 2020
ae08859
Remove unused requirements
tanaysoni May 28, 2020
fbe7115
Simplify pdftotext conversion pipeline
tanaysoni Jun 2, 2020
0198747
Add tests for PDF conversion
tanaysoni Jun 2, 2020
b00512e
Add sample pdf file for tests
tanaysoni Jun 2, 2020
5035a27
Remove xpdf install
tanaysoni Jun 2, 2020
86948d9
Update tests
tanaysoni Jun 2, 2020
465058c
Add PyMuPDF in requirements
tanaysoni Jun 2, 2020
6ce49d9
Add langdetect in requirements
tanaysoni Jun 2, 2020
8523de3
Add xpdf install in CI
tanaysoni Jun 2, 2020
5c57657
Add a pytest fixture for xpdf
tanaysoni Jun 3, 2020
e30281a
Remove xpd install from CI
tanaysoni Jun 3, 2020
aac72a7
Raise privileges for CI
tanaysoni Jun 3, 2020
145aada
Remove type-cast to Document schema
tanaysoni Jun 3, 2020
d8680d3
Add sudo for pdftotext fixture
tanaysoni Jun 3, 2020
7a347b1
Make file extension case insensitive
tanaysoni Jun 4, 2020
6dc120f
Update xpdf download instructions
tanaysoni Jun 4, 2020
d45e72b
Simplify glob pattern for reading input files
tanaysoni Jun 4, 2020
b255372
Add check if pdftotext is already installed
tanaysoni Jun 5, 2020
b3e9bef
Add test for header/footer removal
tanaysoni Jun 5, 2020
7d4c6cf
Renamve header removal params
tanaysoni Jun 5, 2020
83bd686
Add check if string is None for header/footer removal
tanaysoni Jun 5, 2020
23584cf
fix header/footer heuristic
tholor Jun 5, 2020
b7ed431
merge latest changes
tholor Jun 5, 2020
6624114
remove caplog from test
tholor Jun 5, 2020
9ce2a5c
Add docstrings and code formatting
tanaysoni Jun 5, 2020
4372f41
Rename indexing utils
tanaysoni Jun 5, 2020
3f765ea
Add PDF in README
tanaysoni Jun 8, 2020
fd1a203
Rename utils func in tests
tanaysoni Jun 8, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
language: python
sudo: false
sudo: true
cache: pip
python:
- "3.7"
Expand Down
7 changes: 6 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,11 @@ You will find the Swagger API documentation at http://127.0.0.1:80/docs
.. image:: https://raw.githubusercontent.com/deepset-ai/haystack/master/docs/img/annotation_tool.png


7. Development
7. Indexing PDF files
______________________

Haystack has a customizable PDF text extraction pipeline with cleaning functions for header, footers, and tables. It supports complex document layouts with multi-column text.

8. Development
-------------------
* Unit tests can be executed by running :code:`tox`.
Empty file.
44 changes: 44 additions & 0 deletions haystack/indexing/file_converters/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from abc import abstractmethod
from pathlib import Path


class BaseConverter:
"""
Base class for implementing file converts to transform input documents to text format for indexing in database.
"""

def __init__(
self,
remove_numeric_tables: bool = None,
remove_header_footer: bool = None,
remove_whitespace: bool = None,
remove_empty_lines: bool = None,
valid_languages: [str] = None,
):
"""
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
:param remove_header_footer: use heuristic to remove footers and headers across different pages by searching
for the longest common string. This heuristic uses exact matches and therefore
works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4"
or similar.
:param remove_whitespace: strip whitespaces before or after each line in the text.
:param remove_empty_lines: remove more than two empty lines in the text.
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
(https://en.wikipedia.org/wiki/ISO_639-1) format.
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
"""
self.remove_numeric_tables = remove_numeric_tables
self.remove_header_footer = remove_header_footer
self.remove_whitespace = remove_whitespace
self.remove_empty_lines = remove_empty_lines
self.valid_languages = valid_languages

@abstractmethod
def extract_pages(self, file_path: Path) -> [str]:
pass
230 changes: 230 additions & 0 deletions haystack/indexing/file_converters/pdftotext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
import logging
import re
import subprocess
from functools import partial, reduce
from itertools import chain
from pathlib import Path

import fitz
import langdetect

from haystack.indexing.file_converters.base import BaseConverter

logger = logging.getLogger(__name__)


class PDFToTextConverter(BaseConverter):
def __init__(
self,
remove_numeric_tables: bool = False,
remove_whitespace: bool = None,
remove_empty_lines: bool = None,
remove_header_footer: bool = None,
valid_languages: [str] = None,
):
"""
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
:param remove_whitespace: strip whitespaces before or after each line in the text.
:param remove_empty_lines: remove more than two empty lines in the text.
:param remove_header_footer: use heuristic to remove footers and headers across different pages by searching
for the longest common string. This heuristic uses exact matches and therefore
works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4"
or similar.
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
(https://en.wikipedia.org/wiki/ISO_639-1) format.
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
"""
verify_installation = subprocess.run(["pdftotext -v"], shell=True)
if verify_installation.returncode == 127:
raise Exception(
"""pdftotext is not installed. It is part of xpdf or poppler-utils software suite.

Installation on Linux:
wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.02.tar.gz &&
tar -xvf xpdf-tools-linux-4.02.tar.gz && sudo cp xpdf-tools-linux-4.02/bin64/pdftotext /usr/local/bin

Installation on MacOS:
brew install xpdf

You can find more details here: https://www.xpdfreader.com
"""
)

super().__init__(
remove_numeric_tables=remove_numeric_tables,
remove_whitespace=remove_whitespace,
remove_empty_lines=remove_empty_lines,
remove_header_footer=remove_header_footer,
valid_languages=valid_languages,
)

def extract_pages(self, file_path: Path) -> [str]:

page_count = fitz.open(file_path).pageCount

pages = []
for page_number in range(1, page_count + 1):
# pdftotext tool provides an option to retain the original physical layout of a PDF page. This behaviour
# can be toggled by using the layout param.
# layout=True
# + table structures get retained better
# - multi-column pages(eg, research papers) gets extracted with text from multiple columns on same line
# layout=False
# + keeps strings in content stream order, hence multi column layout works well
# - cells of tables gets split across line
#
# Here, as a "safe" default, layout is turned off.
page = self._extract_page(file_path, page_number, layout=False)
lines = page.splitlines()
cleaned_lines = []
for line in lines:
words = line.split()
digits = [word for word in words if any(i.isdigit() for i in word)]

# remove lines having > 40% of words as digits AND not ending with a period(.)
if self.remove_numeric_tables:
if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
logger.debug(f"Removing line '{line}' from {file_path}")
continue

if self.remove_whitespace:
line = line.strip()

cleaned_lines.append(line)

page = "\n".join(cleaned_lines)

if self.remove_empty_lines:
page = re.sub(r"\n\n+", "\n\n", page)

pages.append(page)
page_number += 1

if self.valid_languages:
document_text = "".join(pages)
if not self._validate_language(document_text):
logger.warning(
f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
f"been decoded in the correct text format."
)

if self.remove_header_footer:
pages, header, footer = self.find_and_remove_header_footer(
pages, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
)
logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")

return pages

def _extract_page(self, file_path: Path, page_number: int, layout: bool):
"""
Extract a page from the pdf file at file_path.

:param file_path: path of the pdf file
:param page_number: page number to extract(starting from 1)
:param layout: whether to retain the original physical layout for a page. If disabled, PDF pages are read in
the content stream order.
"""
if layout:
command = ["pdftotext", "-layout", "-f", str(page_number), "-l", str(page_number), file_path, "-"]
else:
command = ["pdftotext", "-f", str(page_number), "-l", str(page_number), file_path, "-"]
output_page = subprocess.run(command, capture_output=True, shell=False)
page = output_page.stdout.decode(errors="ignore")
return page

def _validate_language(self, text: str):
"""
Validate if the language of the text is one of valid languages.
"""
try:
lang = langdetect.detect(text)
except langdetect.lang_detect_exception.LangDetectException:
lang = None

if lang in self.valid_languages:
return True
else:
return False

def _ngram(self, seq: str, n: int):
"""
Return ngram (of tokens - currently splitted by whitespace)
:param seq: str, string from which the ngram shall be created
:param n: int, n of ngram
:return: str, ngram as string
"""

# In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
# we add a space here and remove it after creation of the ngrams again (see below)
seq = seq.replace("\n", " \n")
seq = seq.replace("\t", " \t")

seq = seq.split(" ")
ngrams = (
" ".join(seq[i : i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(seq) - n + 1)
)

return ngrams

def _allngram(self, seq: str, min_ngram: int, max_ngram: int):
lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))
ngrams = map(partial(self._ngram, seq), lengths)
res = set(chain.from_iterable(ngrams))
return res

def find_longest_common_ngram(self, sequences: [str], max_ngram: int = 30, min_ngram: int = 3):
"""
Find the longest common ngram across different text sequences (e.g. start of pages).
Considering all ngrams between the specified range. Helpful for finding footers, headers etc.

:param sequences: list[str], list of strings that shall be searched for common n_grams
:param max_ngram: int, maximum length of ngram to consider
:param min_ngram: minimum length of ngram to consider
:return: str, common string of all sections
"""

seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences)
intersection = reduce(set.intersection, seqs_ngrams)

try:
longest = max(intersection, key=len)
except ValueError:
# no common sequence found
longest = ""
return longest if longest.strip() else None

def find_and_remove_header_footer(
self, pages: [str], n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
):
"""
Heuristic to find footers and headers across different pages by searching for the longest common string.
For headers we only search in the first n_chars characters (for footer: last n_chars).
Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX",
but won't detect "Page 3 of 4" or similar.

:param pages: list of strings, one string per page
:param n_chars: number of first/last characters where the header/footer shall be searched in
:param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header)
:param n_last_pages_to_ignore: number of last pages to ignore
:return: (cleaned pages, found_header_str, found_footer_str)
"""

# header
start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
found_header = self.find_longest_common_ngram(start_of_pages)
if found_header:
pages = [page.replace(found_header, "") for page in pages]

# footer
end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
found_footer = self.find_longest_common_ngram(end_of_pages)
if found_footer:
pages = [page.replace(found_footer, "") for page in pages]
return pages, found_header, found_footer
79 changes: 38 additions & 41 deletions haystack/indexing/io.py → haystack/indexing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,57 +4,53 @@
import tempfile
import tarfile
import zipfile
from typing import Callable
from haystack.indexing.file_converters.pdftotext import PDFToTextConverter

logger = logging.getLogger(__name__)


def write_documents_to_db(document_store, document_dir, clean_func=None, only_empty_db=False, split_paragraphs=False):
def convert_files_to_dicts(dir_path: str, clean_func: Callable = None, split_paragraphs: bool = False) -> [dict]:
"""
Write all text files(.txt) in the sub-directories of the given path to the connected database.
Convert all files(.txt, .pdf) in the sub-directories of the given path to Python dicts that can be written to a
Document Store.

:param document_dir: path for the documents to be written to the database
:param dir_path: path for the documents to be written to the database
:param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str)
:param only_empty_db: If true, docs will only be written if db is completely empty.
Useful to avoid indexing the same initial docs again and again.
:param split_paragraphs: split text in paragraphs.

:return: None
"""
file_paths = Path(document_dir).glob("**/*.txt")

# check if db has already docs
if only_empty_db:
n_docs = document_store.get_document_count()
if n_docs > 0:
logger.info(f"Skip writing documents since DB already contains {n_docs} docs ... "
"(Disable `only_empty_db`, if you want to add docs anyway.)")
return None

# read and add docs
docs_to_index = []

file_paths = [p for p in Path(dir_path).glob("**/*")]
if ".pdf" in [p.suffix.lower() for p in file_paths]:
pdf_converter = PDFToTextConverter()
else:
pdf_converter = None

documents = []
for path in file_paths:
with open(path) as doc:
text = doc.read()
if clean_func:
text = clean_func(text)

if split_paragraphs:
for para in text.split("\n\n"):
if not para.strip(): # skip empty paragraphs
continue
docs_to_index.append(
{
"name": path.name,
"text": para
}
)
else:
docs_to_index.append(
{
"name": path.name,
"text": text
}
)
document_store.write_documents(docs_to_index)
logger.info(f"Wrote {len(docs_to_index)} docs to DB")
if path.suffix.lower() == ".txt":
with open(path) as doc:
text = doc.read()
elif path.suffix.lower() == ".pdf":
pages = pdf_converter.extract_pages(path)
text = "\n".join(pages)
else:
raise Exception(f"Indexing of {path.suffix} files is not currently supported.")

if clean_func:
text = clean_func(text)

if split_paragraphs:
for para in text.split("\n\n"):
if not para.strip(): # skip empty paragraphs
continue
documents.append({"name": path.name, "text": para})
else:
documents.append({"name": path.name, "text": text})

return documents


def fetch_archive_from_http(url, output_dir, proxies=None):
Expand Down Expand Up @@ -97,3 +93,4 @@ def fetch_archive_from_http(url, output_dir, proxies=None):
archive.extractall(output_dir)
# temp_file gets deleted here
return True