Skip to content

Commit

Permalink
added pdf support
Browse files Browse the repository at this point in the history
  • Loading branch information
Dean Malmgren committed Jul 7, 2014
1 parent e9860b8 commit 184b313
Show file tree
Hide file tree
Showing 13 changed files with 70 additions and 10 deletions.
6 changes: 5 additions & 1 deletion bin/textract
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,18 @@ parser.add_argument(
'-o', '--output', type=argparse.FileType('w'), default='-',
help='Put raw text in here.',
)
parser.add_argument(
'-m', '--method', default='',
help='Specify a particular method of extraction here.',
)

# enable autocompletion with argcomplete
argcomplete.autocomplete(parser)

# extract text
args = parser.parse_args()
try:
output = process(args.filename)
output = process(**vars(args))
except CommandLineError, e:
sys.stderr.write(red(e) + '\n')
else:
Expand Down
6 changes: 5 additions & 1 deletion docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,14 @@ or the :ref:`python package <python-package>`
Currently supporting
--------------------

* ``.doc`` via `antiword <http://www.winfield.demon.nl/>`__

* ``.docx`` via `python-docx <https://python-docx.readthedocs.org/en/latest/>`__

* ``.pptx`` via `python-pptx <https://python-pptx.readthedocs.org/en/latest/>`__

* ``.pdf`` via `pdftotext <http://poppler.freedesktop.org/>`__ (default) or `pdfminer <https://euske.github.io/pdfminer/>`__

Installation
------------

Expand All @@ -50,7 +54,7 @@ you will need to run:

.. code-block:: bash
apt-get install python-dev libxml2-dev libxslt1-dev antiword
apt-get install python-dev libxml2-dev libxslt1-dev antiword poppler-utils
before running:

Expand Down
7 changes: 7 additions & 0 deletions docs/python_package.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,13 @@ textract.parsers.docx
:members:


textract.parsers.pdf
---------------------

.. automodule:: textract.parsers.pdf
:members:


textract.parsers.pptx
---------------------

Expand Down
3 changes: 3 additions & 0 deletions requirements/debian
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@ libxslt1-dev

# parse word documents
antiword

# parse pdfs
poppler-utils
1 change: 1 addition & 0 deletions requirements/python
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
argcomplete
python-pptx
python-docx
pdfminer
Binary file added tests/pdf/i_heart_pdfs.pdf
Binary file not shown.
11 changes: 8 additions & 3 deletions tests/run_functional_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,15 @@ update_status () {
# function for running test on a specific example to validate that the
# checksum of results is consistent
validate_example () {
example=$1
test_checksum=$2
# manipulate the list of arguments passed to this function via
# http://stackoverflow.com/a/10308353/564709
args=("$@")
test_checksum=${args[-1]}
unset args[${#args[@]}-1]

# run textract on an example document and make sure the md5sum is
# the same as what we expect
textract $example > dummy.txt
textract "${args[@]}" > dummy.txt
update_status $? ''
local_checksum=$(md5sum dummy.txt | awk '{print $1}')
rm -f dummy.txt
Expand All @@ -65,6 +68,8 @@ validate_example () {
validate_example ${BASEDIR}/docx/i_heart_word.docx 35b515d5e9d68af496f9233eb81547be
validate_example ${BASEDIR}/pptx/i_love_powerpoint.pptx a5bc9cbe9284d4c81c1106a8137e4a4d
validate_example ${BASEDIR}/doc/i_heart_word.doc 95720710c2eac172e1e05e86e02964f0
validate_example ${BASEDIR}/pdf/i_heart_pdfs.pdf 06719d714211174a3851ac4cee880fe1
validate_example -m pdfminer ${BASEDIR}/pdf/i_heart_pdfs.pdf d4377783e5fbde756d3a195bfd103be0

# exit with the sum of the status
exit ${EXIT_CODE}
10 changes: 10 additions & 0 deletions textract/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,16 @@ def __str__(self):
))


class UnknownMethod(CommandLineError):
def __init__(self, method):
self.method = method

def __str__(self):
return self.render((
'The method "%(method)s" can not be found for this filetype.'
))


class ShellError(CommandLineError):
def __init__(self, exit_code):
self.exit_code = exit_code
Expand Down
4 changes: 2 additions & 2 deletions textract/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from .. import exceptions


def process(filename):
def process(filename, **kwargs):
"""This is the core function used for parsing. It routes the filename
to the appropriate parser and returns the result.
"""
Expand All @@ -23,4 +23,4 @@ def process(filename):
except ImportError, e:
raise exceptions.ExtensionNotSupported(ext)

return filetype_module.extract(filename)
return filetype_module.extract(filename, **kwargs)
2 changes: 1 addition & 1 deletion textract/parsers/doc.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from ..shell import run


def extract(filename):
def extract(filename, **kwargs):
"""Extract text from doc files using antiword.
"""
pipe = run('antiword %(filename)s' % locals())
Expand Down
2 changes: 1 addition & 1 deletion textract/parsers/docx.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from ..utils import non_local_import


def extract(filename):
def extract(filename, **kwargs):
"""Extract text from docx file using python-docx.
"""
docx = non_local_import('docx')
Expand Down
26 changes: 26 additions & 0 deletions textract/parsers/pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from ..shell import run
from ..exceptions import UnknownMethod


def extract(filename, method=None, **kwargs):
"""Extract text from pdf files using ``method``.
"""
method = method or 'pdftotext'
if method == 'pdftotext':
return extract_pdftotext(filename)
elif method == 'pdfminer':
return extract_pdfminer(filename)
else:
raise UnknownMethod(method)


def extract_pdftotext(filename):
"""Extract text from pdfs using the pdftotext command line utility."""
pipe = run('pdftotext %(filename)s -' % locals())
return pipe.stdout.read()


def extract_pdfminer(filename):
"""Extract text from pdfs using pdfminer."""
pipe = run('pdf2txt.py %(filename)s' % locals())
return pipe.stdout.read()
2 changes: 1 addition & 1 deletion textract/parsers/pptx.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from ..utils import non_local_import


def extract(filename):
def extract(filename, **kwargs):
"""Extract text from pptx file using python-pptx
"""
pptx = non_local_import('pptx')
Expand Down

0 comments on commit 184b313

Please sign in to comment.