added pdf support

deanmalmgren · Jul 7, 2014 · 184b313 · 184b313
1 parent e9860b8
commit 184b313
Show file tree

Hide file tree

Showing 13 changed files with 70 additions and 10 deletions.
diff --git a/bin/textract b/bin/textract
@@ -23,14 +23,18 @@ parser.add_argument(
     '-o', '--output', type=argparse.FileType('w'), default='-',
     help='Put raw text in here.',
 )
+parser.add_argument(
+    '-m', '--method', default='',
+    help='Specify a particular method of extraction here.',
+)
 
 # enable autocompletion with argcomplete
 argcomplete.autocomplete(parser)
 
 # extract text
 args = parser.parse_args()
 try:
-    output = process(args.filename)
+    output = process(**vars(args))
 except CommandLineError, e:
     sys.stderr.write(red(e) + '\n')
 else:

diff --git a/docs/index.rst b/docs/index.rst
@@ -35,10 +35,14 @@ or the :ref:`python package <python-package>`
 Currently supporting
 --------------------
 
+* ``.doc`` via `antiword <http://www.winfield.demon.nl/>`__
+
 * ``.docx`` via `python-docx <https://python-docx.readthedocs.org/en/latest/>`__
 
 * ``.pptx`` via `python-pptx <https://python-pptx.readthedocs.org/en/latest/>`__
 
+* ``.pdf`` via `pdftotext <http://poppler.freedesktop.org/>`__ (default) or `pdfminer <https://euske.github.io/pdfminer/>`__
+
 Installation
 ------------
 
@@ -50,7 +54,7 @@ you will need to run:
 
 .. code-block:: bash
 
-    apt-get install python-dev libxml2-dev libxslt1-dev antiword
+    apt-get install python-dev libxml2-dev libxslt1-dev antiword poppler-utils
 
 before running:
 

diff --git a/docs/python_package.rst b/docs/python_package.rst
@@ -30,6 +30,13 @@ textract.parsers.docx
    :members:
 
 
+textract.parsers.pdf
+---------------------
+
+.. automodule:: textract.parsers.pdf
+   :members:
+
+
 textract.parsers.pptx
 ---------------------
 

diff --git a/requirements/debian b/requirements/debian
@@ -6,3 +6,6 @@ libxslt1-dev
 
 # parse word documents
 antiword
+
+# parse pdfs
+poppler-utils
diff --git a/requirements/python b/requirements/python
@@ -3,3 +3,4 @@
 argcomplete
 python-pptx
 python-docx
+pdfminer
diff --git a/tests/pdf/i_heart_pdfs.pdf b/tests/pdf/i_heart_pdfs.pdf
diff --git a/tests/run_functional_tests.sh b/tests/run_functional_tests.sh
@@ -37,12 +37,15 @@ update_status () {
 # function for running test on a specific example to validate that the
 # checksum of results is consistent
 validate_example () {
-    example=$1
-    test_checksum=$2
+    # manipulate the list of arguments passed to this function via
+    # http://stackoverflow.com/a/10308353/564709
+    args=("$@")
+    test_checksum=${args[-1]}
+    unset args[${#args[@]}-1]
 
     # run textract on an example document and make sure the md5sum is
     # the same as what we expect
-    textract $example > dummy.txt
+    textract "${args[@]}" > dummy.txt
     update_status $? ''
     local_checksum=$(md5sum dummy.txt | awk '{print $1}')
     rm -f dummy.txt
@@ -65,6 +68,8 @@ validate_example () {
 validate_example ${BASEDIR}/docx/i_heart_word.docx 35b515d5e9d68af496f9233eb81547be
 validate_example ${BASEDIR}/pptx/i_love_powerpoint.pptx a5bc9cbe9284d4c81c1106a8137e4a4d
 validate_example ${BASEDIR}/doc/i_heart_word.doc 95720710c2eac172e1e05e86e02964f0
+validate_example ${BASEDIR}/pdf/i_heart_pdfs.pdf 06719d714211174a3851ac4cee880fe1
+validate_example -m pdfminer ${BASEDIR}/pdf/i_heart_pdfs.pdf d4377783e5fbde756d3a195bfd103be0
 
 # exit with the sum of the status
 exit ${EXIT_CODE}
diff --git a/textract/exceptions.py b/textract/exceptions.py
@@ -31,6 +31,16 @@ def __str__(self):
         ))
 
 
+class UnknownMethod(CommandLineError):
+    def __init__(self, method):
+        self.method = method
+
+    def __str__(self):
+        return self.render((
+            'The method "%(method)s" can not be found for this filetype.'
+        ))
+
+
 class ShellError(CommandLineError):
     def __init__(self, exit_code):
         self.exit_code = exit_code

diff --git a/textract/parsers/__init__.py b/textract/parsers/__init__.py
@@ -4,7 +4,7 @@
 from .. import exceptions
 
 
-def process(filename):
+def process(filename, **kwargs):
     """This is the core function used for parsing. It routes the filename
     to the appropriate parser and returns the result.
     """
@@ -23,4 +23,4 @@ def process(filename):
     except ImportError, e:
         raise exceptions.ExtensionNotSupported(ext)
 
-    return filetype_module.extract(filename)
+    return filetype_module.extract(filename, **kwargs)
diff --git a/textract/parsers/doc.py b/textract/parsers/doc.py
@@ -1,7 +1,7 @@
 from ..shell import run
 
 
-def extract(filename):
+def extract(filename, **kwargs):
     """Extract text from doc files using antiword.
     """
     pipe = run('antiword %(filename)s' % locals())

diff --git a/textract/parsers/docx.py b/textract/parsers/docx.py
@@ -1,7 +1,7 @@
 from ..utils import non_local_import
 
 
-def extract(filename):
+def extract(filename, **kwargs):
     """Extract text from docx file using python-docx.
     """
     docx = non_local_import('docx')

diff --git a/textract/parsers/pdf.py b/textract/parsers/pdf.py
@@ -0,0 +1,26 @@
+from ..shell import run
+from ..exceptions import UnknownMethod
+
+
+def extract(filename, method=None, **kwargs):
+    """Extract text from pdf files using ``method``.
+    """
+    method = method or 'pdftotext'
+    if method == 'pdftotext':
+        return extract_pdftotext(filename)
+    elif method == 'pdfminer':
+        return extract_pdfminer(filename)
+    else:
+        raise UnknownMethod(method)
+
+
+def extract_pdftotext(filename):
+    """Extract text from pdfs using the pdftotext command line utility."""
+    pipe = run('pdftotext %(filename)s -' % locals())
+    return pipe.stdout.read()
+
+
+def extract_pdfminer(filename):
+    """Extract text from pdfs using pdfminer."""
+    pipe = run('pdf2txt.py %(filename)s' % locals())
+    return pipe.stdout.read()
diff --git a/textract/parsers/pptx.py b/textract/parsers/pptx.py
@@ -1,7 +1,7 @@
 from ..utils import non_local_import
 
 
-def extract(filename):
+def extract(filename, **kwargs):
     """Extract text from pptx file using python-pptx
     """
     pptx = non_local_import('pptx')