Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
  • Loading branch information
Dean Malmgren committed Mar 31, 2017
2 parents f39bfef + c48d5f6 commit dfb2f64
Show file tree
Hide file tree
Showing 8 changed files with 60 additions and 5 deletions.
9 changes: 7 additions & 2 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ latest changes in development for next release

.. THANKS FOR CONTRIBUTING; MENTION WHAT YOU DID IN THIS SECTION HERE!
* Let the user provide file extension as an argument when the file name has no
extension (`#148`_ by `@motazsaad`_)

* Added ability to parse audio with ``pocketsphinx`` (`#122`_ by `@barrust`_)

* several bug fixes, including:
Expand Down Expand Up @@ -238,7 +241,7 @@ latest changes in development for next release


.. list of contributors that are linked to above. putting links here
to make the text above relatively clean
.. to make the text above relatively clean
.. _@akoumjian: https://github.com/akoumjian
.. _@anthonygarvan: https://github.com/anthonygarvan
Expand All @@ -254,6 +257,7 @@ latest changes in development for next release
.. _@jsmith-mploir: https://github.com/jsmith-mploir
.. _@kokxx: https://github.com/Kokxx
.. _@levivm: https://github.com/levivm
.. _@motazsaad: https://github.com/motazsaad
.. _@onionradish: https://github.com/onionradish
.. _@pierre-ernst: https://github.com/pierre-ernst
.. _@pudo: https://github.com/pudo
Expand All @@ -263,7 +267,7 @@ latest changes in development for next release


.. list of issues that have been resolved. putting links here to make
the text above relatively clean
.. the text above relatively clean
.. _#2: https://github.com/deanmalmgren/textract/issues/2
.. _#3: https://github.com/deanmalmgren/textract/issues/3
Expand Down Expand Up @@ -323,3 +327,4 @@ latest changes in development for next release
.. _#136: https://github.com/deanmalmgren/textract/issues/136
.. _#139: https://github.com/deanmalmgren/textract/issues/139
.. _#147: https://github.com/deanmalmgren/textract/issues/147
.. _#148: https://github.com/deanmalmgren/textract/issues/148
6 changes: 6 additions & 0 deletions docs/python_package.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@ inferred using `chardet <https://github.com/chardet/chardet>`_)::
import textract
text = textract.process('path/to/file.extension', encoding='ascii')

When the file name has no extension, you specify the file's extension as an argument
to ``textract.process`` like this::

import textract
text = textract.process('path/to/file', extension='docx')

.. _additional-options:

Additional options
Expand Down
Binary file added tests/no_ext/docx_paragraphs_and_tables
Binary file not shown.
Binary file added tests/no_ext/msg_standardized_text
Binary file not shown.
Binary file added tests/no_ext/pdf_standardized_text
Binary file not shown.
27 changes: 27 additions & 0 deletions tests/test_no_ext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import unittest
import os
import textract

class No_Ext_TestCase(unittest.TestCase):

def test_docx(self):
current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
docx_file = os.path.join(current_dir, "tests/no_ext/docx_paragraphs_and_tables")
# pass the file without extension and provide the extension as a parameter
text = textract.process(docx_file, extension='docx')
print(text)

def test_msg(self):
current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
msg_file = os.path.join(current_dir, "tests/no_ext/msg_standardized_text")
# pass the file without extension and provide the extension as a parameter
text = textract.process(msg_file, extension='msg')
print(text)

def test_pdf(self):
current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
pdf_file = os.path.join(current_dir, "tests/no_ext/pdf_standardized_text")
# pass the file without extension and provide the extension as a parameter
text = textract.process(pdf_file, extension='.pdf')
print(text)

7 changes: 7 additions & 0 deletions textract/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,13 @@ def get_parser():
choices=_get_available_encodings(),
help='Specify the encoding of the output.',
)
parser.add_argument(
'--extension', type=str, default=None,
choices=_get_available_encodings(),
help='Specify the extension of the file (e.g., docx or pdf). '
'Extension can be also passed with the '
'leading . (e.g., .docx or .pdf).',
)
parser.add_argument(
'-m', '--method', default='',
help='Specify a method of extraction for formats that support it',
Expand Down
16 changes: 13 additions & 3 deletions textract/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
DEFAULT_ENCODING = 'utf_8'


def process(filename, encoding=DEFAULT_ENCODING, **kwargs):
def process(filename, encoding=DEFAULT_ENCODING, extension=None, **kwargs):
"""This is the core function used for extracting text. It routes the
``filename`` to the appropriate parser and returns the extracted
text as a byte-string encoded with ``encoding``.
Expand All @@ -36,8 +36,18 @@ def process(filename, encoding=DEFAULT_ENCODING, **kwargs):
# get the filename extension, which is something like .docx for
# example, and import the module dynamically using importlib. This
# is a relative import so the name of the package is necessary
_, ext = os.path.splitext(filename)
ext = ext.lower()
# normally, file extension will be extracted from the file name
# if the file name has no extension, then the user can pass the
# extension as an argument
if extension:
ext = extension
# check if the extension has the leading .
if not ext.startswith('.'):
ext = '.' + ext
ext = ext.lower()
else:
_, ext = os.path.splitext(filename)
ext = ext.lower()

# check the EXTENSION_SYNONYMS dictionary
ext = EXTENSION_SYNONYMS.get(ext, ext)
Expand Down

0 comments on commit dfb2f64

Please sign in to comment.