Skip to content

Commit

Permalink
Merge branch 'extension-autocomplete'
Browse files Browse the repository at this point in the history
  • Loading branch information
Dean Malmgren committed Apr 3, 2017
2 parents 4d71473 + 0369c7d commit 448bf00
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 8 deletions.
2 changes: 2 additions & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ latest changes in development for next release

* avoid unicode decode error with html parser (`#147`_ by `@suned`_)

* enabling autocomplete and improving error handling (`#149`_)

1.5.0
-----
Expand Down Expand Up @@ -335,3 +336,4 @@ latest changes in development for next release
.. _#146: https://github.com/deanmalmgren/textract/issues/146
.. _#147: https://github.com/deanmalmgren/textract/issues/147
.. _#148: https://github.com/deanmalmgren/textract/issues/148
.. _#149: https://github.com/deanmalmgren/textract/issues/149
10 changes: 5 additions & 5 deletions textract/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@
import pkgutil
import sys
import six
import re
import glob

import argcomplete

from . import VERSION
from .parsers import DEFAULT_ENCODING
from .parsers import DEFAULT_ENCODING, _get_available_extensions


class AddToNamespaceAction(argparse.Action):
Expand Down Expand Up @@ -62,10 +64,8 @@ def get_parser():
)
parser.add_argument(
'--extension', type=str, default=None,
choices=_get_available_encodings(),
help='Specify the extension of the file (e.g., docx or pdf). '
'Extension can be also passed with the '
'leading . (e.g., .docx or .pdf).',
choices=_get_available_extensions(),
help='Specify the extension of the file.',
)
parser.add_argument(
'-m', '--method', default='',
Expand Down
10 changes: 9 additions & 1 deletion textract/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,19 @@ class ExtensionNotSupported(CommandLineError):
def __init__(self, ext):
self.ext = ext

from .parsers import _get_available_extensions
available_extensions = []
for e in _get_available_extensions():
if e.startswith('.'):
available_extensions.append(e)
self.available_extensions_str = ', '.join(available_extensions)

def __str__(self):
return self.render((
'The filename extension %(ext)s is not yet supported by\n'
'textract. Please suggest this filename extension here:\n\n'
' https://github.com/deanmalmgren/textract/issues\n'
' https://github.com/deanmalmgren/textract/issues\n\n'
'Available extensions include: %(available_extensions_str)s\n'
))


Expand Down
35 changes: 33 additions & 2 deletions textract/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import os
import importlib
import glob
import re

from .. import exceptions

Expand All @@ -22,6 +24,9 @@
# the command line interface
DEFAULT_ENCODING = 'utf_8'

# filename format
_FILENAME_SUFFIX = '_parser'


def process(filename, encoding=DEFAULT_ENCODING, extension=None, **kwargs):
"""This is the core function used for extracting text. It routes the
Expand Down Expand Up @@ -55,17 +60,43 @@ def process(filename, encoding=DEFAULT_ENCODING, extension=None, **kwargs):
# to avoid conflicts with packages that are installed globally
# (e.g. python's json module), all extension parser modules have
# the _parser extension
rel_module = ext + '_parser'
rel_module = ext + _FILENAME_SUFFIX

# If we can't import the module, the file extension isn't currently
# supported
try:
filetype_module = importlib.import_module(
rel_module, 'textract.parsers')
rel_module, 'textract.parsers'
)
except ImportError:
raise exceptions.ExtensionNotSupported(ext)

# do the extraction

parser = filetype_module.Parser()
return parser.process(filename, encoding, **kwargs)


def _get_available_extensions():
"""Get a list of available file extensions to make it easy for
tab-completion and exception handling.
"""
extensions = []

# from filenames
parsers_dir = os.path.join(os.path.dirname(__file__))
glob_filename = os.path.join(parsers_dir, "*" + _FILENAME_SUFFIX + ".py")
ext_re = re.compile(glob_filename.replace('*', "(?P<ext>\w+)"))
for filename in glob.glob(glob_filename):
ext_match = ext_re.match(filename)
ext = ext_match.groups()[0]
extensions.append(ext)
extensions.append('.' + ext)

# from relevant synonyms (don't use the '' synonym)
for ext in EXTENSION_SYNONYMS.keys():
if ext:
extensions.append(ext)
extensions.append(ext.replace('.', '', 1))
extensions.sort()
return extensions

0 comments on commit 448bf00

Please sign in to comment.