Skip to content

Commit

Permalink
specify input file encoding manually
Browse files Browse the repository at this point in the history
  • Loading branch information
Johannes Weytjens committed Nov 4, 2019
1 parent 4e38e92 commit ae0834d
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 8 deletions.
6 changes: 3 additions & 3 deletions textract/parsers/__init__.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@
# default encoding that is returned by the process method. specify it
# here so the default is used on both the process function and also by
# the command line interface
DEFAULT_ENCODING = 'utf_8'
DEFAULT_OUTPUT_ENCODING = 'utf_8'

# filename format
_FILENAME_SUFFIX = '_parser'


def process(filename, encoding=DEFAULT_ENCODING, extension=None, **kwargs):
def process(filename, input_encoding=None, output_encoding=DEFAULT_OUTPUT_ENCODING, extension=None, **kwargs):
"""This is the core function used for extracting text. It routes the
``filename`` to the appropriate parser and returns the extracted
text as a byte-string encoded with ``encoding``.
Expand Down Expand Up @@ -74,7 +74,7 @@ def process(filename, encoding=DEFAULT_ENCODING, extension=None, **kwargs):
# do the extraction

parser = filetype_module.Parser()
return parser.process(filename, encoding, **kwargs)
return parser.process(filename, input_encoding, output_encoding, **kwargs)


def _get_available_extensions():
Expand Down
14 changes: 9 additions & 5 deletions textract/parsers/utils.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def encode(self, text, encoding):
"""
return text.encode(encoding, 'ignore')

def process(self, filename, encoding, **kwargs):
def process(self, filename, input_encoding, output_encoding="utf8", **kwargs):
"""Process ``filename`` and encode byte-string with ``encoding``. This
method is called by :func:`textract.parsers.process` and wraps
the :meth:`.BaseParser.extract` method in `a delicious unicode
Expand All @@ -44,10 +44,10 @@ def process(self, filename, encoding, **kwargs):
# output encoding
# http://nedbatchelder.com/text/unipain/unipain.html#35
byte_string = self.extract(filename, **kwargs)
unicode_string = self.decode(byte_string)
return self.encode(unicode_string, encoding)
unicode_string = self.decode(byte_string, input_encoding)
return self.encode(unicode_string, output_encoding)

def decode(self, text):
def decode(self, text, input_encoding=None):
"""Decode ``text`` using the `chardet
<https://github.com/chardet/chardet>`_ package.
"""
Expand All @@ -60,7 +60,11 @@ def decode(self, text):
if not text:
return u''

# use chardet to automatically detect the encoding text
# use the provided encoding
if input_encoding:
return text.decode(input_encoding)

# use chardet to automatically detect the encoding text if no encoding is provided
result = chardet.detect(text)
return text.decode(result['encoding'])

Expand Down

0 comments on commit ae0834d

Please sign in to comment.