merge

deanmalmgren · Mar 31, 2017 · dfb2f64 · dfb2f64
2 parents f39bfef + c48d5f6
commit dfb2f64
Show file tree

Hide file tree

Showing 8 changed files with 60 additions and 5 deletions.
diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -11,6 +11,9 @@ latest changes in development for next release
 
 .. THANKS FOR CONTRIBUTING; MENTION WHAT YOU DID IN THIS SECTION HERE!
 
+* Let the user provide file extension as an argument when the file name has no
+  extension (`#148`_ by `@motazsaad`_)
+
 * Added ability to parse audio with ``pocketsphinx`` (`#122`_ by `@barrust`_)
 
 * several bug fixes, including:
@@ -238,7 +241,7 @@ latest changes in development for next release
 
 
 .. list of contributors that are linked to above. putting links here
-   to make the text above relatively clean
+.. to make the text above relatively clean
 
 .. _@akoumjian: https://github.com/akoumjian
 .. _@anthonygarvan: https://github.com/anthonygarvan
@@ -254,6 +257,7 @@ latest changes in development for next release
 .. _@jsmith-mploir: https://github.com/jsmith-mploir
 .. _@kokxx: https://github.com/Kokxx
 .. _@levivm: https://github.com/levivm
+.. _@motazsaad: https://github.com/motazsaad
 .. _@onionradish: https://github.com/onionradish
 .. _@pierre-ernst: https://github.com/pierre-ernst
 .. _@pudo: https://github.com/pudo
@@ -263,7 +267,7 @@ latest changes in development for next release
 
 
 .. list of issues that have been resolved. putting links here to make
-   the text above relatively clean
+.. the text above relatively clean
 
 .. _#2: https://github.com/deanmalmgren/textract/issues/2
 .. _#3: https://github.com/deanmalmgren/textract/issues/3
@@ -323,3 +327,4 @@ latest changes in development for next release
 .. _#136: https://github.com/deanmalmgren/textract/issues/136
 .. _#139: https://github.com/deanmalmgren/textract/issues/139
 .. _#147: https://github.com/deanmalmgren/textract/issues/147
+.. _#148: https://github.com/deanmalmgren/textract/issues/148
diff --git a/docs/python_package.rst b/docs/python_package.rst
@@ -24,6 +24,12 @@ inferred using `chardet <https://github.com/chardet/chardet>`_)::
     import textract
     text = textract.process('path/to/file.extension', encoding='ascii')
 
+When the file name has no extension, you specify the file's extension as an argument
+to ``textract.process`` like this::
+
+    import textract
+    text = textract.process('path/to/file', extension='docx')
+
 .. _additional-options:
 
 Additional options

diff --git a/tests/no_ext/docx_paragraphs_and_tables b/tests/no_ext/docx_paragraphs_and_tables
diff --git a/tests/no_ext/msg_standardized_text b/tests/no_ext/msg_standardized_text
diff --git a/tests/no_ext/pdf_standardized_text b/tests/no_ext/pdf_standardized_text
diff --git a/tests/test_no_ext.py b/tests/test_no_ext.py
@@ -0,0 +1,27 @@
+import unittest
+import os
+import textract
+
+class No_Ext_TestCase(unittest.TestCase):
+
+    def test_docx(self):
+        current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        docx_file = os.path.join(current_dir, "tests/no_ext/docx_paragraphs_and_tables")
+        # pass the file without extension and provide the extension as a parameter
+        text = textract.process(docx_file, extension='docx')
+        print(text)
+
+    def test_msg(self):
+        current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        msg_file = os.path.join(current_dir, "tests/no_ext/msg_standardized_text")
+        # pass the file without extension and provide the extension as a parameter
+        text = textract.process(msg_file, extension='msg')
+        print(text)
+
+    def test_pdf(self):
+        current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        pdf_file = os.path.join(current_dir, "tests/no_ext/pdf_standardized_text")
+        # pass the file without extension and provide the extension as a parameter
+        text = textract.process(pdf_file, extension='.pdf')
+        print(text)
+
diff --git a/textract/cli.py b/textract/cli.py
@@ -60,6 +60,13 @@ def get_parser():
         choices=_get_available_encodings(),
         help='Specify the encoding of the output.',
     )
+    parser.add_argument(
+        '--extension', type=str, default=None,
+        choices=_get_available_encodings(),
+        help='Specify the extension of the file (e.g., docx or pdf). '
+             'Extension can be also passed with the '
+             'leading . (e.g., .docx or .pdf).',
+    )
     parser.add_argument(
         '-m', '--method', default='',
         help='Specify a method of extraction for formats that support it',

diff --git a/textract/parsers/__init__.py b/textract/parsers/__init__.py
@@ -23,7 +23,7 @@
 DEFAULT_ENCODING = 'utf_8'
 
 
-def process(filename, encoding=DEFAULT_ENCODING, **kwargs):
+def process(filename, encoding=DEFAULT_ENCODING, extension=None, **kwargs):
     """This is the core function used for extracting text. It routes the
     ``filename`` to the appropriate parser and returns the extracted
     text as a byte-string encoded with ``encoding``.
@@ -36,8 +36,18 @@ def process(filename, encoding=DEFAULT_ENCODING, **kwargs):
     # get the filename extension, which is something like .docx for
     # example, and import the module dynamically using importlib. This
     # is a relative import so the name of the package is necessary
-    _, ext = os.path.splitext(filename)
-    ext = ext.lower()
+    # normally, file extension will be extracted from the file name
+    # if the file name has no extension, then the user can pass the
+    # extension as an argument
+    if extension:
+        ext = extension
+        # check if the extension has the leading .
+        if not ext.startswith('.'):
+            ext = '.' + ext
+        ext = ext.lower()
+    else:
+        _, ext = os.path.splitext(filename)
+        ext = ext.lower()
 
     # check the EXTENSION_SYNONYMS dictionary
     ext = EXTENSION_SYNONYMS.get(ext, ext)