switch from python-docx to python-docx2txt

deanmalmgren · Nov 9, 2015 · 6ecc86d · 6ecc86d
1 parent 2d598ad
commit 6ecc86d
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 30 deletions.
diff --git a/requirements/python b/requirements/python
@@ -3,7 +3,7 @@
 argcomplete
 chardet
 python-pptx>=0.5.1
-python-docx
+docx2txt
 pdfminer==20140328
 beautifulsoup4
 xlrd

diff --git a/textract/parsers/docx_parser.py b/textract/parsers/docx_parser.py
@@ -1,4 +1,4 @@
-import docx
+import docx2txt
 
 from .utils import BaseParser
 
@@ -8,31 +8,4 @@ class Parser(BaseParser):
     """
 
     def extract(self, filename, **kwargs):
-        text = ""
-        document = docx.Document(filename)
-
-        # Extract text from root paragraphs.
-        text += '\n\n'.join([
-            paragraph.text for paragraph in document.paragraphs
-        ])
-
-        # Recursively extract text from root tables.
-        for table in document.tables:
-            text += '\n\n' + self._parse_table(table)
-
-        return text
-
-    def _parse_table(self, table):
-        text = ''
-        for row in table.rows:
-            for cell in row.cells:
-                # For every cell in every row of the table, extract text from
-                # child paragraphs.
-                for paragraph in cell.paragraphs:
-                    text += '\n\n' + paragraph.text
-
-                # Then recursively extract text from child tables.
-                for table in cell.tables:
-                    text += self._parse_table(table)
-
-        return text
+        return docx2txt.process(filename)