Skip to content

Commit

Permalink
switch from python-docx to python-docx2txt
Browse files Browse the repository at this point in the history
  • Loading branch information
ankushshah89 committed Nov 9, 2015
1 parent 2d598ad commit 6ecc86d
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 30 deletions.
2 changes: 1 addition & 1 deletion requirements/python
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
argcomplete
chardet
python-pptx>=0.5.1
python-docx
docx2txt
pdfminer==20140328
beautifulsoup4
xlrd
Expand Down
31 changes: 2 additions & 29 deletions textract/parsers/docx_parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import docx
import docx2txt

from .utils import BaseParser

Expand All @@ -8,31 +8,4 @@ class Parser(BaseParser):
"""

def extract(self, filename, **kwargs):
text = ""
document = docx.Document(filename)

# Extract text from root paragraphs.
text += '\n\n'.join([
paragraph.text for paragraph in document.paragraphs
])

# Recursively extract text from root tables.
for table in document.tables:
text += '\n\n' + self._parse_table(table)

return text

def _parse_table(self, table):
text = ''
for row in table.rows:
for cell in row.cells:
# For every cell in every row of the table, extract text from
# child paragraphs.
for paragraph in cell.paragraphs:
text += '\n\n' + paragraph.text

# Then recursively extract text from child tables.
for table in cell.tables:
text += self._parse_table(table)

return text
return docx2txt.process(filename)

0 comments on commit 6ecc86d

Please sign in to comment.