Skip to content

Commit

Permalink
Merge pull request #79 from evfredericksen/odt-tabs
Browse files Browse the repository at this point in the history
Odt parser tabs and multiple spaces
  • Loading branch information
Dean Malmgren committed Nov 22, 2014
2 parents 6d5e148 + b438016 commit fa65746
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 13 deletions.
Binary file modified tests/odt/raw_text.odt
Binary file not shown.
2 changes: 1 addition & 1 deletion tests/odt/raw_text.txt
@@ -1 +1 @@
Sample OppenOffice Writer file
Sample OppenOffice Writer file with tabs and multiple spaces
45 changes: 33 additions & 12 deletions textract/parsers/odt_parser.py
@@ -1,5 +1,5 @@
import zipfile
import xml.dom.minidom
import xml.etree.ElementTree as ET
import StringIO

from .utils import BaseParser
Expand All @@ -14,24 +14,45 @@ def extract(self, filename, **kwargs):
# https://github.com/odoo/odoo/blob/master/addons/document/odt2txt.py
with open(filename) as stream:
zip_stream = zipfile.ZipFile(stream)
self.content = xml.dom.minidom.parseString(
zip_stream.read("content.xml")
)

self.content = ET.fromstring(zip_stream.read("content.xml"))
return self.to_string()

def to_string(self):
""" Converts the document to a string. """
buff = u""
for paragraph in self.content.getElementsByTagName("text:p"):
buff += self.text_to_string(paragraph) + "\n"
for child in self.content.iter():
if child.tag in [self.qn('text:p'), self.qn('text:h')]:
buff += self.text_to_string(child) + "\n"
# remove last newline char
if buff:
buff = buff[:-1]
return buff

def text_to_string(self, element):
buff = u""
for node in element.childNodes:
if node.nodeType == xml.dom.Node.TEXT_NODE:
buff += node.nodeValue
elif node.nodeType == xml.dom.Node.ELEMENT_NODE:
buff += self.text_to_string(node)
if element.text is not None:
buff += element.text
for child in element:
if child.tag == self.qn('text:tab'):
buff += "\t"
if child.tail is not None:
buff += child.tail
elif child.tag == self.qn('text:s'):
buff += u" "
if child.get(self.qn('text:c')) is not None:
buff += u" " * (int(child.get(self.qn('text:c'))) - 1)
if child.tail is not None:
buff += child.tail
else:
buff += self.text_to_string(child)
if element.tail is not None:
buff += element.tail
return buff

def qn(self, namespace):
"""Connect tag prefix to longer namespace"""
nsmap = {
'text': 'urn:oasis:names:tc:opendocument:xmlns:text:1.0',
}
spl = namespace.split(':')
return '{{{}}}{}'.format(nsmap[spl[0]], spl[1])

0 comments on commit fa65746

Please sign in to comment.