Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Improve break handling (frozensets disable normalization across spans…
… with frozenset styles)
  • Loading branch information
chbrown committed Nov 8, 2013
1 parent 8a6c386 commit 1d23310
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 13 deletions.
10 changes: 6 additions & 4 deletions xdoc/dom.py
Expand Up @@ -37,7 +37,8 @@ def normalize(self):
outer_spans = []
current_styles = set()
for span in self.spans:
if span.empty():
# we only want spans that are empty / only whitespace, and have mutable styles
if span.empty() and not isinstance(span.styles, frozenset):
outer_spans.append(span)
elif span.styles == current_styles:
# a non-empty span with identical styles triggers:
Expand All @@ -63,13 +64,15 @@ def normalize(self):

def __unicode__(self):
return u'Document(metadata=%s, references=%s, spans=%s)' % (
self.metadata, self.references, u''.join(self.spans))
self.metadata, self.references, u', '.join(map(unicode, self.spans)))


class Span(object_ustr):
'''
The primary building block of a Document
A (Paragraph)Break is just a Span, but with a value of styles that is an empty frozenset.
A Hyperlink is just a Span, except that 'hyperlink' should always be an element of its `styles` set,
and it also has a `url` field that designates the target url (the `text` field is the display text).
Expand All @@ -86,8 +89,7 @@ def __unicode__(self):
return u'Span(%r, styles=%s, attrs=%s)' % (self.text, self.styles, self.attrs)

def empty(self):
# the 'break' style check is kind of arbitrary
return 'break' not in self.styles and (self.text == '' or self.text.isspace())
return self.text == '' or self.text.isspace()

@classmethod
def merge(cls, spans):
Expand Down
8 changes: 5 additions & 3 deletions xdoc/formats/docx/reader.py
Expand Up @@ -98,8 +98,10 @@ def read_r(r, p_styles, p_attrs):
elif child_tag == 'sym':
char = child.get(w_('char'))
# symbol_map maps from ascii to unicode
replacement = symbol_map.get(char, 'XXX: MISSING SYMBOL %r' % char)
logger.debug('Reading sym=%s: "%s"', char, replacement)
replacement = symbol_map.get(char)
if replacement is None:
logger.critical('Could not find symbol in map: %r' % char)
logger.silly('Reading sym=%s: "%s"', char, replacement)
yield Span(replacement, r_styles | p_styles, **p_attrs)
elif child_tag == 't':
yield Span(unicode(child.text), r_styles | p_styles, **p_attrs)
Expand Down Expand Up @@ -197,7 +199,7 @@ def read_docx_document(document_fp):
yield span

# Do we really want this spacer? or just more structure inside the document?
yield Span(u'', set(['break']))
yield Span(u'\n\n', frozenset())


def parse_docx(docx_fp):
Expand Down
10 changes: 5 additions & 5 deletions xdoc/formats/tex/__init__.py
Expand Up @@ -60,18 +60,18 @@ def serialize_document(document):
# current_styles tracks what styles are currently applied
current_styles = set()
for span in document.spans:
logger.silly('escaping: %r', span.text)

pop_styles, push_styles, current_styles = set_diff(current_styles, span.styles)
# logger.debug('popping styles: %r', pop_styles)
for style in pop_styles:
yield '}'

logger.silly('escaping: %r (pop ) (>>', span.text)

# logger.debug('pushing styles: %r', push_styles)
for style in push_styles:
# handle the more common styles first:
if style in simple_commands:
yield simple_commands[style]
elif style == 'break':
yield u'\n\n'
elif style == 'hyperlink':
yield r'\href{%s}' % span.attrs['url']
elif style == 'counter':
Expand Down Expand Up @@ -103,7 +103,7 @@ def write(tex_fp, bib_fp, document):
tex_string = auto.references(tex_string)
tex_string = auto.spaces(tex_string)

# embed the document body in template
# embed the document body into our template
timestamp = datetime.today().strftime('%Y-%m-%d %H:%M:%S')
tex_string = document_template % dict(body=tex_string, timestamp=timestamp)

Expand Down
2 changes: 1 addition & 1 deletion xdoc/formats/tex/characters.py
Expand Up @@ -103,5 +103,5 @@


def escape(string):
logger.debug('Escaping: %r', string)
logger.debug('Escaping: "%s"', string.encode('utf8'))
return string.translate(escape_translations)

0 comments on commit 1d23310

Please sign in to comment.