Skip to content
Permalink
Browse files

Fix #2570 -- new deduplicate_ids filter

Signed-off-by: Chris Warrick <kwpolska@gmail.com>
  • Loading branch information...
Kwpolska committed May 14, 2017
1 parent 3b748b0 commit c393e225fc1d37cb910b2475f76641883b78c8c4
Showing with 41 additions and 0 deletions.
  1. +2 −0 CHANGES.txt
  2. +5 −0 docs/manual.txt
  3. +34 −0 nikola/filters.py
@@ -4,6 +4,8 @@ New in master
Features
--------

* New ``deduplicate_ids``, for preventing duplication of HTML id
attributes (Issue #2570)
* New ``add_header_permalinks`` filter, for Sphinx-style header links
(Issue #2636)

@@ -1928,6 +1928,11 @@ add_header_permalinks
# Include *every* header (not recommended):
# HEADER_PERMALINKS_XPATH_LIST = ['*//{hx}']

deduplicate_ids
Prevent duplicated IDs in HTML output. An incrementing counter is added to
offending IDs. If used alongside ``add_header_permalinks``, it will fix
those links (it must run **after** that filter)

You can apply filters to specific posts or pages by using the ``filters`` metadata field:

.. code:: restructuredtext
@@ -436,3 +436,37 @@ def add_header_permalinks(data, xpath_list=None):
new_node = lxml.html.fragment_fromstring('<a href="#{0}" class="headerlink" title="Permalink to this heading">¶</a>'.format(hid))
node.append(new_node)
return lxml.html.tostring(doc, encoding="unicode")

@apply_to_text_file
def deduplicate_ids(data):
"""Post-process HTML via lxml to deduplicate IDs."""
doc = lxml.html.document_fromstring(data)
elements = doc.xpath('//*')
all_ids = [element.attrib.get('id') for element in elements]
seen_ids = set()
duplicated_ids = set()
for i in all_ids:
if i is not None and i in seen_ids:
duplicated_ids.add(i)
else:
seen_ids.add(i)

if duplicated_ids:
# Well, that sucks.
for i in duplicated_ids:
# Results are ordered the same way they are ordered in document
offending_elements = doc.xpath('//*[@id="{}"]'.format(i))
counter = 2
for e in offending_elements[1:]:
new_id = '{0}-{1}'.format(i, counter)
e.attrib['id'] = new_id
counter += 1
# Find headerlinks that we can fix.
headerlinks = e.find_class('headerlink')
for hl in headerlinks:
# We might get headerlinks of child elements
if hl.attrib['href'] == '#' + i:
hl.attrib['href'] = '#' + new_id
return lxml.html.tostring(doc, encoding='unicode')
else:
return data

0 comments on commit c393e22

Please sign in to comment.
You can’t perform that action at this time.