Skip to content
Permalink
Browse files

Unescaping HTML in WordPress tags and categories.

  • Loading branch information
felixfontein committed Dec 4, 2016
1 parent eb1ebfb commit 8b193c72eee6375207005b4a71732854c071f310
Showing with 22 additions and 5 deletions.
  1. +4 −4 nikola/plugins/command/import_wordpress.py
  2. +18 −1 nikola/utils.py
@@ -339,7 +339,7 @@ def _prepare(self, channel):
# cat_id = get_text_tag(cat, '{{{0}}}term_id'.format(wordpress_namespace), None)
cat_slug = get_text_tag(cat, '{{{0}}}category_nicename'.format(wordpress_namespace), None)
cat_parent_slug = get_text_tag(cat, '{{{0}}}category_parent'.format(wordpress_namespace), None)
cat_name = get_text_tag(cat, '{{{0}}}cat_name'.format(wordpress_namespace), None)
cat_name = utils.html_unescape(get_text_tag(cat, '{{{0}}}cat_name'.format(wordpress_namespace), None))
cat_path = [cat_name]
if cat_parent_slug in cat_map:
cat_path = cat_map[cat_parent_slug] + cat_path
@@ -824,16 +824,16 @@ def _create_metadata(self, status, excerpt, tags, categories, post_name=None):
if text in self._category_paths:
cats.append(self._category_paths[text])
else:
cats.append(utils.join_hierarchical_category_path([text]))
cats.append(utils.join_hierarchical_category_path([utils.html_unescape(text)]))
other_meta['categories'] = ','.join(cats)
if len(cats) > 0:
other_meta['category'] = cats[0]
if len(cats) > 1:
LOGGER.warn(('Post "{0}" has more than one category! ' +
'Will only use the first one.').format(post_name))
tags_cats = tags
tags_cats = [utils.html_unescape(tag) for tag in tags]
else:
tags_cats = tags + categories
tags_cats = [utils.html_unescape(tag) for tag in tags + categories]
return tags_cats, other_meta

_tag_sanitize_map = {True: {}, False: {}}
@@ -94,7 +94,7 @@
'NikolaPygmentsHTML', 'create_redirect', 'TreeNode',
'flatten_tree_structure', 'parse_escaped_hierarchical_category_name',
'join_hierarchical_category_path', 'clean_before_deployment', 'indent',
'load_data')
'load_data', 'html_unescape')

# Are you looking for 'generic_rss_renderer'?
# It's defined in nikola.nikola.Nikola (the site object).
@@ -1942,3 +1942,20 @@ def load_data(path):
return
with io.open(path, 'r', encoding='utf8') as inf:
return loader.load(inf)


# http://stackoverflow.com/a/2087433
if sys.version_info[0] == 3 and sys.version_info[1] >= 4:
import html # Python 3.4 and newer

html_unescape = html.unescape
else:
try:
from HTMLParser import HTMLParser # Python 2.6 and 2.7
except ImportError:
from html.parser import HTMLParser # Python 3 (up to 3.4)

def html_unescape(s):
"""Convert all named and numeric character references in the string s to the corresponding unicode characters."""
h = HTMLParser()
return h.unescape(s)

0 comments on commit 8b193c7

Please sign in to comment.
You can’t perform that action at this time.