Skip to content

Commit

Permalink
Strip tags from post titles
Browse files Browse the repository at this point in the history
WordPress allows to include markup on titles and we want to get rid of them.
  • Loading branch information
hvelarde committed Aug 14, 2015
1 parent a443112 commit 1ffc9c0
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 1 deletion.
3 changes: 2 additions & 1 deletion transmogrify/wordpress/blueprints/csvsource.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from collective.transmogrifier.interfaces import ISectionBlueprint
from DateTime import DateTime
from OFS.ObjectManager import bad_id
from transmogrify.wordpress.utils import strip_tags
from transmogrify.wordpress.blueprints.csvutils import get_display_names
from transmogrify.wordpress.blueprints.csvutils import get_taxonomies
from transmogrify.wordpress.logger import logger
Expand Down Expand Up @@ -140,7 +141,7 @@ def __iter__(self):
item_id = row['post_name']
# Zope ids need to be ASCII
item_id = unquote_plus(item_id).decode('utf-8').encode('ascii', 'ignore')
item['title'] = row['post_title']
item['title'] = strip_tags(row['post_title'])
else:
# for attachments we need to parse the guid
# and use the file name as title
Expand Down
27 changes: 27 additions & 0 deletions transmogrify/wordpress/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-
from HTMLParser import HTMLParser


class MLStripper(HTMLParser):

"""Taken from: http://stackoverflow.com/a/925630/644075
Why don't you use RegEx? http://stackoverflow.com/a/1732454/644075
"""

def __init__(self):
self.reset()
self.fed = []

def handle_data(self, d):
self.fed.append(d)

def get_data(self):
return ''.join(self.fed)


def strip_tags(html):
"""Remove HTML tags from a string."""
s = MLStripper()
s.feed(html)
return s.get_data()

0 comments on commit 1ffc9c0

Please sign in to comment.