From 172677d2ad334fb6faca5d3b6c2f0bfc6e7f63fb Mon Sep 17 00:00:00 2001 From: tobes Date: Tue, 2 Apr 2013 13:17:21 +0100 Subject: [PATCH 1/7] [#708] Fix render_markdown() helper function --- ckan/lib/helpers.py | 70 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 3 deletions(-) diff --git a/ckan/lib/helpers.py b/ckan/lib/helpers.py index ab521abeb3c..3e536ccc72c 100644 --- a/ckan/lib/helpers.py +++ b/ckan/lib/helpers.py @@ -960,7 +960,7 @@ def related_item_link(related_item_dict): def tag_link(tag): url = url_for(controller='tag', action='read', id=tag['name']) - return link_to(tag['name'], url) + return link_to(tag.get('title', tag['name']), url) def group_link(group): @@ -1371,12 +1371,76 @@ def get_request_param(parameter_name, default=None): return request.params.get(parameter_name, default) -def render_markdown(data): +# find all inner text of html eg `moo` gets `moo` but not of tags +# as this would lead to linkifying links if they are urls. +RE_MD_GET_INNER_HTML = re.compile( + r'(^|(?:<(?!a\b)[^>]*>))([^<]+)(?=<|$)', + flags=re.UNICODE +) + +# find all `internal links` eg. tag:moo, dataset:1234, tag:"my tag" +RE_MD_INTERNAL_LINK = re.compile( + r'\b(tag|package|dataset|group):((")?(?(3)[ \w\-.]+|[\w\-.]+)(?(3)"))', + flags=re.UNICODE +) + +# find external links eg http://foo.com, https:/bar.org/foobar.html +RE_MD_EXTERNAL_LINK = re.compile(r'(\bhttps?:\/\/[\w\-\.,@?^=%&;:\/~\\+#]*)', + flags=re.UNICODE +) + +# find all tags but ignore < in the strings so that we can use it correctly +# in markdown +RE_MD_HTML_TAGS = re.compile('<[^><]*>') + +def html_auto_link(data): + '''Linkifies HTML + + tag:... converted to a tag link + dataset:... converted to a dataset link + group:... converted to a group link + http://... converted to a link + ''' + def makelink(matchobj): + obj = matchobj.group(1) + name = matchobj.group(2) + title = '%s:%s' % (obj, name) + if obj == 'tag': + return tag_link({'name': name.strip('"'), + 'title': title}) + elif obj == 'group': + return group_link({'name': name, + 'title': title}) + elif obj in ['dataset', 'package']: + return dataset_link({'name': name, + 'title': title}) + + def link(matchobj): + return '%s' \ + % (matchobj.group(1), matchobj.group(1)) + + def process(matchobj): + data = matchobj.group(2) + data = RE_MD_INTERNAL_LINK.sub(makelink, data) + data = RE_MD_EXTERNAL_LINK.sub(link, data) + return matchobj.group(1) + data + + data = RE_MD_GET_INNER_HTML.sub(process, data) + return data + + +def render_markdown(data, auto_link=True): ''' returns the data as rendered markdown ''' # cope with data == None if not data: return '' - return literal(ckan.misc.MarkdownFormat().to_html(data)) + data = RE_MD_HTML_TAGS.sub('', data.strip()) + data = markdown(data, safe_mode=True) + # tags can be added by tag:... or tag:"...." and a link will be made + # from it + if auto_link: + data = html_auto_link(data) + return literal(data) def format_resource_items(items): From 6924bbd41a7ccc77a5e11d8a6012e5ed75c1f510 Mon Sep 17 00:00:00 2001 From: tobes Date: Tue, 2 Apr 2013 13:19:34 +0100 Subject: [PATCH 2/7] [#708] Switch over to use markdown_extract() not misc.py --- ckan/controllers/api.py | 2 +- ckan/controllers/group.py | 9 +- ckan/controllers/user.py | 14 +-- ckan/lib/dictization/model_dictize.py | 3 +- ckan/lib/package_saver.py | 9 +- ckan/misc.py | 95 -------------------- ckan/model/package.py | 4 +- ckan/templates_legacy/package/read_core.html | 2 +- 8 files changed, 9 insertions(+), 129 deletions(-) delete mode 100644 ckan/misc.py diff --git a/ckan/controllers/api.py b/ckan/controllers/api.py index 09b0c14368e..18e8d2bc063 100644 --- a/ckan/controllers/api.py +++ b/ckan/controllers/api.py @@ -566,7 +566,7 @@ def _get_search_params(cls, request_params): def markdown(self, ver=None): raw_markdown = request.params.get('q', '') - results = ckan.misc.MarkdownFormat().to_html(raw_markdown) + results = h.render_markdown(raw_markdown) return self._finish_ok(results) diff --git a/ckan/controllers/group.py b/ckan/controllers/group.py index f3f8508c6f4..505df2fade9 100644 --- a/ckan/controllers/group.py +++ b/ckan/controllers/group.py @@ -201,14 +201,7 @@ def _read(self, id, limit): else: q += ' groups: "%s"' % c.group_dict.get('name') - try: - description_formatted = ckan.misc.MarkdownFormat().to_html( - c.group_dict.get('description', '')) - c.description_formatted = genshi.HTML(description_formatted) - except Exception, e: - error_msg = "%s" %\ - _("Cannot render description") - c.description_formatted = genshi.HTML(error_msg) + c.description_formatted = h.render_markdown(c.group_dict.get('description')) context['return_query'] = True diff --git a/ckan/controllers/user.py b/ckan/controllers/user.py index 638bd7a7231..1ca75663154 100644 --- a/ckan/controllers/user.py +++ b/ckan/controllers/user.py @@ -3,11 +3,9 @@ from pylons import session, c, g, request, config from pylons.i18n import _ -import genshi import ckan.lib.i18n as i18n import ckan.lib.base as base -import ckan.misc as misc import ckan.model as model import ckan.lib.helpers as h import ckan.new_authz as new_authz @@ -73,7 +71,7 @@ def _setup_template_variables(self, context, data_dict): abort(401, _('Not authorized to see this page')) c.user_dict = user_dict c.is_myself = user_dict['name'] == c.user - c.about_formatted = self._format_about(user_dict['about']) + c.about_formatted = h.render_markdown(user_dict['about']) ## end hooks @@ -620,13 +618,3 @@ def unfollow(self, id): or e.error_dict) h.flash_error(error_message) h.redirect_to(controller='user', action='read', id=id) - - def _format_about(self, about): - about_formatted = misc.MarkdownFormat().to_html(about) - try: - html = genshi.HTML(about_formatted) - except genshi.ParseError, e: - log.error('Could not print "about" field Field: %r Error: %r', - about, e) - html = _('Error: Could not parse About text') - return html diff --git a/ckan/lib/dictization/model_dictize.py b/ckan/lib/dictization/model_dictize.py index d61c68e73b1..d2a74071923 100644 --- a/ckan/lib/dictization/model_dictize.py +++ b/ckan/lib/dictization/model_dictize.py @@ -4,7 +4,6 @@ from pylons import config from sqlalchemy.sql import select -import ckan.misc as misc import ckan.logic as logic import ckan.plugins as plugins import ckan.lib.helpers as h @@ -530,7 +529,7 @@ def package_to_api(pkg, context): dictized['license'] = pkg.license.title if pkg.license else None dictized['ratings_average'] = pkg.get_average_rating() dictized['ratings_count'] = len(pkg.ratings) - dictized['notes_rendered'] = misc.MarkdownFormat().to_html(pkg.notes) + dictized['notes_rendered'] = h.render_markdown(pkg.notes) site_url = config.get('ckan.site_url', None) if site_url: diff --git a/ckan/lib/package_saver.py b/ckan/lib/package_saver.py index 1a1a9a8e7f6..5b3f876151f 100644 --- a/ckan/lib/package_saver.py +++ b/ckan/lib/package_saver.py @@ -1,4 +1,3 @@ -import genshi from sqlalchemy import orm import ckan.lib.helpers as h from ckan.lib.base import * @@ -22,12 +21,8 @@ def render_package(cls, pkg, context): render. Note that the actual calling of render('package/read') is left to the caller.''' - try: - notes_formatted = ckan.misc.MarkdownFormat().to_html(pkg.get('notes','')) - c.pkg_notes_formatted = genshi.HTML(notes_formatted) - except Exception, e: - error_msg = "%s" % _("Cannot render package description") - c.pkg_notes_formatted = genshi.HTML(error_msg) + c.pkg_notes_formatted = h.render_markdown(pkg.get('notes')) + c.current_rating, c.num_ratings = ckan.rating.get_rating(context['package']) url = pkg.get('url', '') c.pkg_url_link = h.link_to(url, url, rel='foaf:homepage', target='_blank') \ diff --git a/ckan/misc.py b/ckan/misc.py deleted file mode 100644 index 29f3ae6447b..00000000000 --- a/ckan/misc.py +++ /dev/null @@ -1,95 +0,0 @@ -import re -import logging -import urllib -import webhelpers.markdown - -from pylons.i18n import _ - -log = logging.getLogger(__name__) - -class TextFormat(object): - - def to_html(self, instr): - raise NotImplementedError() - - -class MarkdownFormat(TextFormat): - internal_link = re.compile('(dataset|package|group):([a-z0-9\-_]+)') - - # tag names are allowed more characters, including spaces. So are - # treated specially. - internal_tag_link = re.compile(\ - r"""(tag): # group 1 - ( # capture name (inc. quotes) (group 2) - (")? # optional quotes for multi-word name (group 3) - ( # begin capture of the name w/o quotes (group 4) - (?(3) # if the quotes matched in group 3 - [ \w\-.] # then capture spaces (as well as other things) - | # else - [\w\-.] # don't capture spaces - ) # end - +) # end capture of the name w/o quotes (group 4) - (?(3)") # close opening quote if necessary - ) # end capture of the name with quotes (group 2) - """, re.VERBOSE|re.UNICODE) - normal_link = re.compile('<(http:[^>]+)>') - - html_whitelist = 'b center li ol p table td tr ul'.split(' ') - whitelist_elem = re.compile(r'<(\/?((%s)(\s[^>]*)?))>' % "|".join(html_whitelist), re.IGNORECASE) - whitelist_escp = re.compile(r'\\xfc\\xfd(\/?((%s)(\s[^>]*?)?))\\xfd\\xfc' % "|".join(html_whitelist), re.IGNORECASE) - normal_link = re.compile(r']*?href="([^"]*?)"[^>]*?>', re.IGNORECASE) - abbrev_link = re.compile(r'<(http://[^>]*)>', re.IGNORECASE) - any_link = re.compile(r']*?>', re.IGNORECASE) - close_link = re.compile(r'<(\/a[^>]*)>', re.IGNORECASE) - link_escp = re.compile(r'\\xfc\\xfd(\/?(%s)[^>]*?)\\xfd\\xfc' % "|".join(['a']), re.IGNORECASE) - web_address = re.compile(r'(\s|

)((http|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?)', re.IGNORECASE) - - def to_html(self, text): - if text is None: - return '' - # Encode whitelist elements. - text = self.whitelist_elem.sub(r'\\\\xfc\\\\xfd\1\\\\xfd\\\\xfc', text) - - # Encode links only in an acceptable format (guard against spammers) - text = self.normal_link.sub(r'\\\\xfc\\\\xfda href="\1" target="_blank" rel="nofollow"\\\\xfd\\\\xfc', text) - text = self.abbrev_link.sub(r'\\\\xfc\\\\xfda href="\1" target="_blank" rel="nofollow"\\\\xfd\\\\xfc\1', text) - text = self.any_link.sub(r'\\\\xfc\\\\xfda href="TAG MALFORMED" target="_blank" rel="nofollow"\\\\xfd\\\\xfc', text) - text = self.close_link.sub(r'\\\\xfc\\\\xfd\1\\\\xfd\\\\xfc', text) - - # Convert internal tag links - text = self.internal_tag_link.sub(self._create_tag_link, text) - - # Convert internal links. - text = self.internal_link.sub(r'[\1:\2] (/\1/\2)', text) - - # Convert to markdown format. - text = self.normal_link.sub(r'[\1] (\1)', text) - - # Convert to markdown format. - text = self.normal_link.sub(r'[\1] (\1)', text) - - # Markdown to HTML. - text = webhelpers.markdown.markdown(text, safe_mode=True) - - # Remaining unlinked web addresses to become addresses - text = self.web_address.sub(r'\1\2', text) - - # Decode whitelist elements. - text = self.whitelist_escp.sub(r'<\1>', text) - text = self.link_escp.sub(r'<\1>', text) - - return text - - def _create_tag_link(self, match_object): - """ - A callback used to create the internal tag link. - - The reason for this is that webhelpers.markdown does not percent-escape - spaces, nor does it encode unicode characters correctly. - - This is only applied to the tag substitution since only tags may - have spaces or unicode characters. - """ - g = match_object.group - url = urllib.quote(g(4).encode('utf8')) - return r'[%s:%s] (/%s/%s)' % (g(1), g(2), g(1), url) diff --git a/ckan/model/package.py b/ckan/model/package.py index ec49bbab019..bce320bde98 100644 --- a/ckan/model/package.py +++ b/ckan/model/package.py @@ -17,7 +17,6 @@ import activity import extension -import ckan.misc import ckan.lib.dictization __all__ = ['Package', 'package_table', 'package_revision_table', @@ -216,7 +215,8 @@ def as_dict(self, ref_package_by='name', ref_group_by='name'): if self.metadata_modified else None _dict['metadata_created'] = self.metadata_created.isoformat() \ if self.metadata_created else None - _dict['notes_rendered'] = ckan.misc.MarkdownFormat().to_html(self.notes) + import ckan.lib.helpers as h + _dict['notes_rendered'] = h.render_markdown(self.notes) _dict['type'] = self.type or u'dataset' #tracking import ckan.model as model diff --git a/ckan/templates_legacy/package/read_core.html b/ckan/templates_legacy/package/read_core.html index 9868845a8ec..427775a8d42 100644 --- a/ckan/templates_legacy/package/read_core.html +++ b/ckan/templates_legacy/package/read_core.html @@ -10,7 +10,7 @@

-
+
${c.pkg_notes_formatted}
From 2c8977e0bedce4ebc11a066f2ad11aa82e124726 Mon Sep 17 00:00:00 2001 From: tobes Date: Tue, 2 Apr 2013 13:20:12 +0100 Subject: [PATCH 3/7] [#708] Fix tests --- ckan/lib/create_test_data.py | 2 +- ckan/tests/functional/test_package.py | 20 ++--- ckan/tests/functional/test_user.py | 44 ----------- ckan/tests/lib/test_dictization.py | 2 +- ckan/tests/logic/test_action.py | 4 +- ckan/tests/misc/test_format_text.py | 102 +++++++++----------------- ckan/tests/models/test_package.py | 2 +- 7 files changed, 47 insertions(+), 129 deletions(-) diff --git a/ckan/lib/create_test_data.py b/ckan/lib/create_test_data.py index e2aba3b242f..6826e254c63 100644 --- a/ckan/lib/create_test_data.py +++ b/ckan/lib/create_test_data.py @@ -464,7 +464,7 @@ def create(cls, auth_profile="", package_type=None): model.Session.add_all([ model.User(name=u'tester', apikey=u'tester', password=u'tester'), model.User(name=u'joeadmin', password=u'joeadmin'), - model.User(name=u'annafan', about=u'I love reading Annakarenina. My site: anna.com', password=u'annafan'), + model.User(name=u'annafan', about=u'I love reading Annakarenina. My site: http://anna.com', password=u'annafan'), model.User(name=u'russianfan', password=u'russianfan'), sysadmin, ]) diff --git a/ckan/tests/functional/test_package.py b/ckan/tests/functional/test_package.py index 4adabcc6c78..4c2adeb3de4 100644 --- a/ckan/tests/functional/test_package.py +++ b/ckan/tests/functional/test_package.py @@ -309,10 +309,6 @@ def test_read(self): assert anna.version in res assert anna.url in res assert 'Some test notes' in res - self.check_named_element(res, 'a', - 'http://ckan.net/', - 'target="_blank"', - 'rel="nofollow"') assert 'Some bolded text.' in res self.check_tag_and_data(res, 'left arrow', '<') self.check_tag_and_data(res, 'umlaut', u'\xfc') @@ -350,7 +346,7 @@ def test_read_internal_links(self): pkg_name = u'link-test', CreateTestData.create_arbitrary([ {'name':pkg_name, - 'notes':'Decoy link here: decoy:decoy, real links here: package:pkg-1, ' \ + 'notes':'Decoy link here: decoy:decoy, real links here: dataset:pkg-1, ' \ 'tag:tag_1 group:test-group-1 and a multi-word tag: tag:"multi word with punctuation."', } ]) @@ -358,9 +354,9 @@ def test_read_internal_links(self): res = self.app.get(offset) def check_link(res, controller, id): id_in_uri = id.strip('"').replace(' ', '%20') # remove quotes and percent-encode spaces - self.check_tag_and_data(res, 'a ', '/%s/%s' % (controller, id_in_uri), - '%s:%s' % (controller, id)) - check_link(res, 'package', 'pkg-1') + self.check_tag_and_data(res, 'a ', '%s/%s' % (controller, id_in_uri), + '%s:%s' % (controller, id.replace('"', '"'))) + check_link(res, 'dataset', 'pkg-1') check_link(res, 'tag', 'tag_1') check_link(res, 'tag', '"multi word with punctuation."') check_link(res, 'group', 'test-group-1') @@ -1557,10 +1553,10 @@ def teardown(self): def test_markdown_html_whitelist(self): self.body = str(self.res) - self.assert_fragment('') - self.assert_fragment('') - self.assert_fragment('subcategory.txt') - self.assert_fragment('') + self.fail_if_fragment('
Description
--
') + self.fail_if_fragment('') + self.fail_if_fragment('subcategory.txt') + self.fail_if_fragment('') self.fail_if_fragment('
Description
--