diff --git a/ckan/controllers/api.py b/ckan/controllers/api.py
index 4455e204dae..30dc0beb8a5 100644
--- a/ckan/controllers/api.py
+++ b/ckan/controllers/api.py
@@ -571,7 +571,7 @@ def _get_search_params(cls, request_params):
def markdown(self, ver=None):
raw_markdown = request.params.get('q', '')
- results = ckan.misc.MarkdownFormat().to_html(raw_markdown)
+ results = h.render_markdown(raw_markdown)
return self._finish_ok(results)
diff --git a/ckan/controllers/group.py b/ckan/controllers/group.py
index 9adf26af9aa..140dd1d4eba 100644
--- a/ckan/controllers/group.py
+++ b/ckan/controllers/group.py
@@ -201,14 +201,7 @@ def _read(self, id, limit):
else:
q += ' groups:"%s"' % c.group_dict.get('name')
- try:
- description_formatted = ckan.misc.MarkdownFormat().to_html(
- c.group_dict.get('description', ''))
- c.description_formatted = genshi.HTML(description_formatted)
- except Exception, e:
- error_msg = "%s" %\
- _("Cannot render description")
- c.description_formatted = genshi.HTML(error_msg)
+ c.description_formatted = h.render_markdown(c.group_dict.get('description'))
context['return_query'] = True
diff --git a/ckan/controllers/user.py b/ckan/controllers/user.py
index 5d957b4e026..10cb0e9a78d 100644
--- a/ckan/controllers/user.py
+++ b/ckan/controllers/user.py
@@ -2,11 +2,9 @@
from urllib import quote
from pylons import config
-import genshi
import ckan.lib.i18n as i18n
import ckan.lib.base as base
-import ckan.misc as misc
import ckan.model as model
import ckan.lib.helpers as h
import ckan.new_authz as new_authz
@@ -73,7 +71,7 @@ def _setup_template_variables(self, context, data_dict):
abort(401, _('Not authorized to see this page'))
c.user_dict = user_dict
c.is_myself = user_dict['name'] == c.user
- c.about_formatted = self._format_about(user_dict['about'])
+ c.about_formatted = h.render_markdown(user_dict['about'])
## end hooks
@@ -622,13 +620,3 @@ def unfollow(self, id):
or e.error_dict)
h.flash_error(error_message)
h.redirect_to(controller='user', action='read', id=id)
-
- def _format_about(self, about):
- about_formatted = misc.MarkdownFormat().to_html(about)
- try:
- html = genshi.HTML(about_formatted)
- except genshi.ParseError, e:
- log.error('Could not print "about" field Field: %r Error: %r',
- about, e)
- html = _('Error: Could not parse About text')
- return html
diff --git a/ckan/lib/create_test_data.py b/ckan/lib/create_test_data.py
index 6720bbb9c79..3edf865c7c7 100644
--- a/ckan/lib/create_test_data.py
+++ b/ckan/lib/create_test_data.py
@@ -464,7 +464,7 @@ def create(cls, auth_profile="", package_type=None):
model.Session.add_all([
model.User(name=u'tester', apikey=u'tester', password=u'tester'),
model.User(name=u'joeadmin', password=u'joeadmin'),
- model.User(name=u'annafan', about=u'I love reading Annakarenina. My site: anna.com', password=u'annafan'),
+ model.User(name=u'annafan', about=u'I love reading Annakarenina. My site: http://anna.com', password=u'annafan'),
model.User(name=u'russianfan', password=u'russianfan'),
sysadmin,
])
diff --git a/ckan/lib/dictization/model_dictize.py b/ckan/lib/dictization/model_dictize.py
index d61c68e73b1..d2a74071923 100644
--- a/ckan/lib/dictization/model_dictize.py
+++ b/ckan/lib/dictization/model_dictize.py
@@ -4,7 +4,6 @@
from pylons import config
from sqlalchemy.sql import select
-import ckan.misc as misc
import ckan.logic as logic
import ckan.plugins as plugins
import ckan.lib.helpers as h
@@ -530,7 +529,7 @@ def package_to_api(pkg, context):
dictized['license'] = pkg.license.title if pkg.license else None
dictized['ratings_average'] = pkg.get_average_rating()
dictized['ratings_count'] = len(pkg.ratings)
- dictized['notes_rendered'] = misc.MarkdownFormat().to_html(pkg.notes)
+ dictized['notes_rendered'] = h.render_markdown(pkg.notes)
site_url = config.get('ckan.site_url', None)
if site_url:
diff --git a/ckan/lib/helpers.py b/ckan/lib/helpers.py
index db48123a758..04960d6c94c 100644
--- a/ckan/lib/helpers.py
+++ b/ckan/lib/helpers.py
@@ -636,7 +636,7 @@ def markdown_extract(text, extract_length=190):
will not be truncated.'''
if (text is None) or (text.strip() == ''):
return ''
- plain = re.sub(r'<.*?>', '', markdown(text))
+ plain = RE_MD_HTML_TAGS.sub('', markdown(text))
if not extract_length or len(plain) < extract_length:
return literal(plain)
return literal(unicode(truncate(plain, length=extract_length, indicator='...', whole_word=True)))
@@ -956,7 +956,7 @@ def related_item_link(related_item_dict):
def tag_link(tag):
url = url_for(controller='tag', action='read', id=tag['name'])
- return link_to(tag['name'], url)
+ return link_to(tag.get('title', tag['name']), url)
def group_link(group):
@@ -1367,12 +1367,77 @@ def get_request_param(parameter_name, default=None):
return request.params.get(parameter_name, default)
-def render_markdown(data):
+# find all inner text of html eg `moo` gets `moo` but not of tags
+# as this would lead to linkifying links if they are urls.
+RE_MD_GET_INNER_HTML = re.compile(
+ r'(^|(?:<(?!a\b)[^>]*>))([^<]+)(?=<|$)',
+ flags=re.UNICODE
+)
+
+# find all `internal links` eg. tag:moo, dataset:1234, tag:"my tag"
+RE_MD_INTERNAL_LINK = re.compile(
+ r'\b(tag|package|dataset|group):((")?(?(3)[ \w\-.]+|[\w\-.]+)(?(3)"))',
+ flags=re.UNICODE
+)
+
+# find external links eg http://foo.com, https://bar.org/foobar.html
+RE_MD_EXTERNAL_LINK = re.compile(
+ r'(\bhttps?:\/\/[\w\-\.,@?^=%&;:\/~\\+#]*)',
+ flags=re.UNICODE
+)
+
+# find all tags but ignore < in the strings so that we can use it correctly
+# in markdown
+RE_MD_HTML_TAGS = re.compile('<[^><]*>')
+
+
+def html_auto_link(data):
+ '''Linkifies HTML
+
+ tag:... converted to a tag link
+ dataset:... converted to a dataset link
+ group:... converted to a group link
+ http://... converted to a link
+ '''
+
+ LINK_FNS = {
+ 'tag': tag_link,
+ 'group': group_link,
+ 'dataset': dataset_link,
+ 'package': dataset_link,
+ }
+
+ def makelink(matchobj):
+ obj = matchobj.group(1)
+ name = matchobj.group(2)
+ title = '%s:%s' % (obj, name)
+ return LINK_FNS[obj]({'name': name.strip('"'), 'title': title})
+
+ def link(matchobj):
+ return '%s' \
+ % (matchobj.group(1), matchobj.group(1))
+
+ def process(matchobj):
+ data = matchobj.group(2)
+ data = RE_MD_INTERNAL_LINK.sub(makelink, data)
+ data = RE_MD_EXTERNAL_LINK.sub(link, data)
+ return matchobj.group(1) + data
+
+ data = RE_MD_GET_INNER_HTML.sub(process, data)
+ return data
+
+
+def render_markdown(data, auto_link=True):
''' returns the data as rendered markdown '''
- # cope with data == None
if not data:
return ''
- return literal(ckan.misc.MarkdownFormat().to_html(data))
+ data = RE_MD_HTML_TAGS.sub('', data.strip())
+ data = markdown(data, safe_mode=True)
+ # tags can be added by tag:... or tag:"...." and a link will be made
+ # from it
+ if auto_link:
+ data = html_auto_link(data)
+ return literal(data)
def format_resource_items(items):
diff --git a/ckan/lib/package_saver.py b/ckan/lib/package_saver.py
index 1a1a9a8e7f6..5b3f876151f 100644
--- a/ckan/lib/package_saver.py
+++ b/ckan/lib/package_saver.py
@@ -1,4 +1,3 @@
-import genshi
from sqlalchemy import orm
import ckan.lib.helpers as h
from ckan.lib.base import *
@@ -22,12 +21,8 @@ def render_package(cls, pkg, context):
render.
Note that the actual calling of render('package/read') is left
to the caller.'''
- try:
- notes_formatted = ckan.misc.MarkdownFormat().to_html(pkg.get('notes',''))
- c.pkg_notes_formatted = genshi.HTML(notes_formatted)
- except Exception, e:
- error_msg = "%s" % _("Cannot render package description")
- c.pkg_notes_formatted = genshi.HTML(error_msg)
+ c.pkg_notes_formatted = h.render_markdown(pkg.get('notes'))
+
c.current_rating, c.num_ratings = ckan.rating.get_rating(context['package'])
url = pkg.get('url', '')
c.pkg_url_link = h.link_to(url, url, rel='foaf:homepage', target='_blank') \
diff --git a/ckan/misc.py b/ckan/misc.py
deleted file mode 100644
index 29f3ae6447b..00000000000
--- a/ckan/misc.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import re
-import logging
-import urllib
-import webhelpers.markdown
-
-from pylons.i18n import _
-
-log = logging.getLogger(__name__)
-
-class TextFormat(object):
-
- def to_html(self, instr):
- raise NotImplementedError()
-
-
-class MarkdownFormat(TextFormat):
- internal_link = re.compile('(dataset|package|group):([a-z0-9\-_]+)')
-
- # tag names are allowed more characters, including spaces. So are
- # treated specially.
- internal_tag_link = re.compile(\
- r"""(tag): # group 1
- ( # capture name (inc. quotes) (group 2)
- (")? # optional quotes for multi-word name (group 3)
- ( # begin capture of the name w/o quotes (group 4)
- (?(3) # if the quotes matched in group 3
- [ \w\-.] # then capture spaces (as well as other things)
- | # else
- [\w\-.] # don't capture spaces
- ) # end
- +) # end capture of the name w/o quotes (group 4)
- (?(3)") # close opening quote if necessary
- ) # end capture of the name with quotes (group 2)
- """, re.VERBOSE|re.UNICODE)
- normal_link = re.compile('<(http:[^>]+)>')
-
- html_whitelist = 'b center li ol p table td tr ul'.split(' ')
- whitelist_elem = re.compile(r'<(\/?((%s)(\s[^>]*)?))>' % "|".join(html_whitelist), re.IGNORECASE)
- whitelist_escp = re.compile(r'\\xfc\\xfd(\/?((%s)(\s[^>]*?)?))\\xfd\\xfc' % "|".join(html_whitelist), re.IGNORECASE)
- normal_link = re.compile(r']*?href="([^"]*?)"[^>]*?>', re.IGNORECASE)
- abbrev_link = re.compile(r'<(http://[^>]*)>', re.IGNORECASE)
- any_link = re.compile(r']*?>', re.IGNORECASE)
- close_link = re.compile(r'<(\/a[^>]*)>', re.IGNORECASE)
- link_escp = re.compile(r'\\xfc\\xfd(\/?(%s)[^>]*?)\\xfd\\xfc' % "|".join(['a']), re.IGNORECASE)
- web_address = re.compile(r'(\s|)((http|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?)', re.IGNORECASE)
-
- def to_html(self, text):
- if text is None:
- return ''
- # Encode whitelist elements.
- text = self.whitelist_elem.sub(r'\\\\xfc\\\\xfd\1\\\\xfd\\\\xfc', text)
-
- # Encode links only in an acceptable format (guard against spammers)
- text = self.normal_link.sub(r'\\\\xfc\\\\xfda href="\1" target="_blank" rel="nofollow"\\\\xfd\\\\xfc', text)
- text = self.abbrev_link.sub(r'\\\\xfc\\\\xfda href="\1" target="_blank" rel="nofollow"\\\\xfd\\\\xfc\1
', text)
- text = self.any_link.sub(r'\\\\xfc\\\\xfda href="TAG MALFORMED" target="_blank" rel="nofollow"\\\\xfd\\\\xfc', text)
- text = self.close_link.sub(r'\\\\xfc\\\\xfd\1\\\\xfd\\\\xfc', text)
-
- # Convert internal tag links
- text = self.internal_tag_link.sub(self._create_tag_link, text)
-
- # Convert internal links.
- text = self.internal_link.sub(r'[\1:\2] (/\1/\2)', text)
-
- # Convert to markdown format.
- text = self.normal_link.sub(r'[\1] (\1)', text)
-
- # Convert to markdown format.
- text = self.normal_link.sub(r'[\1] (\1)', text)
-
- # Markdown to HTML.
- text = webhelpers.markdown.markdown(text, safe_mode=True)
-
- # Remaining unlinked web addresses to become addresses
- text = self.web_address.sub(r'\1\2', text)
-
- # Decode whitelist elements.
- text = self.whitelist_escp.sub(r'<\1>', text)
- text = self.link_escp.sub(r'<\1>', text)
-
- return text
-
- def _create_tag_link(self, match_object):
- """
- A callback used to create the internal tag link.
-
- The reason for this is that webhelpers.markdown does not percent-escape
- spaces, nor does it encode unicode characters correctly.
-
- This is only applied to the tag substitution since only tags may
- have spaces or unicode characters.
- """
- g = match_object.group
- url = urllib.quote(g(4).encode('utf8'))
- return r'[%s:%s] (/%s/%s)' % (g(1), g(2), g(1), url)
diff --git a/ckan/model/package.py b/ckan/model/package.py
index ec49bbab019..bce320bde98 100644
--- a/ckan/model/package.py
+++ b/ckan/model/package.py
@@ -17,7 +17,6 @@
import activity
import extension
-import ckan.misc
import ckan.lib.dictization
__all__ = ['Package', 'package_table', 'package_revision_table',
@@ -216,7 +215,8 @@ def as_dict(self, ref_package_by='name', ref_group_by='name'):
if self.metadata_modified else None
_dict['metadata_created'] = self.metadata_created.isoformat() \
if self.metadata_created else None
- _dict['notes_rendered'] = ckan.misc.MarkdownFormat().to_html(self.notes)
+ import ckan.lib.helpers as h
+ _dict['notes_rendered'] = h.render_markdown(self.notes)
_dict['type'] = self.type or u'dataset'
#tracking
import ckan.model as model
diff --git a/ckan/templates_legacy/package/read_core.html b/ckan/templates_legacy/package/read_core.html
index 9868845a8ec..427775a8d42 100644
--- a/ckan/templates_legacy/package/read_core.html
+++ b/ckan/templates_legacy/package/read_core.html
@@ -10,7 +10,7 @@
-
+
diff --git a/ckan/tests/functional/test_package.py b/ckan/tests/functional/test_package.py
index 4adabcc6c78..4c2adeb3de4 100644
--- a/ckan/tests/functional/test_package.py
+++ b/ckan/tests/functional/test_package.py
@@ -309,10 +309,6 @@ def test_read(self):
assert anna.version in res
assert anna.url in res
assert 'Some test notes' in res
- self.check_named_element(res, 'a',
- 'http://ckan.net/',
- 'target="_blank"',
- 'rel="nofollow"')
assert '
Some bolded text.' in res
self.check_tag_and_data(res, 'left arrow', '<')
self.check_tag_and_data(res, 'umlaut', u'\xfc')
@@ -350,7 +346,7 @@ def test_read_internal_links(self):
pkg_name = u'link-test',
CreateTestData.create_arbitrary([
{'name':pkg_name,
- 'notes':'Decoy link here: decoy:decoy, real links here: package:pkg-1, ' \
+ 'notes':'Decoy link here: decoy:decoy, real links here: dataset:pkg-1, ' \
'tag:tag_1 group:test-group-1 and a multi-word tag: tag:"multi word with punctuation."',
}
])
@@ -358,9 +354,9 @@ def test_read_internal_links(self):
res = self.app.get(offset)
def check_link(res, controller, id):
id_in_uri = id.strip('"').replace(' ', '%20') # remove quotes and percent-encode spaces
- self.check_tag_and_data(res, 'a ', '/%s/%s' % (controller, id_in_uri),
- '%s:%s' % (controller, id))
- check_link(res, 'package', 'pkg-1')
+ self.check_tag_and_data(res, 'a ', '%s/%s' % (controller, id_in_uri),
+ '%s:%s' % (controller, id.replace('"', '"')))
+ check_link(res, 'dataset', 'pkg-1')
check_link(res, 'tag', 'tag_1')
check_link(res, 'tag', '"multi word with punctuation."')
check_link(res, 'group', 'test-group-1')
@@ -1557,10 +1553,10 @@ def teardown(self):
def test_markdown_html_whitelist(self):
self.body = str(self.res)
- self.assert_fragment('
')
- self.assert_fragment('Description | ')
- self.assert_fragment('subcategory.txt')
- self.assert_fragment('-- | ')
+ self.fail_if_fragment('')
+ self.fail_if_fragment('Description | ')
+ self.fail_if_fragment('subcategory.txt')
+ self.fail_if_fragment('-- | ')
self.fail_if_fragment('