From 172677d2ad334fb6faca5d3b6c2f0bfc6e7f63fb Mon Sep 17 00:00:00 2001
From: tobes <toby.junk@gmail.com>
Date: Tue, 2 Apr 2013 13:17:21 +0100
Subject: [PATCH 1/7] [#708] Fix render_markdown() helper function

---
 ckan/lib/helpers.py | 70 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 67 insertions(+), 3 deletions(-)
diff --git a/ckan/lib/helpers.py b/ckan/lib/helpers.py
index ab521abeb3c..3e536ccc72c 100644
--- a/ckan/lib/helpers.py
+++ b/ckan/lib/helpers.py
@@ -960,7 +960,7 @@ def related_item_link(related_item_dict):
 
 def tag_link(tag):
     url = url_for(controller='tag', action='read', id=tag['name'])
-    return link_to(tag['name'], url)
+    return link_to(tag.get('title', tag['name']), url)
 
 
 def group_link(group):
@@ -1371,12 +1371,76 @@ def get_request_param(parameter_name, default=None):
     return request.params.get(parameter_name, default)
 
 
-def render_markdown(data):
+# find all inner text of html eg `<b>moo</b>` gets `moo` but not of <a> tags
+# as this would lead to linkifying links if they are urls.
+RE_MD_GET_INNER_HTML = re.compile(
+    r'(^|(?:<(?!a\b)[^>]*>))([^<]+)(?=<|$)',
+    flags=re.UNICODE
+)
+
+# find all `internal links` eg. tag:moo, dataset:1234, tag:"my tag"
+RE_MD_INTERNAL_LINK = re.compile(
+    r'\b(tag|package|dataset|group):((")?(?(3)[ \w\-.]+|[\w\-.]+)(?(3)"))',
+    flags=re.UNICODE
+)
+
+# find external links eg http://foo.com, https:/bar.org/foobar.html
+RE_MD_EXTERNAL_LINK = re.compile(r'(\bhttps?:\/\/[\w\-\.,@?^=%&;:\/~\\+#]*)',
+    flags=re.UNICODE
+)
+
+# find all tags but ignore < in the strings so that we can use it correctly
+# in markdown
+RE_MD_HTML_TAGS = re.compile('<[^><]*>')
+
+def html_auto_link(data):
+    '''Linkifies HTML
+
+    tag:... converted to a tag link
+    dataset:... converted to a dataset link
+    group:... converted to a group link
+    http://... converted to a link
+    '''
+    def makelink(matchobj):
+        obj = matchobj.group(1)
+        name = matchobj.group(2)
+        title = '%s:%s' % (obj, name)
+        if obj == 'tag':
+            return tag_link({'name': name.strip('"'),
+                             'title': title})
+        elif obj  == 'group':
+            return group_link({'name': name,
+                               'title': title})
+        elif obj in ['dataset', 'package']:
+            return dataset_link({'name': name,
+                                 'title': title})
+
+    def link(matchobj):
+        return '<a href="%s" target="_blank" rel="nofollow">%s</a>' \
+            % (matchobj.group(1), matchobj.group(1))
+
+    def process(matchobj):
+        data = matchobj.group(2)
+        data = RE_MD_INTERNAL_LINK.sub(makelink, data)
+        data = RE_MD_EXTERNAL_LINK.sub(link, data)
+        return matchobj.group(1) + data
+
+    data = RE_MD_GET_INNER_HTML.sub(process, data)
+    return data
+
+
+def render_markdown(data, auto_link=True):
     ''' returns the data as rendered markdown '''
     # cope with data == None
     if not data:
         return ''
-    return literal(ckan.misc.MarkdownFormat().to_html(data))
+    data = RE_MD_HTML_TAGS.sub('', data.strip())
+    data = markdown(data, safe_mode=True)
+    # tags can be added by tag:... or tag:"...." and a link will be made
+    # from it
+    if auto_link:
+        data = html_auto_link(data)
+    return literal(data)
 
 
 def format_resource_items(items):

From 6924bbd41a7ccc77a5e11d8a6012e5ed75c1f510 Mon Sep 17 00:00:00 2001
From: tobes <toby.junk@gmail.com>
Date: Tue, 2 Apr 2013 13:19:34 +0100
Subject: [PATCH 2/7] [#708] Switch over to use markdown_extract() not misc.py

---
 ckan/controllers/api.py                      |  2 +-
 ckan/controllers/group.py                    |  9 +-
 ckan/controllers/user.py                     | 14 +--
 ckan/lib/dictization/model_dictize.py        |  3 +-
 ckan/lib/package_saver.py                    |  9 +-
 ckan/misc.py                                 | 95 --------------------
 ckan/model/package.py                        |  4 +-
 ckan/templates_legacy/package/read_core.html |  2 +-
 8 files changed, 9 insertions(+), 129 deletions(-)
 delete mode 100644 ckan/misc.py

diff --git a/ckan/controllers/api.py b/ckan/controllers/api.py
index 09b0c14368e..18e8d2bc063 100644
--- a/ckan/controllers/api.py
+++ b/ckan/controllers/api.py
@@ -566,7 +566,7 @@ def _get_search_params(cls, request_params):
 
     def markdown(self, ver=None):
         raw_markdown = request.params.get('q', '')
-        results = ckan.misc.MarkdownFormat().to_html(raw_markdown)
+        results = h.render_markdown(raw_markdown)
 
         return self._finish_ok(results)
 
diff --git a/ckan/controllers/group.py b/ckan/controllers/group.py
index f3f8508c6f4..505df2fade9 100644
--- a/ckan/controllers/group.py
+++ b/ckan/controllers/group.py
@@ -201,14 +201,7 @@ def _read(self, id, limit):
         else:
             q += ' groups: "%s"' % c.group_dict.get('name')
 
-        try:
-            description_formatted = ckan.misc.MarkdownFormat().to_html(
-                c.group_dict.get('description', ''))
-            c.description_formatted = genshi.HTML(description_formatted)
-        except Exception, e:
-            error_msg = "<span class='inline-warning'>%s</span>" %\
-                        _("Cannot render description")
-            c.description_formatted = genshi.HTML(error_msg)
+        c.description_formatted = h.render_markdown(c.group_dict.get('description'))
 
         context['return_query'] = True
 
diff --git a/ckan/controllers/user.py b/ckan/controllers/user.py
index 638bd7a7231..1ca75663154 100644
--- a/ckan/controllers/user.py
+++ b/ckan/controllers/user.py
@@ -3,11 +3,9 @@
 
 from pylons import session, c, g, request, config
 from pylons.i18n import _
-import genshi
 
 import ckan.lib.i18n as i18n
 import ckan.lib.base as base
-import ckan.misc as misc
 import ckan.model as model
 import ckan.lib.helpers as h
 import ckan.new_authz as new_authz
@@ -73,7 +71,7 @@ def _setup_template_variables(self, context, data_dict):
             abort(401, _('Not authorized to see this page'))
         c.user_dict = user_dict
         c.is_myself = user_dict['name'] == c.user
-        c.about_formatted = self._format_about(user_dict['about'])
+        c.about_formatted = h.render_markdown(user_dict['about'])
 
     ## end hooks
 
@@ -620,13 +618,3 @@ def unfollow(self, id):
                              or e.error_dict)
             h.flash_error(error_message)
         h.redirect_to(controller='user', action='read', id=id)
-
-    def _format_about(self, about):
-        about_formatted = misc.MarkdownFormat().to_html(about)
-        try:
-            html = genshi.HTML(about_formatted)
-        except genshi.ParseError, e:
-            log.error('Could not print "about" field Field: %r Error: %r',
-                      about, e)
-            html = _('Error: Could not parse About text')
-        return html
diff --git a/ckan/lib/dictization/model_dictize.py b/ckan/lib/dictization/model_dictize.py
index d61c68e73b1..d2a74071923 100644
--- a/ckan/lib/dictization/model_dictize.py
+++ b/ckan/lib/dictization/model_dictize.py
@@ -4,7 +4,6 @@
 from pylons import config
 from sqlalchemy.sql import select
 
-import ckan.misc as misc
 import ckan.logic as logic
 import ckan.plugins as plugins
 import ckan.lib.helpers as h
@@ -530,7 +529,7 @@ def package_to_api(pkg, context):
     dictized['license'] = pkg.license.title if pkg.license else None
     dictized['ratings_average'] = pkg.get_average_rating()
     dictized['ratings_count'] = len(pkg.ratings)
-    dictized['notes_rendered'] = misc.MarkdownFormat().to_html(pkg.notes)
+    dictized['notes_rendered'] = h.render_markdown(pkg.notes)
 
     site_url = config.get('ckan.site_url', None)
     if site_url:
diff --git a/ckan/lib/package_saver.py b/ckan/lib/package_saver.py
index 1a1a9a8e7f6..5b3f876151f 100644
--- a/ckan/lib/package_saver.py
+++ b/ckan/lib/package_saver.py
@@ -1,4 +1,3 @@
-import genshi
 from sqlalchemy import orm
 import ckan.lib.helpers as h
 from ckan.lib.base import *
@@ -22,12 +21,8 @@ def render_package(cls, pkg, context):
         render. 
         Note that the actual calling of render('package/read') is left
         to the caller.'''
-        try:
-            notes_formatted = ckan.misc.MarkdownFormat().to_html(pkg.get('notes',''))
-            c.pkg_notes_formatted = genshi.HTML(notes_formatted)
-        except Exception, e:
-            error_msg = "<span class='inline-warning'>%s</span>" % _("Cannot render package description")
-            c.pkg_notes_formatted = genshi.HTML(error_msg)
+        c.pkg_notes_formatted = h.render_markdown(pkg.get('notes'))
+
         c.current_rating, c.num_ratings = ckan.rating.get_rating(context['package'])
         url = pkg.get('url', '')
         c.pkg_url_link = h.link_to(url, url, rel='foaf:homepage', target='_blank') \
diff --git a/ckan/misc.py b/ckan/misc.py
deleted file mode 100644
index 29f3ae6447b..00000000000
--- a/ckan/misc.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import re
-import logging
-import urllib
-import webhelpers.markdown
-
-from pylons.i18n import _
-
-log = logging.getLogger(__name__)
-
-class TextFormat(object):
-
-    def to_html(self, instr):
-        raise NotImplementedError()
-
-
-class MarkdownFormat(TextFormat):
-    internal_link = re.compile('(dataset|package|group):([a-z0-9\-_]+)')
-
-    # tag names are allowed more characters, including spaces.  So are
-    # treated specially.
-    internal_tag_link = re.compile(\
-        r"""(tag):                               # group 1
-            (                                    # capture name (inc. quotes) (group 2)
-            (")?                                 # optional quotes for multi-word name (group 3)
-            (                                    # begin capture of the name w/o quotes (group 4)
-            (?(3)                                # if the quotes matched in group 3
-                [ \w\-.]                         #     then capture spaces (as well as other things)
-                |                                # else
-                [\w\-.]                          #     don't capture spaces
-            )                                    # end
-            +)                                   # end capture of the name w/o quotes (group 4)
-            (?(3)")                              # close opening quote if necessary
-            )                                    # end capture of the name with quotes (group 2)
-        """, re.VERBOSE|re.UNICODE)
-    normal_link = re.compile('<(http:[^>]+)>')
-
-    html_whitelist = 'b center li ol p table td tr ul'.split(' ')
-    whitelist_elem = re.compile(r'<(\/?((%s)(\s[^>]*)?))>' % "|".join(html_whitelist), re.IGNORECASE)
-    whitelist_escp = re.compile(r'\\xfc\\xfd(\/?((%s)(\s[^>]*?)?))\\xfd\\xfc' % "|".join(html_whitelist), re.IGNORECASE)
-    normal_link = re.compile(r'<a[^>]*?href="([^"]*?)"[^>]*?>', re.IGNORECASE)
-    abbrev_link = re.compile(r'<(http://[^>]*)>', re.IGNORECASE)
-    any_link = re.compile(r'<a[^>]*?>', re.IGNORECASE)
-    close_link = re.compile(r'<(\/a[^>]*)>', re.IGNORECASE)
-    link_escp = re.compile(r'\\xfc\\xfd(\/?(%s)[^>]*?)\\xfd\\xfc' % "|".join(['a']), re.IGNORECASE)
-    web_address = re.compile(r'(\s|<p>)((http|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?)', re.IGNORECASE)
-    
-    def to_html(self, text):
-        if text is None:
-            return ''
-        # Encode whitelist elements.
-        text = self.whitelist_elem.sub(r'\\\\xfc\\\\xfd\1\\\\xfd\\\\xfc', text)
-
-        # Encode links only in an acceptable format (guard against spammers)
-        text = self.normal_link.sub(r'\\\\xfc\\\\xfda href="\1" target="_blank" rel="nofollow"\\\\xfd\\\\xfc', text)
-        text = self.abbrev_link.sub(r'\\\\xfc\\\\xfda href="\1" target="_blank" rel="nofollow"\\\\xfd\\\\xfc\1</a>', text)
-        text = self.any_link.sub(r'\\\\xfc\\\\xfda href="TAG MALFORMED" target="_blank" rel="nofollow"\\\\xfd\\\\xfc', text)
-        text = self.close_link.sub(r'\\\\xfc\\\\xfd\1\\\\xfd\\\\xfc', text)
-
-        # Convert internal tag links
-        text = self.internal_tag_link.sub(self._create_tag_link, text)
-
-        # Convert internal links.
-        text = self.internal_link.sub(r'[\1:\2] (/\1/\2)', text)
-
-        # Convert <link> to markdown format.
-        text = self.normal_link.sub(r'[\1] (\1)', text)
-
-        # Convert <link> to markdown format.
-        text = self.normal_link.sub(r'[\1] (\1)', text)
-
-        # Markdown to HTML.
-        text = webhelpers.markdown.markdown(text, safe_mode=True)
-
-        # Remaining unlinked web addresses to become addresses
-        text = self.web_address.sub(r'\1<a href="\2" target="_blank" rel="nofollow">\2</a>', text)
-
-        # Decode whitelist elements.
-        text = self.whitelist_escp.sub(r'<\1>', text)
-        text = self.link_escp.sub(r'<\1>', text)
-
-        return text
-
-    def _create_tag_link(self, match_object):
-        """
-        A callback used to create the internal tag link.
-
-        The reason for this is that webhelpers.markdown does not percent-escape
-        spaces, nor does it encode unicode characters correctly.
-
-        This is only applied to the tag substitution since only tags may
-        have spaces or unicode characters.
-        """
-        g = match_object.group
-        url = urllib.quote(g(4).encode('utf8'))
-        return r'[%s:%s] (/%s/%s)' % (g(1), g(2), g(1), url)
diff --git a/ckan/model/package.py b/ckan/model/package.py
index ec49bbab019..bce320bde98 100644
--- a/ckan/model/package.py
+++ b/ckan/model/package.py
@@ -17,7 +17,6 @@
 import activity
 import extension
 
-import ckan.misc
 import ckan.lib.dictization
 
 __all__ = ['Package', 'package_table', 'package_revision_table',
@@ -216,7 +215,8 @@ def as_dict(self, ref_package_by='name', ref_group_by='name'):
             if self.metadata_modified else None
         _dict['metadata_created'] = self.metadata_created.isoformat() \
             if self.metadata_created else None
-        _dict['notes_rendered'] = ckan.misc.MarkdownFormat().to_html(self.notes)
+        import ckan.lib.helpers as h
+        _dict['notes_rendered'] = h.render_markdown(self.notes)
         _dict['type'] = self.type or u'dataset'
         #tracking
         import ckan.model as model
diff --git a/ckan/templates_legacy/package/read_core.html b/ckan/templates_legacy/package/read_core.html
index 9868845a8ec..427775a8d42 100644
--- a/ckan/templates_legacy/package/read_core.html
+++ b/ckan/templates_legacy/package/read_core.html
@@ -10,7 +10,7 @@
     <div id="dataset-overview">
 
       <!-- Description -->
-      <div class="notes" py:if="str(c.pkg_notes_formatted).strip()">
+      <div class="notes" py:if="c.pkg_notes_formatted">
         <div id="notes-extract">
           ${c.pkg_notes_formatted}
         </div>

From 2c8977e0bedce4ebc11a066f2ad11aa82e124726 Mon Sep 17 00:00:00 2001
From: tobes <toby.junk@gmail.com>
Date: Tue, 2 Apr 2013 13:20:12 +0100
Subject: [PATCH 3/7] [#708] Fix tests

---
 ckan/lib/create_test_data.py          |   2 +-
 ckan/tests/functional/test_package.py |  20 ++---
 ckan/tests/functional/test_user.py    |  44 -----------
 ckan/tests/lib/test_dictization.py    |   2 +-
 ckan/tests/logic/test_action.py       |   4 +-
 ckan/tests/misc/test_format_text.py   | 102 +++++++++-----------------
 ckan/tests/models/test_package.py     |   2 +-
 7 files changed, 47 insertions(+), 129 deletions(-)

diff --git a/ckan/lib/create_test_data.py b/ckan/lib/create_test_data.py
index e2aba3b242f..6826e254c63 100644
--- a/ckan/lib/create_test_data.py
+++ b/ckan/lib/create_test_data.py
@@ -464,7 +464,7 @@ def create(cls, auth_profile="", package_type=None):
         model.Session.add_all([
             model.User(name=u'tester', apikey=u'tester', password=u'tester'),
             model.User(name=u'joeadmin', password=u'joeadmin'),
-            model.User(name=u'annafan', about=u'I love reading Annakarenina. My site: <a href="http://anna.com">anna.com</a>', password=u'annafan'),
+            model.User(name=u'annafan', about=u'I love reading Annakarenina. My site: http://anna.com', password=u'annafan'),
             model.User(name=u'russianfan', password=u'russianfan'),
             sysadmin,
             ])
diff --git a/ckan/tests/functional/test_package.py b/ckan/tests/functional/test_package.py
index 4adabcc6c78..4c2adeb3de4 100644
--- a/ckan/tests/functional/test_package.py
+++ b/ckan/tests/functional/test_package.py
@@ -309,10 +309,6 @@ def test_read(self):
         assert anna.version in res
         assert anna.url in res
         assert 'Some test notes' in res
-        self.check_named_element(res, 'a',
-                                 'http://ckan.net/',
-                                 'target="_blank"',
-                                 'rel="nofollow"')
         assert '<strong>Some bolded text.</strong>' in res
         self.check_tag_and_data(res, 'left arrow', '&lt;')
         self.check_tag_and_data(res, 'umlaut', u'\xfc')
@@ -350,7 +346,7 @@ def test_read_internal_links(self):
         pkg_name = u'link-test',
         CreateTestData.create_arbitrary([
             {'name':pkg_name,
-             'notes':'Decoy link here: decoy:decoy, real links here: package:pkg-1, ' \
+             'notes':'Decoy link here: decoy:decoy, real links here: dataset:pkg-1, ' \
                    'tag:tag_1 group:test-group-1 and a multi-word tag: tag:"multi word with punctuation."',
              }
             ])
@@ -358,9 +354,9 @@ def test_read_internal_links(self):
         res = self.app.get(offset)
         def check_link(res, controller, id):
             id_in_uri = id.strip('"').replace(' ', '%20') # remove quotes and percent-encode spaces
-            self.check_tag_and_data(res, 'a ', '/%s/%s' % (controller, id_in_uri),
-                                    '%s:%s' % (controller, id))
-        check_link(res, 'package', 'pkg-1')
+            self.check_tag_and_data(res, 'a ', '%s/%s' % (controller, id_in_uri),
+                                    '%s:%s' % (controller, id.replace('"', '&#34;')))
+        check_link(res, 'dataset', 'pkg-1')
         check_link(res, 'tag', 'tag_1')
         check_link(res, 'tag', '"multi word with punctuation."')
         check_link(res, 'group', 'test-group-1')
@@ -1557,10 +1553,10 @@ def teardown(self):
 
     def test_markdown_html_whitelist(self):
         self.body = str(self.res)
-        self.assert_fragment('<table width="100%" border="1">')
-        self.assert_fragment('<td rowspan="2"><b>Description</b></td>')
-        self.assert_fragment('<a href="http://www.nber.org/patents/subcategories.txt" target="_blank" rel="nofollow">subcategory.txt</a>')
-        self.assert_fragment('<td colspan="2"><center>--</center></td>')
+        self.fail_if_fragment('<table width="100%" border="1">')
+        self.fail_if_fragment('<td rowspan="2"><b>Description</b></td>')
+        self.fail_if_fragment('<a href="http://www.nber.org/patents/subcategories.txt" target="_blank" rel="nofollow">subcategory.txt</a>')
+        self.fail_if_fragment('<td colspan="2"><center>--</center></td>')
         self.fail_if_fragment('<script>')
 
     def assert_fragment(self, fragment):
diff --git a/ckan/tests/functional/test_user.py b/ckan/tests/functional/test_user.py
index 571e100ca39..e0ef996d4d9 100644
--- a/ckan/tests/functional/test_user.py
+++ b/ckan/tests/functional/test_user.py
@@ -89,50 +89,6 @@ def test_user_read_logged_in(self):
         assert 'checkpoint:is-myself' in main_res, main_res
         assert 'Edit Profile' in main_res, main_res
 
-    def test_user_read_about_unfinished(self):
-        user = model.User.by_name(u'unfinisher')
-        offset = '/user/%s' % user.id
-        res = self.app.get(offset, status=200)
-        main_res = self.main_div(res)
-        assert 'unfinisher' in res, res
-        assert '&lt;a href="http://unfinished.tag' in main_res, main_res
-
-    def test_user_read_about_unclosed(self):
-        user = model.User.by_name(u'uncloser')
-        offset = '/user/%s' % user.id
-        res = self.app.get(offset, status=200)
-        main_res = self.main_div(res)
-        assert 'unclosed' in res, res
-        # tag gets closed by genshi
-        assert '<a href="http://unclosed.tag" target="_blank" rel="nofollow">\n</a>' in main_res, main_res
-
-    def test_user_read_about_spam(self):
-        user = model.User.by_name(u'spammer')
-        offset = '/user/%s' % user.id
-        res = self.app.get(offset, status=200)
-        main_res = self.main_div(res)
-        assert 'spammer' in res, res
-        self.check_named_element(res, 'a',
-                                 'href="http://mysite"',
-                                 'target="_blank"',
-                                 'rel="nofollow"')
-
-        self.check_named_element(res, 'a',
-                                 'href="TAG MALFORMED"',
-                                 'target="_blank"',
-                                 'rel="nofollow"')
-
-    def test_user_read_about_spam2(self):
-        user = model.User.by_name(u'spammer2')
-        offset = '/user/%s' % user.id
-        res = self.app.get(offset, status=200)
-        main_res = self.main_div(res)
-        assert 'spammer2' in res, res
-        assert 'spamsite2' not in res, res
-        # some computers have the Genshi exception and some insert "malformed"
-        # but either are fine
-        assert 'Error: Could not parse About text' in res or\
-               'malformed' in res, res
 
     def test_user_login_page(self):
         offset = url_for(controller='user', action='login', id=None)
diff --git a/ckan/tests/lib/test_dictization.py b/ckan/tests/lib/test_dictization.py
index 19f89aeba2f..50ca5e97062 100644
--- a/ckan/tests/lib/test_dictization.py
+++ b/ckan/tests/lib/test_dictization.py
@@ -928,7 +928,7 @@ def test_16_group_dictized(self):
                                'title': u'simple',
                                'type': u'organization',
                                'approval_status': u'approved'}],
-                    'users': [{'about': u'I love reading Annakarenina. My site: <a href="http://anna.com">anna.com</a>',
+                    'users': [{'about': u'I love reading Annakarenina. My site: http://anna.com',
                               'display_name': u'annafan',
                               'capacity' : 'public',
                               'sysadmin': False,
diff --git a/ckan/tests/logic/test_action.py b/ckan/tests/logic/test_action.py
index b63e0ca6067..082e5225f07 100644
--- a/ckan/tests/logic/test_action.py
+++ b/ckan/tests/logic/test_action.py
@@ -292,7 +292,7 @@ def test_04_user_list(self):
         assert res_obj['success'] == True
         assert len(res_obj['result']) == 7
         assert res_obj['result'][0]['name'] == 'annafan'
-        assert res_obj['result'][0]['about'] == 'I love reading Annakarenina. My site: <a href="http://anna.com">anna.com</a>'
+        assert res_obj['result'][0]['about'] == 'I love reading Annakarenina. My site: http://anna.com'
         assert not 'apikey' in res_obj['result'][0]
 
     def test_05_user_show(self):
@@ -304,7 +304,7 @@ def test_05_user_show(self):
         assert res_obj['success'] == True
         result = res_obj['result']
         assert result['name'] == 'annafan'
-        assert result['about'] == 'I love reading Annakarenina. My site: <a href="http://anna.com">anna.com</a>'
+        assert result['about'] == 'I love reading Annakarenina. My site: http://anna.com'
         assert 'activity' in result
         assert 'created' in result
         assert 'display_name' in result
diff --git a/ckan/tests/misc/test_format_text.py b/ckan/tests/misc/test_format_text.py
index 934b06c5600..e4e2030e18d 100644
--- a/ckan/tests/misc/test_format_text.py
+++ b/ckan/tests/misc/test_format_text.py
@@ -1,4 +1,4 @@
-from ckan.misc import MarkdownFormat
+import ckan.lib.helpers as h
 
 class TestFormatText:
 
@@ -14,92 +14,79 @@ def test_markdown(self):
 </p>
 <p><em>Some italicized text.</em>
 </p>'''
-        format = MarkdownFormat()
-        out = format.to_html(instr)
+        out = h.render_markdown(instr)
         assert out == exp
 
     def test_markdown_blank(self):
         instr = None
-        format = MarkdownFormat()
-        out = format.to_html(instr)
+        out = h.render_markdown(instr)
         assert out == ''
 
     def test_evil_markdown(self):
         instr = 'Evil <script src="http://evilserver.net/evil.js";>'
-        exp = '''<p>Evil [HTML_REMOVED]
-</p>'''
-        format = MarkdownFormat()
-        out = format.to_html(instr)
+        exp = '''<p>Evil \n</p>'''
+        out = h.render_markdown(instr)
         assert out == exp, out
-        
+
     def test_internal_link(self):
-        instr = 'package:test-_pkg'
-        exp = '<a href="/package/test-_pkg">package:test-_pkg</a>'
-        format = MarkdownFormat()
-        out = format.to_html(instr)
+        instr = 'dataset:test-_pkg'
+        exp = '<p><a href="/dataset/test-_pkg">dataset:test-_pkg</a>\n</p>'
+        out = h.render_markdown(instr)
         assert exp in out, '\nGot: %s\nWanted: %s' % (out, exp)
 
     def test_internal_tag_link(self):
         """Asserts links like 'tag:test-tag' work"""
         instr = 'tag:test-tag foobar'
         exp = '<a href="/tag/test-tag">tag:test-tag</a> foobar'
-        format = MarkdownFormat()
-        out = format.to_html(instr)
+        out = h.render_markdown(instr)
         assert exp in out, '\nGot: %s\nWanted: %s' % (out, exp)
 
     def test_internal_tag_linked_with_quotes(self):
         """Asserts links like 'tag:"test-tag"' work"""
         instr = 'tag:"test-tag" foobar'
-        exp = '<a href="/tag/test-tag">tag:"test-tag"</a> foobar'
-        format = MarkdownFormat()
-        out = format.to_html(instr)
+        exp = '<p><a href="/tag/test-tag">tag:&#34;test-tag&#34;</a> foobar\n</p>'
+        out = h.render_markdown(instr)
         assert exp in out, '\nGot: %s\nWanted: %s' % (out, exp)
 
     def test_internal_tag_linked_with_quotes_and_space(self):
         """Asserts links like 'tag:"test tag"' work"""
         instr = 'tag:"test tag" foobar'
-        exp = '<a href="/tag/test%20tag">tag:"test tag"</a> foobar'
-        format = MarkdownFormat()
-        out = format.to_html(instr)
+        exp = '<p><a href="/tag/test%20tag">tag:&#34;test tag&#34;</a> foobar\n</p>'
+        out = h.render_markdown(instr)
         assert exp in out, '\nGot: %s\nWanted: %s' % (out, exp)
 
     def test_internal_tag_with_no_opening_quote_only_matches_single_word(self):
         """Asserts that without an opening quote only one word is matched"""
         instr = 'tag:test tag" foobar' # should match 'tag:test'
         exp = '<a href="/tag/test">tag:test</a> tag" foobar'
-        format = MarkdownFormat()
-        out = format.to_html(instr)
+        out = h.render_markdown(instr)
         assert exp in out, '\nGot: %s\nWanted: %s' % (out, exp)
 
     def test_internal_tag_with_no_opening_quote_wont_match_the_closing_quote(self):
         """Asserts that 'tag:test" tag' is matched, but to 'tag:test'"""
         instr = 'tag:test" foobar' # should match 'tag:test'
         exp = '<a href="/tag/test">tag:test</a>" foobar'
-        format = MarkdownFormat()
-        out = format.to_html(instr)
+        out = h.render_markdown(instr)
         assert exp in out, '\nGot: %s\nWanted: %s' % (out, exp)
 
     def test_internal_tag_with_no_closing_quote_does_not_match(self):
         """Asserts that without an opening quote only one word is matched"""
         instr = 'tag:"test tag foobar'
-        format = MarkdownFormat()
-        out = format.to_html(instr)
+        out = h.render_markdown(instr)
         assert "<a href" not in out
 
     def test_tag_names_match_simple_punctuation(self):
         """Asserts punctuation and capital letters are matched in the tag name"""
         instr = 'tag:"Test- _." foobar'
-        exp = '<a href="/tag/Test-%20_.">tag:"Test- _."</a> foobar'
-        format = MarkdownFormat()
-        out = format.to_html(instr)
+        exp = '<p><a href="/tag/Test-%20_.">tag:&#34;Test- _.&#34;</a> foobar\n</p>'
+        out = h.render_markdown(instr)
         assert exp in out, '\nGot: %s\nWanted: %s' % (out, exp)
 
     def test_tag_names_do_not_match_commas(self):
         """Asserts commas don't get matched as part of a tag name"""
         instr = 'tag:Test,tag foobar'
         exp = '<a href="/tag/Test">tag:Test</a>,tag foobar'
-        format = MarkdownFormat()
-        out = format.to_html(instr)
+        out = h.render_markdown(instr)
         assert exp in out, '\nGot: %s\nWanted: %s' % (out, exp)
 
     def test_tag_names_dont_match_non_space_whitespace(self):
@@ -108,59 +95,38 @@ def test_tag_names_dont_match_non_space_whitespace(self):
         for ch in whitespace_characters:
             instr = 'tag:Bad' + ch + 'space'
             exp = '<a href="/tag/Bad">tag:Bad</a>'
-            format = MarkdownFormat()
-            out = format.to_html(instr)
+            out = h.render_markdown(instr)
             assert exp in out, '\nGot: %s\nWanted: %s' % (out, exp)
-    
+
     def test_tag_names_with_unicode_alphanumeric(self):
         """Asserts that unicode alphanumeric characters are captured"""
         instr = u'tag:"Japanese katakana \u30a1" blah'
-        exp = u'<a href="/tag/Japanese%20katakana%20%E3%82%A1">tag:"Japanese katakana \u30a1"</a>'
-        format = MarkdownFormat()
-        out = format.to_html(instr)
+        exp = u'<p><a href="/tag/Japanese%20katakana%20%E3%82%A1">tag:&#34;Japanese katakana \u30a1&#34;</a> blah\n</p>'
+        out = h.render_markdown(instr)
         assert exp in out, u'\nGot: %s\nWanted: %s' % (out, exp)
 
     def test_normal_link(self):
-        instr = '<http://somelink/>'
+        instr = 'http://somelink/'
         exp = '<a href="http://somelink/" target="_blank" rel="nofollow">http://somelink/</a>'
-        format = MarkdownFormat()
-        out = format.to_html(instr)
+        out = h.render_markdown(instr)
         assert exp in out, '\nGot: %s\nWanted: %s' % (out, exp)
 
     def test_auto_link(self):
         instr = 'http://somelink.com'
         exp = '<a href="http://somelink.com" target="_blank" rel="nofollow">http://somelink.com</a>'
-        format = MarkdownFormat()
-        out = format.to_html(instr)
+        out = h.render_markdown(instr)
         assert exp in out, '\nGot: %s\nWanted: %s' % (out, exp)
 
     def test_auto_link_after_whitespace(self):
         instr = 'go to http://somelink.com'
         exp = 'go to <a href="http://somelink.com" target="_blank" rel="nofollow">http://somelink.com</a>'
-        format = MarkdownFormat()
-        out = format.to_html(instr)
+        out = h.render_markdown(instr)
         assert exp in out, '\nGot: %s\nWanted: %s' % (out, exp)
 
     def test_malformed_link_1(self):
         instr = u'<a href=\u201dsomelink\u201d>somelink</a>'
-        exp = '<a href="TAG MALFORMED" target="_blank" rel="nofollow">somelink</a>'
-        format = MarkdownFormat()
-        out = format.to_html(instr)
-        assert exp in out, '\nGot: %s\nWanted: %s' % (out, exp)
-
-    def test_malformed_link_2(self):
-        instr = u'<a href="http://url.com> url >'
-        exp = '<a href="TAG MALFORMED" target="_blank" rel="nofollow"> url &gt;'
-        format = MarkdownFormat()
-        out = format.to_html(instr)
-        assert exp in out, '\nGot: %s\nWanted: %s' % (out, exp)
-
-    def test_malformed_link_3(self):
-        instr = u'<a href="http://url.com"> url'
-        exp = '<a href="http://url.com" target="_blank" rel="nofollow"> url'
-        # NB when this is put into Genshi, it will close the tag for you.
-        format = MarkdownFormat()
-        out = format.to_html(instr)
+        exp = '<p>somelink\n</p>'
+        out = h.render_markdown(instr)
         assert exp in out, '\nGot: %s\nWanted: %s' % (out, exp)
 
     def test_multiline_links(self):
@@ -170,9 +136,9 @@ def test_multiline_links(self):
   [google]: http://google.com/        "Google"
   [yahoo]:  http://search.yahoo.com/  "Yahoo Search"
   [msn]:    http://search.msn.com/    "MSN Search"'''
-        exp = '''I get 10 times more traffic from <a href="http://google.com/" title="Google">Google</a> than from
-   <a href="http://search.yahoo.com/" title="Yahoo Search">Yahoo</a> or <a href="http://search.msn.com/" title="MSN Search">MSN</a>.'''
+        exp = '''<p>I get 10 times more traffic from <a href="http://google.com/" title="Google">Google</a> than from
+   <a href="http://search.yahoo.com/" title="Yahoo Search">Yahoo</a> or <a href="http://search.msn.com/" title="MSN Search">MSN</a>.
+</p>'''
         # NB when this is put into Genshi, it will close the tag for you.
-        format = MarkdownFormat()
-        out = format.to_html(instr)
+        out = h.render_markdown(instr)
         assert exp in out, '\nGot: %s\nWanted: %s' % (out, exp)
diff --git a/ckan/tests/models/test_package.py b/ckan/tests/models/test_package.py
index 7515b73f7df..4681a01d2a3 100644
--- a/ckan/tests/models/test_package.py
+++ b/ckan/tests/models/test_package.py
@@ -109,7 +109,7 @@ def test_as_dict(self):
         assert out['metadata_modified'] == pkg.metadata_modified.isoformat()
         assert out['metadata_created'] == pkg.metadata_created.isoformat()
         assert_equal(out['notes'], pkg.notes)
-        assert_equal(out['notes_rendered'], '<p>A <b>great</b> package [HTML_REMOVED] like <a href="/package/pollution_stats">package:pollution_stats</a>\n</p>')
+        assert_equal(out['notes_rendered'], '<p>A great package  like <a href="/dataset/pollution_stats">package:pollution_stats</a>\n</p>')
 
     def test_metadata_created_and_modified(self):
         # create a new package

From f647c6946edc50aa6b4dc5bab98872ba91879b96 Mon Sep 17 00:00:00 2001
From: tobes <toby.junk@gmail.com>
Date: Tue, 2 Apr 2013 13:52:01 +0100
Subject: [PATCH 4/7] [#708] Get markdown_extract() to use RE_MD_HTML_TAGS

---
 ckan/lib/helpers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ckan/lib/helpers.py b/ckan/lib/helpers.py
index 3e536ccc72c..93f0cb68ac2 100644
--- a/ckan/lib/helpers.py
+++ b/ckan/lib/helpers.py
@@ -636,7 +636,7 @@ def markdown_extract(text, extract_length=190):
     will not be truncated.'''
     if (text is None) or (text.strip() == ''):
         return ''
-    plain = re.sub(r'<.*?>', '', markdown(text))
+    plain = RE_MD_HTML_TAGS.sub('', markdown(text))
     if not extract_length or len(plain) < extract_length:
         return literal(plain)
     return literal(unicode(truncate(plain, length=extract_length, indicator='...', whole_word=True)))

From b5640954c1a3e7844db7f20ab3d223efcb5a48ca Mon Sep 17 00:00:00 2001
From: tobes <toby.junk@gmail.com>
Date: Tue, 2 Apr 2013 15:03:09 +0100
Subject: [PATCH 5/7] [#708] Minor refactor

---
 ckan/lib/helpers.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/ckan/lib/helpers.py b/ckan/lib/helpers.py
index 93f0cb68ac2..ad6ec7fd04e 100644
--- a/ckan/lib/helpers.py
+++ b/ckan/lib/helpers.py
@@ -1401,19 +1401,19 @@ def html_auto_link(data):
     group:... converted to a group link
     http://... converted to a link
     '''
+
+    LINK_FNS = {
+        'tag': tag_link,
+        'group': group_link,
+        'dataset': dataset_link,
+        'package': dataset_link,
+    }
+
     def makelink(matchobj):
         obj = matchobj.group(1)
         name = matchobj.group(2)
         title = '%s:%s' % (obj, name)
-        if obj == 'tag':
-            return tag_link({'name': name.strip('"'),
-                             'title': title})
-        elif obj  == 'group':
-            return group_link({'name': name,
-                               'title': title})
-        elif obj in ['dataset', 'package']:
-            return dataset_link({'name': name,
-                                 'title': title})
+        return LINK_FNS[obj]({'name': name.strip('"'), 'title': title})
 
     def link(matchobj):
         return '<a href="%s" target="_blank" rel="nofollow">%s</a>' \

From 36c6143b601c6b2cdec74ac17ca9d895ce6664a8 Mon Sep 17 00:00:00 2001
From: Dominik Moritz <domoritz@gmail.com>
Date: Tue, 16 Apr 2013 22:38:55 +0200
Subject: [PATCH 6/7] [#708] Clean up code, fix comment, remove obvious comment

---
 ckan/lib/helpers.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ckan/lib/helpers.py b/ckan/lib/helpers.py
index ad6ec7fd04e..6fdbb2932ca 100644
--- a/ckan/lib/helpers.py
+++ b/ckan/lib/helpers.py
@@ -1384,8 +1384,9 @@ def get_request_param(parameter_name, default=None):
     flags=re.UNICODE
 )
 
-# find external links eg http://foo.com, https:/bar.org/foobar.html
-RE_MD_EXTERNAL_LINK = re.compile(r'(\bhttps?:\/\/[\w\-\.,@?^=%&;:\/~\\+#]*)',
+# find external links eg http://foo.com, https://bar.org/foobar.html
+RE_MD_EXTERNAL_LINK = re.compile(
+    r'(\bhttps?:\/\/[\w\-\.,@?^=%&;:\/~\\+#]*)',
     flags=re.UNICODE
 )
 
@@ -1393,6 +1394,7 @@ def get_request_param(parameter_name, default=None):
 # in markdown
 RE_MD_HTML_TAGS = re.compile('<[^><]*>')
 
+
 def html_auto_link(data):
     '''Linkifies HTML
 
@@ -1431,7 +1433,6 @@ def process(matchobj):
 
 def render_markdown(data, auto_link=True):
     ''' returns the data as rendered markdown '''
-    # cope with data == None
     if not data:
         return ''
     data = RE_MD_HTML_TAGS.sub('', data.strip())

From ab31c4e0cdf84bc03d19787b2a727850f0ba9aea Mon Sep 17 00:00:00 2001
From: Dominik Moritz <domoritz@gmail.com>
Date: Tue, 16 Apr 2013 22:39:51 +0200
Subject: [PATCH 7/7] [#708] Add more tests to link detection tests

---
 ckan/tests/misc/test_format_text.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/ckan/tests/misc/test_format_text.py b/ckan/tests/misc/test_format_text.py
index e4e2030e18d..9590e3b01be 100644
--- a/ckan/tests/misc/test_format_text.py
+++ b/ckan/tests/misc/test_format_text.py
@@ -57,14 +57,14 @@ def test_internal_tag_linked_with_quotes_and_space(self):
 
     def test_internal_tag_with_no_opening_quote_only_matches_single_word(self):
         """Asserts that without an opening quote only one word is matched"""
-        instr = 'tag:test tag" foobar' # should match 'tag:test'
+        instr = 'tag:test tag" foobar'  # should match 'tag:test'
         exp = '<a href="/tag/test">tag:test</a> tag" foobar'
         out = h.render_markdown(instr)
         assert exp in out, '\nGot: %s\nWanted: %s' % (out, exp)
 
     def test_internal_tag_with_no_opening_quote_wont_match_the_closing_quote(self):
         """Asserts that 'tag:test" tag' is matched, but to 'tag:test'"""
-        instr = 'tag:test" foobar' # should match 'tag:test'
+        instr = 'tag:test" foobar'  # should match 'tag:test'
         exp = '<a href="/tag/test">tag:test</a>" foobar'
         out = h.render_markdown(instr)
         assert exp in out, '\nGot: %s\nWanted: %s' % (out, exp)
@@ -111,6 +111,16 @@ def test_normal_link(self):
         out = h.render_markdown(instr)
         assert exp in out, '\nGot: %s\nWanted: %s' % (out, exp)
 
+        instr = 'http://somelink.com/#anchor'
+        exp = '<a href="http://somelink.com/#anchor" target="_blank" rel="nofollow">http://somelink.com/#anchor</a>'
+        out = h.render_markdown(instr)
+        assert exp in out, '\nGot: %s\nWanted: %s' % (out, exp)
+
+        instr = 'http://www.somelink.com/#anchor'
+        exp = '<a href="http://www.somelink.com/#anchor" target="_blank" rel="nofollow">http://www.somelink.com/#anchor</a>'
+        out = h.render_markdown(instr)
+        assert exp in out, '\nGot: %s\nWanted: %s' % (out, exp)
+
     def test_auto_link(self):
         instr = 'http://somelink.com'
         exp = '<a href="http://somelink.com" target="_blank" rel="nofollow">http://somelink.com</a>'