From 2950d87a952b641c1050ee3f1f89c025d94b2c92 Mon Sep 17 00:00:00 2001 From: Konstantin Sivakov Date: Thu, 12 Oct 2017 11:28:43 +0200 Subject: [PATCH] add feeds blueprint --- ckan/views/feeds.py | 509 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 509 insertions(+) create mode 100644 ckan/views/feeds.py diff --git a/ckan/views/feeds.py b/ckan/views/feeds.py new file mode 100644 index 00000000000..e21b895a2aa --- /dev/null +++ b/ckan/views/feeds.py @@ -0,0 +1,509 @@ +# encoding: utf-8 + +import logging +import urlparse + +from flask import Blueprint +from werkzeug.contrib.atom import AtomFeed + +from ckan.common import _, config, g, request +import ckan.lib.helpers as h +import ckan.lib.base as base +import ckan.model as model +import ckan.logic as logic +import ckan.plugins as plugins + +log = logging.getLogger(__name__) + +feeds = Blueprint(u'feeds', __name__, url_prefix=u'/feeds') + +ITEMS_LIMIT = config.get(u'ckan.feeds.limit', 20) +BASE_URL = config.get(u'ckan.site_url') +SITE_TITLE = config.get(u'ckan.site_title', u'CKAN') + + +def _package_search(data_dict): + """ + Helper method that wraps the package_search action. + + * unless overridden, sorts results by metadata_modified date + * unless overridden, sets a default item limit + """ + context = { + u'model': model, + u'session': model.Session, + u'user': g.user, + u'auth_user_obj': g.userobj + } + if u'sort' not in data_dict or not data_dict['sort']: + data_dict['sort'] = u'metadata_modified desc' + + if u'rows' not in data_dict or not data_dict['rows']: + data_dict['rows'] = ITEMS_LIMIT + + # package_search action modifies the data_dict, so keep our copy intact. + query = logic.get_action(u'package_search')(context, data_dict.copy()) + + return query['count'], query['results'] + + +def output_feed(results, feed_title, feed_description, feed_link, feed_url, + navigation_urls, feed_guid): + author_name = config.get(u'ckan.feeds.author_name', u'').strip() or \ + config.get(u'ckan.site_id', u'').strip() + + # TODO: language + feed_class = None + for plugin in plugins.PluginImplementations(plugins.IFeed): + if hasattr(plugin, u'get_feed_class'): + feed_class = plugin.get_feed_class() + + if not feed_class: + feed_class = _FixedAtomFeed + + feed = feed_class( + title=feed_title, + url=feed_link, + language=u'en', + author={u'name': author_name, + u'uri': BASE_URL}, + id=feed_guid, + feed_url=feed_url, + links=navigation_urls, + generator=(None, None, None)) + + for pkg in results: + additional_fields = {} + + for plugin in plugins.PluginImplementations(plugins.IFeed): + if hasattr(plugin, u'get_item_additional_fields'): + additional_fields = plugin.get_item_additional_fields(pkg) + + feed.add( + title=pkg.get(u'title', u''), + url=h.url_for( + controller=u'package', + action=u'read', + id=pkg['id'], + _external=True), + description=pkg.get(u'notes', u''), + updated=h.date_str_to_datetime(pkg.get(u'metadata_modified')), + published=h.date_str_to_datetime(pkg.get(u'metadata_created')), + unique_id=_create_atom_id(u'/dataset%s' % pkg['id']), + author=pkg.get(u'author', u''), + categories=[{ + 'terms': t['name'] + } for t in pkg.get('tags')], + **additional_fields) + + # response.content_type = feed.get_response() + return feed.get_response() + + +def group(id): + try: + context = { + u'model': model, + u'session': model.Session, + u'user': g.user, + u'auth_user_obj': g.userobj + } + group_dict = logic.get_action(u'group_show')(context, {u'id': id}) + except logic.NotFound: + base.abort(404, _(u'Group not found')) + + return group_or_organization(group_dict, is_org=False) + + +def organization(id): + try: + context = { + u'model': model, + u'session': model.Session, + u'user': g.user, + u'auth_user_obj': g.userobj + } + group_dict = logic.get_action(u'organization_show')(context, { + u'id': id + }) + except logic.NotFound: + base.abort(404, _(u'Organization not found')) + + return group_or_organization(group_dict, is_org=True) + + +def tag(id): + data_dict, params = _parse_url_params() + data_dict['fq'] = u'tags: "%s"' % id + + item_count, results = _package_search(data_dict) + + navigation_urls = _navigation_urls( + params, + item_count=item_count, + limit=data_dict['rows'], + controller=u'feeds', + action=u'tag', + id=id) + + feed_url = _feed_url(params, controller=u'feeds', action=u'tag', id=id) + + alternate_url = _alternate_url(params, tags=id) + + title = u'%s - Tag: "%s"' % (SITE_TITLE, id) + desc = u'Recently created or updated datasets on %s by tag: "%s"' % \ + (SITE_TITLE, id) + guid = _create_atom_id(u'/feeds/tag/%s.atom' % id) + + return output_feed( + results, + feed_title=title, + feed_description=desc, + feed_link=alternate_url, + feed_guid=guid, + feed_url=feed_url, + navigation_urls=navigation_urls) + + +def group_or_organization(obj_dict, is_org): + data_dict, params = _parse_url_params() + if is_org: + key = u'owner_org' + value = obj_dict['id'] + group_type = u'organization' + else: + key = u'groups' + value = obj_dict['name'] + group_type = u'group' + + data_dict['fq'] = u'{0}: "{1}"'.format(key, value) + item_count, results = _package_search(data_dict) + + navigation_urls = _navigation_urls( + params, + item_count=item_count, + limit=data_dict['rows'], + controller=u'feed', + action=group_type, + id=obj_dict['name']) + feed_url = _feed_url( + params, controller=u'feed', action=group_type, id=obj_dict['name']) + # site_title = SITE_TITLE + if is_org: + guid = _create_atom_id( + u'feeds/organization/%s.atom' % obj_dict['name']) + alternate_url = _alternate_url(params, organization=obj_dict['name']) + desc = u'Recently created or updated datasets on %s '\ + 'by organization: "%s"' % (SITE_TITLE, obj_dict['title']) + title = u'%s - Organization: "%s"' % (SITE_TITLE, obj_dict['title']) + + else: + guid = _create_atom_id(u'feeds/group/%s.atom' % obj_dict['name']) + alternate_url = _alternate_url(params, groups=obj_dict['name']) + desc = u'Recently created or updated datasets on %s '\ + 'by group: "%s"' % (SITE_TITLE, obj_dict['title']) + title = u'%s - Group: "%s"' % (SITE_TITLE, obj_dict['title']) + + return output_feed( + results, + feed_title=title, + feed_description=desc, + feed_link=alternate_url, + feed_guid=guid, + feed_url=feed_url, + navigation_urls=navigation_urls) + + +def _parse_url_params(): + """ + Constructs a search-query dict from the URL query parameters. + + Returns the constructed search-query dict, and the valid URL + query parameters. + """ + page = h.get_page_number(request.params) + + limit = ITEMS_LIMIT + data_dict = {u'start': (page - 1) * limit, u'rows': limit} + + # Filter ignored query parameters + valid_params = ['page'] + params = dict((p, request.params.get(p)) for p in valid_params + if p in request.params) + return data_dict, params + + +def general(): + data_dict, params = _parse_url_params() + data_dict['q'] = u'*:*' + + item_count, results = _package_search(data_dict) + + navigation_urls = _navigation_urls( + params, + item_count=item_count, + limit=data_dict['rows'], + controller=u'feeds', + action=u'general') + + feed_url = _feed_url(params, controller=u'feeds', action=u'general') + + alternate_url = _alternate_url(params) + + guid = _create_atom_id(u'/feeds/dataset.atom') + + desc = u'Recently created or updated datasets on %s' % SITE_TITLE + + return output_feed( + results, + feed_title=SITE_TITLE, + feed_description=desc, + feed_link=alternate_url, + feed_guid=guid, + feed_url=feed_url, + navigation_urls=navigation_urls) + + +def custom(): + """ + Custom atom feed + + """ + q = request.params.get(u'q', u'') + fq = u'' + search_params = {} + for (param, value) in request.params.items(): + if param not in [u'q', u'page', u'sort'] \ + and len(value) and not param.startswith(u'_'): + search_params[param] = value + fq += u'%s:"%s"' % (param, value) + + page = h.get_page_number(request.params) + + limit = ITEMS_LIMIT + data_dict = { + u'q': q, + u'fq': fq, + u'start': (page - 1) * limit, + u'rows': limit, + u'sort': request.params.get(u'sort', None) + } + + item_count, results = _package_search(data_dict) + + navigation_urls = _navigation_urls( + request.params, + item_count=item_count, + limit=data_dict['rows'], + controller=u'feeds', + action=u'custom') + + feed_url = _feed_url(request.params, controller=u'feeds', action=u'custom') + + atom_url = h._url_with_params(u'/feeds/custom.atom', search_params.items()) + + alternate_url = _alternate_url(request.params) + + site_title = config.get(u'ckan.site_title', u'CKAN') + + return output_feed( + results, + feed_title=u'%s - Custom query' % site_title, + feed_description=u'Recently created or updated' + ' datasets on %s. Custom query: \'%s\'' % (site_title, q), + feed_link=alternate_url, + feed_guid=_create_atom_id(atom_url), + feed_url=feed_url, + navigation_urls=navigation_urls) + + +def _alternate_url(params, **kwargs): + search_params = params.copy() + search_params.update(kwargs) + + # Can't count on the page sizes being the same on the search results + # view. So provide an alternate link to the first page, regardless + # of the page we're looking at in the feed. + search_params.pop(u'page', None) + return _feed_url(search_params, controller=u'package', action=u'search') + + +def _feed_url(query, controller, action, **kwargs): + """ + Constructs the url for the given action. Encoding the query + parameters. + """ + for item in query.iteritems(): + kwargs['query'] = item + return h.url_for(controller=controller, action=action, **kwargs) + + +def _navigation_urls(query, controller, action, item_count, limit, **kwargs): + """ + Constructs and returns first, last, prev and next links for paging + """ + urls = [] + + page = int(query.get(u'page', 1)) + + # first: remove any page parameter + first_query = query.copy() + first_query.pop(u'page', None) + href = _feed_url(first_query, controller, action, **kwargs) + urls.append({u'rel': u'first', u'href': href}) + + # last: add last page parameter + last_page = (item_count / limit) + min(1, item_count % limit) + last_query = query.copy() + last_query['page'] = last_page + href = _feed_url(last_query, controller, action, **kwargs) + urls.append({u'rel': u'last', u'href': href}) + # previous + if page > 1: + previous_query = query.copy() + previous_query['page'] = page - 1 + href = _feed_url(previous_query, controller, action, **kwargs) + else: + href = None + urls.append({u'rel': u'previous', u'href': href}) + + # next + if page < last_page: + next_query = query.copy() + next_query['page'] = page + 1 + href = _feed_url(next_query, controller, action, **kwargs) + else: + href = None + + urls.append({u'rel': u'next', u'href': href}) + return urls + + +def _create_atom_id(resource_path, authority_name=None, date_string=None): + """ + Helper method that creates an atom id for a feed or entry. + + An id must be unique, and must not change over time. ie - once published, + it represents an atom feed or entry uniquely, and forever. See [4]: + + When an Atom Document is relocated, migrated, syndicated, + republished, exported, or imported, the content of its atom:id + element MUST NOT change. Put another way, an atom:id element + pertains to all instantiations of a particular Atom entry or feed; + revisions retain the same content in their atom:id elements. It is + suggested that the atom:id element be stored along with the + associated resource. + + resource_path + The resource path that uniquely identifies the feed or element. This + mustn't be something that changes over time for a given entry or feed. + And does not necessarily need to be resolvable. + + e.g. ``"/group/933f3857-79fd-4beb-a835-c0349e31ce76"`` could represent + the feed of datasets belonging to the identified group. + + authority_name + The domain name or email address of the publisher of the feed. See [3] + for more details. If ``None`` then the domain name is taken from the + config file. First trying ``ckan.feeds.authority_name``, and failing + that, it uses ``ckan.site_url``. Again, this should not change over + time. + + date_string + A string representing a date on which the authority_name is owned by + the publisher of the feed. + + e.g. ``"2012-03-22"`` + + Again, this should not change over time. + + If date_string is None, then an attempt is made to read the config + option ``ckan.feeds.date``. If that's not available, + then the date_string is not used in the generation of the atom id. + + Following the methods outlined in [1], [2] and [3], this function produces + tagURIs like: + ``"tag:thedatahub.org,2012:/group/933f3857-79fd-4beb-a835-c0349e31ce76"``. + + If not enough information is provide to produce a valid tagURI, then only + the resource_path is used, e.g.: :: + + "http://thedatahub.org/group/933f3857-79fd-4beb-a835-c0349e31ce76" + + or + + "/group/933f3857-79fd-4beb-a835-c0349e31ce76" + + The latter of which is only used if no site_url is available. And it + should be noted will result in an invalid feed. + + [1] http://web.archive.org/web/20110514113830/http://diveintomark.org/\ + archives/2004/05/28/howto-atom-id + [2] http://www.taguri.org/ + [3] http://tools.ietf.org/html/rfc4151#section-2.1 + [4] http://www.ietf.org/rfc/rfc4287 + """ + if authority_name is None: + authority_name = config.get(u'ckan.feeds.authority_name', u'').strip() + if not authority_name: + site_url = config.get(u'ckan.site_url', u'').strip() + authority_name = urlparse.urlparse(site_url).netloc + + if not authority_name: + log.warning(u'No authority_name available for feed generation. ' + 'Generated feed will be invalid.') + + if date_string is None: + date_string = config.get(u'ckan.feeds.date', u'') + + if not date_string: + log.warning(u'No date_string available for feed generation. ' + 'Please set the "ckan.feeds.date" config value.') + + # Don't generate a tagURI without a date as it wouldn't be valid. + # This is best we can do, and if the site_url is not set, then + # this still results in an invalid feed. + site_url = config.get(u'ckan.site_url', u'') + return u''.join([site_url, resource_path]) + + tagging_entity = u','.join([authority_name, date_string]) + return u':'.join(['tag', tagging_entity, resource_path]) + + +class _FixedAtomFeed(AtomFeed): + def add(self, *args, **kwargs): + """ + Drop the pubdate field from the new item. + """ + if u'pubdate' in kwargs: + kwargs.pop(u'pubdate') + if u'generator' in kwargs: + kwargs.pop(u'generator') + defaults = {u'updated': None, u'published': None} + defaults.update(kwargs) + super(_FixedAtomFeed, self).add(*args, **defaults) + + def latest_post_date(self): + """ + Calculates the latest post date from the 'updated' fields, + rather than the 'pubdate' fields. + """ + updates = [ + item['updated'] for item in self.entries + if item['updated'] is not None + ] + if not len(updates): # delegate to parent for default behaviour + return super(_FixedAtomFeed, self).latest_post_date() + return max(updates) + + +# Routing +feeds.add_url_rule(u'/dataset.atom', methods=[u'GET'], view_func=general) +feeds.add_url_rule(u'/custom.atom', methods=[u'GET'], view_func=custom) +feeds.add_url_rule(u'/tag/.atom', methods=[u'GET'], view_func=tag) +feeds.add_url_rule( + u'/group/.atom', methods=[u'GET'], view_func=group) +feeds.add_url_rule( + u'/organization/.atom', + methods=[u'GET'], + view_func=organization)