Permalink
Browse files

Change build process, new feed library

  • Loading branch information...
1 parent 20bee3e commit f02498b731904de97383c7b06b40045fc4a3e3af @beberlei beberlei committed Mar 24, 2013
View
@@ -12,21 +12,29 @@
</exec>
</target>
- <target name="build">
+ <target name="regenerate-projects-yml">
<exec executable="php" dir=".">
<arg value="bin/build-projects.php" />
</exec>
+ </target>
+
+ <target name="build-docs">
<exec executable="make" dir="pages/">
<arg value="html" />
</exec>
<copy file="favicon.ico" tofile="pages/build/html/favicon.ico" />
+ </target>
+
+ <target name="build-api">
<mkdir dir="pages/build/api" />
<exec executable="php" dir=".">
<arg value="bin/build-apidocs.php" />
<arg value="pages/build/api" />
</exec>
</target>
+ <target name="build" depends="regenerate-projects-yml,build-docs,build-api" />
+
<target name="clean">
<delete dir="pages/build" />
</target>
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+"""
+ sphinxcontrib
+ ~~~~~~~~~~~~~
+
+ This package is a namespace package that contains all extensions
+ distributed in the ``sphinx-contrib`` distribution.
+
+ :copyright: Copyright 2007-2009 by the Sphinx team, see AUTHORS.
+ :license: BSD, see LICENSE for details.
+"""
+
+__import__('pkg_resources').declare_namespace(__name__)
+
@@ -0,0 +1,145 @@
+from fsdict import FSDict
+import feedgenerator
+from urllib import quote_plus
+import os.path
+import directives
+
+#global
+feed_entries = None
+
+#constant unlikely to occur in a docname and legal as a filename
+MAGIC_SEPARATOR = '---###---'
+
+def parse_date(datestring):
+ try:
+ parser = parse_date.parser
+ except AttributeError:
+ import dateutil.parser
+ parser = dateutil.parser.parser()
+ parse_date.parser = parser
+ return parser.parse(datestring)
+
+def setup(app):
+ """
+ see: http://sphinx.pocoo.org/ext/appapi.html
+ this is the primary extension point for Sphinx
+ """
+ from sphinx.application import Sphinx
+ if not isinstance(app, Sphinx): return
+ app.add_config_value('feed_title', '', 'html')
+ app.add_config_value('feed_base_url', '', 'html')
+ app.add_config_value('feed_description', '', 'html')
+ app.add_config_value('feed_filename', 'rss.xml', 'html')
+
+ app.connect('html-page-context', create_feed_item)
+ app.connect('build-finished', emit_feed)
+ app.connect('builder-inited', create_feed_container)
+ app.connect('env-purge-doc', remove_dead_feed_item)
+
+def create_feed_container(app):
+ """
+ create lazy filesystem stash for keeping RSS entry fragments, since we
+ don't want to store the entire site in the environment (in fact, even if
+ we did, it wasn't persisting for some reason.)
+ """
+ global feed_entries
+ rss_fragment_path = os.path.realpath(os.path.join(app.outdir, '..', 'rss_entry_fragments'))
+ feed_entries = FSDict(work_dir=rss_fragment_path)
+ app.builder.env.feed_url = app.config.feed_base_url + '/' + \
+ app.config.feed_filename
+
+def create_feed_item(app, pagename, templatename, ctx, doctree):
+ """
+ Here we have access to nice HTML fragments to use in, say, an RSS feed.
+ We serialize them to disk so that we get them preserved across builds.
+
+ We also inject useful metadata into the context here.
+ """
+ global feed_entries
+ from absolutify_urls import absolutify
+ metadata = app.builder.env.metadata.get(pagename, {})
+
+ if 'date' not in metadata:
+ return #don't index dateless articles
+ try:
+ pub_date = parse_date(metadata['date'])
+ app.builder.env.metadata.get(pagename, {})
+ except ValueError, exc:
+ #probably a nonsensical date
+ app.builder.warn('date parse error: ' + str(exc) + ' in ' + pagename)
+ return
+
+ # RSS item attributes, w/defaults:
+ # title, link, description, author_email=None,
+ # author_name=None, author_link=None, pubdate=None, comments=None,
+ # unique_id=None, enclosure=None, categories=(), item_copyright=None,
+ # ttl=None,
+ link = app.config.feed_base_url + '/' + ctx['current_page_name'] + ctx['file_suffix']
+ item = {
+ 'title': ctx.get('title'),
+ 'link': link,
+ 'unique_id': link,
+ 'description': absolutify(ctx.get('body'), link),
+ 'pubdate': pub_date
+ }
+ if 'author' in metadata:
+ item['author'] = metadata['author']
+ feed_entries[nice_name(pagename, pub_date)] = item
+
+ #Now, useful variables to keep in context
+ ctx['rss_link'] = app.builder.env.feed_url
+ ctx['pub_date'] = pub_date
+
+def remove_dead_feed_item(app, env, docname):
+ """
+ TODO:
+ purge unwanted crap
+ """
+ global feed_entries
+ munged_name = ''.join([MAGIC_SEPARATOR,quote_plus(docname)])
+ for name in feed_entries:
+ if name.endswith(munged_name):
+ del(feed_entries[name])
+
+def emit_feed(app, exc):
+ global feed_entries
+ import os.path
+
+ title = app.config.feed_title
+ if not title:
+ title = app.config.project
+
+ feed_dict = {
+ 'title': title,
+ 'link': app.config.feed_base_url,
+ 'feed_url': app.config.feed_base_url,
+ 'description': app.config.feed_description
+ }
+ if app.config.language:
+ feed_dict['language'] = app.config.language
+ if app.config.copyright:
+ feed_dict['feed_copyright'] = app.config.copyright
+ feed = feedgenerator.Rss201rev2Feed(**feed_dict)
+ app.builder.env.feed_feed = feed
+ ordered_keys = feed_entries.keys()
+ ordered_keys.sort(reverse=True)
+ num = 0
+ for key in ordered_keys:
+ num = num+1
+ if num <= 10:
+ feed.add_item(**feed_entries[key])
+ else:
+ break
+ outfilename = os.path.join(app.builder.outdir,
+ app.config.feed_filename)
+ fp = open(outfilename, 'w')
+ feed.write(fp, 'utf-8')
+ fp.close()
+
+def nice_name(docname, date):
+ """
+ we need convenient filenames which incorporate dates for ease of sorting and
+ guid for uniqueness, plus will work in the FS without inconvenient
+ characters. NB, at the moment, hour of publication is ignored.
+ """
+ return quote_plus(MAGIC_SEPARATOR.join([date.isoformat(), docname]))
@@ -0,0 +1,96 @@
+# By Gareth Rees
+# http://gareth-rees.livejournal.com/27148.html
+
+import html5lib
+import html5lib.serializer
+import html5lib.treewalkers
+import urlparse
+
+# List of (ELEMENT, ATTRIBUTE) for HTML5 attributes which contain URLs.
+# Based on the list at http://www.feedparser.org/docs/resolving-relative-links.html
+url_attributes = [
+ ('a', 'href'),
+ ('applet', 'codebase'),
+ ('area', 'href'),
+ ('blockquote', 'cite'),
+ ('body', 'background'),
+ ('del', 'cite'),
+ ('form', 'action'),
+ ('frame', 'longdesc'),
+ ('frame', 'src'),
+ ('iframe', 'longdesc'),
+ ('iframe', 'src'),
+ ('head', 'profile'),
+ ('img', 'longdesc'),
+ ('img', 'src'),
+ ('img', 'usemap'),
+ ('input', 'src'),
+ ('input', 'usemap'),
+ ('ins', 'cite'),
+ ('link', 'href'),
+ ('object', 'classid'),
+ ('object', 'codebase'),
+ ('object', 'data'),
+ ('object', 'usemap'),
+ ('q', 'cite'),
+ ('script', 'src')]
+
+def absolutify(src, base_url):
+ """absolutify(SRC, BASE_URL): Resolve relative URLs in SRC.
+SRC is a string containing HTML. All URLs in SRC are resolved relative
+to BASE_URL. Return the body of the result as HTML."""
+
+ # Parse SRC as HTML.
+ tree_builder = html5lib.treebuilders.getTreeBuilder('dom')
+ parser = html5lib.html5parser.HTMLParser(tree = tree_builder)
+ dom = parser.parse(src)
+
+ # Handle <BASE> if any.
+ head = dom.getElementsByTagName('head')[0]
+ for b in head.getElementsByTagName('base'):
+ u = b.getAttribute('href')
+ if u:
+ base_url = urlparse.urljoin(base_url, u)
+ # HTML5 4.2.3 "if there are multiple base elements with href
+ # attributes, all but the first are ignored."
+ break
+
+ # Change all relative URLs to absolute URLs by resolving them
+ # relative to BASE_URL. Note that we need to do this even for URLs
+ # that consist only of a fragment identifier, because Google Reader
+ # changes href=#foo to href=http://site/#foo
+ for tag, attr in url_attributes:
+ for e in dom.getElementsByTagName(tag):
+ u = e.getAttribute(attr)
+ if u:
+ e.setAttribute(attr, urlparse.urljoin(base_url, u))
+
+ # Return the HTML5 serialization of the <BODY> of the result (we don't
+ # want the <HEAD>: this breaks feed readers).
+ body = dom.getElementsByTagName('body')[0]
+ tree_walker = html5lib.treewalkers.getTreeWalker('dom')
+ html_serializer = html5lib.serializer.htmlserializer.HTMLSerializer()
+ return u''.join(html_serializer.serialize(tree_walker(body)))
+
+
+# Alternative option, from http://stackoverflow.com/questions/589833/how-to-find-a-relative-url-and-translate-it-to-an-absolute-url-in-python/589939#589939
+#
+# import re, urlparse
+#
+# find_re = re.compile(r'\bhref\s*=\s*("[^"]*"|\'[^\']*\'|[^"\'<>=\s]+)')
+#
+# def fix_urls(document, base_url):
+# ret = []
+# last_end = 0
+# for match in find_re.finditer(document):
+# url = match.group(1)
+# if url[0] in "\"'":
+# url = url.strip(url[0])
+# parsed = urlparse.urlparse(url)
+# if parsed.scheme == parsed.netloc == '': #relative to domain
+# url = urlparse.urljoin(base_url, url)
+# ret.append(document[last_end:match.start(1)])
+# ret.append('"%s"' % (url,))
+# last_end = match.end(1)
+# ret.append(document[last_end:])
+# return ''.join(ret)
Oops, something went wrong.

0 comments on commit f02498b

Please sign in to comment.