Permalink
Browse files

Initial xpathfetchpage module (requires lxml and html5lib)

  • Loading branch information...
1 parent 9b0bc63 commit a8b87ffabaa781e4d25e22d12a33b81299285642 Greg committed Feb 9, 2013
Showing with 122 additions and 20 deletions.
  1. +14 −4 README.rst
  2. +1 −0 modules/__init__.py
  3. +90 −0 modules/pipexpathfetchpage.py
  4. +1 −1 pipes.wpr
  5. +1 −1 setup.py
  6. +15 −14 util.py
View
@@ -24,15 +24,25 @@ we could use queues to plumb them together.
Setting up the environment
==========================
-Put the source code in a package directory named `pipe2py`, say, `pipeline/pipe2py`.
+Install the package::
-Make the package directory available to Python, e.g.
+ python setup.py install
- export PYTHONPATH=pipeline
Dependencies
------------
-If using a Python version before 2.6 then simplejson is needed:
+If you need the 'XPath Fetch Page' module, lxml (http://lxml.de/) is
+required, e.g.::
+
+ pip install lxml
+
+If you use the html5 parser option for the 'XPath Fetch Page' module,
+html5lib (http://code.google.com/p/html5lib/) is also required, e.g.::
+
+ pip install html5lib
+
+
+If using a Python version before 2.6 then simplejson is needed::
* http://pypi.python.org/pypi/simplejson
View
@@ -12,6 +12,7 @@
'pipecsv',
'pipefeedautodiscovery',
'pipefetchsitefeed',
+ 'pipexpathfetchpage',
'pipedatebuilder',
'pipeurlbuilder',
@@ -0,0 +1,90 @@
+# pipexpathfetchpage.py
+#
+
+import urllib2
+import re
+from pipe2py import util
+
+
+def pipe_xpathfetchpage(context, _INPUT, conf, **kwargs):
+ """XPath Fetch Page module
+
+ _INPUT -- not used since this does not have inputs.
+
+ conf:
+ URL -- url object contain the URL to download
+ xpath -- xpath to extract
+ html5 -- use html5 parser?
+ useAsString -- emit items as string?
+
+ Description: http://pipes.yahoo.com/pipes/docs?doc=sources#XPathFetchPage
+
+ TODOS:
+ - don't retrieve pages larger than 1.5MB
+ - don't retrieve if page is not indexable.
+ """
+ urls = conf['URL']
+ if not isinstance(urls, list):
+ urls = [urls]
+
+ for item in _INPUT:
+ for item_url in urls:
+ url = util.get_value(item_url, item, **kwargs)
+ if context.verbose:
+ print "XPathFetchPage: Preparing to download:",url
+
+ try:
+ request = urllib2.Request(url)
+ request.add_header('User-Agent','Yahoo Pipes 1.0')
+ request = urllib2.build_opener().open(request)
+ content = unicode(request.read(),
+ request.headers['content-type'].split('charset=')[-1])
+
+ # TODO it seems that Yahoo! converts relative links to absolute
+ # TODO this needs to be done on the content but seems to be a non-trival
+ # TODO task python?
+
+ xpath = util.get_value(conf["xpath"], _INPUT, **kwargs)
+ html5 = False
+ useAsString = False
+ if "html5" in conf:
+ html5 = util.get_value(conf["html5"], _INPUT, **kwargs) == "true"
+ if "useAsString" in conf:
+ useAsString = util.get_value(conf["useAsString"], _INPUT, **kwargs) == "true"
+
+
+ if html5:
+ from lxml.html import html5parser
+ root = html5parser.fromstring(content)
+ else:
+ from lxml import etree
+ root = etree.HTML(content)
+ res_items = root.xpath(xpath)
+
+ if context.verbose:
+ print "XPathFetchPage: found count items:",len(res_items)
+
+ for res_item in res_items:
+ i = util.etree_to_pipes(res_item) #TODO xml_to_dict(res_item)
+ if context.verbose:
+ print "--------------item data --------------------"
+ print i
+ print "--------------EOF item data ----------------"
+ if useAsString:
+ yield { "content" : unicode(i) }
+ else:
+ yield i
+
+ except Exception, e:
+ if context.verbose:
+ print "XPathFetchPage: failed to retrieve from:", url
+
+ print "----------------- XPathFetchPage -----------------"
+ import traceback
+ traceback.print_exc()
+ print "----------------- XPathFetchPage -----------------"
+ raise
+
+ if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
+ break
+
View
@@ -1,5 +1,5 @@
#!wing
-#!version=3.0
+#!version=4.0
##################################################################
# Wing IDE project file #
##################################################################
View
@@ -8,7 +8,7 @@ def read(fname):
return open(os.path.join(os.path.dirname(__file__), fname)).read()
setup(name='pipe2py',
- version='0.9.4',
+ version='0.9.5',
description=('A project to compile Yahoo! Pipes into Python. '
'The pipe2py package can compile a Yahoo! Pipe into pure Python source code, '
'or it can interpret the pipe on-the-fly. It supports embedded pipes too.'
View
@@ -20,10 +20,10 @@ def pythonise(id):
"""Return a Python-friendly id"""
if id:
id = id.replace("-", "_").replace(":", "_")
-
+
if id[0] in string.digits:
id = "_" + id
-
+
return id.encode('ascii')
def xml_to_dict(element):
@@ -42,7 +42,7 @@ def xml_to_dict(element):
else:
if element.text and element.text.strip():
i['content'] = element.text
-
+
return i
def etree_to_pipes(element):
@@ -55,7 +55,7 @@ def etree_to_pipes(element):
if len(element): # if element has child elements
if element.text and element.text.strip(): # if element has text
i['content'] = element.text
-
+
for child in element:
tag = child.tag.split('}', 1)[-1]
@@ -88,7 +88,7 @@ def etree_to_pipes(element):
else: # element has attributes
if element.text and element.text.strip(): # if element has text
i['content'] = element.text
-
+
return i
def get_subkey(subkey, item):
@@ -112,7 +112,7 @@ def get_subkey(subkey, item):
#unless 'value' or 'utime' is the part in which case we return the parent
#(to cope with y:id.value -> y:id and item.endtime.utime -> item.endtime)
return subtree
-
+
def get_value(_item, _loop_item=None, **kwargs):
"""Return either:
a literal value
@@ -138,11 +138,11 @@ def del_value(item, key):
Note: keys use dot notation and we map onto nested dictionaries, e.g. 'a.content' -> ['a']['content']
"""
del reduce(lambda i,k:i.get(k), [item] + key.split('.')[:-1])[key.split('.')[-1]]
-
-
+
+
def multikeysort(items, columns):
"""Sorts a list of items by the columns
-
+
(columns precedeed with a '-' will sort descending)
"""
comparers = [ ((itemgetter(col[1:].strip()), -1) if col.startswith('-') else (itemgetter(col.strip()), 1)) for col in columns]
@@ -164,14 +164,14 @@ def comparer(left, right):
def get_input(context, conf):
"""Gets a user parameter, either from the console or from an outer submodule/system
-
+
Assumes conf has name, default, prompt and debug
"""
name = conf['name']['value']
default = conf['default']['value']
prompt = conf['prompt']['value']
debug = conf['debug']['value']
-
+
value = None
if context.submodule:
value = context.inputs.get(name, default)
@@ -183,7 +183,7 @@ def get_input(context, conf):
value = default
else:
value = context.inputs.get(name, default)
-
+
return value
def rreplace(s, find, replace, count=None):
@@ -196,5 +196,6 @@ def url_quote(url):
return urllib2.quote(url, safe=URL_SAFE)
except KeyError:
return urllib2.quote(url.encode('utf-8'), safe=URL_SAFE)
-
-
+
+def recursive_dict(element):
+ return element.tag, dict(map(recursive_dict, element)) or element.text

0 comments on commit a8b87ff

Please sign in to comment.