scrape

#!/usr/bin/env python2

import sys     # for handling arguments (argv)
import os      # to help resolve location of sanitize.xsl stylesheet
import libxml2
import libxslt

__location__ = os.path.dirname(os.path.realpath(__file__))
g_sanitize = os.path.join(__location__,"sanitize.xsl")

g_stylesheet_name = None
g_html_name = None

# g_dumpopts = (libxml2.XML_SAVE_FORMAT + libxml2.XML_SAVE_NO_XHTML)
g_dumpopts = (libxml2.XML_SAVE_FORMAT + libxml2.XML_SAVE_XHTML + libxml2.XML_SAVE_AS_XML)

def show_usage():
    print "Usage:"
    print "scrape [xsl stylesheet] <html file>"
    print "examples:"
    print
    print "Extract data from people.html using scrape_people.xsl stylesheet:"
    print "scrape scrape_people.xsl people.html"
    print
    print "Pass only the HTML filename (omit the XSL stylesheet) to use the"
    print "supplied sanitize.xsl stylesheet to convert a message HTML file"
    print "into a valid XHTML document:"
    print "scrape people.html"
    print
    print "Load the HTML page from STDIN:"
    print "wget -O- www.dustyfeet.com | scrape -"
    print 
    print "Remember that you can save the output to a file by using the '>'"
    print "operator:"
    print "scrape -O- www.dustyfeet.com | scrape - > dustyfeet.html"


def xpathNSEval(doc, expr):
    """ Called to do a namespace-aware xpath evaluation of an XSLT document. """
    ctxt = doc.xpathNewContext()
    ctxt.setContextNode(doc)
    ctxt.xpathRegisterNs("xsl", "http://www.w3.org/1999/XSL/Transform")
    res = ctxt.xpathEval(expr)
    ctxt.xpathFreeContext()
    return res

def fix_dumpopts(xsldoc):
    """called to do additional preparation for printing without the
       the XML declaration if xsl:output/@omit-xml-declaration="yes"."""
    global g_dumpopts
    xpath = "xsl:stylesheet/xsl:output/@omit-xml-declaration"
    try:
        output_list = xpathNSEval(xsldoc, xpath)
        if len(output_list)>0 and output_list[0].get_content()=="yes":
            g_dumpopts += libxml2.XML_SAVE_NO_DECL
    except libxml2.xpathError as e:
        sys.stdout.write("xpathEval failed \"%s\"\n" % e)
        
def init_xslproc(f_use_ss, f_transform):
    global g_stylesheet_name
    # No parse relaxing: an XSL stylesheet should be valid XML.
    xsldoc = libxml2.readFile(g_stylesheet_name,None,0)
    if xsldoc:
        fix_dumpopts(xsldoc)
        style = libxslt.parseStylesheetDoc(xsldoc)
        if style:
            f_use_ss(style, f_transform)
            style.freeStylesheet()
        else:
            # This may cause a double-free error if parseStylesheetDoc()
            # closes it on failure.  
            xsldoc.freeDoc()

def get_xmldoc(xsl_ss, f_transform):
    global g_html_name
    readopts = (libxml2.HTML_PARSE_RECOVER |
                libxml2.HTML_PARSE_NODEFDTD |
                libxml2.HTML_PARSE_NOERROR |
                libxml2.HTML_PARSE_NOWARNING |
                libxml2.HTML_PARSE_NONET)

    if g_html_name == '-':
        xmldoc = libxml2.htmlCtxtReadFd(0, None, None, readopts)
    else:
        xmldoc = libxml2.htmlReadFile(g_html_name, None, readopts)

    if xmldoc:
        f_transform(xsl_ss, xmldoc)
        xmldoc.freeDoc()

def transform(xsl_ss, xmldoc):
    global g_dumpopts
    result = xsl_ss.applyStylesheet(xmldoc,None)
    if result:
        try:
            root = result.getRootElement()
            root.saveTo(sys.stdout, None, g_dumpopts)
        except libxml2.treeError as e:
            sys.stderr.write("\"%s\" failure for file \"%s\", \"%s\"\n"
                             % (e,sys.argv[1],sys.argv[2]))
        result.freeDoc()

def main():
    global g_stylesheet_name, g_html_name, g_sanitize

    slen = len(sys.argv)

    if slen == 1:
        show_usage()
    else:
        if slen == 2:
            g_stylesheet_name = g_sanitize
            g_html_name = sys.argv[1]
        elif slen == 3:
            g_stylesheet_name = sys.argv[1]
            g_html_name = sys.argv[2]

        init_xslproc(get_xmldoc, transform);
    

if __name__ == "__main__":
    main()