-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape
executable file
·123 lines (103 loc) · 3.95 KB
/
scrape
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python2
import sys # for handling arguments (argv)
import os # to help resolve location of sanitize.xsl stylesheet
import libxml2
import libxslt
__location__ = os.path.dirname(os.path.realpath(__file__))
g_sanitize = os.path.join(__location__,"sanitize.xsl")
g_stylesheet_name = None
g_html_name = None
# g_dumpopts = (libxml2.XML_SAVE_FORMAT + libxml2.XML_SAVE_NO_XHTML)
g_dumpopts = (libxml2.XML_SAVE_FORMAT + libxml2.XML_SAVE_XHTML + libxml2.XML_SAVE_AS_XML)
def show_usage():
print "Usage:"
print "scrape [xsl stylesheet] <html file>"
print "examples:"
print
print "Extract data from people.html using scrape_people.xsl stylesheet:"
print "scrape scrape_people.xsl people.html"
print
print "Pass only the HTML filename (omit the XSL stylesheet) to use the"
print "supplied sanitize.xsl stylesheet to convert a message HTML file"
print "into a valid XHTML document:"
print "scrape people.html"
print
print "Load the HTML page from STDIN:"
print "wget -O- www.dustyfeet.com | scrape -"
print
print "Remember that you can save the output to a file by using the '>'"
print "operator:"
print "scrape -O- www.dustyfeet.com | scrape - > dustyfeet.html"
def xpathNSEval(doc, expr):
""" Called to do a namespace-aware xpath evaluation of an XSLT document. """
ctxt = doc.xpathNewContext()
ctxt.setContextNode(doc)
ctxt.xpathRegisterNs("xsl", "http://www.w3.org/1999/XSL/Transform")
res = ctxt.xpathEval(expr)
ctxt.xpathFreeContext()
return res
def fix_dumpopts(xsldoc):
"""called to do additional preparation for printing without the
the XML declaration if xsl:output/@omit-xml-declaration="yes"."""
global g_dumpopts
xpath = "xsl:stylesheet/xsl:output/@omit-xml-declaration"
try:
output_list = xpathNSEval(xsldoc, xpath)
if len(output_list)>0 and output_list[0].get_content()=="yes":
g_dumpopts += libxml2.XML_SAVE_NO_DECL
except libxml2.xpathError as e:
sys.stdout.write("xpathEval failed \"%s\"\n" % e)
def init_xslproc(f_use_ss, f_transform):
global g_stylesheet_name
# No parse relaxing: an XSL stylesheet should be valid XML.
xsldoc = libxml2.readFile(g_stylesheet_name,None,0)
if xsldoc:
fix_dumpopts(xsldoc)
style = libxslt.parseStylesheetDoc(xsldoc)
if style:
f_use_ss(style, f_transform)
style.freeStylesheet()
else:
# This may cause a double-free error if parseStylesheetDoc()
# closes it on failure.
xsldoc.freeDoc()
def get_xmldoc(xsl_ss, f_transform):
global g_html_name
readopts = (libxml2.HTML_PARSE_RECOVER |
libxml2.HTML_PARSE_NODEFDTD |
libxml2.HTML_PARSE_NOERROR |
libxml2.HTML_PARSE_NOWARNING |
libxml2.HTML_PARSE_NONET)
if g_html_name == '-':
xmldoc = libxml2.htmlCtxtReadFd(0, None, None, readopts)
else:
xmldoc = libxml2.htmlReadFile(g_html_name, None, readopts)
if xmldoc:
f_transform(xsl_ss, xmldoc)
xmldoc.freeDoc()
def transform(xsl_ss, xmldoc):
global g_dumpopts
result = xsl_ss.applyStylesheet(xmldoc,None)
if result:
try:
root = result.getRootElement()
root.saveTo(sys.stdout, None, g_dumpopts)
except libxml2.treeError as e:
sys.stderr.write("\"%s\" failure for file \"%s\", \"%s\"\n"
% (e,sys.argv[1],sys.argv[2]))
result.freeDoc()
def main():
global g_stylesheet_name, g_html_name, g_sanitize
slen = len(sys.argv)
if slen == 1:
show_usage()
else:
if slen == 2:
g_stylesheet_name = g_sanitize
g_html_name = sys.argv[1]
elif slen == 3:
g_stylesheet_name = sys.argv[1]
g_html_name = sys.argv[2]
init_xslproc(get_xmldoc, transform);
if __name__ == "__main__":
main()