-
Notifications
You must be signed in to change notification settings - Fork 13
/
gosax.go
219 lines (185 loc) · 7.23 KB
/
gosax.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
// gosax: Go wrapper for libxml SAX.
//
// This file contains all the exported functionality of the module.
//
// Eli Bendersky [https://eli.thegreenplace.net]
// This code is in the public domain.
package gosax
import (
"fmt"
"strings"
"sync"
"unsafe"
)
/*
#cgo pkg-config: libxml-2.0
#include <libxml/tree.h>
#include <libxml/parser.h>
#include <libxml/parserInternals.h>
extern void startDocumentCgo(void*);
extern void endDocumentCgo(void*);
extern void startElementCgo(void*, const xmlChar*, const xmlChar**);
extern void startElementNoAttrCgo(void*, const xmlChar*, const xmlChar**);
extern void endElementCgo(void*, const xmlChar*);
extern void charactersCgo(void*, const xmlChar*, int);
extern void charactersRawCgo(void*, const xmlChar*, int);
// Since this structure contains pointers, take extra care to zero it out
// before passing it to Go code.
static inline xmlSAXHandler newHandlerStruct() {
xmlSAXHandler h = {0};
return h;
}
// Wrap a C macro in a function callable from Go.
static inline xmlError* getLastError() {
return xmlGetLastError();
}
*/
import "C"
import "github.com/eliben/gosax/pointer"
// Used to ensure that xmlInitParser is only called once.
var initOnce sync.Once
func init() {
initOnce.Do(func() {
C.xmlInitParser()
})
}
// SaxCallbacks collects callback functions to invoke on SAX events. Only
// populate callbacks you're interested in - callbacks left as nil will not
// be registered with the C layer and may save processing time.
// Some callbacks override others for optimization purposes - check the comments
// for more information.
type SaxCallbacks struct {
// StartDocument is invoked on the "start document" event.
StartDocument StartDocumentFunc
// EndDocument is invoked on the "end document" event
EndDocument EndDocumentFunc
// StartElement is invoked whenever the beginning of a new element is found.
// name will be the element name, and attrs a slice of attributes where
// attribute names alternate with values. For example, given the element
// <elem foo="bar" id="100"> the callback will get name="elem" and
// attrs=["foo", "bar", "id", "100"].
StartElement StartElementFunc
// StartElementNoAttr will override StartElement, if set. When you don't
// care about the attributes of an element, use this one - it will be faster
// because it doesn't have to do attribute unpacking, which is expensive.
StartElementNoAttr StartElementNoAttrFunc
// EndElement is invoked at the end of parsing an element (after closing tag
// has been processed), with name being the element name.
EndElement EndElementFunc
// Characters is invoked on character data inside elements. contents is the
// data, as string. Note that this callback may be invoked multiple times
// within a single tag.
Characters CharactersFunc
// CharactersRaw will override Characters, if set. It doesn't translate XML
// data into a Go string, but leaves it as an opaque pair of (ch, chlen),
// which you could use UnpackString to convert to a string if needed. This
// could be a useful optimization if you're only occasionally interested in
// the contents of character data.
CharactersRaw CharactersRawFunc
}
type StartDocumentFunc func()
type EndDocumentFunc func()
type StartElementFunc func(name string, attrs []string)
type StartElementNoAttrFunc func(name string)
type EndElementFunc func(name string)
type CharactersFunc func(contents string)
type CharactersRawFunc func(ch unsafe.Pointer, chlen int)
// UnpackString unpacks the opaque ch, chlen pair (that some callbacks in
// this package may create) into a Go string.
func UnpackString(ch unsafe.Pointer, chlen int) string {
return C.GoStringN((*C.char)(ch), C.int(chlen))
}
// ParseFile parses an XML file with the given name using SAX, with cb as
// the callbacks. The file name is required, rather than a reader, because it
// gets passed directly to the C layer.
func ParseFile(filename string, cb SaxCallbacks) error {
var cfilename *C.char = C.CString(filename)
defer C.free(unsafe.Pointer(cfilename))
// newHandlerStruct zeroes out all the pointers; we assign only those that
// are passed as non-nil in SaxCallbacks.
SAXhandler := C.newHandlerStruct()
if cb.StartDocument != nil {
SAXhandler.startDocument = C.startDocumentSAXFunc(C.startDocumentCgo)
}
if cb.EndDocument != nil {
SAXhandler.endDocument = C.endDocumentSAXFunc(C.endDocumentCgo)
}
if cb.StartElement != nil {
SAXhandler.startElement = C.startElementSAXFunc(C.startElementCgo)
}
// StartElementNoAttr overrides StartElement
if cb.StartElementNoAttr != nil {
SAXhandler.startElement = C.startElementSAXFunc(C.startElementNoAttrCgo)
}
if cb.EndElement != nil {
SAXhandler.endElement = C.endElementSAXFunc(C.endElementCgo)
}
if cb.Characters != nil {
SAXhandler.characters = C.charactersSAXFunc(C.charactersCgo)
}
// CharactersRaw overrides Characters
if cb.CharactersRaw != nil {
SAXhandler.characters = C.charactersSAXFunc(C.charactersRawCgo)
}
// Pack the callbacks structure into an opaque unsafe.Pointer which we'll
// pass to C as user_data, and C will pass it back to our Go callbacks.
user_data := pointer.Save(&cb)
defer pointer.Unref(user_data)
rc := C.xmlSAXUserParseFile(&SAXhandler, user_data, cfilename)
if rc != 0 {
xmlErr := C.getLastError()
msg := strings.TrimSpace(C.GoString(xmlErr.message))
return fmt.Errorf("line %v: error: %v", xmlErr.line, msg)
}
return nil
}
//export goStartDocument
func goStartDocument(user_data unsafe.Pointer) {
gcb := pointer.Restore(user_data).(*SaxCallbacks)
gcb.StartDocument()
}
//export goEndDocument
func goEndDocument(user_data unsafe.Pointer) {
gcb := pointer.Restore(user_data).(*SaxCallbacks)
gcb.EndDocument()
}
//export goStartElement
func goStartElement(user_data unsafe.Pointer, name *C.char, attrs **C.char, attrlen C.int) {
// Passing attrs to Go is tricky because it's an array of C strings,
// terminated with a NULL pointer. The C callback startElementCgo calculates
// the length of the array and passes it in as attrlen. We still have to
// convert it to a Go slice, by mapping a slice on the underlying storage
// and copying the attributes, one by one. This is all rather expensive, so
// consider using the StartElementNoAttr callback instead, when applicable.
gcb := pointer.Restore(user_data).(*SaxCallbacks)
length := int(attrlen)
var goattrs []string
if length > 0 {
tmpslice := (*[1 << 30]*C.char)(unsafe.Pointer(attrs))[:length:length]
goattrs = make([]string, length)
for i, s := range tmpslice {
goattrs[i] = C.GoString(s)
}
}
gcb.StartElement(C.GoString(name), goattrs)
}
//export goStartElementNoAttr
func goStartElementNoAttr(user_data unsafe.Pointer, name *C.char) {
gcb := pointer.Restore(user_data).(*SaxCallbacks)
gcb.StartElementNoAttr(C.GoString(name))
}
//export goEndElement
func goEndElement(user_data unsafe.Pointer, name *C.char) {
gcb := pointer.Restore(user_data).(*SaxCallbacks)
gcb.EndElement(C.GoString(name))
}
//export goCharacters
func goCharacters(user_data unsafe.Pointer, ch *C.char, chlen C.int) {
gcb := pointer.Restore(user_data).(*SaxCallbacks)
gcb.Characters(C.GoStringN(ch, chlen))
}
//export goCharactersRaw
func goCharactersRaw(user_data unsafe.Pointer, ch *C.char, chlen C.int) {
gcb := pointer.Restore(user_data).(*SaxCallbacks)
gcb.CharactersRaw(unsafe.Pointer(ch), int(chlen))
}