/
xml.clj
185 lines (159 loc) · 7.55 KB
/
xml.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
; adapted from: net.cgrand.xml
; Copyright (c) Christophe Grand, 2009-2013. All rights reserved.
;
; The use and distribution terms for this software are covered by the
; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php)
; which can be found in the file epl-v10.html at the root of this distribution.
; By using this software in any fashion, you are agreeing to be bound by
; the terms of this license.
; You must not remove this notice, or any other, from this software.
(ns tupelo.parse.xml
(:use tupelo.core)
(:require
[clojure.zip :as zip]
[schema.core :as s]
[tupelo.string :as ts]
[tupelo.schema :as tsk])
(:import
[java.io Reader InputStream]
[javax.xml.parsers SAXParserFactory]
[org.xml.sax Attributes]
[org.xml.sax.ext DefaultHandler2]
))
(defstruct Element :tag :attrs :content)
(def ^:private tag? :tag)
(defn- document?
"Document nodes are a parsing impelentation details and should never leak
outside of it."
[x] (= :document (:type x)))
(defn- comment? [x] (= (:type x) :comment))
(defn- dtd? [x] (= (:type x) :dtd))
(defn- xml-zip
"Returns a zipper for xml elements (as from xml/parse), given a root element"
[root]
(zip/zipper
#(or (tag? %) (document? %))
(comp seq :content)
#(assoc %1 :content %2)
root))
(defn- insert-element [result-zipper elem]
(-> result-zipper (zip/append-child elem) zip/down zip/rightmost))
(defn- merge-text-left [result-zipper str-val]
(or
(when-let [item (-> result-zipper zip/down zip/rightmost)]
(when (-> item zip/node string?)
(-> item (zip/edit str str-val) zip/up)))
(-> result-zipper (zip/append-child str-val))))
(defn- handler [result-atom]
(proxy [DefaultHandler2] []
(startElement [uri local-name q-name ^Attributes atts]
(let [elem (struct Element
(keyword q-name)
(when (pos? (. atts (getLength)))
(reduce #(assoc %1 (keyword (.getQName atts %2)) (.getValue atts (int %2)))
{} (range (.getLength atts)))))]
(swap! result-atom insert-element elem)))
(endElement [uri local-name q-name]
(swap! result-atom zip/up))
(characters [ch start length]
(swap! result-atom merge-text-left (String. ^chars ch (int start) (int length))))
(ignorableWhitespace [ch start length]
(swap! result-atom merge-text-left (String. ^chars ch (int start) (int length))))
(comment [ch start length]
(swap! result-atom zip/append-child {:type :comment :data (String. ^chars ch (int start) (int length))}))
(startDTD [name publicId systemId]
(swap! result-atom zip/append-child {:type :dtd :data [name publicId systemId]}))
(resolveEntity
([name publicId baseURI systemId]
(doto (org.xml.sax.InputSource.)
(.setSystemId systemId)
(.setPublicId publicId)
(.setCharacterStream (java.io.StringReader. ""))))
([publicId systemId]
(let [^DefaultHandler2 this this]
(proxy-super resolveEntity publicId systemId))))))
(defn enlive-normalize
"Normalize Enlive data replacing `nil` values for :attrs or :content with empty maps & vectors, respectively.
Also coerces all :content to vector. "
[item]
(if (and (map? item) ; Enlive data parsed from XML may has raw strings (esp. whitespace) embedded in it
(contains-key? item :tag)) ; when parsing html, may get non-enlive nodes like {:type :comment, :data "..."}
(it-> item
(update it :attrs (fn [attrs]
(into {} attrs))) ; works for nil
(update it :content (fn [content]
(if (or (nil? content) (empty? content))
[]
(mapv enlive-normalize content)))))
item))
(defn enlive-remove-whitespace
"Removes whilespace strings from Enlive data :content vectors."
[item]
(if (and (map? item) ; Enlive data parsed from XML may has raw strings (esp. whitespace) embedded in it
(contains-key? item :tag)) ; when parsing html, may get non-enlive nodes like {:type :comment, :data "..."}
(let [content-new (cond-it-> (:content item)
(or (nil? it) (empty? it)) []
:then (drop-if (fn [arg]
(and (string? arg)
(ts/whitespace? arg))) it)
:then (mapv enlive-remove-whitespace it))]
(glue item {:content content-new}))
item))
(defn ^:private sax-parse-fn
[xml-input content-handler]
(let [input-source (cond
(or (instance? InputStream xml-input)
(instance? Reader xml-input)) (org.xml.sax.InputSource. xml-input)
(instance? org.xml.sax.InputSource xml-input) xml-input
:else (throw (ex-info "sax-parse-fn: xml-input must be one of InputStream, Reader, or org.xml.sax.InputSource"
{:type (type xml-input)
:class (class xml-input)})))]
(it-> (SAXParserFactory/newInstance)
(doto it
(.setValidating false)
(.setFeature "http://xml.org/sax/features/external-general-entities" false)
(.setFeature "http://xml.org/sax/features/external-parameter-entities" false))
(.newSAXParser it)
(doto it
(.setProperty "http://xml.org/sax/properties/lexical-handler" content-handler))
(.parse it
^org.xml.sax.InputSource input-source
^org.xml.sax.helpers.DefaultHandler content-handler))))
(s/defn parse-raw-streaming ; #todo fix docstring
"Parses XML data from an input-stream or Reader, returning Enlive-format data.
Does not include whitespace removal or enlive normalization."
([xml-input] (parse-raw-streaming xml-input sax-parse-fn))
([xml-input parse-fn]
(let [result-atom (atom (xml-zip {:type :document :content nil}))
content-handler (handler result-atom)]
(parse-fn xml-input content-handler)
; #todo document logic vvv using xkcd & plain xml example
(let [parsed-data (it-> @result-atom
(first it)
(:content it)
(drop-if #(= :dtd (:type %)) it)
(drop-if #(string? %) it)
(only it) )]
parsed-data))))
(s/defn parse-streaming ; #todo fix docstring
"Parses XML data from an input-stream or Reader, returning Enlive-format data"
([xml-input] (parse-streaming xml-input sax-parse-fn))
([xml-input parse-fn]
(enlive-remove-whitespace
(enlive-normalize
(parse-raw-streaming xml-input parse-fn)))))
;---------------------------------------------------------------------------------------------------
(s/defn parse-raw ; #todo fix docstring
"Parses a string of XML data, returning Enlive-format data.
Does not include whitespace removal or enlive normalization."
[xml-str :- s/Str]
(parse-raw-streaming (ts/string->stream xml-str)))
(s/defn parse ; #todo fix docstring
"Parses a string of XML data, returning Enlive-format data"
[xml-str :- s/Str]
(parse-streaming (ts/string->stream xml-str)))
; "Parses and loads the source input-source, which can be a File, InputStream or String
; naming a URI. Returns a seq of tree of the xml/element struct-map, which has the keys
; :tag, :attrs, and :content. and accessor fns tag, attrs, and content. Other parsers
; can be supplied by passing parse-fn, a fn taking a source and a
; ContentHandler and returning a parse-fn"