-
Notifications
You must be signed in to change notification settings - Fork 11
/
html.clj
87 lines (74 loc) · 3.15 KB
/
html.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
(ns stencil.postprocess.html
"Replaces results of html() calls with external part relationships."
(:require [clojure.zip :as zip]
[clojure.data.xml :as xml]
[stencil.functions :refer [call-fn]]
[stencil.postprocess.fragments :as fragments]
[stencil.types :refer [ControlMarker]]
[stencil.util :refer :all]
[stencil.ooxml :as ooxml]))
(set! *warn-on-reflection* true)
(defrecord HtmlChunk [content] ControlMarker)
(defmethod call-fn "html" [_ content] (->HtmlChunk content))
(def legal-tags
"Set of supported HTML tags"
#{:b :em :i :u :s :sup :sub :span :br :strong})
(defn- kw-lowercase [kw] (-> kw name .toLowerCase keyword))
(defn- validate-tags
"Throws ExceptionInfo on invalid HTML tag in tree"
[xml-tree]
(->>
(fn [node]
(if (legal-tags (-> node :tag kw-lowercase))
node
(throw (ex-info (str "Unexpected HTML tag: " (:tag node)) {:tag (:tag node)}))))
(dfs-walk-xml xml-tree map?)))
(defn- parse-html [xml]
(-> (str xml)
(.replaceAll "<br>" "<BR/>")
(.replaceAll "<BR>" "<BR/>")
(xml/parse-str)
(doto (validate-tags))
(try (catch javax.xml.stream.XMLStreamException e
(throw (ex-info "Invalid HTML content!" {:raw-xml xml} e))))))
(defn- walk-children [xml]
(if (map? xml)
(if (#{:br :BR} (:tag xml))
[{:text ::br}]
(for [c (:content xml)
x (walk-children c)]
(update x :path conj (kw-lowercase (:tag xml)))))
[{:text xml}]))
(defn- path->style [p]
(case p
(:b :em :strong) {:tag ooxml/b :attrs {ooxml/val "true"}}
(:i) {:tag ooxml/i :attrs {ooxml/val "true"}}
(:s) {:tag ooxml/strike :attrs {ooxml/val "true"}}
(:u) {:tag ooxml/u :attrs {ooxml/val "single"}}
(:sup) {:tag ooxml/vertAlign :attrs {ooxml/val "superscript"}}
(:sub) {:tag ooxml/vertAlign :attrs {ooxml/val "subscript"}}
nil))
(defn html->ooxml-runs
"Parses html string and returns a seq of ooxml run elements.
Parameter base-style is the default styling for each run."
[html base-style]
(when (seq html)
(let [ch (walk-children (parse-html (str "<span>" html "</span>")))]
(for [parts (partition-by :path ch)
:let [prs (into (set base-style) (keep path->style) (:path (first parts)))]]
{:tag ooxml/r
:content (cons {:tag ooxml/rPr :content (vec prs)}
(for [{:keys [text]} parts]
(if (= ::br text)
{:tag ooxml/br :content []}
{:tag ooxml/t :content [(str text)]})))}))))
(defn- current-run-style [chunk-loc]
(let [r (zip/node (zip/up (zip/up chunk-loc)))]
(find-first #(= ooxml/rPr (:tag %)) (:content r))))
(defn- fix-html-chunk [chunk-loc]
(assert (instance? HtmlChunk (zip/node chunk-loc)))
(let [style (current-run-style chunk-loc)
ooxml-runs (html->ooxml-runs (:content (zip/node chunk-loc)) (:content style))]
(apply fragments/unpack-items chunk-loc ooxml-runs)))
(defn fix-html-chunks [xml-tree]
(dfs-walk-xml-node xml-tree #(instance? HtmlChunk %) fix-html-chunk))