-
Notifications
You must be signed in to change notification settings - Fork 42
/
Select.hs
267 lines (241 loc) · 11.8 KB
/
Select.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE BangPatterns #-}
{-# OPTIONS_HADDOCK hide #-}
module Text.HTML.Scalpel.Internal.Select (
SelectContext (..)
, TagSpec
, TagInfo (..)
, select
, tagsToSpec
) where
import Text.HTML.Scalpel.Internal.Select.Types
import Control.Applicative ((<$>), (<|>))
import Data.Maybe (catMaybes, isJust, fromJust, fromMaybe)
import qualified Data.Map.Strict as Map
import qualified Data.Text as T
import qualified Data.Tree as Tree
import qualified Data.Vector as Vector
import qualified Text.HTML.TagSoup as TagSoup
import qualified Text.StringLike as TagSoup
type Index = Int
type Name = Maybe T.Text
type CloseOffset = Maybe Index
-- | The span of a tag in terms of the index of the opening tag and the index of
-- the closing tag. If there is no closing tag the closing tag is equal to the
-- opening tag.
data Span = Span !Int !Int
-- | A representation of the hierarchal structure of a document. Nodes of the
-- tree are spans which mark the start and end of a tag. The tree is organized
-- such that tags that appear earlier in the parsed string appear earlier in the
-- list of nodes, and that a given node is completely within the span of its
-- parent.
type TagForest = Tree.Forest Span
-- | A tag and associated precomputed meta data that is accessed in tight inner
-- loops during scraping.
data TagInfo str = TagInfo {
infoTag :: !(TagSoup.Tag str)
, infoName :: !Name
, infoIndex :: !Index
, infoOffset :: !CloseOffset
}
-- | A vector of tags and precomputed meta data. A vector is used because it
-- allows for constant time slicing and sharing memory between the slices.
type TagVector str = Vector.Vector (TagInfo str)
-- | Ephemeral meta-data that each TagSpec is tagged with. This type contains
-- information that is not intrinsic in the sub-tree that corresponds to a given
-- TagSpec.
data SelectContext = SelectContext {
-- | `select` generates a list of `TagSpec`s that match a
-- selector. This indicates the index in the result list
-- that this TagSpec corresponds to.
ctxPosition :: !Index
}
-- | A structured representation of the parsed tags that provides fast element
-- look up via a vector of tags, and fast traversal via a rose tree of tags.
type TagSpec str = (TagVector str, TagForest, SelectContext)
-- | The 'select' function takes a 'Selectable' value and a list of
-- 'TagSoup.Tag's and returns a list of every subsequence of the given list of
-- Tags that matches the given selector.
select :: (Ord str, TagSoup.StringLike str)
=> Selector -> TagSpec str -> [TagSpec str]
select s tagSpec = newSpecs
where
(MkSelector nodes) = s
newSpecs = zipWith applyPosition [0..] (selectNodes nodes tagSpec [])
applyPosition p (tags, f, ctx) = (tags, f, SelectContext p)
-- | Creates a TagSpec from a list of tags parsed by TagSoup.
tagsToSpec :: forall str. (Ord str, TagSoup.StringLike str)
=> [TagSoup.Tag str] -> TagSpec str
tagsToSpec tags = (vector, tree, ctx)
where
vector = tagsToVector tags
tree = vectorToTree vector
ctx = SelectContext 0
-- | Annotate each tag with the offset to the corresponding closing tag. This
-- annotating is done in O(n * log(n)).
--
-- The algorithm works on a list of tags annotated with their index. It
-- maintains a map of unclosed open tags keyed by tag name.
--
-- (1) When an open tag is encountered it is pushed onto the list keyed by
-- its name.
--
-- (2) When a closing tag is encountered the corresponding opening tag is
-- popped, the offset between the two are computed, the opening tag is
-- annotated with the offset between the two, and both are added to the
-- result set.
--
-- (3) When any other tag is encountered it is added to the result set
-- immediately.
--
-- (4) After all tags are either in the result set or the state, all
-- unclosed tags from the state are added to the result set without a
-- closing offset.
--
-- (5) The result set is then sorted by their indices.
tagsToVector :: forall str. (Ord str, TagSoup.StringLike str)
=> [TagSoup.Tag str] -> TagVector str
tagsToVector tags = let indexed = zip tags [0..]
total = length indexed
unsorted = go indexed Map.empty
emptyVec = Vector.replicate total undefined
in emptyVec Vector.// unsorted
where
go :: [(TagSoup.Tag str, Index)]
-> Map.Map T.Text [(TagSoup.Tag str, Index)]
-> [(Index, TagInfo str)]
go [] state = map (\(t, i) -> (i, TagInfo t (maybeName t) i Nothing))
$ concat
$ Map.elems state
where
maybeName t | TagSoup.isTagOpen t = Just $ getTagName t
| TagSoup.isTagClose t = Just $ getTagName t
| otherwise = Nothing
go (x@(tag, index) : xs) state
| TagSoup.isTagClose tag =
let maybeOpen = head <$> Map.lookup tagName state
state' = Map.alter popTag tagName state
info = TagInfo tag (Just tagName) index Nothing
res = catMaybes [
Just (index, info)
, calcOffset <$> maybeOpen
]
in res ++ go xs state'
| TagSoup.isTagOpen tag = go xs (Map.alter appendTag tagName state)
| otherwise =
let info = TagInfo tag Nothing index Nothing
in (index, info) : go xs state
where
tagName = getTagName tag
appendTag :: Maybe [(TagSoup.Tag str, Index)]
-> Maybe [(TagSoup.Tag str, Index)]
appendTag m = (x :) <$> (m <|> Just [])
calcOffset :: (TagSoup.Tag str, Int) -> (Index, TagInfo str)
calcOffset (t, i) =
let offset = index - i
info = TagInfo t (Just tagName) i (Just offset)
in offset `seq` (i, info)
popTag :: Maybe [a] -> Maybe [a]
popTag (Just (_ : y : xs)) = let s = y : xs in s `seq` Just s
popTag _ = Nothing
getTagName :: TagSoup.StringLike str => TagSoup.Tag str -> T.Text
getTagName (TagSoup.TagOpen name _) = TagSoup.castString name
getTagName (TagSoup.TagClose name) = TagSoup.castString name
getTagName _ = undefined
-- | Builds a forest describing the structure of the tags within a given vector.
-- The nodes of the forest are tag spans which mark the indices within the
-- vector of an open and close pair. The tree is organized such for any node n
-- the parent of node n is the smallest span that completely encapsulates the
-- span of node n.
vectorToTree :: TagSoup.StringLike str => TagVector str -> TagForest
vectorToTree tags = fixup $ forestWithin 0 (Vector.length tags)
where
forestWithin :: Int -> Int -> TagForest
forestWithin !lo !hi
| hi <= lo = []
| not isOpen = forestWithin (lo + 1) hi
| otherwise = Tree.Node (Span lo closeIndex) subForest
: forestWithin (closeIndex + 1) hi
where
info = tags Vector.! lo
isOpen = TagSoup.isTagOpen $ infoTag info
closeIndex = lo + fromMaybe 0 (infoOffset info)
subForest = forestWithin (lo + 1) closeIndex
-- Lifts nodes whose closing tags lay outside their parent tags up to
-- within a parent node that encompasses the node's entire span.
fixup :: TagForest -> TagForest
fixup [] = []
fixup (Tree.Node (Span lo hi) subForest : siblings)
= Tree.Node (Span lo hi) ok : bad
where
(ok, bad) = malformed (fixup siblings) $ fixup subForest
malformed :: TagForest -- Forest to prepend bad trees on.
-> TagForest -- Remaining trees to examine.
-> (TagForest, TagForest)
malformed preBad [] = ([], preBad)
malformed preBad (n@(Tree.Node (Span _ nHi) _) : ns)
| hi < nHi = (ok, n : bad)
| otherwise = (n : ok, bad)
where (ok, bad) = malformed preBad ns
-- | Generates a list of 'TagSpec's that match the given list of 'SelectNode's.
-- This is is done in linear time with respect to the number of tags.
--
-- The algorithm is a simple DFS traversal of the tag forest. While traversing
-- the forest if the current SelectNode is satisfied by the current node in the
-- tree the SelectNode is popped and the current node's sub-forest is traversed
-- with the remaining SelectNodes. If there is only a single SelectNode then any
-- node encountered that satisfies the SelectNode is returned as an answer.
selectNodes :: TagSoup.StringLike str
=> [SelectNode] -> TagSpec str -> [TagSpec str] -> [TagSpec str]
selectNodes [] _ acc = acc
selectNodes [_] (_, [], _) acc = acc
-- Now that there is only a single SelectNode to satisfy, search the remaining
-- forests and generates a TagSpec for each node that satisfies the condition.
selectNodes [n] (tags, f : fs, ctx) acc
| nodeMatches n info = (shrunkSpec :)
$ selectNodes [n] (tags, fs, ctx)
$ selectNodes [n] (tags, Tree.subForest f, ctx) acc
| otherwise = selectNodes [n] (tags, fs, ctx)
$ selectNodes [n] (tags, Tree.subForest f, ctx) acc
where
Span lo hi = Tree.rootLabel f
shrunkSpec = (
Vector.slice lo (hi - lo + 1) tags
, [fmap recenter f]
, ctx
)
recenter (Span nLo nHi) = Span (nLo - lo) (nHi - lo)
info = tags Vector.! lo
-- There are multiple SelectNodes that need to be satisfied. If the current node
-- satisfies the condition, then the current nodes sub-forest is searched for
-- matches of the remaining SelectNodes.
selectNodes (_ : _) (_, [], _) acc = acc
selectNodes (n : ns) (tags, f : fs, ctx) acc
| nodeMatches n info = selectNodes ns (tags, Tree.subForest f, ctx)
$ selectNodes (n : ns) (tags, fs, ctx) acc
| otherwise = selectNodes (n : ns) (tags, Tree.subForest f, ctx)
$ selectNodes (n : ns) (tags, fs, ctx) acc
where
Span lo _ = Tree.rootLabel f
info = tags Vector.! lo
-- | Returns True if a tag satisfies a given SelectNode's condition.
nodeMatches :: TagSoup.StringLike str => SelectNode -> TagInfo str -> Bool
nodeMatches (SelectNode node preds) info = checkTag node preds info
nodeMatches (SelectAny preds) info = checkPreds preds (infoTag info)
-- | Given a tag name and a list of attribute predicates return a function that
-- returns true if a given tag matches the supplied name and predicates.
checkTag :: TagSoup.StringLike str
=> T.Text -> [AttributePredicate] -> TagInfo str -> Bool
checkTag name preds (TagInfo tag tagName _ _)
= TagSoup.isTagOpen tag
&& isJust tagName
&& name == fromJust tagName
&& checkPreds preds tag
-- | Returns True if a tag satisfies a list of attribute predicates.
checkPreds :: TagSoup.StringLike str
=> [AttributePredicate] -> TagSoup.Tag str -> Bool
checkPreds preds tag
= TagSoup.isTagOpen tag
&& all (flip checkPred attrs) preds
where (TagSoup.TagOpen _ attrs) = tag