Permalink
Fetching contributors…
Cannot retrieve contributors at this time
268 lines (241 sloc) 11.8 KB
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE BangPatterns #-}
{-# OPTIONS_HADDOCK hide #-}
module Text.HTML.Scalpel.Internal.Select (
SelectContext (..)
, TagSpec
, TagInfo (..)
, select
, tagsToSpec
) where
import Text.HTML.Scalpel.Internal.Select.Types
import Control.Applicative ((<$>), (<|>))
import Data.Maybe (catMaybes, isJust, fromJust, fromMaybe)
import qualified Data.Map.Strict as Map
import qualified Data.Text as T
import qualified Data.Tree as Tree
import qualified Data.Vector as Vector
import qualified Text.HTML.TagSoup as TagSoup
import qualified Text.StringLike as TagSoup
type Index = Int
type Name = Maybe T.Text
type CloseOffset = Maybe Index
-- | The span of a tag in terms of the index of the opening tag and the index of
-- the closing tag. If there is no closing tag the closing tag is equal to the
-- opening tag.
data Span = Span !Int !Int
-- | A representation of the hierarchal structure of a document. Nodes of the
-- tree are spans which mark the start and end of a tag. The tree is organized
-- such that tags that appear earlier in the parsed string appear earlier in the
-- list of nodes, and that a given node is completely within the span of its
-- parent.
type TagForest = Tree.Forest Span
-- | A tag and associated precomputed meta data that is accessed in tight inner
-- loops during scraping.
data TagInfo str = TagInfo {
infoTag :: !(TagSoup.Tag str)
, infoName :: !Name
, infoIndex :: !Index
, infoOffset :: !CloseOffset
}
-- | A vector of tags and precomputed meta data. A vector is used because it
-- allows for constant time slicing and sharing memory between the slices.
type TagVector str = Vector.Vector (TagInfo str)
-- | Ephemeral meta-data that each TagSpec is tagged with. This type contains
-- information that is not intrinsic in the sub-tree that corresponds to a given
-- TagSpec.
data SelectContext = SelectContext {
-- | `select` generates a list of `TagSpec`s that match a
-- selector. This indicates the index in the result list
-- that this TagSpec corresponds to.
ctxPosition :: !Index
}
-- | A structured representation of the parsed tags that provides fast element
-- look up via a vector of tags, and fast traversal via a rose tree of tags.
type TagSpec str = (TagVector str, TagForest, SelectContext)
-- | The 'select' function takes a 'Selectable' value and a list of
-- 'TagSoup.Tag's and returns a list of every subsequence of the given list of
-- Tags that matches the given selector.
select :: (Ord str, TagSoup.StringLike str)
=> Selector -> TagSpec str -> [TagSpec str]
select s tagSpec = newSpecs
where
(MkSelector nodes) = s
newSpecs = zipWith applyPosition [0..] (selectNodes nodes tagSpec [])
applyPosition p (tags, f, ctx) = (tags, f, SelectContext p)
-- | Creates a TagSpec from a list of tags parsed by TagSoup.
tagsToSpec :: forall str. (Ord str, TagSoup.StringLike str)
=> [TagSoup.Tag str] -> TagSpec str
tagsToSpec tags = (vector, tree, ctx)
where
vector = tagsToVector tags
tree = vectorToTree vector
ctx = SelectContext 0
-- | Annotate each tag with the offset to the corresponding closing tag. This
-- annotating is done in O(n * log(n)).
--
-- The algorithm works on a list of tags annotated with their index. It
-- maintains a map of unclosed open tags keyed by tag name.
--
-- (1) When an open tag is encountered it is pushed onto the list keyed by
-- its name.
--
-- (2) When a closing tag is encountered the corresponding opening tag is
-- popped, the offset between the two are computed, the opening tag is
-- annotated with the offset between the two, and both are added to the
-- result set.
--
-- (3) When any other tag is encountered it is added to the result set
-- immediately.
--
-- (4) After all tags are either in the result set or the state, all
-- unclosed tags from the state are added to the result set without a
-- closing offset.
--
-- (5) The result set is then sorted by their indices.
tagsToVector :: forall str. (Ord str, TagSoup.StringLike str)
=> [TagSoup.Tag str] -> TagVector str
tagsToVector tags = let indexed = zip tags [0..]
total = length indexed
unsorted = go indexed Map.empty
emptyVec = Vector.replicate total undefined
in emptyVec Vector.// unsorted
where
go :: [(TagSoup.Tag str, Index)]
-> Map.Map T.Text [(TagSoup.Tag str, Index)]
-> [(Index, TagInfo str)]
go [] state = map (\(t, i) -> (i, TagInfo t (maybeName t) i Nothing))
$ concat
$ Map.elems state
where
maybeName t | TagSoup.isTagOpen t = Just $ getTagName t
| TagSoup.isTagClose t = Just $ getTagName t
| otherwise = Nothing
go (x@(tag, index) : xs) state
| TagSoup.isTagClose tag =
let maybeOpen = head <$> Map.lookup tagName state
state' = Map.alter popTag tagName state
info = TagInfo tag (Just tagName) index Nothing
res = catMaybes [
Just (index, info)
, calcOffset <$> maybeOpen
]
in res ++ go xs state'
| TagSoup.isTagOpen tag = go xs (Map.alter appendTag tagName state)
| otherwise =
let info = TagInfo tag Nothing index Nothing
in (index, info) : go xs state
where
tagName = getTagName tag
appendTag :: Maybe [(TagSoup.Tag str, Index)]
-> Maybe [(TagSoup.Tag str, Index)]
appendTag m = (x :) <$> (m <|> Just [])
calcOffset :: (TagSoup.Tag str, Int) -> (Index, TagInfo str)
calcOffset (t, i) =
let offset = index - i
info = TagInfo t (Just tagName) i (Just offset)
in offset `seq` (i, info)
popTag :: Maybe [a] -> Maybe [a]
popTag (Just (_ : y : xs)) = let s = y : xs in s `seq` Just s
popTag _ = Nothing
getTagName :: TagSoup.StringLike str => TagSoup.Tag str -> T.Text
getTagName (TagSoup.TagOpen name _) = TagSoup.castString name
getTagName (TagSoup.TagClose name) = TagSoup.castString name
getTagName _ = undefined
-- | Builds a forest describing the structure of the tags within a given vector.
-- The nodes of the forest are tag spans which mark the indices within the
-- vector of an open and close pair. The tree is organized such for any node n
-- the parent of node n is the smallest span that completely encapsulates the
-- span of node n.
vectorToTree :: TagSoup.StringLike str => TagVector str -> TagForest
vectorToTree tags = fixup $ forestWithin 0 (Vector.length tags)
where
forestWithin :: Int -> Int -> TagForest
forestWithin !lo !hi
| hi <= lo = []
| not isOpen = forestWithin (lo + 1) hi
| otherwise = Tree.Node (Span lo closeIndex) subForest
: forestWithin (closeIndex + 1) hi
where
info = tags Vector.! lo
isOpen = TagSoup.isTagOpen $ infoTag info
closeIndex = lo + fromMaybe 0 (infoOffset info)
subForest = forestWithin (lo + 1) closeIndex
-- Lifts nodes whose closing tags lay outside their parent tags up to
-- within a parent node that encompasses the node's entire span.
fixup :: TagForest -> TagForest
fixup [] = []
fixup (Tree.Node (Span lo hi) subForest : siblings)
= Tree.Node (Span lo hi) ok : bad
where
(ok, bad) = malformed (fixup siblings) $ fixup subForest
malformed :: TagForest -- Forest to prepend bad trees on.
-> TagForest -- Remaining trees to examine.
-> (TagForest, TagForest)
malformed preBad [] = ([], preBad)
malformed preBad (n@(Tree.Node (Span _ nHi) _) : ns)
| hi < nHi = (ok, n : bad)
| otherwise = (n : ok, bad)
where (ok, bad) = malformed preBad ns
-- | Generates a list of 'TagSpec's that match the given list of 'SelectNode's.
-- This is is done in linear time with respect to the number of tags.
--
-- The algorithm is a simple DFS traversal of the tag forest. While traversing
-- the forest if the current SelectNode is satisfied by the current node in the
-- tree the SelectNode is popped and the current node's sub-forest is traversed
-- with the remaining SelectNodes. If there is only a single SelectNode then any
-- node encountered that satisfies the SelectNode is returned as an answer.
selectNodes :: TagSoup.StringLike str
=> [SelectNode] -> TagSpec str -> [TagSpec str] -> [TagSpec str]
selectNodes [] _ acc = acc
selectNodes [_] (_, [], _) acc = acc
-- Now that there is only a single SelectNode to satisfy, search the remaining
-- forests and generates a TagSpec for each node that satisfies the condition.
selectNodes [n] (tags, f : fs, ctx) acc
| nodeMatches n info = (shrunkSpec :)
$ selectNodes [n] (tags, fs, ctx)
$ selectNodes [n] (tags, Tree.subForest f, ctx) acc
| otherwise = selectNodes [n] (tags, Tree.subForest f, ctx)
$ selectNodes [n] (tags, fs, ctx) acc
where
Span lo hi = Tree.rootLabel f
shrunkSpec = (
Vector.slice lo (hi - lo + 1) tags
, [fmap recenter f]
, ctx
)
recenter (Span nLo nHi) = Span (nLo - lo) (nHi - lo)
info = tags Vector.! lo
-- There are multiple SelectNodes that need to be satisfied. If the current node
-- satisfies the condition, then the current nodes sub-forest is searched for
-- matches of the remaining SelectNodes.
selectNodes (_ : _) (_, [], _) acc = acc
selectNodes (n : ns) (tags, f : fs, ctx) acc
| nodeMatches n info = selectNodes ns (tags, Tree.subForest f, ctx)
$ selectNodes (n : ns) (tags, fs, ctx) acc
| otherwise = selectNodes (n : ns) (tags, Tree.subForest f, ctx)
$ selectNodes (n : ns) (tags, fs, ctx) acc
where
Span lo _ = Tree.rootLabel f
info = tags Vector.! lo
-- | Returns True if a tag satisfies a given SelectNode's condition.
nodeMatches :: TagSoup.StringLike str => SelectNode -> TagInfo str -> Bool
nodeMatches (SelectNode node preds) info = checkTag node preds info
nodeMatches (SelectAny preds) info = checkPreds preds (infoTag info)
-- | Given a tag name and a list of attribute predicates return a function that
-- returns true if a given tag matches the supplied name and predicates.
checkTag :: TagSoup.StringLike str
=> T.Text -> [AttributePredicate] -> TagInfo str -> Bool
checkTag name preds (TagInfo tag tagName _ _)
= TagSoup.isTagOpen tag
&& isJust tagName
&& name == fromJust tagName
&& checkPreds preds tag
-- | Returns True if a tag satisfies a list of attribute predicates.
checkPreds :: TagSoup.StringLike str
=> [AttributePredicate] -> TagSoup.Tag str -> Bool
checkPreds preds tag
= TagSoup.isTagOpen tag
&& all (flip checkPred attrs) preds
where (TagSoup.TagOpen _ attrs) = tag