/
State.hs
82 lines (70 loc) · 2.17 KB
/
State.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
-- | This module describes the state of shpider computations, and provides a monad transformer over it.
module Network.Shpider.State
( module Control.Monad.State
, ShpiderState (..)
, Page (..)
, Shpider
, emptyPage
, runShpider
, runShpiderSt
)
where
import Control.Monad.State
import Network.Shpider.Curl.Curl
import Data.Maybe
import Text.HTML.TagSoup.Parsec
import Network.Shpider.Forms
import Network.Shpider.Links
-- | The shpider state holds all the options for shpider transactions, the current page and all the `CurlOption`s used when calling curl.
data ShpiderState =
SS { htmlOnlyDownloads :: Bool
, startPage :: String
, dontLeaveDomain :: Bool
, curlOpts :: [ CurlOption ]
, currentPage :: Page
, visited :: Maybe [ String ]
}
deriving Show
-- | The type of Shpider computations. A state transformer over `ShpiderState` and `IO`.
type Shpider =
StateT ShpiderState IO
-- | Run a Shpider computation, returning the result with the state.
runShpiderSt :: Shpider a -> IO ( a , ShpiderState )
runShpiderSt f =
withCurlDo $ runStateT f initialSt
-- | Run a Shpider computation, returning the result.
runShpider :: Shpider a -> IO a
runShpider f = do
( res , _ ) <- runShpiderSt f
return res
-- | The initial shpider state.
-- Currently, CurlTimeout is hard wired to 3, and cookies are saved in a file called "cookies".
initialSt :: ShpiderState
initialSt =
SS { startPage = ""
, htmlOnlyDownloads = False
, dontLeaveDomain = False
, curlOpts = [ CurlCookieFile "cookies"
, CurlCookieJar "cookies"
]
, currentPage = emptyPage
, visited = Nothing
}
-- | The Page datatype. Holds `Link`s, `Form`s, the parsed [ `Tag` ], the page source, and the page's absolute URL.
data Page =
Page { links :: [ Link ]
, forms :: [ Form ]
, tags :: [ Tag String ]
, source :: String
, addr :: String
}
deriving Show
-- | An empty page, containing no information.
emptyPage :: Page
emptyPage =
Page { links = []
, forms = []
, source = ""
, tags = []
, addr =""
}