Permalink
Fetching contributors…
Cannot retrieve contributors at this time
123 lines (101 sloc) 4.54 KB
{-# OPTIONS_HADDOCK hide #-}
module Text.HTML.Scalpel.Internal.Scrape.URL (
URL
, Config (..)
, Decoder
, defaultDecoder
, utf8Decoder
, iso88591Decoder
, scrapeURL
, scrapeURLWithOpts
, scrapeURLWithConfig
) where
import Text.HTML.Scalpel.Core
import Control.Applicative ((<$>))
import Data.Char (toLower)
import Data.Default (def)
import Data.List (isInfixOf)
import Data.Maybe (listToMaybe)
import qualified Data.ByteString as BS
import qualified Data.Default as Default
import qualified Data.Text.Encoding as Text
import qualified Network.Curl as Curl
import qualified Text.HTML.TagSoup as TagSoup
import qualified Text.StringLike as TagSoup
type URL = String
type CurlResponse = Curl.CurlResponse_ [(String, String)] BS.ByteString
-- | A method that takes a HTTP response as raw bytes and returns the body as a
-- string type.
type Decoder str = Curl.CurlResponse_ [(String, String)] BS.ByteString -> str
-- | A record type that determines how 'scrapeUrlWithConfig' interacts with the
-- HTTP server and interprets the results.
data Config str = Config {
curlOpts :: [Curl.CurlOption]
, decoder :: Decoder str
}
instance TagSoup.StringLike str => Default.Default (Config str) where
def = Config {
curlOpts = [Curl.CurlFollowLocation True]
, decoder = defaultDecoder
}
-- | The 'scrapeURL' function downloads the contents of the given URL and
-- executes a 'Scraper' on it.
--
-- 'scrapeURL' makes use of curl to make HTTP requests. The dependency on curl
-- may be too heavyweight for some use cases. In which case users who do not
-- require inbuilt networking support can depend on
-- <https://hackage.haskell.org/package/scalpel-core scalpel-core> for a
-- lightweight subset of this library that does not depend on curl.
scrapeURL :: (Ord str, TagSoup.StringLike str)
=> URL -> Scraper str a -> IO (Maybe a)
scrapeURL = scrapeURLWithOpts [Curl.CurlFollowLocation True]
-- | The 'scrapeURLWithOpts' function take a list of curl options and downloads
-- the contents of the given URL and executes a 'Scraper' on it.
scrapeURLWithOpts :: (Ord str, TagSoup.StringLike str)
=> [Curl.CurlOption] -> URL -> Scraper str a -> IO (Maybe a)
scrapeURLWithOpts options = scrapeURLWithConfig (def {curlOpts = options})
-- | The 'scrapeURLWithConfig' function takes a 'Config' record type and
-- downloads the contents of the given URL and executes a 'Scraper' on it.
scrapeURLWithConfig :: (Ord str, TagSoup.StringLike str)
=> Config str -> URL -> Scraper str a -> IO (Maybe a)
scrapeURLWithConfig config url scraper = do
maybeTags <- downloadAsTags (decoder config) url
return (maybeTags >>= scrape scraper)
where
downloadAsTags decoder url = do
maybeBytes <- openURIWithOpts url (curlOpts config)
return $ TagSoup.parseTags . decoder <$> maybeBytes
openURIWithOpts :: URL -> [Curl.CurlOption] -> IO (Maybe CurlResponse)
openURIWithOpts url opts = do
resp <- curlGetResponse_ url opts
return $ if Curl.respCurlCode resp /= Curl.CurlOK
then Nothing
else Just resp
curlGetResponse_ :: URL
-> [Curl.CurlOption]
-> IO (Curl.CurlResponse_ [(String, String)] BS.ByteString)
curlGetResponse_ = Curl.curlGetResponse_
-- | The default response decoder. This decoder attempts to infer the character
-- set of the HTTP response body from the `Content-Type` header. If this header
-- is not present, then the character set is assumed to be `ISO-8859-1`.
defaultDecoder :: TagSoup.StringLike str => Decoder str
defaultDecoder response = TagSoup.castString
$ choosenDecoder body
where
body = Curl.respBody response
headers = Curl.respHeaders response
contentType = listToMaybe
$ map (map toLower . snd)
$ take 1
$ dropWhile ((/= "content-type") . map toLower . fst)
headers
isType t | Just ct <- contentType = ("charset=" ++ t) `isInfixOf` ct
| otherwise = False
choosenDecoder | isType "utf-8" = Text.decodeUtf8
| otherwise = Text.decodeLatin1
-- | A decoder that will always decode using `UTF-8`.
utf8Decoder :: TagSoup.StringLike str => Decoder str
utf8Decoder = TagSoup.castString . Text.decodeUtf8 . Curl.respBody
-- | A decoder that will always decode using `ISO-8859-1`.
iso88591Decoder :: TagSoup.StringLike str => Decoder str
iso88591Decoder = TagSoup.castString . Text.decodeLatin1 . Curl.respBody