diff --git a/docsrc/content/type-seqt.fsx b/docsrc/content/type-seqt.fsx index f2fe8cbf0..be9934d43 100644 --- a/docsrc/content/type-seqt.fsx +++ b/docsrc/content/type-seqt.fsx @@ -83,6 +83,80 @@ let printPages = printPages |> Async.Start + (** -To make it work with tasks simply add `|> Async.StartAsTask` between `wc.AsyncDownloadString (Uri url)` and `|> SeqT.lift` then run eveything but the `printPages |> Async.Start`. +These samples above and below come from the [original AsyncSeq post](http://tomasp.net/blog/async-sequences.aspx) and they can be easily switched to task sequeces (taskSeq), simply add `|> Async.StartAsTask` between `wc.AsyncDownloadString (Uri url)` and `|> SeqT.lift` then run eveything but the `printPages |> Async.Start`. *) + +// A simple webcrawler + +#r "nuget: FSharpPlus" +#r "nuget: HtmlAgilityPack" + +open System +open System.Net +open System.Text.RegularExpressions +open HtmlAgilityPack +open FSharp.Control + +open FSharpPlus +open FSharpPlus.Data + +// ---------------------------------------------------------------------------- +// Helper functions for downloading documents, extracting links etc. + +/// Asynchronously download the document and parse the HTML +let downloadDocument url = async { + try let wc = new WebClient () + let! html = wc.AsyncDownloadString (Uri url) + let doc = new HtmlDocument () + doc.LoadHtml html + return Some doc + with _ -> return None } + +/// Extract all links from the document that start with "http://" +let extractLinks (doc:HtmlDocument) = + try + [ for a in doc.DocumentNode.SelectNodes ("//a") do + if a.Attributes.Contains "href" then + let href = a.Attributes.["href"].Value + if href.StartsWith "https://" then + let endl = href.IndexOf '?' + yield if endl > 0 then href.Substring(0, endl) else href ] + with _ -> [] + +/// Extract the of the web page +let getTitle (doc: HtmlDocument) = + let title = doc.DocumentNode.SelectSingleNode "//title" + if title <> null then title.InnerText.Trim () else "Untitled" + +// ---------------------------------------------------------------------------- +// Basic crawling - crawl web pages and follow just one link from every page + +/// Crawl the internet starting from the specified page +/// From each page follow the first not-yet-visited page +let rec randomCrawl url = + let visited = new System.Collections.Generic.HashSet<_> () + + // Visits page and then recursively visits all referenced pages + let rec loop url = monad.plus { + if visited.Add(url) then + let! doc = downloadDocument url |> SeqT.lift + match doc with + | Some doc -> + // Yield url and title as the next element + yield url, getTitle doc + // For every link, yield all referenced pages too + for link in extractLinks doc do + yield! loop link + | _ -> () } + loop url + +// Use SeqT combinators to print the titles of the first 10 +// web sites that are from other domains than en.wikipedia.org +randomCrawl "https://en.wikipedia.org/wiki/Main_Page" +|> SeqT.filter (fun (url, title) -> url.Contains "en.wikipedia.org" |> not) +|> SeqT.map snd +|> SeqT.take 10 +|> SeqT.iter (printfn "%s") +|> Async.Start \ No newline at end of file