diff --git a/docsrc/content/type-seqt.fsx b/docsrc/content/type-seqt.fsx index f2fe8cbf0..be9934d43 100644 --- a/docsrc/content/type-seqt.fsx +++ b/docsrc/content/type-seqt.fsx @@ -83,6 +83,80 @@ let printPages = printPages |> Async.Start + (** -To make it work with tasks simply add `|> Async.StartAsTask` between `wc.AsyncDownloadString (Uri url)` and `|> SeqT.lift` then run eveything but the `printPages |> Async.Start`. +These samples above and below come from the [original AsyncSeq post](http://tomasp.net/blog/async-sequences.aspx) and they can be easily switched to task sequeces (taskSeq), simply add `|> Async.StartAsTask` between `wc.AsyncDownloadString (Uri url)` and `|> SeqT.lift` then run eveything but the `printPages |> Async.Start`. *) + +// A simple webcrawler + +#r "nuget: FSharpPlus" +#r "nuget: HtmlAgilityPack" + +open System +open System.Net +open System.Text.RegularExpressions +open HtmlAgilityPack +open FSharp.Control + +open FSharpPlus +open FSharpPlus.Data + +// ---------------------------------------------------------------------------- +// Helper functions for downloading documents, extracting links etc. + +/// Asynchronously download the document and parse the HTML +let downloadDocument url = async { + try let wc = new WebClient () + let! html = wc.AsyncDownloadString (Uri url) + let doc = new HtmlDocument () + doc.LoadHtml html + return Some doc + with _ -> return None } + +/// Extract all links from the document that start with "http://" +let extractLinks (doc:HtmlDocument) = + try + [ for a in doc.DocumentNode.SelectNodes ("//a") do + if a.Attributes.Contains "href" then + let href = a.Attributes.["href"].Value + if href.StartsWith "https://" then + let endl = href.IndexOf '?' + yield if endl > 0 then href.Substring(0, endl) else href ] + with _ -> [] + +/// Extract the