Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 75 additions & 1 deletion docsrc/content/type-seqt.fsx
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,80 @@ let printPages =

printPages |> Async.Start


(**
To make it work with tasks simply add `|> Async.StartAsTask` between `wc.AsyncDownloadString (Uri url)` and `|> SeqT.lift` then run eveything but the `printPages |> Async.Start`.
These samples above and below come from the [original AsyncSeq post](http://tomasp.net/blog/async-sequences.aspx) and they can be easily switched to task sequeces (taskSeq), simply add `|> Async.StartAsTask` between `wc.AsyncDownloadString (Uri url)` and `|> SeqT.lift` then run eveything but the `printPages |> Async.Start`.
*)

// A simple webcrawler

#r "nuget: FSharpPlus"
#r "nuget: HtmlAgilityPack"

open System
open System.Net
open System.Text.RegularExpressions
open HtmlAgilityPack
open FSharp.Control

open FSharpPlus
open FSharpPlus.Data

// ----------------------------------------------------------------------------
// Helper functions for downloading documents, extracting links etc.

/// Asynchronously download the document and parse the HTML
let downloadDocument url = async {
try let wc = new WebClient ()
let! html = wc.AsyncDownloadString (Uri url)
let doc = new HtmlDocument ()
doc.LoadHtml html
return Some doc
with _ -> return None }

/// Extract all links from the document that start with "http://"
let extractLinks (doc:HtmlDocument) =
try
[ for a in doc.DocumentNode.SelectNodes ("//a") do
if a.Attributes.Contains "href" then
let href = a.Attributes.["href"].Value
if href.StartsWith "https://" then
let endl = href.IndexOf '?'
yield if endl > 0 then href.Substring(0, endl) else href ]
with _ -> []

/// Extract the <title> of the web page
let getTitle (doc: HtmlDocument) =
let title = doc.DocumentNode.SelectSingleNode "//title"
if title <> null then title.InnerText.Trim () else "Untitled"

// ----------------------------------------------------------------------------
// Basic crawling - crawl web pages and follow just one link from every page

/// Crawl the internet starting from the specified page
/// From each page follow the first not-yet-visited page
let rec randomCrawl url =
let visited = new System.Collections.Generic.HashSet<_> ()

// Visits page and then recursively visits all referenced pages
let rec loop url = monad.plus {
if visited.Add(url) then
let! doc = downloadDocument url |> SeqT.lift
match doc with
| Some doc ->
// Yield url and title as the next element
yield url, getTitle doc
// For every link, yield all referenced pages too
for link in extractLinks doc do
yield! loop link
| _ -> () }
loop url

// Use SeqT combinators to print the titles of the first 10
// web sites that are from other domains than en.wikipedia.org
randomCrawl "https://en.wikipedia.org/wiki/Main_Page"
|> SeqT.filter (fun (url, title) -> url.Contains "en.wikipedia.org" |> not)
|> SeqT.map snd
|> SeqT.take 10
|> SeqT.iter (printfn "%s")
|> Async.Start