Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Fetching contributors…

Cannot retrieve contributors at this time

165 lines (142 sloc) 6.592 kB
// ----------------------------------------------------------------------------
// F# async extensions (Crawler.fsx)
// (c) Tomas Petricek, 2011, Available under Apache 2.0 license.
// ----------------------------------------------------------------------------
// This example demonstrates how to use asynchronous sequences and
// blocking agents to implement a web crawler. The sample also uses
// various AsyncSeq combinators to process the resulting async sequence.
//
// The first version performs single-threaded random walk (returned
// as an asynchronous sequence) and the second version is concurrent.
#r @"..\build\FSharpx.Core.dll"
#r @"..\packages\HtmlAgilityPack.1.4.2\lib\HtmlAgilityPack.dll"
open System
open System.Net
open System.Text.RegularExpressions
open HtmlAgilityPack
open FSharp.Control
// ----------------------------------------------------------------------------
// Helper functions for downloading documents, extracting links etc.
/// Asynchronously download the document and parse the HTML
let downloadDocument url = async {
try let wc = new WebClient()
let! html = wc.AsyncDownloadString(Uri(url))
let doc = new HtmlDocument()
doc.LoadHtml(html)
return Some doc
with _ -> return None }
/// Extract all links from the document that start with "http://"
let extractLinks (doc:HtmlDocument) =
try
[ for a in doc.DocumentNode.SelectNodes("//a") do
if a.Attributes.Contains("href") then
let href = a.Attributes.["href"].Value
if href.StartsWith("http://") then
let endl = href.IndexOf('?')
yield if endl > 0 then href.Substring(0, endl) else href ]
with _ -> []
/// Extract the <title> of the web page
let getTitle (doc:HtmlDocument) =
let title = doc.DocumentNode.SelectSingleNode("//title")
if title <> null then title.InnerText.Trim() else "Untitled"
// ----------------------------------------------------------------------------
// Basic crawling - crawl web pages and follow just one link from every page
/// Crawl the internet starting from the specified page
/// From each page follow the first not-yet-visited page
let rec randomCrawl url =
let visited = new System.Collections.Generic.HashSet<_>()
// Visits page and then recursively visits all referenced pages
let rec loop url = asyncSeq {
if visited.Add(url) then
let! doc = downloadDocument url
match doc with
| Some doc ->
// Yield url and title as the next element
yield url, getTitle doc
// For every link, yield all referenced pages too
for link in extractLinks doc do
yield! loop link
| _ -> () }
loop url
// Use AsyncSeq combinators to print the titles of the first 10
// web sites that are from other domains than bing.com
randomCrawl "http://news.bing.com"
|> AsyncSeq.filter (fun (url, title) -> url.Contains("bing.com") |> not)
|> AsyncSeq.map snd
|> AsyncSeq.take 10
|> AsyncSeq.iter (printfn "%s")
|> Async.Start
// ----------------------------------------------------------------------------
// Better crawler - crawls the web concurrently using the specified number of
// workers, stores results and pending URLS to blocking buffers and returns
// all results as an asynchronous sequence. After caller stops taking elements
// from the asynchronous sequence, the blocking buffers will eventually fill
// up and crawling will stop.
let concurrentWorkers = 20
let rec concurrentCrawl url = asyncSeq {
// Number of pending requests is usually very high
// (when the queue fills up, the workers will stop, so set this to 10k)
let requests = BlockingQueueAgent<_>(10000)
let results = BlockingQueueAgent<_>(40)
let visited = ConcurrentSetAgent<_>()
/// Worker repeatedly takes URL from the queue and processes it
let worker() = async {
while true do
let! url = requests.AsyncGet()
let! doc = downloadDocument url
match doc with
| Some doc ->
// Yield url and title as the next element
do! results.AsyncAdd( (url, getTitle doc) )
// For every link, yield all referenced pages too
for link in extractLinks doc do
let! added = visited.AsyncAdd(link)
if added then
do! requests.AsyncAdd(link)
| _ -> () }
// Return an asynchronous sequence that sends intial request
// to the crawler, starts workers and then repeatedly takes
// results from the results queue.
do! requests.AsyncAdd(url)
for i in 0 .. concurrentWorkers do
worker () |> Async.Start
while true do
let! res = results.AsyncGet()
yield res }
// ----------------------------------------------------------------------------
// Visualize the results of crawling - show the most common words in titles
// Create user interface with text box for displaying words
open System.Windows.Forms
let frm = new Form(TopMost=true, Visible=true, Width=400, Height=600)
let txt = new TextBox( Multiline = true, Dock = DockStyle.Fill,
Font = new System.Drawing.Font("Cambria", 12.0f),
ScrollBars = ScrollBars.Vertical )
frm.Controls.Add(txt)
// Creates an asynchronous sequence that produces values of type
// Map<string, int> representing words together with their count
// (new version returned after every processing step)
let tables =
concurrentCrawl "http://news.bing.com"
// Split title into lowercase words
|> AsyncSeq.map (fun (_, title) ->
title.Split( [|' '; '.'; '-'; '|'; ','; ';' |],
StringSplitOptions.RemoveEmptyEntries )
|> Array.map (fun s -> s.ToLower()) )
// Create sequence that aggregates words and returns immediate results
|> AsyncSeq.scan (fun table words ->
words |> Seq.fold (fun table word ->
match Map.tryFind word table with
| Some v -> Map.add word (v + 1) table
| _ -> Map.add word 1 table) table) Map.empty
// Asynchronous workflow that iterates over the sequence
// and displays the results in the textbox
async {
let counter = ref 0
for table in tables |> AsyncSeq.take 200 do
frm.Text <- sprintf "Processed %d" (counter := !counter + 1; !counter)
txt.Text <-
table
|> Seq.sortBy (fun (KeyValue(k, v)) -> -v)
|> Seq.map (fun (KeyValue(k, v)) -> sprintf "%s (%d)" k v)
|> String.concat "\r\n" }
|> Async.StartImmediate
Jump to Line
Something went wrong with that request. Please try again.