Added url recursion, link resolution/filtering, and parser speed impr…

…ovements
grapevinebeta · Mar 10, 2011 · 869100c · 869100c
1 parent ce04444
commit 869100c
Show file tree

Hide file tree

Showing 4 changed files with 192 additions and 15 deletions.
diff --git a/HISTORY.md b/HISTORY.md
@@ -1,12 +1,15 @@
 ### v0.2.5
+    * Added the -m (--max) switch for overridding max concurrent requests
+    * Speed improvements when parsing HTML using getHtml and postHtml
     * Moved validator, jQuery and htmlparser to ./vendor as submodules
-    * Npm is no longer required to install node.io
+    * npm is no longer required to install node.io
     * Built-in modules are stored relative to the install dir
+    * Added url recursion and a helper for resolving and filtering links on a page
 
 ### v0.2.4
     * Moved to the new node v0.4 request API with full HTTPS support
     * Added the auto_retry option to improve code readability
-    * Callbacks are now called in the same scope as job methods
+    * Callbacks are now called in the same scope as job methods (no more self = this)
 
 ### v0.2.3
     * Removed daemon and expresso as a required dependencies
@@ -35,12 +38,12 @@
     * Status messages are written to stderr
 
 ### v0.2.1-8
-    * Added [built-in modules](https://github.com/chriso/node.io/tree/master/builtin). 
+    * Added [built-in modules](https://github.com/chriso/node.io/tree/master/builtin).
 
 ### v0.2.1-5
     * Web interface now supports CoffeeScript jobs
     * Support for multiple jobs in the same file (see ./examples/resolve.coffee)
-    * Added the -u (--unpack) switch for decrypting jobs made with [packnode](https://github.com/chriso/packnode)	
+    * Added the -u (--unpack) switch for decrypting jobs made with [packnode](https://github.com/chriso/packnode)
 
 ### v0.2.1-3
     * Better support for multiple jobs running in the same process
@@ -67,12 +70,12 @@
 ### v0.1.1-17
     * Fixed incorrect handling of large streams
     * Better support for request timeouts
-    * Bug fixes    
+    * Bug fixes
 
 ### v0.1.1-6
     * Added a -g (--debug) switch
     * Minor bug fixes
-    * Added HTTP code handler - auto support for redirects, etc.    
+    * Added HTTP code handler - auto support for redirects, etc.
 
 ### v0.1.1-1
     * Fixed an inheritance bug when not exclusively using CoffeeScript

diff --git a/lib/node.io/dom.js b/lib/node.io/dom.js
@@ -41,7 +41,7 @@ Job.prototype.$ = function (selector, context) {
  * @api public
  */
 Job.prototype.parseHtml = function (data, callback, response) {
-    var self = this;
+    var self = this, recurse = this.options.recurse;
     headers = response && response.headers ? response.headers : {};
     if (this.options.jsdom) {
         var features = {
@@ -59,25 +59,189 @@ Job.prototype.parseHtml = function (data, callback, response) {
         $ = function (selector, context) {
             return context ? jquery.create(context) : default_$(selector);
         };
+        if (recurse === 1 || recurse === true || recurse instanceof Array) {
+            this.recurseUrls($);
+        }
         callback.apply(this, [null, $, data, headers, response]);
     } else {
-        var self = this, handler, parser, $, htmlparser = require('../../vendor/htmlparser/lib/htmlparser');
-        handler = new htmlparser.DefaultHandler(function (err, dom) {
+        var self = this;
+        this.postParse = function (err, dom) {
             if (err) {
                 callback.call(self, err);
             } else {
                 $ = function (selector, context) {
                     //Allow the user to specify a custom context (thanks to github.com/jimbishopp)
                     return self.$(selector, context || dom);
                 };
+                if (recurse === 1 || recurse === true || recurse instanceof Array) {
+                    self.recurseUrls($);
+                }
                 callback.apply(self, [null, $, data, headers, response]);
             }
-        }, {verbose: true, ignoreWhitespace: true});
-        parser = new htmlparser.Parser(handler);
-        parser.parseComplete(data);
+        };
+        //Check if the parser is already initalised
+        if (!this.htmlparser) {
+            this.prepareHtmlparser();
+            this.htmlparser.parseComplete(data);
+        } else {
+            this.htmlparser.done();
+        }
     }
 };
 
+/**
+ * Prepare htmlparser so that data can be parsed as chunks are received
+ * (for use with getHtml and postHtml).
+ *
+ * @api public
+ */
+Job.prototype.prepareHtmlparser = function () {
+    var self = this, $, htmlparser = require('../../vendor/htmlparser/lib/htmlparser');
+    this.postParse = function () {};
+    this.htmlparser = new htmlparser.Parser(new htmlparser.DefaultHandler(function () {
+            self.postParse.apply(this, arguments);
+        }, {verbose: true, ignoreWhitespace: true}
+    ));
+}
+
+/**
+ * Gets all a~href links on the page based on the filter options.
+ *
+ * Default options are:
+ *    resolve: true      - resolve relative links
+ *    external: false    - include links to different hosts
+ *    static: false      - include links to static resources (images, etc.)
+ *    strip_anchor: true - links have their anchors stripped
+ *    strip_query: false - strips query strings. Set this to 'smart' to strip
+ *                         all queries unless they contain a page variable
+ *                         such as 'page', 'offset', etc.
+ *
+ * @param {Function} $
+ * @param {String} selector (optional - defaults to 'a')
+ * @param {Object} options
+ * @api public
+ */
+ Job.prototype.getLinks = function ($, selector, options) {
+    if (typeof selector === 'object' || typeof selector === 'undefined') {
+        options = selector || {};
+        selector = 'a';
+    }
+    options = utils.put({
+        resolve: true,
+        external: true,
+        static: false,
+        strip_anchor: true,
+        strip_query: false
+    }, options);
+
+    var current_url = this.last.url,
+        current_host = this.last.host.replace('www.',''),
+        resolve = require('url').resolve,
+        urlparse = require('url').parse,
+        urls = [];
+
+    $(selector).each('href', function (href) {
+        if (!href || href === '#' || href.substr(0, 11) === 'javascript:') return;
+
+        //Ignore links to static resource if static=false
+        if (!options.static && href.match(/\.(jpg|jpeg|ico|css|gif|png|swf)$/i)) {
+            return;
+        }
+
+        //Strip off the anchor if strip_anchor=true
+        var anchor;
+        if (options.strip_anchor && (anchor = href.indexOf('#')) !== -1) {
+            href = href.substr(0, anchor);
+        }
+
+        //Resolve relative links if resolve=true
+        if (options.resolve) {
+            href = resolve(current_url, href);
+        }
+
+        //Cleanup common entities
+        href = href.replace(/\s/g,'%20').replace(/&amp;/g,'&');
+
+        //Strip off query strings unless strip_query=false. If strip_query is 'smart' then
+        //let query strings through if they appear to link to separate pages of results
+        var query_str;
+        if (options.strip_query && (query_str = href.indexOf('?')) != -1) {
+            if (options.strip_query != 'smart' || (href.indexOf('page=') === -1
+                    && href.indexOf('offset=') === -1 && href.indexOf('start=') === -1)) {
+                href = href.substr(0, query_str);
+            }
+        }
+
+        //Prevent duplicates
+        if (urls.indexOf(href) != -1) {
+            return;
+        }
+
+        //Ignore external resources if external=false
+        if (!options.external) {
+            var host = urlparse(href).host;
+            if (host && current_host != host.replace('www.','')) {
+                return;
+            }
+        }
+
+        urls.push(href);
+    });
+    return urls;
+ }
+
+/**
+ * Recurses URLs based on a pattern. If no pattern is specified, URLs
+ * that are children of the current URL are recursed.
+ *
+ * Specify two regex patterns for filtering links. Links will be recursed
+ * if they match pattern1 and do not match pattern2.
+ *        recurse: [pattern1, pattern2]
+ *
+ * @param {Function} $
+ * @api public
+ */
+ Job.prototype.recurseUrls = function ($) {
+    var i, l, links = this.getLinks($, {
+        external: false,
+        strip_query: 'smart'
+    });
+
+    if ((l = links.length) === 0) {
+        return;
+    }
+
+    if (this.options.recurse instanceof Array) {
+        var pattern, n_pattern, p;
+        p = this.options.recurse.length;
+        if (p >= 1) {
+            pattern = this.options.recurse[0];
+        }
+        if (p >= 2) {
+            pattern = this.options.recurse[1];
+        }
+
+        //Iterate over links on the page and recurse urls based on the patterns
+        for (i = 0; i < l; i++) {
+            if (pattern && !links[i].match(pattern)) {
+                continue;
+            }
+            if (n_pattern && links[i].match(n_pattern)) {
+                continue;
+            }
+            this.add(links[i]);
+        }
+    } else {
+        //Iterate over links on the page and recurse children of the current url
+        for (i = 0; i < l; i++) {
+            if (links[i].indexOf(this.last.url) === -1) {
+                continue;
+            }
+            this.add(links[i]);
+        }
+    }
+ }
+
 /**
  * Augments a collection of DOM elements with some helpful methods.
  *

diff --git a/lib/node.io/request.js b/lib/node.io/request.js
@@ -91,6 +91,10 @@ Job.prototype.head = function (resource, headers, callback) {
 Job.prototype.getHtml = function (resource, headers, callback, parse) {
     var self = this;
 
+    if (!this.options.jsdom) {
+        this.prepareHtmlparser();
+    }
+
     //`headers` is optional
     if (typeof headers === 'function') {
         callback = headers;
@@ -120,6 +124,10 @@ Job.prototype.getHtml = function (resource, headers, callback, parse) {
 Job.prototype.postHtml = function (resource, body, headers, callback, parse) {
     var self = this;
 
+    if (!this.options.jsdom) {
+        this.prepareHtmlparser();
+    }
+
     //`body` and `headers` are optional
     if (typeof body === 'function') {
         callback = body;
@@ -319,6 +327,7 @@ Job.prototype.doRequest = function (method, resource, body, headers, callback, p
         //Save the response headers for the next request (if to the same host)
         var cookies = response.headers['set-cookie'];
         self.last = {
+            url: resource,
             host: url.hostname,
             headers: {
                 referer: resource,
@@ -349,11 +358,12 @@ Job.prototype.doRequest = function (method, resource, body, headers, callback, p
         var body = '';
         response.on('data', function (chunk) {
             self.bytes_received += chunk.length;
-
+            if (self.htmlparser) {
+                self.htmlparser.parseChunk(chunk);
+            }
             if (self.is_complete) {
                 return cleanup();
             }
-
             body = body + chunk;
         });
 

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 { "name"          : "node.io",
   "description"   : "A distributed data scraping and processing framework",
-  "version"       : "0.2.5-1",
+  "version"       : "0.2.5-2",
   "homepage"      : "http://github.com/chriso/node.io",
   "keywords"      : ["data","mapreduce","map","reduce","scraping","html","parsing","parse","scrape","process","processing","data"],
   "author"        : "Chris O'Hara <cohara87@gmail.com>",