Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Added url recursion, link resolution/filtering, and parser speed impr…

…ovements
  • Loading branch information...
commit 869100c33e89a64b1397941447a8cc64d3f615e8 1 parent ce04444
@chriso authored
View
15 HISTORY.md
@@ -1,12 +1,15 @@
### v0.2.5
+ * Added the -m (--max) switch for overridding max concurrent requests
+ * Speed improvements when parsing HTML using getHtml and postHtml
* Moved validator, jQuery and htmlparser to ./vendor as submodules
- * Npm is no longer required to install node.io
+ * npm is no longer required to install node.io
* Built-in modules are stored relative to the install dir
+ * Added url recursion and a helper for resolving and filtering links on a page
### v0.2.4
* Moved to the new node v0.4 request API with full HTTPS support
* Added the auto_retry option to improve code readability
- * Callbacks are now called in the same scope as job methods
+ * Callbacks are now called in the same scope as job methods (no more self = this)
### v0.2.3
* Removed daemon and expresso as a required dependencies
@@ -35,12 +38,12 @@
* Status messages are written to stderr
### v0.2.1-8
- * Added [built-in modules](https://github.com/chriso/node.io/tree/master/builtin).
+ * Added [built-in modules](https://github.com/chriso/node.io/tree/master/builtin).
### v0.2.1-5
* Web interface now supports CoffeeScript jobs
* Support for multiple jobs in the same file (see ./examples/resolve.coffee)
- * Added the -u (--unpack) switch for decrypting jobs made with [packnode](https://github.com/chriso/packnode)
+ * Added the -u (--unpack) switch for decrypting jobs made with [packnode](https://github.com/chriso/packnode)
### v0.2.1-3
* Better support for multiple jobs running in the same process
@@ -67,12 +70,12 @@
### v0.1.1-17
* Fixed incorrect handling of large streams
* Better support for request timeouts
- * Bug fixes
+ * Bug fixes
### v0.1.1-6
* Added a -g (--debug) switch
* Minor bug fixes
- * Added HTTP code handler - auto support for redirects, etc.
+ * Added HTTP code handler - auto support for redirects, etc.
### v0.1.1-1
* Fixed an inheritance bug when not exclusively using CoffeeScript
View
176 lib/node.io/dom.js
@@ -41,7 +41,7 @@ Job.prototype.$ = function (selector, context) {
* @api public
*/
Job.prototype.parseHtml = function (data, callback, response) {
- var self = this;
+ var self = this, recurse = this.options.recurse;
headers = response && response.headers ? response.headers : {};
if (this.options.jsdom) {
var features = {
@@ -59,10 +59,13 @@ Job.prototype.parseHtml = function (data, callback, response) {
$ = function (selector, context) {
return context ? jquery.create(context) : default_$(selector);
};
+ if (recurse === 1 || recurse === true || recurse instanceof Array) {
+ this.recurseUrls($);
+ }
callback.apply(this, [null, $, data, headers, response]);
} else {
- var self = this, handler, parser, $, htmlparser = require('../../vendor/htmlparser/lib/htmlparser');
- handler = new htmlparser.DefaultHandler(function (err, dom) {
+ var self = this;
+ this.postParse = function (err, dom) {
if (err) {
callback.call(self, err);
} else {
@@ -70,15 +73,176 @@ Job.prototype.parseHtml = function (data, callback, response) {
//Allow the user to specify a custom context (thanks to github.com/jimbishopp)
return self.$(selector, context || dom);
};
+ if (recurse === 1 || recurse === true || recurse instanceof Array) {
+ self.recurseUrls($);
+ }
callback.apply(self, [null, $, data, headers, response]);
}
- }, {verbose: true, ignoreWhitespace: true});
- parser = new htmlparser.Parser(handler);
- parser.parseComplete(data);
+ };
+ //Check if the parser is already initalised
+ if (!this.htmlparser) {
+ this.prepareHtmlparser();
+ this.htmlparser.parseComplete(data);
+ } else {
+ this.htmlparser.done();
+ }
}
};
/**
+ * Prepare htmlparser so that data can be parsed as chunks are received
+ * (for use with getHtml and postHtml).
+ *
+ * @api public
+ */
+Job.prototype.prepareHtmlparser = function () {
+ var self = this, $, htmlparser = require('../../vendor/htmlparser/lib/htmlparser');
+ this.postParse = function () {};
+ this.htmlparser = new htmlparser.Parser(new htmlparser.DefaultHandler(function () {
+ self.postParse.apply(this, arguments);
+ }, {verbose: true, ignoreWhitespace: true}
+ ));
+}
+
+/**
+ * Gets all a~href links on the page based on the filter options.
+ *
+ * Default options are:
+ * resolve: true - resolve relative links
+ * external: false - include links to different hosts
+ * static: false - include links to static resources (images, etc.)
+ * strip_anchor: true - links have their anchors stripped
+ * strip_query: false - strips query strings. Set this to 'smart' to strip
+ * all queries unless they contain a page variable
+ * such as 'page', 'offset', etc.
+ *
+ * @param {Function} $
+ * @param {String} selector (optional - defaults to 'a')
+ * @param {Object} options
+ * @api public
+ */
+ Job.prototype.getLinks = function ($, selector, options) {
+ if (typeof selector === 'object' || typeof selector === 'undefined') {
+ options = selector || {};
+ selector = 'a';
+ }
+ options = utils.put({
+ resolve: true,
+ external: true,
+ static: false,
+ strip_anchor: true,
+ strip_query: false
+ }, options);
+
+ var current_url = this.last.url,
+ current_host = this.last.host.replace('www.',''),
+ resolve = require('url').resolve,
+ urlparse = require('url').parse,
+ urls = [];
+
+ $(selector).each('href', function (href) {
+ if (!href || href === '#' || href.substr(0, 11) === 'javascript:') return;
+
+ //Ignore links to static resource if static=false
+ if (!options.static && href.match(/\.(jpg|jpeg|ico|css|gif|png|swf)$/i)) {
+ return;
+ }
+
+ //Strip off the anchor if strip_anchor=true
+ var anchor;
+ if (options.strip_anchor && (anchor = href.indexOf('#')) !== -1) {
+ href = href.substr(0, anchor);
+ }
+
+ //Resolve relative links if resolve=true
+ if (options.resolve) {
+ href = resolve(current_url, href);
+ }
+
+ //Cleanup common entities
+ href = href.replace(/\s/g,'%20').replace(/&/g,'&');
+
+ //Strip off query strings unless strip_query=false. If strip_query is 'smart' then
+ //let query strings through if they appear to link to separate pages of results
+ var query_str;
+ if (options.strip_query && (query_str = href.indexOf('?')) != -1) {
+ if (options.strip_query != 'smart' || (href.indexOf('page=') === -1
+ && href.indexOf('offset=') === -1 && href.indexOf('start=') === -1)) {
+ href = href.substr(0, query_str);
+ }
+ }
+
+ //Prevent duplicates
+ if (urls.indexOf(href) != -1) {
+ return;
+ }
+
+ //Ignore external resources if external=false
+ if (!options.external) {
+ var host = urlparse(href).host;
+ if (host && current_host != host.replace('www.','')) {
+ return;
+ }
+ }
+
+ urls.push(href);
+ });
+ return urls;
+ }
+
+/**
+ * Recurses URLs based on a pattern. If no pattern is specified, URLs
+ * that are children of the current URL are recursed.
+ *
+ * Specify two regex patterns for filtering links. Links will be recursed
+ * if they match pattern1 and do not match pattern2.
+ * recurse: [pattern1, pattern2]
+ *
+ * @param {Function} $
+ * @api public
+ */
+ Job.prototype.recurseUrls = function ($) {
+ var i, l, links = this.getLinks($, {
+ external: false,
+ strip_query: 'smart'
+ });
+
+ if ((l = links.length) === 0) {
+ return;
+ }
+
+ if (this.options.recurse instanceof Array) {
+ var pattern, n_pattern, p;
+ p = this.options.recurse.length;
+ if (p >= 1) {
+ pattern = this.options.recurse[0];
+ }
+ if (p >= 2) {
+ pattern = this.options.recurse[1];
+ }
+
+ //Iterate over links on the page and recurse urls based on the patterns
+ for (i = 0; i < l; i++) {
+ if (pattern && !links[i].match(pattern)) {
+ continue;
+ }
+ if (n_pattern && links[i].match(n_pattern)) {
+ continue;
+ }
+ this.add(links[i]);
+ }
+ } else {
+ //Iterate over links on the page and recurse children of the current url
+ for (i = 0; i < l; i++) {
+ if (links[i].indexOf(this.last.url) === -1) {
+ continue;
+ }
+ this.add(links[i]);
+ }
+ }
+ }
+
+/**
* Augments a collection of DOM elements with some helpful methods.
*
* Methods:
View
14 lib/node.io/request.js
@@ -91,6 +91,10 @@ Job.prototype.head = function (resource, headers, callback) {
Job.prototype.getHtml = function (resource, headers, callback, parse) {
var self = this;
+ if (!this.options.jsdom) {
+ this.prepareHtmlparser();
+ }
+
//`headers` is optional
if (typeof headers === 'function') {
callback = headers;
@@ -120,6 +124,10 @@ Job.prototype.getHtml = function (resource, headers, callback, parse) {
Job.prototype.postHtml = function (resource, body, headers, callback, parse) {
var self = this;
+ if (!this.options.jsdom) {
+ this.prepareHtmlparser();
+ }
+
//`body` and `headers` are optional
if (typeof body === 'function') {
callback = body;
@@ -319,6 +327,7 @@ Job.prototype.doRequest = function (method, resource, body, headers, callback, p
//Save the response headers for the next request (if to the same host)
var cookies = response.headers['set-cookie'];
self.last = {
+ url: resource,
host: url.hostname,
headers: {
referer: resource,
@@ -349,11 +358,12 @@ Job.prototype.doRequest = function (method, resource, body, headers, callback, p
var body = '';
response.on('data', function (chunk) {
self.bytes_received += chunk.length;
-
+ if (self.htmlparser) {
+ self.htmlparser.parseChunk(chunk);
+ }
if (self.is_complete) {
return cleanup();
}
-
body = body + chunk;
});
View
2  package.json
@@ -1,6 +1,6 @@
{ "name" : "node.io",
"description" : "A distributed data scraping and processing framework",
- "version" : "0.2.5-1",
+ "version" : "0.2.5-2",
"homepage" : "http://github.com/chriso/node.io",
"keywords" : ["data","mapreduce","map","reduce","scraping","html","parsing","parse","scrape","process","processing","data"],
"author" : "Chris O'Hara <cohara87@gmail.com>",
Please sign in to comment.
Something went wrong with that request. Please try again.