Skip to content

Commit

Permalink
Added url recursion, link resolution/filtering, and parser speed impr…
Browse files Browse the repository at this point in the history
…ovements
  • Loading branch information
chriso committed Mar 10, 2011
1 parent ce04444 commit 869100c
Show file tree
Hide file tree
Showing 4 changed files with 192 additions and 15 deletions.
15 changes: 9 additions & 6 deletions HISTORY.md
@@ -1,12 +1,15 @@
### v0.2.5
* Added the -m (--max) switch for overridding max concurrent requests
* Speed improvements when parsing HTML using getHtml and postHtml
* Moved validator, jQuery and htmlparser to ./vendor as submodules
* Npm is no longer required to install node.io
* npm is no longer required to install node.io
* Built-in modules are stored relative to the install dir
* Added url recursion and a helper for resolving and filtering links on a page

### v0.2.4
* Moved to the new node v0.4 request API with full HTTPS support
* Added the auto_retry option to improve code readability
* Callbacks are now called in the same scope as job methods
* Callbacks are now called in the same scope as job methods (no more self = this)

### v0.2.3
* Removed daemon and expresso as a required dependencies
Expand Down Expand Up @@ -35,12 +38,12 @@
* Status messages are written to stderr

### v0.2.1-8
* Added [built-in modules](https://github.com/chriso/node.io/tree/master/builtin).
* Added [built-in modules](https://github.com/chriso/node.io/tree/master/builtin).

### v0.2.1-5
* Web interface now supports CoffeeScript jobs
* Support for multiple jobs in the same file (see ./examples/resolve.coffee)
* Added the -u (--unpack) switch for decrypting jobs made with [packnode](https://github.com/chriso/packnode)
* Added the -u (--unpack) switch for decrypting jobs made with [packnode](https://github.com/chriso/packnode)

### v0.2.1-3
* Better support for multiple jobs running in the same process
Expand All @@ -67,12 +70,12 @@
### v0.1.1-17
* Fixed incorrect handling of large streams
* Better support for request timeouts
* Bug fixes
* Bug fixes

### v0.1.1-6
* Added a -g (--debug) switch
* Minor bug fixes
* Added HTTP code handler - auto support for redirects, etc.
* Added HTTP code handler - auto support for redirects, etc.

### v0.1.1-1
* Fixed an inheritance bug when not exclusively using CoffeeScript
Expand Down
176 changes: 170 additions & 6 deletions lib/node.io/dom.js
Expand Up @@ -41,7 +41,7 @@ Job.prototype.$ = function (selector, context) {
* @api public
*/
Job.prototype.parseHtml = function (data, callback, response) {
var self = this;
var self = this, recurse = this.options.recurse;
headers = response && response.headers ? response.headers : {};
if (this.options.jsdom) {
var features = {
Expand All @@ -59,25 +59,189 @@ Job.prototype.parseHtml = function (data, callback, response) {
$ = function (selector, context) {
return context ? jquery.create(context) : default_$(selector);
};
if (recurse === 1 || recurse === true || recurse instanceof Array) {
this.recurseUrls($);
}
callback.apply(this, [null, $, data, headers, response]);
} else {
var self = this, handler, parser, $, htmlparser = require('../../vendor/htmlparser/lib/htmlparser');
handler = new htmlparser.DefaultHandler(function (err, dom) {
var self = this;
this.postParse = function (err, dom) {
if (err) {
callback.call(self, err);
} else {
$ = function (selector, context) {
//Allow the user to specify a custom context (thanks to github.com/jimbishopp)
return self.$(selector, context || dom);
};
if (recurse === 1 || recurse === true || recurse instanceof Array) {
self.recurseUrls($);
}
callback.apply(self, [null, $, data, headers, response]);
}
}, {verbose: true, ignoreWhitespace: true});
parser = new htmlparser.Parser(handler);
parser.parseComplete(data);
};
//Check if the parser is already initalised
if (!this.htmlparser) {
this.prepareHtmlparser();
this.htmlparser.parseComplete(data);
} else {
this.htmlparser.done();
}
}
};

/**
* Prepare htmlparser so that data can be parsed as chunks are received
* (for use with getHtml and postHtml).
*
* @api public
*/
Job.prototype.prepareHtmlparser = function () {
var self = this, $, htmlparser = require('../../vendor/htmlparser/lib/htmlparser');
this.postParse = function () {};
this.htmlparser = new htmlparser.Parser(new htmlparser.DefaultHandler(function () {
self.postParse.apply(this, arguments);
}, {verbose: true, ignoreWhitespace: true}
));
}

/**
* Gets all a~href links on the page based on the filter options.
*
* Default options are:
* resolve: true - resolve relative links
* external: false - include links to different hosts
* static: false - include links to static resources (images, etc.)
* strip_anchor: true - links have their anchors stripped
* strip_query: false - strips query strings. Set this to 'smart' to strip
* all queries unless they contain a page variable
* such as 'page', 'offset', etc.
*
* @param {Function} $
* @param {String} selector (optional - defaults to 'a')
* @param {Object} options
* @api public
*/
Job.prototype.getLinks = function ($, selector, options) {
if (typeof selector === 'object' || typeof selector === 'undefined') {
options = selector || {};
selector = 'a';
}
options = utils.put({
resolve: true,
external: true,
static: false,
strip_anchor: true,
strip_query: false
}, options);

var current_url = this.last.url,
current_host = this.last.host.replace('www.',''),
resolve = require('url').resolve,
urlparse = require('url').parse,
urls = [];

$(selector).each('href', function (href) {
if (!href || href === '#' || href.substr(0, 11) === 'javascript:') return;

//Ignore links to static resource if static=false
if (!options.static && href.match(/\.(jpg|jpeg|ico|css|gif|png|swf)$/i)) {
return;
}

//Strip off the anchor if strip_anchor=true
var anchor;
if (options.strip_anchor && (anchor = href.indexOf('#')) !== -1) {
href = href.substr(0, anchor);
}

//Resolve relative links if resolve=true
if (options.resolve) {
href = resolve(current_url, href);
}

//Cleanup common entities
href = href.replace(/\s/g,'%20').replace(/&/g,'&');

//Strip off query strings unless strip_query=false. If strip_query is 'smart' then
//let query strings through if they appear to link to separate pages of results
var query_str;
if (options.strip_query && (query_str = href.indexOf('?')) != -1) {
if (options.strip_query != 'smart' || (href.indexOf('page=') === -1
&& href.indexOf('offset=') === -1 && href.indexOf('start=') === -1)) {
href = href.substr(0, query_str);
}
}

//Prevent duplicates
if (urls.indexOf(href) != -1) {
return;
}

//Ignore external resources if external=false
if (!options.external) {
var host = urlparse(href).host;
if (host && current_host != host.replace('www.','')) {
return;
}
}

urls.push(href);
});
return urls;
}

/**
* Recurses URLs based on a pattern. If no pattern is specified, URLs
* that are children of the current URL are recursed.
*
* Specify two regex patterns for filtering links. Links will be recursed
* if they match pattern1 and do not match pattern2.
* recurse: [pattern1, pattern2]
*
* @param {Function} $
* @api public
*/
Job.prototype.recurseUrls = function ($) {
var i, l, links = this.getLinks($, {
external: false,
strip_query: 'smart'
});

if ((l = links.length) === 0) {
return;
}

if (this.options.recurse instanceof Array) {
var pattern, n_pattern, p;
p = this.options.recurse.length;
if (p >= 1) {
pattern = this.options.recurse[0];
}
if (p >= 2) {
pattern = this.options.recurse[1];
}

//Iterate over links on the page and recurse urls based on the patterns
for (i = 0; i < l; i++) {
if (pattern && !links[i].match(pattern)) {
continue;
}
if (n_pattern && links[i].match(n_pattern)) {
continue;
}
this.add(links[i]);
}
} else {
//Iterate over links on the page and recurse children of the current url
for (i = 0; i < l; i++) {
if (links[i].indexOf(this.last.url) === -1) {
continue;
}
this.add(links[i]);
}
}
}

/**
* Augments a collection of DOM elements with some helpful methods.
*
Expand Down
14 changes: 12 additions & 2 deletions lib/node.io/request.js
Expand Up @@ -91,6 +91,10 @@ Job.prototype.head = function (resource, headers, callback) {
Job.prototype.getHtml = function (resource, headers, callback, parse) {
var self = this;

if (!this.options.jsdom) {
this.prepareHtmlparser();
}

//`headers` is optional
if (typeof headers === 'function') {
callback = headers;
Expand Down Expand Up @@ -120,6 +124,10 @@ Job.prototype.getHtml = function (resource, headers, callback, parse) {
Job.prototype.postHtml = function (resource, body, headers, callback, parse) {
var self = this;

if (!this.options.jsdom) {
this.prepareHtmlparser();
}

//`body` and `headers` are optional
if (typeof body === 'function') {
callback = body;
Expand Down Expand Up @@ -319,6 +327,7 @@ Job.prototype.doRequest = function (method, resource, body, headers, callback, p
//Save the response headers for the next request (if to the same host)
var cookies = response.headers['set-cookie'];
self.last = {
url: resource,
host: url.hostname,
headers: {
referer: resource,
Expand Down Expand Up @@ -349,11 +358,12 @@ Job.prototype.doRequest = function (method, resource, body, headers, callback, p
var body = '';
response.on('data', function (chunk) {
self.bytes_received += chunk.length;

if (self.htmlparser) {
self.htmlparser.parseChunk(chunk);
}
if (self.is_complete) {
return cleanup();
}

body = body + chunk;
});

Expand Down
2 changes: 1 addition & 1 deletion package.json
@@ -1,6 +1,6 @@
{ "name" : "node.io",
"description" : "A distributed data scraping and processing framework",
"version" : "0.2.5-1",
"version" : "0.2.5-2",
"homepage" : "http://github.com/chriso/node.io",
"keywords" : ["data","mapreduce","map","reduce","scraping","html","parsing","parse","scrape","process","processing","data"],
"author" : "Chris O'Hara <cohara87@gmail.com>",
Expand Down

0 comments on commit 869100c

Please sign in to comment.