Skip to content


Subversion checkout URL

You can clone with
Download ZIP


chriso edited this page · 21 revisions includes a robust framework for scraping data from the web. The primary methods for scraping data are get and getHtml, although there are methods for making any type of request, modifying headers, etc. See the API for a full list of methods.

Note that var nodeio = require(''); is omitted in each example.

Example 1: Save a web page to disk


exports.job = new nodeio.Job({
    input: false,
    run: function () {
        var self = this, url = this.options.args[0];
        this.get(url, function(err, data) {
            if (err) {
            } else {

class SavePage extends nodeio.JobClass
    input: false 
    run: () -> 
        url = @options.args[0]
        @get url, (err, data) =>
            if err? then @exit err else @emit data

To save a page to disk, run

$ -s save "" > google.html

Which is equivalent to

$ curl "" > google.html

Example 2: Get the number of Google results for a list of keywords

To use effectively, try and encapsulate common scraping code in run() so that the resulting job is as generic and versatile as possible. Each thread should contain only one request where possible.


var options = {timeout: 10};

exports.job = new nodeio.Job(options, {
    input: ['hello', 'foobar','weather'],
    run: function (keyword) {
        var self = this, results;
        this.getHtml('' + encodeURIComponent(keyword), function (err, $) {
            results = $('#resultStats').text.toLowerCase();
            self.emit(keyword + ' has ' + results);

Note: you could also comment out input: ['hello', 'foobar','weather'], and specify a list of keywords through the web interface or at the command line, e.g.

$ keywords < list_of_words.js

Example 3: Scraping a page using CSS selector / traversal methods

When using getHtml(url, callback), the second argument of callback is $, an object similar to jQuery's. For advanced usage of $, see the API

reddit.js - scrape the front page stories from

var methods = {
    input: false,
    run: function() {
        var self = this;

        this.getHtml('', function(err, $) {

            //Handle any request / parsing errors
            if (err) self.exit(err);

            var titles = [], scores = [], output = [];

            //Select all titles on the page
            $('a.title').each(function(a) {

            //Select all scores on the page
            $('div.score.unvoted').each(function(div) {
                scores.push(div.rawtext); //rawtext doesn't decode entities or trim the text

            //Mismatch? page probably didn't load properly
            if (scores.length != titles.length) {
                self.exit('Title / score mismatch');

            for (var i = 0, len = scores.length; i < len; i++) {
                //Ignore upcoming stories
                if (scores[i] == '&bull;') continue;

                //Check the data is ok

                //Output = [score] title
                output.push('['+scores[i]+'] '+titles[i]);


exports.job = new Job({timeout:10}, methods);

titles = []
scores = []
output = []

class Reddit extends nodeio.JobClass
    input: false
    run: -> 
        @getHtml '', (err, $, data) =>
            @exit err if err?

            $('a.title').each (a) -> titles.push a.text
            $('div.score.unvoted').each (div) -> scores.push div.rawtext

            @exit 'Title / score mismatch' if scores.length isnt titles.length

            for score, i in scores
                if score is '&bull;' then continue
                output.push '[' + score + '] ' + titles[i]

            @emit output

@class = Reddit
@job = new Reddit({timeout:10})
Something went wrong with that request. Please try again.