Skip to content

Commit

Permalink
refactor crawler and use unique index to avoid duplicate rows
Browse files Browse the repository at this point in the history
  • Loading branch information
flyerhzm committed Feb 28, 2012
1 parent 6a086b4 commit 4b0b43f
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 52 deletions.
101 changes: 51 additions & 50 deletions lib/crawler.js
Expand Up @@ -3,75 +3,43 @@ var http = require('http'),
crypto = require('crypto'),
sys = require('util'),
mongoose = require('mongoose'),
underscore = require('underscore'),
htmlparser = require('htmlparser'),
DomUtils = htmlparser.DomUtils,
config = require('../config').config,
models = require('../models'),
Slideshow = models.Slideshow;

var options = {
host: "www.slideshare.net",
path: "/api/2/get_slideshows_by_tag?"
};
var slideshow_states = [];

var fetchSlides = function(tag_name) {
var now = (Date.now() / 1000).toFixed();
var sha1 = crypto.createHash('sha1');
sha1.update(config.slideshare_secret + now);
slideshare_hash = sha1.digest('hex');

var queries = querystring.stringify({
api_key: config.slideshare_key,
ts: now,
hash: slideshare_hash,
hash: generateSlideshareHash(now),
tag: tag_name,
limit: config.crawl_size,
detailed: 1
});
var slideshare_options = underscore.clone(options);
slideshare_options["path"] += queries;
var slideshare_options = {
host: "www.slideshare.net",
path: "/api/2/get_slideshows_by_tag?" + queries
}

var xml_handler = new htmlparser.DefaultHandler(function(err, dom) {
if (err) {
console.log(err.message);
} else {
var slideshow_doms = DomUtils.getElementsByTagName("Slideshow", dom);
var slideshow_states = new Array(slideshow_doms.length);
var index, slideshow_dom;
for (index = 0; index < slideshow_states.length; index++) {
slideshow_dom = slideshow_doms[index];
var url = fetchDomData(slideshow_dom, "URL");
Slideshow.findOne({url: url}, function(err, doc) {
slideshow_states = slideshow_states.concat(new Array(slideshow_doms.length));
var index;
for (index = 0; index < slideshow_doms.length; index++) {
var slideshow = initSlideshare(slideshow_doms[index]);
slideshow.save(function(err) {
if (err) { console.log(err.message); }
if (!doc) {
var slideshow = new Slideshow();
slideshow.title = fetchDomData(slideshow_dom, "Title");
slideshow.description = fetchDomData(slideshow_dom, "Description");
slideshow.username = fetchDomData(slideshow_dom, "Username");
slideshow.url = url;
slideshow.created = fetchDomData(slideshow_dom, "Created");
slideshow.tags = fetchDomArray(slideshow_dom, "Tag");
slideshow.slideshare_id = fetchDomData(slideshow_dom, "ID");

slideshow.save(function(err) {
if (err) { console.log(err.message); }
slideshow_states.pop();
});
} else {
slideshow_states.pop();
}
slideshow_states.pop();
});

};

// WTF, I hate disconnecting mongo by setTimeout
var intervalId = setInterval(function() {
if (slideshow_states.length === 0) {
mongoose.disconnect();
clearInterval(intervalId);
}
}, 5000);
}
}, {verbose: false});

Expand All @@ -91,24 +59,57 @@ var fetchSlides = function(tag_name) {
req.end();
};

var generateSlideshareHash = function(now) {
var sha1 = crypto.createHash('sha1');
sha1.update(config.slideshare_secret + now);
return sha1.digest('hex');
};

var initSlideshare = function(slideshow_dom) {
var slideshow = new Slideshow();
slideshow.title = fetchDomData(slideshow_dom, "Title");
slideshow.description = fetchDomData(slideshow_dom, "Description");
slideshow.username = fetchDomData(slideshow_dom, "Username");
slideshow.url = fetchDomData(slideshow_dom, "URL");
slideshow.created = fetchDomData(slideshow_dom, "Created");
slideshow.tags = fetchDomArray(slideshow_dom, "Tag");
slideshow.slideshare_id = fetchDomData(slideshow_dom, "ID");
return slideshow;
}

var fetchDomData = function(dom, name) {
var data_dom = DomUtils.getElementsByTagName(name, dom);
var data = "";
var data_dom = DomUtils.getElementsByTagName(name, dom)
, data = "";
if (data_dom[0]["children"]) {
data = data_dom[0]["children"][0]["data"];
}
return data;
};

var fetchDomArray = function(dom, name) {
var data_dom = DomUtils.getElementsByTagName(name, dom);
var data = [];
underscore.each(data_dom, function(child_dom) {
var data_dom = DomUtils.getElementsByTagName(name, dom)
, data = []
, index = 0;

for (index = 0; index < data_dom.length; index += 1) {
var child_dom = data_dom[index];
if (child_dom["children"]) {
data.push(child_dom["children"][0]["data"]);
};
});
};
return data;
};

fetchSlides('nodejs');
fetchSlides('node.js');
fetchSlides('javascript');
fetchSlides('js');
fetchSlides('jquery');

// WTF, I hate disconnecting mongo by setTimeout
var intervalId = setInterval(function() {
if (slideshow_states.length === 0) {
mongoose.disconnect();
clearInterval(intervalId);
}
}, 5000);
2 changes: 1 addition & 1 deletion models/slideshow.js
Expand Up @@ -6,7 +6,7 @@ var Slideshow = new Schema({
title : {type: String}
, description : {type: String}
, username : {type: String}
, url : {type: String}
, url : {type: String, unique: true}
, created : {type: Date, default: Date.now}
, tags : {type: Array}
, slideshare_id : {type: String}
Expand Down
1 change: 0 additions & 1 deletion package.json
Expand Up @@ -5,7 +5,6 @@
"express": ""
, "jade": ""
, "nodemon": ""
, "underscore": ""
, "htmlparser": ""
, "mongoose": ""
, "dateformat": ""
Expand Down

0 comments on commit 4b0b43f

Please sign in to comment.