Skip to content
Browse files

Minor fixes

  • Loading branch information...
1 parent 8513e8d commit 84dafe54a7316ec57580cacd053451adb43f27cd @dhruvbird dhruvbird committed Feb 22, 2012
Showing with 37 additions and 18 deletions.
  1. +13 −6 quora-crawler/crawl.js
  2. +24 −12 quora-crawler/parse.js
View
19 quora-crawler/crawl.js
@@ -26,12 +26,18 @@ function runMain() {
console.log("crawl.js::runMain");
// Spawn a task that downloads links in the TODO state
- var download = spawn('./download.js');
-
- // console.log(download, download.pid);
-
- download.stdout.on('data', labelledLogger('download::stdout::', console.log.bind(console)));
- download.stderr.on('data', labelledLogger('download::stderr::', console.error.bind(console)));
+ function spawnDownloader() {
+ var download = spawn('./download.js');
+
+ // console.log(download, download.pid);
+
+ download.stdout.on('data', labelledLogger('download::stdout::', console.log.bind(console)));
+ download.stderr.on('data', labelledLogger('download::stderr::', console.error.bind(console)));
+
+ download.on('exit', function(code) {
+ spawnDownloader();
+ });
+ }
//
// Spawn a task that parses downloaded links in the SAVED state,
@@ -62,6 +68,7 @@ function runMain() {
});
}
+ spawnDownloader();
spawnParser();
}
View
36 quora-crawler/parse.js
@@ -37,7 +37,7 @@ function runMain() {
// Parse the file and set state to 'PARSED'
var fPath = "./quora-data" + row.url;
// console.log("Parsing file:", fPath);
- var contents = '<body><script>';
+ var contents = '<body><p></p><script>';
if (path.existsSync(fPath)) {
contents = fs.readFileSync(fPath, 'utf-8');
@@ -75,9 +75,16 @@ function runMain() {
answers.pop();
answers = answers.map(function(answer) {
+ var a = $(answer);
+ var content = a.find('.answer_content')
+ var count = a.find('.voter_count');
+ if (content.length != 1) {
+ return { body: '', votes: '0' };
+ }
+
return {
- body: $(answer).find('.answer_content').text().trim(),
- votes: ($(answer).find('.voter_count').text() || '0').trim()
+ body: content.text().trim(),
+ votes: ((count.length == 1 ? count.text() : '0') || '0').trim()
};
});
@@ -92,26 +99,31 @@ function runMain() {
// Add question to DB.
db.run("UPDATE QUESTIONS SET status=?, title=?, body=? WHERE id=?",
PARSED, title, body, row.id);
+
+ // Add all answers to the DB.
+ answers.forEach(function(answer) {
+ if (answer.body) {
+ db.run('INSERT OR IGNORE INTO ANSWERS (questionID, body, votes) VALUES (?, ?, ?)',
+ row.id, answer.body, answer.votes);
+ }
+ });
+
+ } else {
+ // Mark it as an error.
+ db.run("UPDATE QUESTIONS SET status=? WHERE id=?", ERROR, row.id);
}
// Add (potentially) new links to the DB.
questionURLs.forEach(function(questionURL) {
db.run(INSERT_IGNORE_SQL, questionURL, TODO, null, null);
});
- // Add all answers to the DB.
- answers.forEach(function(answer) {
- if (answer.body) {
- db.run('INSERT OR IGNORE INTO ANSWERS (questionID, body, votes) VALUES (?, ?, ?)',
- row.id, answer.body, answer.votes);
- }
- });
} else {
console.error("Error parsing file:", fPath, errors);
// Set this row in the 'ERROR' state.
- db.run("UPDATE QUESTIONS SET status=? WHERE id=?",
- ERROR, row.id);
+ db.run("UPDATE QUESTIONS SET status=? WHERE id=?", ERROR, row.id);
+ console.error("Contents:", contents);
return;
}

0 comments on commit 84dafe5

Please sign in to comment.
Something went wrong with that request. Please try again.