-
Notifications
You must be signed in to change notification settings - Fork 3
/
scraper.js
executable file
·124 lines (104 loc) · 3.34 KB
/
scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#! /usr/bin/env node
var cheerio = require('cheerio');
var request = require('request');
var queue = require('d3-queue').queue(10);
var sqlite = require('sqlite3');
var tomd = require('to-markdown');
const chambers = [{
name: 'House',
path: 'http://data.openaustralia.org/scrapedxml/representatives_debates/'
},{
name: 'Senate',
path: 'http://data.openaustralia.org/scrapedxml/senate_debates/'
}];
// Set up sqlite database.
var db = new sqlite.Database("data.sqlite");
db.serialize(function() {
db.run("CREATE TABLE IF NOT EXISTS data (speech_id TEXT PRIMARY KEY, chamber TEXT, debate_type TEXT, debate_subject TEXT, speaker_id TEXT, speaker_name TEXT, speech TEXT)");
start(db);
});
function start(db) {
chambers.forEach(function(chamber){
console.log('Request: '+chamber.path);
request(chamber.path, function(error,response,body){
var $ = cheerio.load(body);
$('a').filter(function(){
return $(this).attr('href').match('[0-9]{4}-[0-9]{2}-[0-9]{2}');
}).each(function(){
var xmlPath = chamber.path + $(this).attr('href');
queue.defer(requestXml({db:db,chamber:chamber,xmlPath:xmlPath}));
});
});
});
}
function requestXml(opts) {
return function(cb){
console.log('Request: '+opts.xmlPath);
opts.cb = cb;
request(opts.xmlPath, handleXml.bind(opts));
};
}
function handleXml(err, res, body) {
var major, minor, opts = this;
if (err) {
return console.error(err);
}
var $ = cheerio.load(body);
$('debates').children('speech,major-heading,minor-heading').each(function(){
var data = {}, node = this, $node = $(this);
// Just note for late if it's a heading
if (node.name === 'major-heading') {
major = $(node).text().trim() || null;
return;
}
if (node.name === 'minor-heading') {
minor = $(node).text().trim() || null;
return;
}
// Okay, we have speech lets parse it
data.$speech_id = $node.attr('id');
data.$chamber = opts.chamber.name;
data.$debate_type = major;
data.$debate_subject = minor;
data.$speaker_id = $node.attr('speakerid');
data.$speaker_name = $node.attr('speakername');
data.$speech = tomd($node.html(),{
gfm: true,
converters: [{
filter: 'dd',
replacement: function(content) {
content = content.replace(/^\s+/, '').replace(/\n/gm, '\n ');
return ': ' + content;
}
},{
filter: 'dt',
replacement: function(content) {
content = content.replace(/^\s+/, '').replace(/\n/gm, '\n ');
return content;
}
},{
filter: 'dl',
replacement: function (content, node) {
var strings = [];
for (var i = 0; i < node.childNodes.length; i++) {
strings.push(node.childNodes[i]._replacement);
}
return '\n\n' + strings.join('\n') + '\n\n';
}
}]
});
// Save to DB
updateRow(opts.db, data);
});
// Collect garbabge (this is for morph.io)
global.gc();
console.log('Manual garbabge collection: ', process.memoryUsage());
// Callback
opts.cb();
}
function updateRow(db, values) {
// Insert some data.
var statement = db.prepare("INSERT OR REPLACE INTO data VALUES ($speech_id, $chamber, $debate_type, $debate_subject, $speaker_id, $speaker_name, $speech)");
statement.run(values);
statement.finalize();
}