Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

save data in temporary file instead of memory #106

Merged
merged 8 commits into from
Jun 21, 2016
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
79 changes: 56 additions & 23 deletions app.js
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,14 @@ function generateFeatures(url, client, clientid) {

var db = {};
var server;
var tempPath = 'temp' + (cluster.isWorker ? '-' + cluster.worker.process.pid: '');

function run(keys) {
try {
fs.mkdirSync(tempPath);
} catch(e) {
console.log('work dir already exists');
}
app.use('/static', express.static(__dirname + '/static'));

if (keys === undefined) {
Expand Down Expand Up @@ -109,6 +115,7 @@ function run(keys) {
url: referer,
userAgent: ua
};
var tempStream = fs.createWriteStream(tempPath + '/' + clientid);

var baseStats = {};

Expand All @@ -130,34 +137,52 @@ function run(keys) {
break;
default:
obfuscate(data);
if (!db[referer][clientid].peerConnections[data[1]]) {
db[referer][clientid].peerConnections[data[1]] = [];
baseStats[data[1]] = {};
}
if (data[0] === 'getstats') { // delta-compressed
data[2] = statsDecompressor(baseStats[data[1]], data[2]);
baseStats[data[1]] = JSON.parse(JSON.stringify(data[2]));
}
if (data[0] === 'getStats' || data[0] === 'getstats') {
data[2] = statsMangler(data[2]);
data[0] = 'getStats';
}
db[referer][clientid].peerConnections[data[1]].push({
time: new Date(),
type: data[0],
value: data[2]
});
data.time = new Date().getTime();
tempStream.write(JSON.stringify(data) + '\n');
break;
}
});

client.on('close', function() {
console.log('closed');

var client = db[referer][clientid];
dump(referer, client, clientid);
generateFeatures(referer, client, clientid);
delete db[referer][clientid];
tempStream.on('finish', function() {
fs.readFile(tempStream.path, {encoding: 'utf-8'}, function(err, data) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At some point I think we will have to put this reading/processing part in a different process reading the collected files in background and extracting features.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah. I noticed that feature generation can take quite long which skews serverside timestamps.
I'm considering doing the accept in the cluster-master and the feature generation in a couple of workers. Also splitting up the big feature loop such that each feature is a setTimeout.

if (!err) {
data.split('\n').forEach(function(line) {
if (line.length) {
var data = JSON.parse(line);
var time = new Date(data.time);
delete data.time;
if (!db[referer][clientid].peerConnections[data[1]]) {
db[referer][clientid].peerConnections[data[1]] = [];
baseStats[data[1]] = {};
}
if (data[0] === 'getstats') { // delta-compressed
data[2] = statsDecompressor(baseStats[data[1]], data[2]);
baseStats[data[1]] = JSON.parse(JSON.stringify(data[2]));
}
if (data[0] === 'getStats' || data[0] === 'getstats') {
data[2] = statsMangler(data[2]);
data[0] = 'getStats';
}
db[referer][clientid].peerConnections[data[1]].push({
time: time,
type: data[0],
value: data[2]
});
}
});
}
// we proceed even if there was an error.
var client = db[referer][clientid];
delete db[referer][clientid];
fs.unlink(tempStream.path, function(err, data) {
// we're good...
});
dump(referer, client, clientid);
generateFeatures(referer, client, clientid);
});
});
tempStream.end();
});
});
}
Expand All @@ -175,6 +200,14 @@ if (require.main === module && cluster.isMaster) {
cluster.on('exit', function(worker, code, signal) {
console.log('worker', worker.process.pid, 'died, restarting');
cluster.fork();

// clean up after worker.
// TODO: Possibly recover data. For now: throw it away.
var path = 'temp-' + worker.process.pid;
fs.readdirSync(path).forEach(function(fname) {
fs.unlinkSync(path + '/' + fname);
});
fs.rmdirSync(path);
});
} else {
run();
Expand Down