# Analysing Piwik Logs

We will stream a JSON array of objects as we proceed into a LZMH compressed file.

In [1]:
var glob = require('glob');
var path = require('path');
var fs = require('fs');
var JSONStream = require('JSONStream');
var EventStream = require('event-stream');
var xz = require('xz');

# Data Selection

We select all timestamped files — we have one stat file per day.

In [2]:
var source_dir = 'raw_visit_logs/';
var target_dir = 'deduplicated_logs/';
var files = glob.sync(source_dir + '*-*-*.json');

Some utility functions.

In [3]:
var dateFromTimestamp = (ts) => new Date(ts * 1000).toISOString().split('T')[0];

In [4]:
var all = files.map(file => {
    return fs.createReadStream(file)
        .pipe(JSONStream.parse('*'))
        .pipe(EventStream.map(({ idVisit, actionDetails, serverDate }, callback) => {
            var actions = actionDetails
                .map(({ url, type, siteSearchKeyword:keyword, timestamp }) => ({
                  url,
                  type,
                  keyword,
                }))
                .filter(({type, keyword}, i, allActions) => {
                    const nextKeyword = allActions[i+1] ? allActions[i+1].keyword : null;

                    return !(type === 'search' && nextKeyword && nextKeyword.indexOf(keyword) === 0);
                });

            var visitClean = {
                idVisit,
                actions,
                date: serverDate,
            };
        

            callback(null, visitClean);
        }))
        .pipe(JSONStream.stringify())
        .pipe(fs.createWriteStream(target_dir + path.basename(file)))
        .on('finish', () => console.log(`${file} done!`));
});

raw_visit_logs/2017-12-02.json done!
raw_visit_logs/2017-12-09.json done!
raw_visit_logs/2017-12-10.json done!
raw_visit_logs/2017-12-03.json done!
raw_visit_logs/2017-11-04.json done!
raw_visit_logs/2017-11-18.json done!
raw_visit_logs/2017-11-11.json done!
raw_visit_logs/2017-11-05.json done!
raw_visit_logs/2017-11-19.json done!
raw_visit_logs/2017-11-26.json done!
raw_visit_logs/2017-11-12.json done!
raw_visit_logs/2017-11-25.json done!
raw_visit_logs/2017-12-08.json done!
raw_visit_logs/2017-11-30.json done!
raw_visit_logs/2017-12-06.json done!
raw_visit_logs/2017-12-01.json done!
raw_visit_logs/2017-12-07.json done!
raw_visit_logs/2017-11-24.json done!
raw_visit_logs/2017-12-11.json done!
raw_visit_logs/2017-11-10.json done!
raw_visit_logs/2017-11-29.json done!
raw_visit_logs/2017-11-09.json done!
raw_visit_logs/2017-11-23.json done!
raw_visit_logs/2017-11-28.json done!
raw_visit_logs/2017-11-27.json done!
raw_visit_logs/2017-11-06.json done!
raw_visit_logs/2017-11-17.json done!
r