# Analysing Piwik Logs

We will stream a JSON array of objects as we proceed into a LZMH compressed file.

In [1]:
var glob = require('glob');
var path = require('path');
var fs = require('fs');
var JSONStream = require('JSONStream');
var EventStream = require('event-stream');
var xz = require('xz');

# Data Selection

We select all timestamped files — we have one stat file per day.

In [52]:
var source_dir = 'raw_visit_logs/';
var target_dir = 'deduplicated_logs/';
var files = glob.sync(source_dir + '*-*-*.json');
var clean_files = glob.sync(target_dir + '*-*-*.json'); // done in a previous run

In [56]:
files[1]

"raw_visit_logs/2017-08-06.json"

In [57]:
var files_json = files.map(i => i.split('/')[1]);
var clean_files_json = clean_files.map(i => i.split('/')[1]);
var new_files = files_json.filter(x => clean_files_json.indexOf(x) == -1).map(x => 'raw_visit_logs' + clean_files_json )

In [58]:
new_files

[Array] ["raw_visit_logs2017-08-05.json,2017-08-06.json,2017-08-07.json,2017-08-08.json,2017-08-09.json,2017-08-10.json,2017-08-11.json,2017-08-12.json,2017-08-13.json,2017-08-14.json,2017-08-15.json,2017-08-16.json,2017-08-17.json,2017-08-18.json,2017-08-19.json,2017-08-20.json,2017-08-21.json,2017-08-22.json,2017-08-23.json,2017-08-24.json,2017-08-25.json,2017-08-26.json,2017-08-27.json,2017-08-28.json,2017-08-29.json,2017-08-30.json,2017-08-31.json,2017-09-01.json,2017-09-02.json,2017-09-03.json,2017-09-04.json,2017-09-05.json,2017-09-06.json,2017-09-07.json,2017-09-08.json,2017-09-09.json,2017-09-10.json,2017-09-11.json,2017-09-12.json,2017-09-13.json,2017-09-14.json,2017-09-15.json,2017-09-16.json,2017-09-17.json,2017-09-18.json,2017-09-19.json,2017-09-20.json,2017-09-21.json,2017-09-22.json,2017-09-23.json,2017-09-24.json,2017-09-25.json,2017-09-26.json,2017-09-27.json,2017-09-28.json,2017-09-29.json,2017-09-30.json,2017-10-01.json,2017-10-02.json,2017-10-03.json,2017-10-04.json,

In [28]:
'ae/az'.split('/')[0]

"ae"

Some utility functions.

In [3]:
var dateFromTimestamp = (ts) => new Date(ts * 1000).toISOString().split('T')[0];

In [4]:
var all = files.map(file => {
    return fs.createReadStream(file)
        .pipe(JSONStream.parse('*'))
        .pipe(EventStream.map(({ idVisit, actionDetails, serverDate }, callback) => {
            var actions = actionDetails
                .map(({ url, type, siteSearchKeyword:keyword, timestamp }) => ({
                  url,
                  type,
                  keyword,
                }))
                .filter(({type, keyword}, i, allActions) => {
                    const nextKeyword = allActions[i+1] ? allActions[i+1].keyword : null;

                    return !(type === 'search' && nextKeyword && nextKeyword.indexOf(keyword) === 0);
                });

            var visitClean = {
                idVisit,
                actions,
                date: serverDate,
            };
        

            callback(null, visitClean);
        }))
        .pipe(JSONStream.stringify())
        .pipe(fs.createWriteStream(target_dir + path.basename(file)))
        .on('finish', () => console.log(`${file} done!`));
});

raw_visit_logs/2017-08-12.json done!
raw_visit_logs/2017-08-13.json done!
raw_visit_logs/2017-08-20.json done!
raw_visit_logs/2017-08-06.json done!
raw_visit_logs/2017-08-05.json done!
raw_visit_logs/2017-08-27.json done!
raw_visit_logs/2017-08-26.json done!
raw_visit_logs/2017-09-02.json done!
raw_visit_logs/2017-08-19.json done!
raw_visit_logs/2017-09-03.json done!
raw_visit_logs/2017-08-15.json done!
raw_visit_logs/2017-09-09.json done!
raw_visit_logs/2017-09-16.json done!
raw_visit_logs/2017-12-02.json done!
raw_visit_logs/2017-12-09.json done!
raw_visit_logs/2017-10-07.json done!
raw_visit_logs/2017-09-10.json done!
raw_visit_logs/2017-09-23.json done!
raw_visit_logs/2017-08-14.json done!
raw_visit_logs/2017-09-17.json done!
raw_visit_logs/2017-10-28.json done!
raw_visit_logs/2017-10-08.json done!
raw_visit_logs/2017-09-24.json done!
raw_visit_logs/2017-10-21.json done!
raw_visit_logs/2017-10-14.json done!
raw_visit_logs/2017-12-10.json done!
raw_visit_logs/2017-09-30.json done!
r