# Analysing Piwik Logs

We will stream a JSON array of objects as we proceed into a LZMH compressed file.

In [1]:
var glob = require('glob');
var path = require('path');
var fs = require('fs');
var JSONStream = require('JSONStream');
var EventStream = require('event-stream');
var xz = require('xz');

# Data Selection

We select all timestamped files â€” we have one stat file per day.

In [2]:
var source_dir = '../raw_visit_logs/';
var target_dir = '../deduplicated_logs/';
var files = glob.sync(source_dir + '*-*-*.json');
var clean_files = glob.sync(target_dir + '*-*-*.json'); // done in a previous run

In [3]:
files[1]

"../raw_visit_logs/2017-08-06.json"

In [4]:
var files_json = files.map(i => i.split('/')[2]);
var clean_files_json = clean_files.map(i => i.split('/')[2]);
// Following array contains only new files (not the one already cleaned.)
var new_files = files_json.filter(x => clean_files_json.indexOf(x) == -1).map(x => '../raw_visit_logs/' + x )

Some utility functions.

In [5]:
var dateFromTimestamp = (ts) => new Date(ts * 1000).toISOString().split('T')[0];

In [6]:
var all = new_files.map(file => {
    return fs.createReadStream(file)
        .pipe(JSONStream.parse('*'))
        .pipe(EventStream.map(({ idVisit, actionDetails, serverDate }, callback) => {
            var actions = actionDetails
                .map(({ url, type, siteSearchKeyword:keyword, timestamp }) => ({
                  url,
                  type,
                  keyword,
                }))
                .filter(({type, keyword}, i, allActions) => {
                    const nextKeyword = allActions[i+1] ? allActions[i+1].keyword : null;

                    return !(type === 'search' && nextKeyword && nextKeyword.indexOf(keyword) === 0);
                });

            var visitClean = {
                idVisit,
                actions,
                date: serverDate,
            };
        

            callback(null, visitClean);
        }))
        .pipe(JSONStream.stringify())
        .pipe(fs.createWriteStream(target_dir + path.basename(file)))
        .on('finish', () => console.log(`${file} done!`));
});

../raw_visit_logs/2017-12-24.json done!
../raw_visit_logs/2017-12-25.json done!
../raw_visit_logs/2017-12-31.json done!
../raw_visit_logs/2018-01-01.json done!
../raw_visit_logs/2017-12-23.json done!
../raw_visit_logs/2017-12-30.json done!
../raw_visit_logs/2018-01-07.json done!
../raw_visit_logs/2017-12-26.json done!
../raw_visit_logs/2017-12-29.json done!
../raw_visit_logs/2017-12-28.json done!
../raw_visit_logs/2017-12-27.json done!
../raw_visit_logs/2018-01-06.json done!
../raw_visit_logs/2017-12-22.json done!
../raw_visit_logs/2018-01-02.json done!
../raw_visit_logs/2018-01-03.json done!
../raw_visit_logs/2018-01-04.json done!
../raw_visit_logs/2017-12-21.json done!
../raw_visit_logs/2018-01-09.json done!
../raw_visit_logs/2018-01-08.json done!
../raw_visit_logs/2018-01-05.json done!
../raw_visit_logs/2018-01-10.json done!
