In [12]:
from glob import glob
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import pickle
import datetime
import re
import os

In [14]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [15]:
import random

Data taken from the article http://ceur-ws.org/Vol-1703/paper12.pdf

The data itself: http://fc.isima.fr/~kahngi/cez13.zip

In [4]:
# Progress tracker from https://github.com/alexanderkuk/log-progress
def log_progress(sequence, every=None, size=None):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{index} / ?'.format(index=index)
                else:
                    progress.value = index
                    label.value = u'{index} / {size}'.format(
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = str(index or '?')

In [5]:
# Auxillary function to assign site idexes
def sitefreq(sites, site_freq={}):
    if len(site_freq):
        site_id = max(site_freq.items(), key=lambda t: t[1][0])[1][0] + 1
    else:
        site_id = 1
        
    for site in sites:
        if site not in site_freq:
            site_freq[site] = [site_id, 1]
            site_id += 1
        else:
            site_freq[site][1] += 1
            
    return site_freq

In [6]:
# Assigns every website a unique index and counts the number of its appearance
def prepare_site_indexes(csv_files_mask, user_filter=[]):
    for userfile in log_progress(glob(csv_files_mask), every=1):
        user_id = int(re.search('cat(\d+)\.csv', userfile).group(1))
        if len(user_filter) and user_id not in user_filter: continue     
        
        data = pd.read_csv(userfile, sep=";", header=None, \
                           names=['userid', 'timestamp', 'site'])      
        site_freq = sitefreq(data.site)
    return site_freq

In [7]:
data_folder = "/home/ubuntu/identifyme/data/dispoSite/30users/"
out_data = "/home/ubuntu/identifyme/data/"

In [8]:
%%time
site_index = prepare_site_indexes(data_folder +'cat*.csv')
pd.DataFrame([[v[0], k]for k, v in sorted(site_index.items(), \
            key=lambda x: x[1][0])])\
            .to_csv(out_data + 'site_indexes.txt', header=False, index=False)

CPU times: user 552 ms, sys: 78.6 ms, total: 631 ms
Wall time: 596 ms


In [9]:
#Randomly increment a date (year, month, day) preserving the weekday
weekday = lambda x: x.isoweekday()

def increment_date(date, seed):
    one_day = datetime.timedelta(days=1)
    one_month = datetime.timedelta(days=30)
    one_year = datetime.timedelta(days=365)
    random.seed(seed)
    r = random.sample(range(1, 10), 3)
    
    newdate = date + r[0]*one_year + r[1]*one_month + r[2]*one_day
    while weekday(newdate) != weekday(date):
        newdate += one_day
    return newdate

In [16]:
# Split original dataset into train and test
def prepare_train_test_set(csv_files_mask, dest_folder, ratio=0.7, user_filter=[]):
    for userfile in log_progress(glob(csv_files_mask), every=1):
        # Reading every file and sorting by timestamp
        user_id = int(re.search('cat(\d+)\.csv', userfile).group(1))
        if len(user_filter) and user_id not in user_filter: continue
            
        data = pd.read_csv(userfile, sep=";", header=None, \
                           names=['userid', 'timestamp', 'site'], \
                           parse_dates=[1], infer_datetime_format=True)
        data.sort_values("timestamp", inplace=True)
        data.reset_index(drop=True, inplace=True)

        # Splitting into train and test data using ratio
        train_idx = int(len(data)*ratio)
        valid_idx = int(len(data) * (ratio + (1-ratio)/2))
        
        train_data = pd.DataFrame(data.iloc[:train_idx])[["timestamp", "site"]]        
        valid_data = pd.DataFrame(data.iloc[train_idx:valid_idx])[["timestamp", "site"]]
        test_data = pd.DataFrame(data.iloc[valid_idx:])[["timestamp", "site"]]
        valid_data.reset_index(drop=True, inplace=True)
        test_data.reset_index(drop=True, inplace=True)

        # Randomly incrementing dates within test
        seed = random.randint(1, 5000)
        valid_data["timestamp"] = valid_data.apply(lambda row: increment_date(row["timestamp"], seed), axis=1)
        test_data["timestamp"] = test_data.apply(lambda row: increment_date(row["timestamp"], seed), axis=1)
        
        try:
            os.makedirs(dest_folder + '/train')
            os.makedirs(dest_folder + '/valid')
            os.makedirs(dest_folder + '/test')
        except OSError:
            pass
        
        train_data.to_csv("{}/train/user{}.csv".format(dest_folder, user_id), index=False)
        valid_data.to_csv("{}/valid/user{}.csv".format(dest_folder, user_id), index=False)
        test_data.to_csv("{}/test/user{}.csv".format(dest_folder, user_id), index=False)

In [17]:
%%time
prepare_train_test_set(data_folder + 'cat*.csv', out_data)

CPU times: user 22.7 s, sys: 115 ms, total: 22.8 s
Wall time: 22.7 s


## Create train and test session files

In [18]:
def get_site_index(csv_file):
    site_index = pd.read_csv(csv_file, header=None, names=['siteid', 'site'], index_col=1)
    site_dic = site_index.to_dict()
    return site_dic["siteid"]

In [19]:
# Creates a file with sessions out of plain logs with websites
def prepare_sessions_set(csv_files_mask, out_folder, site_index_path="", set_type = "train",
                        session_length=10, window_size=10, session_time = 30,\
                        sort_in_session=False, remove_dups = False, shuffle=True):
    
    col_names = ["site"+str(c) for c in range(1,session_length+1)] + \
                ["time"+str(c) for c in range(1,session_length+1)] + ["user_id"]
    
    ses_data = np.array([np.zeros(len(col_names))])
    session_num = 0

    site_index = get_site_index(site_index_path)
    
    files = glob(csv_files_mask)
        
    for userfile in log_progress(files, every=1):
        user_id = re.search('user(\d+)\.csv', userfile).group(1)
        
        data = pd.read_csv(userfile, \
                           parse_dates=[0], infer_datetime_format=True)
        data.sort_values("timestamp", inplace=True)
        data.reset_index(drop=True, inplace=True)

        session_hash = {}    
        
        session = []
        timestamps = []
        next_session = []
        next_t_session = []
        next_s_session = []
        next_timestamps = []

        for i in range (0, len(data), window_size):  
            session += list(data[i:i+session_length].site.apply(lambda x: site_index[x]))
            timestamps += list(data[i:i+session_length].timestamp)

            if window_size < session_length and sort_in_session:
                ses_ts_zip = sorted(set(zip(session, timestamps)), key = lambda t: t[1])
                session, timestamps = zip(*ses_ts_zip)
                session = list(session)
                timestamps = list(timestamps)

            while ((len(session) >= session_length) or (not len(data[i+window_size:]) and len(session))):
                time_diff = [(timestamps[n+1] - timestamps[n]).total_seconds() for n in range(0, len(session)-1)]
                session_timespan = (max(timestamps) - min(timestamps)).total_seconds()
                next_session = []
                next_timestamps = []

                while session_timespan > session_time*60 or len(session) > session_length:
                    next_session.insert(0, session.pop())
                    next_timestamps.insert(0, timestamps.pop())
                    time_diff = [(timestamps[n+1] - timestamps[n]).total_seconds() for n in range(0, len(session)-1)]
                    session_timespan = (max(timestamps) - min(timestamps)).total_seconds()

                session = tuple(session)

                if session not in session_hash or not remove_dups:                      
                    session_hash[session] = 1
                    session = list(session)

                    session.extend([0] * (session_length - len(session)) + \
                                   timestamps + [0] * (session_length - len(timestamps)) + [user_id])

                    ses_data = np.concatenate((ses_data, np.array([session])))
                    session_num += 1

                session = next_session
                timestamps = next_timestamps

        if len(next_session):
            print("ERROR! next_session left!")
            print(session)
            return None

    
    ses_data = np.delete(ses_data, 0, 0)
    if shuffle: np.random.shuffle(ses_data)
    ses_data = pd.DataFrame(ses_data, columns=col_names)
    
    ses_data.to_csv(out_folder + set_type + '_sessions.csv', index=False)
    
    return ses_data

In [20]:
%%time
train_data = prepare_sessions_set(out_data + 'train/*', out_folder=out_data, \
                site_index_path=out_data + 'site_indexes.txt', remove_dups=True, shuffle=True, set_type="train")

CPU times: user 3min 33s, sys: 24.5 s, total: 3min 57s
Wall time: 3min 57s


In [21]:
%%time
valid_data = prepare_sessions_set(out_data + 'valid/*', out_folder=out_data, \
                site_index_path=out_data + 'site_indexes.txt', remove_dups=True, shuffle=True, set_type="valid")
test_data = prepare_sessions_set(out_data + 'test/*', out_folder=out_data, \
                site_index_path=out_data + 'site_indexes.txt', remove_dups=True, shuffle=True, set_type="test")

CPU times: user 26.5 s, sys: 72.2 ms, total: 26.5 s
Wall time: 26.5 s
