## Creation of datasets from original data

In [1]:
DATASET_FILE = '../datasets/30MDataset/relations/sessions.idomaar'
DATASET_PROCESSED = '../Datasets/30M/raw-interactions/sessions_raw.hdf'
DATASET_ROOT= '../Datasets/30M/'
from itertools import (takewhile,repeat)
from tqdm import tqdm
import pandas as pd
import json
def rawincount(filename):
    # Fastest lines counter ever!!!
    
    f = open(filename, 'rb')
    bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024) for _ in repeat(None)))
    return sum( buf.count(b'\n') for buf in bufgen)


def preprocess_playlists():
    line_count = rawincount(DATASET_FILE)
    columns = ['user_id','session_id','created_at','item_id','item_playstart', 'item_playtime']
    with open(DATASET_FILE, 'rt', buffering=64*1024*1024, encoding='utf8') as dataset:
        fields = []
        for line in tqdm(dataset,total=line_count):

            line = line.split('\t')
            session_id = int(line[1])
            created_at = int(line[2])
            dict_data = json.loads(str(line[3]).split(' ')[1])
            # process dictionary
            user_id = dict_data['subjects'][0]['id']
            objects = [obj['id'] for obj in dict_data['objects']]
            objects_playtime = [obj['playtime'] for obj in dict_data['objects']]
            objects_playstart = [obj['playstart'] for obj in dict_data['objects']]
            for idx in range(len(objects)):
                fields.append([user_id, session_id, created_at, objects[idx], objects[idx]+objects_playstart[idx], objects_playtime[idx]])
            
        df = pd.DataFrame(fields, columns=columns)
        df.to_hdf(DATASET_PROCESSED, key='raw')
    del df
    del fields


In [2]:
DATASET_FILE_TIANCHI = '../datasets/Repeat Buyers Prediction-Challenge the Baseline/data_format1/user_log_format1.csv'
DATASET_FILE_TIANCHI_PRE = '../datasets/Repeat Buyers Prediction-Challenge the Baseline/data_format1/user_log_format1_pre.csv'
DATASET_PROCESSED_TIANCHI = '../Datasets/Tianchi/raw-interactions/sessions_raw.hdf'
DATASET_ROOT_TIANCHI= '../Datasets/Tianchi/'
from itertools import (takewhile,repeat)
from tqdm import tqdm
from datetime import datetime
import pandas as pd
def rawincount(filename):
    from itertools import takewhile
    # Fastest lines counter ever!!!
    f = open(filename, 'rb')
    bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024) for _ in repeat(None)))
    return sum( buf.count(b'\n') for buf in bufgen)
def select_colums_from_source():
    source = pd.read_csv(DATASET_FILE_TIANCHI,usecols=['user_id','item_id','time_stamp','action_type'])
    source.sort_values(['user_id','time_stamp'],inplace=True)
    source.to_csv(DATASET_FILE_TIANCHI_PRE,index=False)
    del source
def preprocess_log():
    
    line_count = rawincount(DATASET_FILE_TIANCHI_PRE)
    columns = ['user_id','item_id','interaction_type','created_at']
    with open(DATASET_FILE_TIANCHI_PRE, 'rt', buffering=64*1024*1024*8, encoding='utf8') as dataset:
        fields = []
        dataset.__next__()
        line_number= 0
        date_ = '----'
        offset=0
        for line in tqdm(dataset,total=line_count):
            line_number +=1
            line = line.split(',')
            user_id = line[0]
            timestamp = line[2]
            item_id = line[1]
            action_type = line[3].strip()
        
            if date_ != timestamp:
                offset = 0
                date_ = timestamp
                ts = datetime.strptime(timestamp, '%m%d').timestamp()
            else:
                offset += 1
            
            new_line = [user_id, item_id, action_type,ts+offset]
            fields.append(new_line)
            if line_number % 1e7 == 0:
                df = pd.DataFrame(fields, columns=columns)
                df.to_hdf(DATASET_PROCESSED_TIANCHI, key='raw',mode='a')
                fields=[]
        #saving last slice
        df = pd.DataFrame(fields, columns=columns)
        df.to_hdf(DATASET_PROCESSED_TIANCHI, key='raw',mode='a')
    del df
    del fields
        


## Creating Data Logs from raw data

In [3]:
preprocess_playlists()
preprocess_log()


100%|█████████▉| 54925330/54925331 [03:27<00:00, 264787.87it/s]


## Filtering data according to baseline critheria

In [7]:
# Inspired on the original data filtering from HGRU4REC
import subprocess
import numpy as np
def make_sessions(data, session_th=30 * 60, is_ordered=False, user_key='user_id', item_key='item_id', time_key='ts'):
    """Assigns session ids to the events in data without grouping keys"""
    if not is_ordered:
        # sort data by user and time
        data.sort_values(by=[user_key, time_key], ascending=True, inplace=True)
    # compute the time difference between queries
    tdiff = np.diff(data[time_key].values)
    # check which of them are bigger then session_th
    split_session = tdiff > session_th
    split_session = np.r_[True, split_session]
    # check when the user chenges is data
    new_user = data['user_id'].values[1:] != data['user_id'].values[:-1]
    new_user = np.r_[True, new_user]
    # a new sessions stars when at least one of the two conditions is verified
    new_session = np.logical_or(new_user, split_session)
    # compute the session ids
    session_ids = np.cumsum(new_session)
    data['session_id'] = session_ids
    return data

    
def last_session_out_split(data,
                           user_key='user_id',
                           item_key='item_id',
                           session_key='session_id',
                           time_key='ts',
                           clean_test=True,
                           min_session_length=2):
    """
    last-session-out split
    assign the last session of every user to the test set and the remaining ones to the training set
    """
    sessions = data.sort_values(by=[user_key, time_key]).groupby(user_key)[session_key]
    last_session = sessions.last()
    train = data[~data.session_id.isin(last_session.values)].copy()
    test = data[data.session_id.isin(last_session.values)].copy()
    if clean_test:
        train_items = train[item_key].unique()
        test = test[test[item_key].isin(train_items)]
        #  remove sessions in test shorter than min_session_length
        slen = test[session_key].value_counts()
        good_sessions = slen[slen >= min_session_length].index
        test = test[test[session_key].isin(good_sessions)].copy()
    return train, test

def last_session_out_split_policy_train(data,
                           user_key='user_id',
                           item_key='item_id',
                           session_key='session_id',
                           time_key='ts',
                           clean_test=True,
                           min_session_length=2):
    """
    last-session-out split
    assign the last session of every user to the test set and the remaining ones to the training set
    """
    sessions = data.sort_values(by=[user_key, time_key]).groupby(user_key)[session_key]
    last_session = sessions.last()
    train = data[~data.session_id.isin(last_session.values)].copy()
    test = data[data.session_id.isin(last_session.values)].copy()
    if clean_test:
        train_items = train[item_key].unique()
        test = test[test[item_key].isin(train_items)]
        #  remove sessions in test shorter than min_session_length
        slen = test[session_key].value_counts()
        good_sessions = slen[slen >= min_session_length].index
        test = test[test[session_key].isin(good_sessions)].copy()
    return train_env, train_policy, test

def last_n_days_out_split(data, n=1,
                          user_key='user_id',
                          item_key='item_id',
                          session_key='session_id',
                          time_key='ts',
                          clean_test=True,
                          min_session_length=2):
    """
    last n-days out split
    assign the sessions in the last n days to the test set and remaining to the training one
    """
    DAY = 24 * 60 * 60
    data.sort_values(by=[user_key, time_key], inplace=True)
    sessions_start = data.groupby(session_key)[time_key].agg('min')
    end_time = data[time_key].max()
    test_start = end_time - n * DAY
    train = data[data.session_id.isin(sessions_start[sessions_start < test_start].index)].copy()
    test = data[data.session_id.isin(sessions_start[sessions_start >= test_start].index)].copy()
    if clean_test:
        train_items = train[item_key].unique()
        test = test[test[item_key].isin(train_items)]
        #  remove sessions in test shorter than min_session_length
        slen = test[session_key].value_counts()
        good_sessions = slen[slen >= min_session_length].index
        test = test[test[session_key].isin(good_sessions)].copy()
    return train, test

def ratio_session_split(data, 
                        keep_ratio=0.5,
                        user_key='user_id',
                        item_key='item_id',
                        session_key='session_id',
                        time_key='ts',
                        clean_test=True,
                        min_session_length=2):
    sessions = data.sort_values(by=[user_key, time_key]).groupby(user_key)[session_key]
    last_session = sessions.last()
    train = data[~data.session_id.isin(last_session.values)].copy()
    test = data[data.session_id.isin(last_session.values)].copy()
    if clean_test:
        train_items = train[item_key].unique()
        test = test[test[item_key].isin(train_items)]
        #  remove sessions in test shorter than min_session_length
        slen = test[session_key].value_counts()
        good_sessions = slen[slen >= min_session_length].index
        test = test[test[session_key].isin(good_sessions)].copy()
    
    return train_1 ,test_1, train_2, test_2



def create_dataset(interactions, mode=['raw','full'],dataset_dir=''):
    print('Mode: '+'-'.join(mode))
    print('Filtering data')
    
    # drop duplicate interactions within the same session
    

    if mode[0] == 'dense':
        interactions.drop_duplicates(subset=['item_id', 'session_id'], keep='first', inplace=True)
    # keep items with >=20 interactions
    item_pop = interactions.item_id.value_counts()
    good_items = item_pop[item_pop >= 20].index
    inter_dense = interactions[interactions.item_id.isin(good_items)]
    # remove sessions with length < 3
    session_length = inter_dense.session_id.value_counts()
    good_sessions = session_length[session_length >= 3].index
    inter_dense = inter_dense[inter_dense.session_id.isin(good_sessions)]
    # let's keep only returning users (with >= 5 sessions) and remove overly active ones (>=200 sessions)
    sess_per_user = inter_dense.groupby('user_id')['session_id'].nunique()
    good_users = sess_per_user[(sess_per_user >= 5) & (sess_per_user < 200)].index
    #selection 1000 users for small dataset
    if mode[1] == 'small':
        good_users = good_users[:1000]
    inter_dense = inter_dense[inter_dense.user_id.isin(good_users)]



    print('Filtered data:')
    print('Num items: {}'.format(inter_dense.item_id.nunique()))
    print('Num users: {}'.format(inter_dense.user_id.nunique()))
    print('Num sessions: {}'.format(inter_dense.session_id.nunique()))

    print('Partitioning data')
    # last-session-out partitioning
    train_full_sessions, test_sessions = last_session_out_split(inter_dense,
                                                                user_key='user_id',
                                                                item_key='item_id',
                                                                session_key='session_id',
                                                                time_key='created_at',
                                                                clean_test=True)
    train_valid_sessions, valid_sessions = last_session_out_split(train_full_sessions,
                                                                  user_key='user_id',
                                                                  item_key='item_id',
                                                                  session_key='session_id',
                                                                  time_key='created_at',
                                                                  clean_test=True)

    print('Write to disk')
    # write to disk
    mode_folder = dataset_dir+'-'.join(mode)
    subprocess.call(['mkdir', '-p', mode_folder+'/last-session-out'])
    train_full_sessions.to_hdf(mode_folder+'/last-session-out/sessions.hdf','train')
    test_sessions.to_hdf(mode_folder+'/last-session-out/sessions.hdf','test')
    train_valid_sessions.to_hdf(mode_folder+'/last-session-out/sessions.hdf', 'valid_train')
    valid_sessions.to_hdf(mode_folder+'/last-session-out/sessions.hdf','valid_test')

    

## Creating 30M Dataset

In [9]:
#reading data from origial source
#preprocess_playlists()
# Reading raw data

interactions = pd.read_hdf(DATASET_PROCESSED, key='raw')
create_dataset(interactions, mode=['raw','full'],dataset_dir=DATASET_ROOT)
create_dataset(interactions, mode=['raw','small'],dataset_dir=DATASET_ROOT)
create_dataset(interactions, mode=['dense','full'],dataset_dir=DATASET_ROOT)
create_dataset(interactions, mode=['dense','small'],dataset_dir=DATASET_ROOT)

Mode: raw-full
Filtering data
Filtered data:
Num items: 232399
Num users: 37667
Num sessions: 1213283
Partitioning data
Write to disk
Mode: raw-small
Filtering data
Filtered data:
Num items: 103327
Num users: 1000
Num sessions: 32135
Partitioning data
Write to disk
Mode: dense-full
Filtering data
Filtered data:
Num items: 193726
Num users: 36832
Num sessions: 1121096
Partitioning data
Write to disk
Mode: dense-small
Filtering data
Filtered data:
Num items: 97595
Num users: 1000
Num sessions: 30163
Partitioning data
Write to disk


## Creating Tianchi Dataset

In [8]:
interactions = pd.read_hdf(DATASET_PROCESSED_TIANCHI, key='raw')
print('Removing negative events')
negative_events = [4]
interactions = interactions[~interactions.interaction_type.isin(negative_events)].copy()
print('Building sessions')
#partition interactions into sessions with 30-minutes idle time
interactions = make_sessions(interactions, session_th=60 * 60, time_key='created_at', is_ordered=False)


create_dataset(interactions, mode=['raw','full'],dataset_dir=DATASET_ROOT_TIANCHI)
create_dataset(interactions, mode=['raw','small'],dataset_dir=DATASET_ROOT_TIANCHI)
create_dataset(interactions, mode=['dense','full'],dataset_dir=DATASET_ROOT_TIANCHI)
create_dataset(interactions, mode=['dense','small'],dataset_dir=DATASET_ROOT_TIANCHI)


Removing negative events
Building sessions
Mode: raw-full
Filtering data
Filtered data:
Num items: 49805
Num users: 19992
Num sessions: 238559
Partitioning data
Write to disk
Mode: raw-small
Filtering data
Filtered data:
Num items: 27670
Num users: 1000
Num sessions: 11995
Partitioning data
Write to disk
Mode: dense-full
Filtering data
Filtered data:
Num items: 31497
Num users: 12799
Num sessions: 137429
Partitioning data
Write to disk
Mode: dense-small
Filtering data
Filtered data:
Num items: 23367
Num users: 1000
Num sessions: 10928
Partitioning data
Write to disk
