In [1]:
import os
import numpy as np
import pandas as pd

http://2015.recsyschallenge.com/challenge.html

In [2]:
path = os.path.join('yoochoose-dataFull', 'yoochoose-clicks.dat')
data = pd.read_csv(path, header = None, usecols = [0, 1, 2], names = ['SessionId', 'TimeStamp', 'ItemId'])
data['TimeStamp'] = pd.to_datetime(data['TimeStamp'], format = '%Y-%m-%dT%H:%M:%S.%fZ')
print(data.shape)
data.head()

(33003944, 3)


Unnamed: 0,SessionId,TimeStamp,ItemId
0,1,2014-04-07 10:51:09.277,214536502
1,1,2014-04-07 10:54:09.868,214536500
2,1,2014-04-07 10:54:46.998,214536506
3,1,2014-04-07 10:57:00.306,214577561
4,2,2014-04-07 13:56:37.614,214662742


In [3]:
# filter out infrequent item (less than 5) and short sessions 
# (presumably a misclick) of length 1
item_supports = data.groupby('ItemId').size()
data = data[np.in1d(data['ItemId'], item_supports[item_supports >= 5].index)]
session_lengths = data.groupby('SessionId').size()
data = data[np.in1d(data['SessionId'], session_lengths[session_lengths > 1].index)]

In [36]:
data.shape

(31710571, 3)

In [41]:
def train_test_session(data, train_path, test_path):
    # create a 1 day time difference so we can
    # use the session of the subsequent day as testing
    tmax = data['TimeStamp'].max()
    delta = pd.Timedelta(1, unit = 'd')
    time_diff = tmax - delta
    
    session_max_times = data.groupby('SessionId')['TimeStamp'].max()
    session_train = session_max_times[session_max_times < time_diff].index
    session_test = session_max_times[session_max_times >= time_diff].index
    train = data[np.in1d(data['SessionId'], session_train)]
    test = data[np.in1d(data['SessionId'], session_test)]

    # we filter out clicks from the test set where the item 
    # clicked is not in the train set. Sessions of 
    # length one are also removed from the test set
    test = test[np.in1d(test['ItemId'], train['ItemId'])]
    session_lengths = test.groupby('SessionId').size()
    test = test[np.in1d(test['SessionId'], session_lengths[session_lengths > 1].index)]
    
    def print_info(data, path):
        message = path + '\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'
        print( message.format(data.shape[0], data['SessionId'].nunique(), data['ItemId'].nunique()) )
        
    print_info(train, train_path)
    train.to_csv(train_path, index = False, sep = ',')
    print_info(test, test_path)
    test.to_csv(test_path, index = False, sep = ',')

In [42]:
train_path = 'rsc15_train_full.txt'
test_path = 'rsc15_test.txt'
train_test_session(data, train_path, test_path)

rsc15_train_full.txt
	Events: 31639296
	Sessions: 7966888
	Items: 37958
rsc15_test.txt
	Events: 71231
	Sessions: 15326
	Items: 6756


In [43]:
train_partial_path = 'rsc15_train_partial.txt'
validation_path = 'rsc15_validation.txt'
train_test_session(train, train_partial_path, validation_path)

rsc15_train_partial.txt
	Events: 31581057
	Sessions: 7954513
	Items: 37957
rsc15_validation.txt
	Events: 58237
	Sessions: 12374
	Items: 6361
