### DEAL WITH THE DATA

In [1]:
import numpy as np
import pandas as pd
import random
from fastai.tabular.transform import add_cyclic_datepart
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [57]:
! ls

55175_105481_compressed_yoochoose-buys.dat.zip
55175_105481_compressed_yoochoose-clicks.dat.zip
55175_105481_compressed_yoochoose-test.dat.zip
preprocessing.ipynb
progress report.docx
[31myoochoose-buys.dat[m[m
[31myoochoose-clicks.dat[m[m
[31myoochoose-test.dat[m[m


In [26]:
path_buys = "yoochoose-buys.dat"
path_clicks = "yoochoose-clicks.dat"
path_test = "yoochoose-test.dat"

In [27]:
buys_head = ["session","time", "item", "price", "quantity"]
clicks_head = ["session","time", "item", "category"]
test_head = ["session","time", "item", "category"]

In [28]:
test = pd.read_csv(path_test, sep=',', names=test_head, parse_dates=['time'])
buys = pd.read_csv(path_buys, sep=',', names=buys_head, parse_dates=['time'])
buys = buys.sort_values(by=["time", "session"])
clicks = pd.read_csv(path_clicks, sep=',', names=clicks_head, parse_dates=[
                 'time'], converters={"category": lambda c: -1 if c == "S" else c})
clicks = clicks.sort_values(by=["time", "session"])

  interactivity=interactivity, compiler=compiler, result=result)


In [29]:
uniq_clicks = clicks.session.unique()
uniq_buys = buys.session.unique()

In [30]:
# Make sets to give a decent sample of data.
c_not_b = set(uniq_clicks) - set(uniq_buys)
c_and_b = set(uniq_clicks).intersection(set(uniq_buys))

In [31]:
print(len(c_not_b), len(c_and_b))
assert len(c_not_b)+len(c_and_b) == len(uniq_clicks)

8740033 509696


The original dataset is too big that we can only run it on the sampled data.

In [57]:
num = 30000
cnb = random.sample(c_not_b, num)
cab = random.sample(c_and_b, num)

In [58]:
cl = clicks.loc[clicks.session.isin(cnb+cab)]
bu = buys.loc[buys.session.isin(cab)]
cl.shape, bu.shape

((297594, 4), (67421, 5))

In [59]:
cl.head()

Unnamed: 0,session,time,item,category
63316,18834,2014-04-01 03:47:51.468000+00:00,214819719,0
63317,18834,2014-04-01 03:49:17.031000+00:00,214718169,0
63318,18834,2014-04-01 03:49:46.058000+00:00,214832559,0
1366111,443348,2014-04-01 04:49:17.355000+00:00,214832559,0
1366112,443348,2014-04-01 04:50:50.792000+00:00,214832559,0


In [60]:
bu.head()

Unnamed: 0,session,time,item,price,quantity
14504,265497,2014-04-01 05:23:23.944000+00:00,214840483,1674,1
49269,432562,2014-04-01 05:51:49.336000+00:00,214829810,13509,1
49270,432562,2014-04-01 05:51:49.337000+00:00,214663976,2931,1
29391,383381,2014-04-01 05:52:49.720000+00:00,214697456,1151,3
41185,299597,2014-04-01 06:07:09.461000+00:00,214839313,4188,1


In [62]:
# Add the feature denoting the event.
cl.loc[:, "event"] = 0
bu.loc[:, "event"] = 1

In [63]:
# Hstack the data. 
union=pd.concat([cl, bu], ignore_index=True, sort=False).sort_values(by=['session','time'])

In [64]:
union.head()

Unnamed: 0,session,time,item,category,event,price,quantity
13018,87,2014-04-07 06:19:08.914000+00:00,214652220,0,0,,
13020,87,2014-04-07 06:19:20.979000+00:00,214840483,0,0,,
13021,87,2014-04-07 06:19:28.762000+00:00,214840483,0,0,,
13045,87,2014-04-07 06:26:01.516000+00:00,214717286,0,0,,
13046,87,2014-04-07 06:26:15.176000+00:00,214558807,0,0,,


In [65]:
union.time.min()

Timestamp('2014-04-01 03:47:51.468000+0000', tz='UTC')

In [66]:
# Add Contextual Features.
union.loc[:, "hour"] = union.time.dt.hour
union.loc[:, "month"] = union.time.dt.month
union.loc[:, "dow"] = union.time.dt.dayofweek

In [67]:
# union.category = union.category.fillna(-1)

In [68]:
# Drop the extra columns.
union = union.drop(columns=["price","quantity","category"])

In [69]:
union

Unnamed: 0,session,time,item,event,hour,month,dow
13018,87,2014-04-07 06:19:08.914000+00:00,214652220,0,6,4,0
13020,87,2014-04-07 06:19:20.979000+00:00,214840483,0,6,4,0
13021,87,2014-04-07 06:19:28.762000+00:00,214840483,0,6,4,0
13045,87,2014-04-07 06:26:01.516000+00:00,214717286,0,6,4,0
13046,87,2014-04-07 06:26:15.176000+00:00,214558807,0,6,4,0
...,...,...,...,...,...,...,...
295448,11561884,2014-09-26 18:03:22.661000+00:00,214555104,0,18,9,4
295287,11562067,2014-09-26 16:11:22.515000+00:00,214850516,0,16,9,4
297345,11562118,2014-09-29 14:32:19.138000+00:00,214709702,0,14,9,0
297346,11562118,2014-09-29 14:32:37.129000+00:00,214709702,0,14,9,0


In [70]:
# Add the index within each session for further processing.
df = union.copy()
df.loc[:,'idx'] = df.groupby('session').cumcount()+1

In [71]:
df.head()

Unnamed: 0,session,time,item,event,hour,month,dow,idx
13018,87,2014-04-07 06:19:08.914000+00:00,214652220,0,6,4,0,1
13020,87,2014-04-07 06:19:20.979000+00:00,214840483,0,6,4,0,2
13021,87,2014-04-07 06:19:28.762000+00:00,214840483,0,6,4,0,3
13045,87,2014-04-07 06:26:01.516000+00:00,214717286,0,6,4,0,4
13046,87,2014-04-07 06:26:15.176000+00:00,214558807,0,6,4,0,5


In [72]:
# Know the number of items in this dataset.
df.item.nunique()

18465

In [73]:
sessions = df.session.unique()

In [74]:
len(sessions)

60000

In [75]:
# Add the Contextual Feature Time_Difference.
cool2 = df.groupby("session").shift(1)["time"].dt.tz_localize('UTC')
df.loc[:,"timediff"] = (df.time - cool2).dt.seconds
df.loc[:,"timediff"] = np.log2(df.timediff).replace([np.inf, -np.inf], np.nan).fillna(-1).astype(int)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [76]:
from tqdm import tqdm

In [78]:
def renew(df):
    """
    Helper function to make a unit dataframe long to wide.
    Input: a 20-row df
    Output: a 1-row df
    """
    df = df.pivot_table(index=['session'], columns=tmp.index, 
                        values=['timediff', 'item', "hour", "month", 'dow', 'event','time'], aggfunc='first')

    df = df.sort_index(axis=1, level=1)
    df.columns = [f'{x}_{y}' for x,y in df.columns]
    df = df.reset_index().fillna(0)
    return df.copy()

In [79]:
# Iterate through the dataset to generate time series data.

final = pd.DataFrame()
res = []
for sid in tqdm(sessions):
    dd = df.loc[df.session == sid]
    n = dd.idx.max()
    for c in range(2, n):
        mini_dd = dd.loc[(dd.idx <= c) & (dd.idx > c-20)]
        g = mini_dd.shape[0]
        lst = list(range(20-g+1, 21))
        mini_dd.index = lst
        tmp = mini_dd.reindex(mini_dd.index.tolist() + list(range(1, 21-g)))
        tmp.loc[:,"session"] = mini_dd.session.max()
        tmp = tmp.fillna(-1)
        row = renew(tmp)
        res.append(row.values[0])

100%|██████████| 60000/60000 [1:22:52<00:00, 12.07it/s]  


In [80]:
finals = pd.DataFrame(res, columns=row.columns)

In [81]:
# Save the whole processed DF.
finals.to_csv("tuesday.csv", index=False)

### Train Test Split and Write the data to csv.

In [156]:
print(gg.shape)

(249198, 141)

In [127]:
TEST = gg.iloc[242000:].copy()
VALID = gg.iloc[234000:242000].copy()
TRAIN = gg.iloc[:234000].copy()

In [128]:
TRAIN[["item_"+str(x) for x in range(1, 21)]]

Unnamed: 0,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,item_9,item_10,item_11,item_12,item_13,item_14,item_15,item_16,item_17,item_18,item_19,item_20
721,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,214819719.0,214718169.0
11239,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,214832559.0,214832559.0
6870,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,214840483.0,214840483.0
6871,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,214840483.0,214840483.0,214840483.0
6872,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,214840483.0,214840483.0,214840483.0,214840483.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232940,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,214861455.0,214855153.0
237897,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,214840082.0,214677917.0
239826,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,214515842.0,214550368.0,214574150.0,214515842.0
239862,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,214846033.0,214853420.0,214853102.0


In [129]:
train_part = set(TRAIN.item_20.unique()).union(set(TRAIN.item_19.unique())).union(set(TRAIN.item_18.unique()))
valid_part = set(VALID.item_20.unique()).union(set(VALID.item_19.unique()))
test_part = set(TEST.item_20.unique()).union(set(TEST.item_19.unique()))

In [130]:
len(train_part), len(valid_part), len(test_part)

(16601, 1787, 2072)

In [131]:
len(test_part.intersection(train_part))/len(test_part)

0.8161196911196911

In [132]:
len(valid_part.intersection(train_part))/len(valid_part)

0.8824846110800224

In [133]:
TEST.to_csv("test_tu.csv", index=False)
VALID.to_csv("valid_tu.csv", index=False)
TRAIN.to_csv("train_tu.csv", index=False)

### Showcase

In [115]:
# Sort the df by the time of the last event.
gg = finals.sort_values(by=["time_20"], ascending=True).copy()

In [154]:
# This is what it looks like
gg.head(3)

Unnamed: 0,session,dow_1,event_1,hour_1,item_1,month_1,time_1,timediff_1,dow_2,event_2,...,month_19,time_19,timediff_19,dow_20,event_20,hour_20,item_20,month_20,time_20,timediff_20
721,18834,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,...,4.0,2014-04-01 03:47:51.468000+00:00,-1.0,1.0,0.0,3.0,214718169.0,4.0,2014-04-01 03:49:17.031000+00:00,6.0
11239,443348,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,...,4.0,2014-04-01 04:49:17.355000+00:00,-1.0,1.0,0.0,4.0,214832559.0,4.0,2014-04-01 04:50:50.792000+00:00,6.0
6870,265497,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,...,4.0,2014-04-01 05:16:56.782000+00:00,-1.0,1.0,0.0,5.0,214840483.0,4.0,2014-04-01 05:17:08.857000+00:00,3.0


In [149]:
sample = gg.head(1).copy().drop(columns=["session"])
cols = [col for col in sample.columns if not col.startswith("time_")]
sample = sample[cols]
sample.values[0].reshape(6,20)

array([[-1.000000e+00, -1.000000e+00, -1.000000e+00, -1.000000e+00, ..., -1.000000e+00, -1.000000e+00, -1.000000e+00,
        -1.000000e+00],
       [-1.000000e+00, -1.000000e+00, -1.000000e+00, -1.000000e+00, ..., -1.000000e+00, -1.000000e+00, -1.000000e+00,
        -1.000000e+00],
       [-1.000000e+00, -1.000000e+00, -1.000000e+00, -1.000000e+00, ..., -1.000000e+00, -1.000000e+00, -1.000000e+00,
        -1.000000e+00],
       [-1.000000e+00, -1.000000e+00, -1.000000e+00, -1.000000e+00, ..., -1.000000e+00, -1.000000e+00, -1.000000e+00,
        -1.000000e+00],
       [-1.000000e+00, -1.000000e+00, -1.000000e+00, -1.000000e+00, ..., -1.000000e+00, -1.000000e+00, -1.000000e+00,
        -1.000000e+00],
       [-1.000000e+00, -1.000000e+00, -1.000000e+00, -1.000000e+00, ...,  3.000000e+00,  2.147182e+08,  4.000000e+00,
         6.000000e+00]])

In [155]:
sample.columns

Index(['dow_1', 'event_1', 'hour_1', 'item_1', 'month_1', 'timediff_1',
       'dow_2', 'event_2', 'hour_2', 'item_2',
       ...
       'hour_19', 'item_19', 'month_19', 'timediff_19', 'dow_20', 'event_20',
       'hour_20', 'item_20', 'month_20', 'timediff_20'],
      dtype='object', length=120)

In [148]:
gg.head(1).values[0][1:].reshape(7, 20)

array([[-1.0, -1.0, -1.0, -1.0, ..., -1.0, -1.0, -1.0, -1],
       [-1.0, -1.0, -1.0, -1.0, ..., -1.0, -1.0, -1.0, -1.0],
       [-1, -1.0, -1.0, -1.0, ..., -1.0, -1.0, -1.0, -1.0],
       [-1.0, -1, -1.0, -1.0, ..., -1.0, -1.0, -1.0, -1.0],
       [-1.0, -1.0, -1, -1.0, ..., -1, -1.0, -1.0, -1.0],
       [-1.0, -1.0, -1.0, -1, ..., -1.0, -1, -1.0, -1.0],
       [-1.0, -1.0, -1.0, -1.0, ..., 214718169.0, 4.0, Timestamp('2014-04-01 03:49:17.031000+0000', tz='UTC'), 6.0]],
      dtype=object)

In [139]:
g = mini_dd.shape[0]
lst = list(range(20-g+1, 21))
mini_dd.index = lst
tmp = mini_dd.reindex(mini_dd.index.tolist() + list(range(1, 21-g)))
tmp.loc[:, "session"] = mini_dd.session.max()
tmp = tmp.fillna(-1)

In [140]:
tmp

Unnamed: 0,session,time,item,event,hour,month,dow,idx,timediff
19,11562118,2014-09-29 14:32:19.138000+00:00,214709702.0,0.0,14.0,9.0,0.0,1.0,-1.0
20,11562118,2014-09-29 14:32:37.129000+00:00,214709702.0,0.0,14.0,9.0,0.0,2.0,4.0
1,11562118,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,11562118,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,11562118,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,11562118,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
5,11562118,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
6,11562118,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
7,11562118,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
8,11562118,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
