In [1]:
import os
os.chdir('../')

In [2]:
import data
import preprocess_utils.session2vec as sess2vec
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
tqdm.pandas()

In [3]:
train_df = data.train_df('small')
test_df = data.test_df('small')
print(train_df.shape)
print(test_df.shape)

(3677, 12)
(1334, 12)


In [4]:
train_df.head(10)

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
0,00RL8Z82B2Z1,aff3928535f48,1541037460,1,search for poi,Newtown,AU,"Sydney, Australia",mobile,,,
1,00RL8Z82B2Z1,aff3928535f48,1541037522,2,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
2,00RL8Z82B2Z1,aff3928535f48,1541037522,3,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
3,00RL8Z82B2Z1,aff3928535f48,1541037532,4,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
4,00RL8Z82B2Z1,aff3928535f48,1541037532,5,interaction item image,109038,AU,"Sydney, Australia",mobile,,,
5,00RL8Z82B2Z1,aff3928535f48,1541037532,6,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
6,00RL8Z82B2Z1,aff3928535f48,1541037532,7,interaction item image,109038,AU,"Sydney, Australia",mobile,,,
7,00RL8Z82B2Z1,aff3928535f48,1541037532,8,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
8,00RL8Z82B2Z1,aff3928535f48,1541037542,9,interaction item image,109038,AU,"Sydney, Australia",mobile,,,
9,00RL8Z82B2Z1,aff3928535f48,1541037542,10,interaction item image,109038,AU,"Sydney, Australia",mobile,,,


In [8]:
def count_sessions(df):
    return len(df.groupby('session_id'))

In [16]:
count_sessions(train_df)

240

### See the notebook 'sequence_length_analysis_for_PADDING' to learn more

In [5]:
max_session_length = 75

In [6]:
train_df, custom_index = sess2vec.add_impressions_columns_as_new_actions(train_df)
test_df, _ = sess2vec.add_impressions_columns_as_new_actions(test_df, custom_index)

Total clickout interactions found: 395


395it [00:01, 297.30it/s]


Total clickout interactions found: 119


119it [00:00, 252.00it/s]


In [7]:
print(train_df.shape)
print(test_df.shape)

(12993, 11)
(4197, 11)


In [17]:
count_sessions(train_df)

240

### pad!

This can be done in a better way by placing the last clickout interaction at the last position.

In [50]:
def pad(g, max_length):
    # remove all interactions after the last clickout
    clickout_rows = g[g.action_type == 'clickout item']
    if clickout_rows.shape[0] > 0:
        index_of_last_clickout = clickout_rows.iloc[[-1]].index.values[0]
        g = g.loc[:index_of_last_clickout]
    
    grouplen = g.shape[0]
    if grouplen <= max_length:
        # pad with zeros
        array = np.zeros((max_length, g.shape[1]), dtype=object)
        array[:,0] = -1
        array[:,1] = g.user_id.values[0]
        array[:,2] = g.session_id.values[0]
        array[-grouplen:] = g.values[-grouplen:]
    else:
        # truncate
        array = g.values[-max_length:]
    return pd.DataFrame(array, columns=g.columns)

In [46]:
train_padded_df = train_df.reset_index().groupby('session_id').progress_apply(pad, max_length=max_session_length).set_index('index')

HBox(children=(IntProgress(value=0, max=240), HTML(value='')))

In [47]:
train_padded_df.shape

(18000, 11)

In [48]:
assert count_sessions(train_df) * max_session_length == train_padded_df.shape[0]

In [49]:
train_padded_df

Unnamed: 0_level_0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impression_price
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3736,7YY075MWXQYC,0266e4f67c9c9,1541050583,2,interaction item image,674721,NZ,"Akaroa, New Zealand",desktop,,
99006701,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.04348,show_impression,995989,NZ,"Akaroa, New Zealand",desktop,,113
99006702,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.08507,show_impression,674721,NZ,"Akaroa, New Zealand",desktop,,90
99006703,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.12665,show_impression,995935,NZ,"Akaroa, New Zealand",desktop,,130
99006704,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.16824,show_impression,2176108,NZ,"Akaroa, New Zealand",desktop,,130
99006705,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.20983,show_impression,510141,NZ,"Akaroa, New Zealand",desktop,,103
99006706,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.25142,show_impression,9608024,NZ,"Akaroa, New Zealand",desktop,,159
99006707,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.29301,show_impression,4651640,NZ,"Akaroa, New Zealand",desktop,,46
99006708,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.33459,show_impression,510146,NZ,"Akaroa, New Zealand",desktop,,143
99006709,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.37618,show_impression,1068254,NZ,"Akaroa, New Zealand",desktop,,84


Example of truncated session:

In [34]:
train_df[train_df.session_id == '0266e4f67c9c9']

Unnamed: 0_level_0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impression_price
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3735,7YY075MWXQYC,0266e4f67c9c9,1541050583,1.000000,interaction item image,674721,NZ,"Akaroa, New Zealand",desktop,,
3736,7YY075MWXQYC,0266e4f67c9c9,1541050583,2.000000,interaction item image,674721,NZ,"Akaroa, New Zealand",desktop,,
99006701,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.043478,show_impression,995989,NZ,"Akaroa, New Zealand",desktop,,113.0
99006702,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.085066,show_impression,674721,NZ,"Akaroa, New Zealand",desktop,,90.0
99006703,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.126654,show_impression,995935,NZ,"Akaroa, New Zealand",desktop,,130.0
99006704,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.168242,show_impression,2176108,NZ,"Akaroa, New Zealand",desktop,,130.0
99006705,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.209830,show_impression,510141,NZ,"Akaroa, New Zealand",desktop,,103.0
99006706,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.251418,show_impression,9608024,NZ,"Akaroa, New Zealand",desktop,,159.0
99006707,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.293006,show_impression,4651640,NZ,"Akaroa, New Zealand",desktop,,46.0
99006708,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.334594,show_impression,510146,NZ,"Akaroa, New Zealand",desktop,,143.0


In [35]:
train_padded_df[train_padded_df.session_id == '0266e4f67c9c9']

Unnamed: 0_level_0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impression_price
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3736,7YY075MWXQYC,0266e4f67c9c9,1541050583,2,interaction item image,674721,NZ,"Akaroa, New Zealand",desktop,,
99006701,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.04348,show_impression,995989,NZ,"Akaroa, New Zealand",desktop,,113
99006702,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.08507,show_impression,674721,NZ,"Akaroa, New Zealand",desktop,,90
99006703,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.12665,show_impression,995935,NZ,"Akaroa, New Zealand",desktop,,130
99006704,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.16824,show_impression,2176108,NZ,"Akaroa, New Zealand",desktop,,130
99006705,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.20983,show_impression,510141,NZ,"Akaroa, New Zealand",desktop,,103
99006706,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.25142,show_impression,9608024,NZ,"Akaroa, New Zealand",desktop,,159
99006707,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.29301,show_impression,4651640,NZ,"Akaroa, New Zealand",desktop,,46
99006708,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.33459,show_impression,510146,NZ,"Akaroa, New Zealand",desktop,,143
99006709,7YY075MWXQYC,0266e4f67c9c9,1541050596,2.37618,show_impression,1068254,NZ,"Akaroa, New Zealand",desktop,,84
