# ANL488 Project - Session-based Recommendation Systems

### Feature Engineering

In [1]:
import time
import random

#analysis libraries 
import numpy as np 
import pandas as pd 
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline

### Load & explore data

In [2]:
browsing_train = pd.read_csv('New_browsing(product only).csv')

In [3]:
browsing_train

Unnamed: 0,SessionId,product_action,product_sku_hash,server_timestamp_epoch_ms,time
0,0,detail,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552423391039,2019-03-12 20:43:11
1,0,add,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552424389158,2019-03-12 20:59:49
2,0,detail,4945f2fa8e87cb7501702ed3dce26253296eae7a8f670f...,1552426684381,2019-03-12 21:38:04
3,0,detail,6ff8d0f30bbe66cfec7d87fc7e22bd8b1defd47ff4aaaa...,1552426755233,2019-03-12 21:39:15
4,0,detail,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552426869735,2019-03-12 21:41:09
...,...,...,...,...,...
10431606,3280583,detail,a532a9b9d11dcb3358677f9ff2d41d95f015331acbd632...,1550718210565,2019-02-21 03:03:30
10431607,3280583,detail,3011fe75d8739f217851a68d4c7ab40cce7dd5031f9080...,1550718249795,2019-02-21 03:04:09
10431608,3280583,detail,ea950a72ea131ef7181c7dd03f1ed77396648060c1e9cd...,1550718557956,2019-02-21 03:09:17
10431609,3280583,detail,bed8a2b601108932cbd8b3b14cde1d4919262c60b35cad...,1550718613724,2019-02-21 03:10:13


In [4]:
print(browsing_train.isnull().sum())
print('---------------------------')
print(browsing_train.nunique())

SessionId                    0
product_action               0
product_sku_hash             0
server_timestamp_epoch_ms    0
time                         0
dtype: int64
---------------------------
SessionId                     3280584
product_action                      4
product_sku_hash                57483
server_timestamp_epoch_ms    10397354
time                          4725977
dtype: int64


In [10]:
#time features 
# get features related to time 

def time_features(df):
    
    df['time'] = (df['server_timestamp_epoch_ms'] / 1000).astype(int)
    df['time'] = pd.to_datetime(df['time'].astype(int), unit='s')
    df['dayofweek'] = df.time.dt.weekday
    df['weekend'] = df['dayofweek'].isin([5, 6]).astype('int8')
    return df

In [11]:
browsing_train2 = time_features(browsing_train)

In [12]:
browsing_train2

Unnamed: 0,SessionId,product_action,product_sku_hash,server_timestamp_epoch_ms,time,dayofweek,weekend
0,0,detail,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552423391039,2019-03-12 20:43:11,1,0
1,0,add,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552424389158,2019-03-12 20:59:49,1,0
2,0,detail,4945f2fa8e87cb7501702ed3dce26253296eae7a8f670f...,1552426684381,2019-03-12 21:38:04,1,0
3,0,detail,6ff8d0f30bbe66cfec7d87fc7e22bd8b1defd47ff4aaaa...,1552426755233,2019-03-12 21:39:15,1,0
4,0,detail,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552426869735,2019-03-12 21:41:09,1,0
...,...,...,...,...,...,...,...
10431606,3280583,detail,a532a9b9d11dcb3358677f9ff2d41d95f015331acbd632...,1550718210565,2019-02-21 03:03:30,3,0
10431607,3280583,detail,3011fe75d8739f217851a68d4c7ab40cce7dd5031f9080...,1550718249795,2019-02-21 03:04:09,3,0
10431608,3280583,detail,ea950a72ea131ef7181c7dd03f1ed77396648060c1e9cd...,1550718557956,2019-02-21 03:09:17,3,0
10431609,3280583,detail,bed8a2b601108932cbd8b3b14cde1d4919262c60b35cad...,1550718613724,2019-02-21 03:10:13,3,0


In [13]:
# get lag (difference) features 

def lag_feature(df, group_col, target_col, offset, nan=-1):
    sr = df[target_col].shift(offset)
    sr[df[group_col] != df[group_col].shift(offset)] = nan
    return sr.fillna(nan)

In [16]:
SessionId = 'SessionId'

In [18]:
browsing_train2['last_event_time'] = lag_feature(browsing_train2, SessionId, 'time', 1, np.nan)

In [19]:
browsing_train2

Unnamed: 0,SessionId,product_action,product_sku_hash,server_timestamp_epoch_ms,time,dayofweek,weekend,last_event_time
0,0,detail,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552423391039,2019-03-12 20:43:11,1,0,NaT
1,0,add,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552424389158,2019-03-12 20:59:49,1,0,2019-03-12 20:43:11
2,0,detail,4945f2fa8e87cb7501702ed3dce26253296eae7a8f670f...,1552426684381,2019-03-12 21:38:04,1,0,2019-03-12 20:59:49
3,0,detail,6ff8d0f30bbe66cfec7d87fc7e22bd8b1defd47ff4aaaa...,1552426755233,2019-03-12 21:39:15,1,0,2019-03-12 21:38:04
4,0,detail,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552426869735,2019-03-12 21:41:09,1,0,2019-03-12 21:39:15
...,...,...,...,...,...,...,...,...
10431606,3280583,detail,a532a9b9d11dcb3358677f9ff2d41d95f015331acbd632...,1550718210565,2019-02-21 03:03:30,3,0,2019-02-21 03:03:15
10431607,3280583,detail,3011fe75d8739f217851a68d4c7ab40cce7dd5031f9080...,1550718249795,2019-02-21 03:04:09,3,0,2019-02-21 03:03:30
10431608,3280583,detail,ea950a72ea131ef7181c7dd03f1ed77396648060c1e9cd...,1550718557956,2019-02-21 03:09:17,3,0,2019-02-21 03:04:09
10431609,3280583,detail,bed8a2b601108932cbd8b3b14cde1d4919262c60b35cad...,1550718613724,2019-02-21 03:10:13,3,0,2019-02-21 03:09:17


In [20]:
browsing_train2['last_event_length'] = browsing_train2['time'] - browsing_train2['last_event_time']

In [21]:
# cumulative of products 

browsing_train2['product_true'] = True
browsing_train2['cum_product'] = browsing_train2.groupby([SessionId])['product_true'].cumsum()
browsing_train2.head()

Unnamed: 0,SessionId,product_action,product_sku_hash,server_timestamp_epoch_ms,time,dayofweek,weekend,last_event_time,last_event_length,product_true,cum_product
0,0,detail,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552423391039,2019-03-12 20:43:11,1,0,NaT,NaT,True,1
1,0,add,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552424389158,2019-03-12 20:59:49,1,0,2019-03-12 20:43:11,0 days 00:16:38,True,2
2,0,detail,4945f2fa8e87cb7501702ed3dce26253296eae7a8f670f...,1552426684381,2019-03-12 21:38:04,1,0,2019-03-12 20:59:49,0 days 00:38:15,True,3
3,0,detail,6ff8d0f30bbe66cfec7d87fc7e22bd8b1defd47ff4aaaa...,1552426755233,2019-03-12 21:39:15,1,0,2019-03-12 21:38:04,0 days 00:01:11,True,4
4,0,detail,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552426869735,2019-03-12 21:41:09,1,0,2019-03-12 21:39:15,0 days 00:01:54,True,5


In [22]:
del browsing_train2['product_true']

In [23]:
browsing_train2.head()

Unnamed: 0,SessionId,product_action,product_sku_hash,server_timestamp_epoch_ms,time,dayofweek,weekend,last_event_time,last_event_length,cum_product
0,0,detail,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552423391039,2019-03-12 20:43:11,1,0,NaT,NaT,1
1,0,add,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552424389158,2019-03-12 20:59:49,1,0,2019-03-12 20:43:11,0 days 00:16:38,2
2,0,detail,4945f2fa8e87cb7501702ed3dce26253296eae7a8f670f...,1552426684381,2019-03-12 21:38:04,1,0,2019-03-12 20:59:49,0 days 00:38:15,3
3,0,detail,6ff8d0f30bbe66cfec7d87fc7e22bd8b1defd47ff4aaaa...,1552426755233,2019-03-12 21:39:15,1,0,2019-03-12 21:38:04,0 days 00:01:11,4
4,0,detail,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552426869735,2019-03-12 21:41:09,1,0,2019-03-12 21:39:15,0 days 00:01:54,5


In [24]:
all_df = browsing_train2

In [25]:
all_df

Unnamed: 0,SessionId,product_action,product_sku_hash,server_timestamp_epoch_ms,time,dayofweek,weekend,last_event_time,last_event_length,cum_product
0,0,detail,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552423391039,2019-03-12 20:43:11,1,0,NaT,NaT,1
1,0,add,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552424389158,2019-03-12 20:59:49,1,0,2019-03-12 20:43:11,0 days 00:16:38,2
2,0,detail,4945f2fa8e87cb7501702ed3dce26253296eae7a8f670f...,1552426684381,2019-03-12 21:38:04,1,0,2019-03-12 20:59:49,0 days 00:38:15,3
3,0,detail,6ff8d0f30bbe66cfec7d87fc7e22bd8b1defd47ff4aaaa...,1552426755233,2019-03-12 21:39:15,1,0,2019-03-12 21:38:04,0 days 00:01:11,4
4,0,detail,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552426869735,2019-03-12 21:41:09,1,0,2019-03-12 21:39:15,0 days 00:01:54,5
...,...,...,...,...,...,...,...,...,...,...
10431606,3280583,detail,a532a9b9d11dcb3358677f9ff2d41d95f015331acbd632...,1550718210565,2019-02-21 03:03:30,3,0,2019-02-21 03:03:15,0 days 00:00:15,8
10431607,3280583,detail,3011fe75d8739f217851a68d4c7ab40cce7dd5031f9080...,1550718249795,2019-02-21 03:04:09,3,0,2019-02-21 03:03:30,0 days 00:00:39,9
10431608,3280583,detail,ea950a72ea131ef7181c7dd03f1ed77396648060c1e9cd...,1550718557956,2019-02-21 03:09:17,3,0,2019-02-21 03:04:09,0 days 00:05:08,10
10431609,3280583,detail,bed8a2b601108932cbd8b3b14cde1d4919262c60b35cad...,1550718613724,2019-02-21 03:10:13,3,0,2019-02-21 03:09:17,0 days 00:00:56,11


In [26]:
# * sessions with only one action in train data
all_df['session_len_count'] = all_df.groupby(SessionId)[SessionId].transform('count')

In [27]:
all_df.head()

Unnamed: 0,SessionId,product_action,product_sku_hash,server_timestamp_epoch_ms,time,dayofweek,weekend,last_event_time,last_event_length,cum_product,session_len_count
0,0,detail,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552423391039,2019-03-12 20:43:11,1,0,NaT,NaT,1,5
1,0,add,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552424389158,2019-03-12 20:59:49,1,0,2019-03-12 20:43:11,0 days 00:16:38,2,5
2,0,detail,4945f2fa8e87cb7501702ed3dce26253296eae7a8f670f...,1552426684381,2019-03-12 21:38:04,1,0,2019-03-12 20:59:49,0 days 00:38:15,3,5
3,0,detail,6ff8d0f30bbe66cfec7d87fc7e22bd8b1defd47ff4aaaa...,1552426755233,2019-03-12 21:39:15,1,0,2019-03-12 21:38:04,0 days 00:01:11,4,5
4,0,detail,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552426869735,2019-03-12 21:41:09,1,0,2019-03-12 21:39:15,0 days 00:01:54,5,5


In [28]:
# cumlative products remaining 

all_df['cum_product_r'] = all_df['session_len_count'] - all_df['cum_product']
all_df

Unnamed: 0,SessionId,product_action,product_sku_hash,server_timestamp_epoch_ms,time,dayofweek,weekend,last_event_time,last_event_length,cum_product,session_len_count,cum_product_r
0,0,detail,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552423391039,2019-03-12 20:43:11,1,0,NaT,NaT,1,5,4
1,0,add,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552424389158,2019-03-12 20:59:49,1,0,2019-03-12 20:43:11,0 days 00:16:38,2,5,3
2,0,detail,4945f2fa8e87cb7501702ed3dce26253296eae7a8f670f...,1552426684381,2019-03-12 21:38:04,1,0,2019-03-12 20:59:49,0 days 00:38:15,3,5,2
3,0,detail,6ff8d0f30bbe66cfec7d87fc7e22bd8b1defd47ff4aaaa...,1552426755233,2019-03-12 21:39:15,1,0,2019-03-12 21:38:04,0 days 00:01:11,4,5,1
4,0,detail,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552426869735,2019-03-12 21:41:09,1,0,2019-03-12 21:39:15,0 days 00:01:54,5,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...
10431606,3280583,detail,a532a9b9d11dcb3358677f9ff2d41d95f015331acbd632...,1550718210565,2019-02-21 03:03:30,3,0,2019-02-21 03:03:15,0 days 00:00:15,8,12,4
10431607,3280583,detail,3011fe75d8739f217851a68d4c7ab40cce7dd5031f9080...,1550718249795,2019-02-21 03:04:09,3,0,2019-02-21 03:03:30,0 days 00:00:39,9,12,3
10431608,3280583,detail,ea950a72ea131ef7181c7dd03f1ed77396648060c1e9cd...,1550718557956,2019-02-21 03:09:17,3,0,2019-02-21 03:04:09,0 days 00:05:08,10,12,2
10431609,3280583,detail,bed8a2b601108932cbd8b3b14cde1d4919262c60b35cad...,1550718613724,2019-02-21 03:10:13,3,0,2019-02-21 03:09:17,0 days 00:00:56,11,12,1


In [29]:
%%time
tmp_df = all_df[all_df['cum_product']==1][[SessionId, 'server_timestamp_epoch_ms']]
tmp_df.columns = [SessionId, 'first_time']
all_df = all_df.merge(tmp_df, on=SessionId, how='left')
del tmp_df

Wall time: 6.99 s


In [30]:
all_df

Unnamed: 0,SessionId,product_action,product_sku_hash,server_timestamp_epoch_ms,time,dayofweek,weekend,last_event_time,last_event_length,cum_product,session_len_count,cum_product_r,first_time
0,0,detail,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552423391039,2019-03-12 20:43:11,1,0,NaT,NaT,1,5,4,1552423391039
1,0,add,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552424389158,2019-03-12 20:59:49,1,0,2019-03-12 20:43:11,0 days 00:16:38,2,5,3,1552423391039
2,0,detail,4945f2fa8e87cb7501702ed3dce26253296eae7a8f670f...,1552426684381,2019-03-12 21:38:04,1,0,2019-03-12 20:59:49,0 days 00:38:15,3,5,2,1552423391039
3,0,detail,6ff8d0f30bbe66cfec7d87fc7e22bd8b1defd47ff4aaaa...,1552426755233,2019-03-12 21:39:15,1,0,2019-03-12 21:38:04,0 days 00:01:11,4,5,1,1552423391039
4,0,detail,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552426869735,2019-03-12 21:41:09,1,0,2019-03-12 21:39:15,0 days 00:01:54,5,5,0,1552423391039
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10431606,3280583,detail,a532a9b9d11dcb3358677f9ff2d41d95f015331acbd632...,1550718210565,2019-02-21 03:03:30,3,0,2019-02-21 03:03:15,0 days 00:00:15,8,12,4,1550717915937
10431607,3280583,detail,3011fe75d8739f217851a68d4c7ab40cce7dd5031f9080...,1550718249795,2019-02-21 03:04:09,3,0,2019-02-21 03:03:30,0 days 00:00:39,9,12,3,1550717915937
10431608,3280583,detail,ea950a72ea131ef7181c7dd03f1ed77396648060c1e9cd...,1550718557956,2019-02-21 03:09:17,3,0,2019-02-21 03:04:09,0 days 00:05:08,10,12,2,1550717915937
10431609,3280583,detail,bed8a2b601108932cbd8b3b14cde1d4919262c60b35cad...,1550718613724,2019-02-21 03:10:13,3,0,2019-02-21 03:09:17,0 days 00:00:56,11,12,1,1550717915937


In [31]:
all_df['lapse'] = all_df['server_timestamp_epoch_ms'] - all_df['first_time']

In [32]:
all_df.head()

Unnamed: 0,SessionId,product_action,product_sku_hash,server_timestamp_epoch_ms,time,dayofweek,weekend,last_event_time,last_event_length,cum_product,session_len_count,cum_product_r,first_time,lapse
0,0,detail,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552423391039,2019-03-12 20:43:11,1,0,NaT,NaT,1,5,4,1552423391039,0
1,0,add,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552424389158,2019-03-12 20:59:49,1,0,2019-03-12 20:43:11,0 days 00:16:38,2,5,3,1552423391039,998119
2,0,detail,4945f2fa8e87cb7501702ed3dce26253296eae7a8f670f...,1552426684381,2019-03-12 21:38:04,1,0,2019-03-12 20:59:49,0 days 00:38:15,3,5,2,1552423391039,3293342
3,0,detail,6ff8d0f30bbe66cfec7d87fc7e22bd8b1defd47ff4aaaa...,1552426755233,2019-03-12 21:39:15,1,0,2019-03-12 21:38:04,0 days 00:01:11,4,5,1,1552423391039,3364194
4,0,detail,cf2f88cb43c1713538f7dfd2aa498a2cb9ebc0c99feeac...,1552426869735,2019-03-12 21:41:09,1,0,2019-03-12 21:39:15,0 days 00:01:54,5,5,0,1552423391039,3478696


In [33]:
all_df.columns

Index(['SessionId', 'product_action', 'product_sku_hash',
       'server_timestamp_epoch_ms', 'time', 'dayofweek', 'weekend',
       'last_event_time', 'last_event_length', 'cum_product',
       'session_len_count', 'cum_product_r', 'first_time', 'lapse'],
      dtype='object')

In [34]:
all_df.to_csv('new_features.csv', index=False)