In [96]:
# ! pip install fastparquet --user
# ! pip install ordered-set
# ! pip install swifter --user

In [97]:
import pandas as pd
import swifter
import numpy as np
import gc
from tqdm import tqdm
from collections import defaultdict
from ordered_set import OrderedSet
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler, StandardScaler


In [98]:
train = pd.read_parquet("../data_phase1/train.parquet")
val = pd.read_parquet("../data_phase1/validation.parquet")


In [99]:
train = train.fillna(value={"context_type": "NA"})
val = val.fillna(value={"context_type": "NA"})


In [100]:
train.describe(include='O')

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value
count,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990
unique,584665,208393,317426,408263,5,23,3,2,6,196,5,189571
top,48466297d4c0e7b1772dc3be2e1f97c931790cef83b2fa...,cc83479dd22e19ec45d08805a61b73d7f33a69feaf42be...,f3de26eced2c81d1b0d6da40c11c9f987fe066b5a4f4fd...,55e1495c40504b4b15a358f95e2cbede34d011b287c32b...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...
freq,6,4428,354,1728,3105120,2053848,1853280,1927596,2714292,520824,3105204,152514


In [101]:
val.describe(include='O')

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value
count,687192,687192,687192,687192,687192,687192,687192,687192,687192,687192,687192,687192
unique,114532,114532,114532,218525,5,22,3,2,6,188,5,66955
top,24a6fdaf54ae5aafdd891c3d14596c15b47ccb329e9ccb...,b2604857bd347e0cbc0e32b8c7c0455f368ad2fa199b26...,d18c5903be4cd2f80a6a849787c9791078bc77f6dc4d65...,fe13af44356050cdc93ad3d5e458e24c5077e5bf7a4c12...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...
freq,6,6,6,398,666960,513864,411180,423654,605262,102696,666990,15342


In [102]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df


In [103]:
train = reduce_mem_usage(train)
val = reduce_mem_usage(val)


Memory usage of dataframe is 428.22 MB
Memory usage after optimization is: 175.85 MB
Decreased by 58.9%
Memory usage of dataframe is 78.64 MB
Memory usage after optimization is: 47.95 MB
Decreased by 39.0%


In [104]:
gc.collect()

0

In [105]:
train = train.sort_values(by='query_id').reset_index(drop=True)
val = val.sort_values(by='query_id').reset_index(drop=True)


In [106]:
train.head(10)

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value,product_price,week,week_day,is_click
0,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,1efd18182268101b62a1ea12a9cafbe05487f3abb92924...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.000466,6,6,0
1,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,62349c6eda0dc9fe8bb023213f03ebe93aefa5cbcdfecf...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.001068,6,6,0
2,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,aa0bbdfa55326b5c08d3472b1ee1d56fe13a82f63f46c8...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.001238,6,6,1
3,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,793301d78ea43b599acf05d350c8f9e485f5deaa417284...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.000465,6,6,0
4,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,9e03ca4af958bef1ce5d54e684a40ee7e3a9aa7e5010b6...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.000936,6,6,0
5,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,9e7f6f10b18f35ce924575f599465265e22fa9d75b13c9...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.001765,6,6,0
6,00008bcf52759e7e70d533026da637b1cfeb0dc04a67ed...,968f154cc29e3fb1ed5f361ab23d9d8495cf47c5baefef...,f391f67a9d273be5a95460ce4756cdbc22430336a0b28d...,b08c37eac9adf04388a290b13105167b4a50911ac14e5c...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,ca0bdabcd3a4eae25db6f95687cb307200e78892a6d774...,0.000804,7,2,0
7,00008bcf52759e7e70d533026da637b1cfeb0dc04a67ed...,968f154cc29e3fb1ed5f361ab23d9d8495cf47c5baefef...,f391f67a9d273be5a95460ce4756cdbc22430336a0b28d...,7b8c68e43264904ec9f74df2b9461fe7381aab12d0740e...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,ca0bdabcd3a4eae25db6f95687cb307200e78892a6d774...,0.000722,7,2,0
8,00008bcf52759e7e70d533026da637b1cfeb0dc04a67ed...,968f154cc29e3fb1ed5f361ab23d9d8495cf47c5baefef...,f391f67a9d273be5a95460ce4756cdbc22430336a0b28d...,616636f3c60f43d24741d9eca7fe1c3c567825acc64348...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,ca0bdabcd3a4eae25db6f95687cb307200e78892a6d774...,0.000916,7,2,1
9,00008bcf52759e7e70d533026da637b1cfeb0dc04a67ed...,968f154cc29e3fb1ed5f361ab23d9d8495cf47c5baefef...,f391f67a9d273be5a95460ce4756cdbc22430336a0b28d...,01f0a57f4fbc0e42e76960238fce01c28e29f23819ae04...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,ca0bdabcd3a4eae25db6f95687cb307200e78892a6d774...,0.000761,7,2,0


In [107]:
train.isnull().sum()

query_id              0
user_id               0
session_id            0
product_id            0
page_type             0
previous_page_type    0
device_category       0
device_platform       0
user_tier             0
user_country          0
context_type          0
context_value         0
product_price         0
week                  0
week_day              0
is_click              0
dtype: int64

In [108]:
data = pd.concat([train[list(set(train.columns)-set(['is_click']))], val], axis=0)
data = data.reset_index(drop=True)

In [109]:
all_products = data.product_id.apply(str).tolist()

In [110]:
unique_products = OrderedSet(all_products)


In [111]:
train_session_interactions = dict(train.groupby('session_id')['product_id'].apply(list))
val_session_interactions = dict(val.groupby('session_id')['product_id'].apply(list))

train_session_actions = dict(train.groupby('session_id')['is_click'].apply(list))
# val_session_actions = dict(val.groupby('session_id')['is_click'].apply(list))


In [112]:
train['days_elapsed'] = (train['week'].astype(int)-1)*7 + train['week_day'].astype(int)
val['days_elapsed'] = (val['week'].astype(int)-1)*7 + val['week_day'].astype(int)

In [113]:
val['days_elapsed'] = val['days_elapsed'] - train['days_elapsed'].min()
train['days_elapsed'] = train['days_elapsed'] - train['days_elapsed'].min()


In [114]:
val['days_elapsed'] = val['days_elapsed'] - train['days_elapsed'].min()
val.days_elapsed.unique()

array([43, 38, 41, 52, 53, 49, 11, 45, 46,  9, 48,  8, 20, 42, 34, 44, 40,
       21, 39, 19, 14, 30, 47,  6, 33, 23, 17, 28,  5,  7, 13, 25])

In [115]:
train['sess_step'] = train.groupby('session_id')['days_elapsed'].rank(method='max').apply(int)
val['sess_step'] = val.groupby('session_id')['days_elapsed'].rank(method='max').apply(int)

train['sess_step'] = train['sess_step']//6
val['sess_step'] = val['sess_step']//6


In [116]:
train['country_platform'] = train.swifter.apply(lambda x: x['user_country'] + x['device_platform'], axis=1)
val['country_platform'] = val.swifter.apply(lambda x: x['user_country'] + x['device_platform'], axis=1)


In [117]:
gc.collect()

132

In [118]:
cols = [
 'query_id',
 'user_id',
 'session_id',
 'product_id',
 'user_tier',
 'product_price',
 'page_type',
 'previous_page_type',
 'device_category',
 'device_platform',
 'user_country',
 'country_platform',
 'context_type',
 'context_value',
 'week',
 'week_day',
 'sess_step',
 'days_elapsed'
]


In [119]:
cols.append('is_click')
train = train[cols]

In [25]:
sorted_train_clickout_data = train.sort_values(["days_elapsed"]).reset_index(drop=True)
sorted_train_clickout_data = sorted_train_clickout_data[sorted_train_clickout_data["is_click"]==1]


In [26]:
sorted_train_clickout_data.head()

Unnamed: 0,query_id,user_id,session_id,product_id,user_tier,product_price,page_type,previous_page_type,device_category,device_platform,user_country,country_platform,context_type,context_value,sess_step,days_elapsed,is_click
3,9446fb136df49975847f8c73e4f1b660a4103a9579582c...,55141265635f9f86832cfe6d30822e78cf1c619df8a5e3...,e3bfa8e1dc3068c434dd7c121229d394b74bc97509a865...,5266d3fcb681ff8d11fd8a42f5b06aa2729dfc4fd9ba1b...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.002205,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,product_id,8b43fe5a4ac6e21388b8a0a1000d30f565906ad239fe3d...,23,0,1
11,3ec6453701ac4285c916735f09d88f20720846b1ac0df9...,55141265635f9f86832cfe6d30822e78cf1c619df8a5e3...,e3bfa8e1dc3068c434dd7c121229d394b74bc97509a865...,b46915304330dc6eb5cc99a40cf712df67e16c3abba919...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.005379,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,product_id,68ffd8b6d0267886d1f4668099c269ab5d016774e3820e...,23,0,1
14,936c329320ffdac0aaba3874d2ad978b2d8e93e2f688d1...,9094f8baf87832740422bdadd76132089ffc637224b175...,451813b2c4dfcfc4787e9d99cf4ed8c3eab7704a024108...,5463fe247c3db709f9c38808a1279c5615a79847d8ebcc...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.001719,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,8008642632a9035202bbc1a28419d0bb5167c0488f3d0e...,2,0,1
24,7c08c7ae7fbe4b21a119b32071bc27b74ef3d5dfe7abfb...,c8443acb7ee67ea5e8f9726d93d6f92ea472bad8a5fd92...,df2432c9ee96db11af92bd2aae9521283c6a19fd4b3f92...,d407c33ed51cad6716b7052dc8d98e334f1f5d9a57bf83...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.000188,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,570d5cd5f5ec1be4a0a910cf18b51c228b5f63d09896a8...,2,0,1
30,394a4f6a4dbd161c5e7dfcf05e6129de0c5de62f630df7...,87b0c2f3de88284cf65531751e5a9546cdd29411ba65c6...,82d88fa219e8f5882380e38444c7b0018254be3d7a11af...,8424abbf2d1cbb1ec5e741ba8cc4302c891b31dfa2e02e...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.001109,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,product_id,769db6accd15e3cf489461cba947bff58bd14713586533...,1,0,1


In [27]:
temp1 = sorted_train_clickout_data[sorted_train_clickout_data["query_id"]=="d857debeb1245b878ce153e2c22972977e4b71ae1ff86c179d5d0040e522ce46"].drop_duplicates()
temp1

Unnamed: 0,query_id,user_id,session_id,product_id,user_tier,product_price,page_type,previous_page_type,device_category,device_platform,user_country,country_platform,context_type,context_value,sess_step,days_elapsed,is_click
2199981,d857debeb1245b878ce153e2c22972977e4b71ae1ff86c...,4be47c1a4e9b7e990027d19f44dfbb8f0dcc757f4ce8bf...,144d1806f7521a3f3e688f91d0c895cbcaae216a4b2738...,915fa29349b4f4b6881a194cc06c8fe029fd9a0a41548b...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.000555,596618814963e496d74434df8b8fe3306892f2e4ce6aaa...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,designer_id,b5bc347a651c4933c11e4fecb3fa0056d209ae8b09e65b...,1,36,1
2199982,d857debeb1245b878ce153e2c22972977e4b71ae1ff86c...,4be47c1a4e9b7e990027d19f44dfbb8f0dcc757f4ce8bf...,144d1806f7521a3f3e688f91d0c895cbcaae216a4b2738...,404de72ca2ccca7734d37d325060a853f72605d649c308...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.000553,596618814963e496d74434df8b8fe3306892f2e4ce6aaa...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,designer_id,b5bc347a651c4933c11e4fecb3fa0056d209ae8b09e65b...,1,36,1
2199983,d857debeb1245b878ce153e2c22972977e4b71ae1ff86c...,4be47c1a4e9b7e990027d19f44dfbb8f0dcc757f4ce8bf...,144d1806f7521a3f3e688f91d0c895cbcaae216a4b2738...,47d0fffa8f3c3e1ad82922f4a654cdfc216cc5f0b644ec...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.000682,596618814963e496d74434df8b8fe3306892f2e4ce6aaa...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,designer_id,b5bc347a651c4933c11e4fecb3fa0056d209ae8b09e65b...,1,36,1
2199984,d857debeb1245b878ce153e2c22972977e4b71ae1ff86c...,4be47c1a4e9b7e990027d19f44dfbb8f0dcc757f4ce8bf...,144d1806f7521a3f3e688f91d0c895cbcaae216a4b2738...,83cef55ea2a46f42371d7ebc00d0fdaf7280c4b4624e80...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.00051,596618814963e496d74434df8b8fe3306892f2e4ce6aaa...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,designer_id,b5bc347a651c4933c11e4fecb3fa0056d209ae8b09e65b...,1,36,1
2199995,d857debeb1245b878ce153e2c22972977e4b71ae1ff86c...,4be47c1a4e9b7e990027d19f44dfbb8f0dcc757f4ce8bf...,144d1806f7521a3f3e688f91d0c895cbcaae216a4b2738...,80acec678c1bdb38fa361391d5eb9dacbeadfef79711ce...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.000596,596618814963e496d74434df8b8fe3306892f2e4ce6aaa...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,designer_id,b5bc347a651c4933c11e4fecb3fa0056d209ae8b09e65b...,1,36,1


In [28]:
temp2 = sorted_train_clickout_data.groupby('query_id').count().reset_index().sort_values('is_click', ascending=False).reset_index(drop=True)
temp2.head()


Unnamed: 0,query_id,user_id,session_id,product_id,user_tier,product_price,page_type,previous_page_type,device_category,device_platform,user_country,country_platform,context_type,context_value,sess_step,days_elapsed,is_click
0,8c9df7b233ceb7f8bd0dbce4e8fd9d7a230e758a02fddf...,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
1,d857debeb1245b878ce153e2c22972977e4b71ae1ff86c...,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
2,2f62ea052b3c98a86cbd567652a45cdc77e7d559c6c552...,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
3,9d8c854a1298bb52369de76859b459513f2ea1e8c1aea7...,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
4,8d9a5e15e9a6a6b03fb59114561f29c5f69791d080180e...,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4


In [29]:
train[train["query_id"]=="d857debeb1245b878ce153e2c22972977e4b71ae1ff86c179d5d0040e522ce46"]


Unnamed: 0,query_id,user_id,session_id,product_id,user_tier,product_price,page_type,previous_page_type,device_category,device_platform,user_country,country_platform,context_type,context_value,sess_step,days_elapsed,is_click
2965968,d857debeb1245b878ce153e2c22972977e4b71ae1ff86c...,4be47c1a4e9b7e990027d19f44dfbb8f0dcc757f4ce8bf...,144d1806f7521a3f3e688f91d0c895cbcaae216a4b2738...,182d98b9080acd5fe7afc9c49b050c186c11b9441ce540...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.00051,596618814963e496d74434df8b8fe3306892f2e4ce6aaa...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,designer_id,b5bc347a651c4933c11e4fecb3fa0056d209ae8b09e65b...,1,36,0
2965969,d857debeb1245b878ce153e2c22972977e4b71ae1ff86c...,4be47c1a4e9b7e990027d19f44dfbb8f0dcc757f4ce8bf...,144d1806f7521a3f3e688f91d0c895cbcaae216a4b2738...,83cef55ea2a46f42371d7ebc00d0fdaf7280c4b4624e80...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.00051,596618814963e496d74434df8b8fe3306892f2e4ce6aaa...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,designer_id,b5bc347a651c4933c11e4fecb3fa0056d209ae8b09e65b...,1,36,1
2965970,d857debeb1245b878ce153e2c22972977e4b71ae1ff86c...,4be47c1a4e9b7e990027d19f44dfbb8f0dcc757f4ce8bf...,144d1806f7521a3f3e688f91d0c895cbcaae216a4b2738...,47d0fffa8f3c3e1ad82922f4a654cdfc216cc5f0b644ec...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.000682,596618814963e496d74434df8b8fe3306892f2e4ce6aaa...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,designer_id,b5bc347a651c4933c11e4fecb3fa0056d209ae8b09e65b...,1,36,1
2965971,d857debeb1245b878ce153e2c22972977e4b71ae1ff86c...,4be47c1a4e9b7e990027d19f44dfbb8f0dcc757f4ce8bf...,144d1806f7521a3f3e688f91d0c895cbcaae216a4b2738...,404de72ca2ccca7734d37d325060a853f72605d649c308...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.000553,596618814963e496d74434df8b8fe3306892f2e4ce6aaa...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,designer_id,b5bc347a651c4933c11e4fecb3fa0056d209ae8b09e65b...,1,36,1
2965972,d857debeb1245b878ce153e2c22972977e4b71ae1ff86c...,4be47c1a4e9b7e990027d19f44dfbb8f0dcc757f4ce8bf...,144d1806f7521a3f3e688f91d0c895cbcaae216a4b2738...,80acec678c1bdb38fa361391d5eb9dacbeadfef79711ce...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.000596,596618814963e496d74434df8b8fe3306892f2e4ce6aaa...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,designer_id,b5bc347a651c4933c11e4fecb3fa0056d209ae8b09e65b...,1,36,1
2965973,d857debeb1245b878ce153e2c22972977e4b71ae1ff86c...,4be47c1a4e9b7e990027d19f44dfbb8f0dcc757f4ce8bf...,144d1806f7521a3f3e688f91d0c895cbcaae216a4b2738...,915fa29349b4f4b6881a194cc06c8fe029fd9a0a41548b...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.000555,596618814963e496d74434df8b8fe3306892f2e4ce6aaa...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,designer_id,b5bc347a651c4933c11e4fecb3fa0056d209ae8b09e65b...,1,36,1


In [30]:
t = train.groupby('query_id').agg({'is_click': 'sum'}).reset_index().sort_values('is_click', ascending=False)
t

Unnamed: 0,query_id,is_click
320754,8c9df7b233ceb7f8bd0dbce4e8fd9d7a230e758a02fddf...,5
494328,d857debeb1245b878ce153e2c22972977e4b71ae1ff86c...,5
107910,2f62ea052b3c98a86cbd567652a45cdc77e7d559c6c552...,5
359506,9d8c854a1298bb52369de76859b459513f2ea1e8c1aea7...,4
323005,8d9a5e15e9a6a6b03fb59114561f29c5f69791d080180e...,4
...,...,...
197593,569673b56eebe671b5b582b6a5023fc49533af5fc6a7b2...,1
197592,56966cc8f90fbe3fb74bde54722344db96d4f62422df81...,1
197591,569658b586eed8dbda1795e4feca09e25e2b001cccb6c7...,1
197590,569643a9d2a55996b43cadbb8a82ca0c21c55780f61c00...,1


In [31]:
t[t['is_click']>1].shape

(16110, 2)

In [32]:
len(list(set(val.query_id.unique().tolist()) - set(train.query_id.unique().tolist())))


114532

In [40]:
def get_past_product_clickout_count(row, df):
    return len(df[(df['product_id']==row['product_id']) & (df['days_elapsed']<row['days_elapsed'])])
    

In [48]:
sorted_train_clickout_data["frequency_of_past_clickout_of_product"] = sorted_train_clickout_data.swifter.apply(lambda x: get_past_product_clickout_count(x, sorted_train_clickout_data), 1)



HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=602158.0), HTML(value='')))




In [49]:
sorted_train_clickout_data.head()

Unnamed: 0,query_id,user_id,session_id,product_id,user_tier,product_price,page_type,previous_page_type,device_category,device_platform,user_country,country_platform,context_type,context_value,sess_step,days_elapsed,is_click,frequency_of_past_clickout_of_product
3,9446fb136df49975847f8c73e4f1b660a4103a9579582c...,55141265635f9f86832cfe6d30822e78cf1c619df8a5e3...,e3bfa8e1dc3068c434dd7c121229d394b74bc97509a865...,5266d3fcb681ff8d11fd8a42f5b06aa2729dfc4fd9ba1b...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.002205,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,product_id,8b43fe5a4ac6e21388b8a0a1000d30f565906ad239fe3d...,23,0,1,0
11,3ec6453701ac4285c916735f09d88f20720846b1ac0df9...,55141265635f9f86832cfe6d30822e78cf1c619df8a5e3...,e3bfa8e1dc3068c434dd7c121229d394b74bc97509a865...,b46915304330dc6eb5cc99a40cf712df67e16c3abba919...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.005379,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,product_id,68ffd8b6d0267886d1f4668099c269ab5d016774e3820e...,23,0,1,0
14,936c329320ffdac0aaba3874d2ad978b2d8e93e2f688d1...,9094f8baf87832740422bdadd76132089ffc637224b175...,451813b2c4dfcfc4787e9d99cf4ed8c3eab7704a024108...,5463fe247c3db709f9c38808a1279c5615a79847d8ebcc...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.001719,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,8008642632a9035202bbc1a28419d0bb5167c0488f3d0e...,2,0,1,0
24,7c08c7ae7fbe4b21a119b32071bc27b74ef3d5dfe7abfb...,c8443acb7ee67ea5e8f9726d93d6f92ea472bad8a5fd92...,df2432c9ee96db11af92bd2aae9521283c6a19fd4b3f92...,d407c33ed51cad6716b7052dc8d98e334f1f5d9a57bf83...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.000188,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,570d5cd5f5ec1be4a0a910cf18b51c228b5f63d09896a8...,2,0,1,0
30,394a4f6a4dbd161c5e7dfcf05e6129de0c5de62f630df7...,87b0c2f3de88284cf65531751e5a9546cdd29411ba65c6...,82d88fa219e8f5882380e38444c7b0018254be3d7a11af...,8424abbf2d1cbb1ec5e741ba8cc4302c891b31dfa2e02e...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.001109,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,product_id,769db6accd15e3cf489461cba947bff58bd14713586533...,1,0,1,0


In [56]:
# sorted_train_clickout_data[["query_id", "user_id", "session_id", "product_id", "days_elapsed", "frequency_of_past_clickout_of_product"]].to_csv("../preprocessed_data/clickout_features/frequency_of_past_clickout_of_product.csv", index=False)



In [57]:
def get_past_product_clickout_agg_price(row, df, aggfunc):
    if aggfunc=='mean':
        return df[(df['product_id']==row['product_id']) & (df['days_elapsed']<row['days_elapsed'])]['product_price'].mean()
    elif aggfunc=='max':
        return df[(df['product_id']==row['product_id']) & (df['days_elapsed']<row['days_elapsed'])]['product_price'].max()
    elif aggfunc=='min':
        return df[(df['product_id']==row['product_id']) & (df['days_elapsed']<row['days_elapsed'])]['product_price'].min()    
    

In [58]:
sorted_train_clickout_data["mean_price_of_past_clickout_of_product"] = sorted_train_clickout_data.swifter.apply(lambda x: get_past_product_clickout_agg_price(x, sorted_train_clickout_data, 'mean'), 1)


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=602158.0), HTML(value='')))




In [62]:
sorted_train_clickout_data["max_price_of_past_clickout_of_product"] = sorted_train_clickout_data.swifter.apply(lambda x: get_past_product_clickout_agg_price(x, sorted_train_clickout_data, 'max'), 1)


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=602158.0), HTML(value='')))




In [63]:
sorted_train_clickout_data["min_price_of_past_clickout_of_product"] = sorted_train_clickout_data.swifter.apply(lambda x: get_past_product_clickout_agg_price(x, sorted_train_clickout_data, 'min'), 1)


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=602158.0), HTML(value='')))




In [64]:
# sorted_train_clickout_data[["query_id", "user_id", "session_id", "product_id", "days_elapsed", "mean_price_of_past_clickout_of_product", "max_price_of_past_clickout_of_product", "min_price_of_past_clickout_of_product"]].to_csv("../preprocessed_data/clickout_features/mean_max_min_of_past_clickout_of_product.csv", index=False)


In [72]:
sorted_train_clickout_data.head()

Unnamed: 0,query_id,user_id,session_id,product_id,user_tier,product_price,page_type,previous_page_type,device_category,device_platform,...,country_platform,context_type,context_value,sess_step,days_elapsed,is_click,frequency_of_past_clickout_of_product,mean_price_of_past_clickout_of_product,max_price_of_past_clickout_of_product,min_price_of_past_clickout_of_product
3,9446fb136df49975847f8c73e4f1b660a4103a9579582c...,55141265635f9f86832cfe6d30822e78cf1c619df8a5e3...,e3bfa8e1dc3068c434dd7c121229d394b74bc97509a865...,5266d3fcb681ff8d11fd8a42f5b06aa2729dfc4fd9ba1b...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.002205,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,product_id,8b43fe5a4ac6e21388b8a0a1000d30f565906ad239fe3d...,23,0,1,0,,,
11,3ec6453701ac4285c916735f09d88f20720846b1ac0df9...,55141265635f9f86832cfe6d30822e78cf1c619df8a5e3...,e3bfa8e1dc3068c434dd7c121229d394b74bc97509a865...,b46915304330dc6eb5cc99a40cf712df67e16c3abba919...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.005379,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,product_id,68ffd8b6d0267886d1f4668099c269ab5d016774e3820e...,23,0,1,0,,,
14,936c329320ffdac0aaba3874d2ad978b2d8e93e2f688d1...,9094f8baf87832740422bdadd76132089ffc637224b175...,451813b2c4dfcfc4787e9d99cf4ed8c3eab7704a024108...,5463fe247c3db709f9c38808a1279c5615a79847d8ebcc...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.001719,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,8008642632a9035202bbc1a28419d0bb5167c0488f3d0e...,2,0,1,0,,,
24,7c08c7ae7fbe4b21a119b32071bc27b74ef3d5dfe7abfb...,c8443acb7ee67ea5e8f9726d93d6f92ea472bad8a5fd92...,df2432c9ee96db11af92bd2aae9521283c6a19fd4b3f92...,d407c33ed51cad6716b7052dc8d98e334f1f5d9a57bf83...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.000188,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,570d5cd5f5ec1be4a0a910cf18b51c228b5f63d09896a8...,2,0,1,0,,,
30,394a4f6a4dbd161c5e7dfcf05e6129de0c5de62f630df7...,87b0c2f3de88284cf65531751e5a9546cdd29411ba65c6...,82d88fa219e8f5882380e38444c7b0018254be3d7a11af...,8424abbf2d1cbb1ec5e741ba8cc4302c891b31dfa2e02e...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.001109,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,product_id,769db6accd15e3cf489461cba947bff58bd14713586533...,1,0,1,0,,,


In [79]:
def get_prop_of_past_click_of_product_in_country(row, df):
    past_country_clicks = len(df[(df['product_id']==row['product_id']) & (df['user_country']==row['user_country']) & (df['days_elapsed']<row['days_elapsed'])])
    past_overall_clicks = row['frequency_of_past_clickout_of_product']
    if past_overall_clicks==0:
        return np.nan
    return past_country_clicks/past_overall_clicks
    
    

In [80]:
sorted_train_clickout_data["prop_of_past_click_of_product_in_country"] = sorted_train_clickout_data.swifter.apply(lambda x: get_prop_of_past_click_of_product_in_country(x, sorted_train_clickout_data), 1)


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=602158.0), HTML(value='')))




In [None]:
sorted_train_clickout_data[["query_id", "user_id", "session_id", "product_id", "user_country", "days_elapsed", "prop_of_past_click_of_product_in_country"]].to_csv("../preprocessed_data/clickout_features/proportion_of_past_click_of_product_in_country.csv", index=False)


In [81]:
len(sorted_train_clickout_data[["query_id", "user_id", "session_id", "product_id", "user_country", "days_elapsed", "prop_of_past_click_of_product_in_country"]]),\
len(sorted_train_clickout_data[["query_id", "user_id", "session_id", "product_id", "user_country", "days_elapsed", "prop_of_past_click_of_product_in_country"]].drop_duplicates())




(602158, 602158)

In [53]:
train[(train['product_id']=='1efd18182268101b62a1ea12a9cafbe05487f3abb9292418de3152f1ac87f8eb') & (train['days_elapsed']<38)]['product_price'].mean()



0.0005083

In [35]:
train[train['product_id']=='1efd18182268101b62a1ea12a9cafbe05487f3abb9292418de3152f1ac87f8eb']

Unnamed: 0,query_id,user_id,session_id,product_id,user_tier,product_price,page_type,previous_page_type,device_category,device_platform,user_country,country_platform,context_type,context_value,sess_step,days_elapsed,is_click
0,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,1efd18182268101b62a1ea12a9cafbe05487f3abb92924...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.000466,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,1,38,0
1726713,7e193bbc57e45f512dc7b3d5fbba79261bc7eedfadc5fb...,b6660c7f762af3e7cc28ae9bd432ae10b3dccb01183656...,c2b3ac6baf7f5a7eb5bcf3df396b0ca6dd0a75973c625b...,1efd18182268101b62a1ea12a9cafbe05487f3abb92924...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.000508,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,14fc80e2d6821260d291ff47ce6d8d7534d4cb5aa4ab0d...,baf81edf7ae389bf0f52e3f6fff748ac04286ab68eca84...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,5fb64e8ae6e8f0b4217d71d30d5abc8f5f5bb3333c1b62...,1,28,1
1881241,896f2dea3117252f7487dbd175304eea7b14f1849b6b0e...,109dcceba2cd1a22ed925b3653596739ac66d6d46025fe...,d334bb89e442b79a7e8791f66fd31bdccd18cf9be8c3d5...,1efd18182268101b62a1ea12a9cafbe05487f3abb92924...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.000509,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,14fc80e2d6821260d291ff47ce6d8d7534d4cb5aa4ab0d...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,b3a0c5cfe399afbcf45c46ce87ff476dd48ab9283d1af6...,b3a0c5cfe399afbcf45c46ce87ff476dd48ab9283d1af6...,product_id,103ce936dc1fa9db91c92bb0013ecb8823ea244142b999...,2,14,0


In [89]:
val[val['product_id']=='1efd18182268101b62a1ea12a9cafbe05487f3abb9292418de3152f1ac87f8eb']

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value,product_price,week,week_day,days_elapsed,sess_step,country_platform
639208,ee29ef679d5acd5e8b7ce7dcd3643084cae913327fbca1...,752af9c20ae2d1ece04e66285056207e136bc84bcaf0ba...,40949d5da2f356dc76393c965ce181e083e12850e8d7e8...,1efd18182268101b62a1ea12a9cafbe05487f3abb92924...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,5fb64e8ae6e8f0b4217d71d30d5abc8f5f5bb3333c1b62...,0.000291,7,1,40,1,ea2f413bd8fda0b91a814a68aa520044b204796991a343...


In [212]:
clicked_product_ids = train[train["is_click"]==1].product_id.unique().tolist()

In [213]:
new_train = train.sort_values(["days_elapsed"]).reset_index(drop=True)
new_train = new_train[new_train["product_id"].isin(clicked_product_ids)]


In [214]:
new_train.head()

Unnamed: 0,query_id,user_id,session_id,product_id,user_tier,product_price,page_type,previous_page_type,device_category,device_platform,user_country,country_platform,context_type,context_value,week,week_day,sess_step,days_elapsed,is_click
0,4835c7d4d1d678f9acf993848d7c98c4b9b079d43483b7...,90faf5e4d75b3a5e221e8d1ee5be73b50b79769c20421e...,3862c8e286648dd60d8075aad76ef076413603faa7dd44...,14ce9a6c60e9e7b43a9f0c77ebe9ad5d9f59154d38d0fe...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.000391,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,f319f71fd9b14532cf715e601fe3b5c2473b58d4302e88...,f319f71fd9b14532cf715e601fe3b5c2473b58d4302e88...,product_id,24a49e04cdea49621059651a6348540fc69828aac66a5d...,1,3,9,0,0
2,9446fb136df49975847f8c73e4f1b660a4103a9579582c...,55141265635f9f86832cfe6d30822e78cf1c619df8a5e3...,e3bfa8e1dc3068c434dd7c121229d394b74bc97509a865...,e63066db6803db9747d8472d3c4d0e6cf85cf1ce972736...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.002131,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,product_id,8b43fe5a4ac6e21388b8a0a1000d30f565906ad239fe3d...,1,3,23,0,0
3,9446fb136df49975847f8c73e4f1b660a4103a9579582c...,55141265635f9f86832cfe6d30822e78cf1c619df8a5e3...,e3bfa8e1dc3068c434dd7c121229d394b74bc97509a865...,5266d3fcb681ff8d11fd8a42f5b06aa2729dfc4fd9ba1b...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.002205,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,product_id,8b43fe5a4ac6e21388b8a0a1000d30f565906ad239fe3d...,1,3,23,0,1
4,9446fb136df49975847f8c73e4f1b660a4103a9579582c...,55141265635f9f86832cfe6d30822e78cf1c619df8a5e3...,e3bfa8e1dc3068c434dd7c121229d394b74bc97509a865...,4fd8ad45465739666a567a92c5edd60145e07bfd275312...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.001776,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,product_id,8b43fe5a4ac6e21388b8a0a1000d30f565906ad239fe3d...,1,3,23,0,0
5,3ec6453701ac4285c916735f09d88f20720846b1ac0df9...,55141265635f9f86832cfe6d30822e78cf1c619df8a5e3...,e3bfa8e1dc3068c434dd7c121229d394b74bc97509a865...,e2954d99e7a718b423ebd3585c1775d1f84f5adbf86bf4...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.004704,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,product_id,68ffd8b6d0267886d1f4668099c269ab5d016774e3820e...,1,3,23,0,0


In [216]:
train.week.unique(), val.week.unique()

(array([6, 7, 3, 5, 4, 2, 8, 1], dtype=int8),
 array([7, 6, 8, 9, 3, 2, 4, 5], dtype=int8))

In [217]:
product_mean_price_by_week = new_train.groupby(["product_id", "week"]).agg({"product_price": "mean"}).reset_index()
product_mean_price_by_week = product_mean_price_by_week.rename(columns={"product_price": "clickout_product_mean_price"})
product_mean_price_by_week.head()

Unnamed: 0,product_id,week,clickout_product_mean_price
0,00002185a176ed9deaabd2f8b5eceb6a3039f1c8596e45...,1,
1,00002185a176ed9deaabd2f8b5eceb6a3039f1c8596e45...,2,
2,00002185a176ed9deaabd2f8b5eceb6a3039f1c8596e45...,3,
3,00002185a176ed9deaabd2f8b5eceb6a3039f1c8596e45...,4,
4,00002185a176ed9deaabd2f8b5eceb6a3039f1c8596e45...,5,


In [218]:
unique_prod_ids = product_mean_price_by_week.product_id.unique().tolist()
ids = []
weeks = []
for _id in unique_prod_ids:
    ids.extend([_id]*8)
    weeks.extend([1,2,3,4,5,6,7,8])
    
new_df = pd.DataFrame({'product_id': ids, 'week': weeks})


In [219]:
t = product_mean_price_by_week.merge(new_df, on=["product_id", "week"], how='right')

def get_week(row):
    dic={}
    for i in range(1, 9):
        dic[i] = "week_"+str(i)
    return dic[row["week"]]
    
t["week"] = t.swifter.apply(lambda x: get_week(x), 1)
t.head(14)

HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3266104.0), HTML(value='')))




Unnamed: 0,product_id,week,clickout_product_mean_price
0,00002185a176ed9deaabd2f8b5eceb6a3039f1c8596e45...,week_1,
1,00002185a176ed9deaabd2f8b5eceb6a3039f1c8596e45...,week_2,
2,00002185a176ed9deaabd2f8b5eceb6a3039f1c8596e45...,week_3,
3,00002185a176ed9deaabd2f8b5eceb6a3039f1c8596e45...,week_4,
4,00002185a176ed9deaabd2f8b5eceb6a3039f1c8596e45...,week_5,
5,00002185a176ed9deaabd2f8b5eceb6a3039f1c8596e45...,week_6,
6,00002185a176ed9deaabd2f8b5eceb6a3039f1c8596e45...,week_7,
7,00002185a176ed9deaabd2f8b5eceb6a3039f1c8596e45...,week_8,
8,000029d3d499cbd642ac8d2b79028d4d1e048f915cdfcd...,week_1,
9,000029d3d499cbd642ac8d2b79028d4d1e048f915cdfcd...,week_2,


In [220]:
t["clickout_product_mean_price"] = t["clickout_product_mean_price"].fillna(0)
t = pd.pivot_table(t, values=['clickout_product_mean_price'], index=['product_id'],
                    columns=['week'], aggfunc=np.mean)
t.columns = t.columns.to_series().str.join('_')
t = t.reset_index()
t.head()


Unnamed: 0,product_id,clickout_product_mean_price_week_1,clickout_product_mean_price_week_2,clickout_product_mean_price_week_3,clickout_product_mean_price_week_4,clickout_product_mean_price_week_5,clickout_product_mean_price_week_6,clickout_product_mean_price_week_7,clickout_product_mean_price_week_8
0,00002185a176ed9deaabd2f8b5eceb6a3039f1c8596e45...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,000029d3d499cbd642ac8d2b79028d4d1e048f915cdfcd...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00007316b79817657a13be236d31f5ec3970fb0e7d66e0...,0.00134,0.001337,0.001337,0.0,0.0,0.001357,0.001357,0.0
3,00008a8e96b8602a19be7341577ae45e9bfee0716b6b12...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00008f66579dee0738eb52386a02af5b5d9b16d6bf05f4...,0.0,0.000133,0.000126,0.00012,0.0,0.000109,0.000111,0.0


In [224]:
# t.to_csv("../preprocessed_data/clickout_features/clicked_product_prices_by_week.csv", index=False)

In [237]:
gc.collect()

80

In [235]:
weekly_product_clicks = train.groupby(['product_id', 'week']).agg({'is_click': 'sum'}).reset_index()
weekly_product_clicks["week"] = weekly_product_clicks["week"].astype(str)
weekly_product_clicks = pd.pivot_table(weekly_product_clicks, values=['is_click'], index=['product_id'],
                    columns=['week'], aggfunc=np.mean)
weekly_product_clicks.columns = weekly_product_clicks.columns.to_series().str.join('_')
weekly_product_clicks = weekly_product_clicks.reset_index()
weekly_product_clicks = weekly_product_clicks.rename(columns={"is_click_1": "click_frequency_week_1",
                                                             "is_click_2": "click_frequency_week_2",
                                                             "is_click_3": "click_frequency_week_3",
                                                             "is_click_4": "click_frequency_week_4",
                                                             "is_click_5": "click_frequency_week_5",
                                                             "is_click_6": "click_frequency_week_6",
                                                             "is_click_7": "click_frequency_week_7",
                                                             "is_click_8": "click_frequency_week_8",})
weekly_product_clicks.head()


Unnamed: 0,product_id,click_frequency_week_1,click_frequency_week_2,click_frequency_week_3,click_frequency_week_4,click_frequency_week_5,click_frequency_week_6,click_frequency_week_7,click_frequency_week_8
0,00002185a176ed9deaabd2f8b5eceb6a3039f1c8596e45...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,000029d3d499cbd642ac8d2b79028d4d1e048f915cdfcd...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00007316b79817657a13be236d31f5ec3970fb0e7d66e0...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,00008a8e96b8602a19be7341577ae45e9bfee0716b6b12...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00008f66579dee0738eb52386a02af5b5d9b16d6bf05f4...,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0


In [238]:
# weekly_product_clicks.to_csv("../preprocessed_data/clickout_features/clicked_product_frequency_by_week.csv", index=False)
