In [1]:
# ! pip install fastparquet --user
# ! pip install ordered-set

In [2]:
import pandas as pd
import numpy as np
import gc
from tqdm import tqdm
from collections import defaultdict
from ordered_set import OrderedSet
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler, StandardScaler


In [3]:
train = pd.read_parquet("../data_phase1/train.parquet")
val = pd.read_parquet("../data_phase1/validation.parquet")


In [4]:
train = train.fillna(value={"context_type": "NA"})
val = val.fillna(value={"context_type": "NA"})


In [5]:
train.describe(include='O')

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value
count,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990
unique,584665,208393,317426,408263,5,23,3,2,6,196,5,189571
top,abc40264df8164d3b04b4cb65f5f79fb980eccb7c5b5d0...,cc83479dd22e19ec45d08805a61b73d7f33a69feaf42be...,f3de26eced2c81d1b0d6da40c11c9f987fe066b5a4f4fd...,55e1495c40504b4b15a358f95e2cbede34d011b287c32b...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...
freq,6,4428,354,1728,3105120,2053848,1853280,1927596,2714292,520824,3105204,152514


In [6]:
val.describe(include='O')

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value
count,687192,687192,687192,687192,687192,687192,687192,687192,687192,687192,687192,687192
unique,114532,114532,114532,218525,5,22,3,2,6,188,5,66955
top,74da5fccd1c4557b0b115170462d3ad11c333cb0649b94...,0ee43e390fd64fb8f8325a7e11e8a4eea7f102a72e4b54...,7b59327d02ceb17d708469f17118bceae9f7a191ec481d...,fe13af44356050cdc93ad3d5e458e24c5077e5bf7a4c12...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...
freq,6,6,6,398,666960,513864,411180,423654,605262,102696,666990,15342


In [7]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df


In [8]:
train = reduce_mem_usage(train)
val = reduce_mem_usage(val)


Memory usage of dataframe is 428.22 MB
Memory usage after optimization is: 175.85 MB
Decreased by 58.9%
Memory usage of dataframe is 78.64 MB
Memory usage after optimization is: 47.95 MB
Decreased by 39.0%


In [9]:
gc.collect()

22

In [10]:
train = train.sort_values(by='query_id').reset_index(drop=True)
val = val.sort_values(by='query_id').reset_index(drop=True)


In [11]:
train.head(10)

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value,product_price,week,week_day,is_click
0,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,1efd18182268101b62a1ea12a9cafbe05487f3abb92924...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.000466,6,6,0
1,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,62349c6eda0dc9fe8bb023213f03ebe93aefa5cbcdfecf...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.001068,6,6,0
2,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,aa0bbdfa55326b5c08d3472b1ee1d56fe13a82f63f46c8...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.001238,6,6,1
3,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,793301d78ea43b599acf05d350c8f9e485f5deaa417284...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.000465,6,6,0
4,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,9e03ca4af958bef1ce5d54e684a40ee7e3a9aa7e5010b6...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.000936,6,6,0
5,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,9e7f6f10b18f35ce924575f599465265e22fa9d75b13c9...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.001765,6,6,0
6,00008bcf52759e7e70d533026da637b1cfeb0dc04a67ed...,968f154cc29e3fb1ed5f361ab23d9d8495cf47c5baefef...,f391f67a9d273be5a95460ce4756cdbc22430336a0b28d...,b08c37eac9adf04388a290b13105167b4a50911ac14e5c...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,ca0bdabcd3a4eae25db6f95687cb307200e78892a6d774...,0.000804,7,2,0
7,00008bcf52759e7e70d533026da637b1cfeb0dc04a67ed...,968f154cc29e3fb1ed5f361ab23d9d8495cf47c5baefef...,f391f67a9d273be5a95460ce4756cdbc22430336a0b28d...,7b8c68e43264904ec9f74df2b9461fe7381aab12d0740e...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,ca0bdabcd3a4eae25db6f95687cb307200e78892a6d774...,0.000722,7,2,0
8,00008bcf52759e7e70d533026da637b1cfeb0dc04a67ed...,968f154cc29e3fb1ed5f361ab23d9d8495cf47c5baefef...,f391f67a9d273be5a95460ce4756cdbc22430336a0b28d...,616636f3c60f43d24741d9eca7fe1c3c567825acc64348...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,ca0bdabcd3a4eae25db6f95687cb307200e78892a6d774...,0.000916,7,2,1
9,00008bcf52759e7e70d533026da637b1cfeb0dc04a67ed...,968f154cc29e3fb1ed5f361ab23d9d8495cf47c5baefef...,f391f67a9d273be5a95460ce4756cdbc22430336a0b28d...,01f0a57f4fbc0e42e76960238fce01c28e29f23819ae04...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,ca0bdabcd3a4eae25db6f95687cb307200e78892a6d774...,0.000761,7,2,0


In [12]:
train.isnull().sum()

query_id              0
user_id               0
session_id            0
product_id            0
page_type             0
previous_page_type    0
device_category       0
device_platform       0
user_tier             0
user_country          0
context_type          0
context_value         0
product_price         0
week                  0
week_day              0
is_click              0
dtype: int64

In [13]:
data = pd.concat([train[list(set(train.columns)-set(['is_click']))], val], axis=0)
data = data.reset_index(drop=True)

In [14]:
all_products = data.product_id.apply(str).tolist()

In [15]:
unique_products = OrderedSet(all_products)


In [16]:
train_session_interactions = dict(train.groupby('session_id')['product_id'].apply(list))
val_session_interactions = dict(val.groupby('session_id')['product_id'].apply(list))

train_session_actions = dict(train.groupby('session_id')['is_click'].apply(list))
# val_session_actions = dict(val.groupby('session_id')['is_click'].apply(list))


In [17]:
train['days_elapsed'] = (train['week'].astype(int)-1)*7 + train['week_day'].astype(int)
val['days_elapsed'] = (val['week'].astype(int)-1)*7 + val['week_day'].astype(int)


In [18]:
train['sess_step'] = train.groupby('session_id')['days_elapsed'].rank(method='max').apply(int)
val['sess_step'] = val.groupby('session_id')['days_elapsed'].rank(method='max').apply(int)

train['sess_step'] = train['sess_step']//6
val['sess_step'] = val['sess_step']//6


In [19]:
gc.collect()

22

In [20]:
train.groupby('session_id')['query_id', 'product_id'].nunique().sort_values('query_id', ascending=False).reset_index()

  """Entry point for launching an IPython kernel.


Unnamed: 0,session_id,query_id,product_id
0,f3de26eced2c81d1b0d6da40c11c9f987fe066b5a4f4fd...,59,171
1,d4063dbeb0907bdf8357cc3770ffe4e06610750799f52e...,49,144
2,752198f5ef8d176e6424fed1eb5081ee88d1e19363b3fb...,47,141
3,0554524c36c55f5b616353b445753d2380fc8db6b6abb8...,44,104
4,7f6b35cf7ea4ea3276545691bcf7742519f9653d5a5f93...,41,149
...,...,...,...
317421,691c97d24d512107a328eb65844a8cb95049083f806eca...,1,6
317422,691cb0d24567617d5311882c4f40b035693b83d7b4fec6...,1,6
317423,691d2384e9ee198688d6efbb5663642704f7523f5866e5...,1,6
317424,691d28775c47acf25c9e25d6b3ca029a83ade8773d9178...,1,6


In [21]:
def get_prod_freq(row):
    cnt = 0
    try:
        for prod in train_session_interactions[row['session_id']]:
            if prod==row['product_id']:
                cnt+=1
    except KeyError:
        pass
    try:
        for prod in val_session_interactions[row['session_id']]:
            if prod==row['product_id']:
                cnt+=1
    except KeyError:
        pass
    return cnt
        

In [22]:
train['product_session_frequency'] = train.apply(lambda x: get_prod_freq(x), 1)
val['product_session_frequency'] = val.apply(lambda x: get_prod_freq(x), 1)


In [23]:
train.sort_values('product_session_frequency', ascending=False).head(2)

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value,product_price,week,week_day,is_click,days_elapsed,sess_step,product_session_frequency
3172579,e75f542ae040ad8ef25013c5c023c1762f26ba6d6308f2...,4277ace003ca24df59394fd61b467e96a06cda5d64a7e7...,389729c7f87b9cb1e7870f0914a125ffb66097807c6b03...,7a2b9b37a106299c035246edf069fa212bbef82df66df1...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,9cb5ac3125e399b247d031c37bec739e8b35d85c2d0a5e...,0.001139,7,5,0,47,23,18
2512989,b76769a047f7093be6cb3062f7454cc1343909a72866ab...,4277ace003ca24df59394fd61b467e96a06cda5d64a7e7...,389729c7f87b9cb1e7870f0914a125ffb66097807c6b03...,7a2b9b37a106299c035246edf069fa212bbef82df66df1...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,6972062bb374ad69d74ff8a25aafe92e3cbd041331aea8...,0.001139,7,5,0,47,23,18


In [42]:
gc.collect()

88

In [34]:
def get_prod_clickout_proportion(row):
    cnt = 0
    clickouts = 0
    
    try:
        for i, prod in enumerate(train_session_interactions[row['session_id']]):
            if train_session_actions[row['session_id']][i]==1:
                clickouts+=1
                if prod==row['product_id']:
                    cnt+=1
    except KeyError:
        pass
    
#     try:            
#         for i, prod in enumerate(val_session_interactions[row['session_id']]):
#             if val_session_interactions[row['session_id']][i]==1:
#                 clickouts+=1
#                 if prod==row['product_id']:
#                     cnt+=1
#     except KeyError:
#         pass
                
    if clickouts==0:
        return np.nan
    
    return cnt/clickouts


In [27]:
train['product_session_click_proportion'] = train.apply(lambda x: get_prod_clickout_proportion(x), 1)

In [36]:
val['product_session_click_proportion'] = val.apply(lambda x: get_prod_clickout_proportion(x), 1)

In [39]:
data = pd.concat([train, val], 0)

In [40]:
product_click_features_by_session = data[["session_id", "product_id", "product_session_frequency", "product_session_click_proportion"]].drop_duplicates()



In [48]:
session_impression_count_df = pd.DataFrame(data.groupby('session_id')['query_id'].nunique()).reset_index()
session_impression_count_df = session_impression_count_df.rename(columns={'query_id': 'session_impression_count'})
session_impression_count_df.head()
                                           

Unnamed: 0,session_id,session_impression_count
0,00000310a3874db670d94513217ac53d73be25ea0bc1c9...,2
1,00002bea5c0c174264e846ce5beb196fd5b36f1d64e4d8...,1
2,000055d87de3ef0e5c614ce4ec6c04a37405d767332477...,1
3,00005e9475a26ea9798de9f24b7c43273122c4fd4db841...,2
4,0000b16e532d23c2c8a904b438408bbed6728cdf18eed3...,2


In [49]:
product_click_features_by_session = product_click_features_by_session.merge(session_impression_count_df, on=["session_id"], how='left')


In [52]:
session_mean_product_price = pd.DataFrame(data.groupby('session_id')['product_price'].mean()).reset_index()
session_mean_product_price = session_mean_product_price.rename(columns={'product_price': 'mean_session_product_price'})
session_mean_product_price.head()


Unnamed: 0,session_id,mean_session_product_price
0,00000310a3874db670d94513217ac53d73be25ea0bc1c9...,0.000731
1,00002bea5c0c174264e846ce5beb196fd5b36f1d64e4d8...,0.000184
2,000055d87de3ef0e5c614ce4ec6c04a37405d767332477...,0.00074
3,00005e9475a26ea9798de9f24b7c43273122c4fd4db841...,0.004536
4,0000b16e532d23c2c8a904b438408bbed6728cdf18eed3...,0.005707


In [53]:
product_click_features_by_session = product_click_features_by_session.merge(session_mean_product_price, on=["session_id"], how='left')


In [55]:
# product_click_features_by_session['price_deviation_from_mean_session_price'] = product_click_features_by_session['product_price'] - product_click_features_by_session['mean_session_product_price']


In [56]:
session_max_product_price = pd.DataFrame(data.groupby('session_id')['product_price'].max()).reset_index()
session_max_product_price = session_max_product_price.rename(columns={'product_price': 'max_session_product_price'})
session_max_product_price.head()


Unnamed: 0,session_id,max_session_product_price
0,00000310a3874db670d94513217ac53d73be25ea0bc1c9...,0.000963
1,00002bea5c0c174264e846ce5beb196fd5b36f1d64e4d8...,0.000474
2,000055d87de3ef0e5c614ce4ec6c04a37405d767332477...,0.001316
3,00005e9475a26ea9798de9f24b7c43273122c4fd4db841...,0.038818
4,0000b16e532d23c2c8a904b438408bbed6728cdf18eed3...,0.018616


In [58]:
session_min_product_price = pd.DataFrame(data.groupby('session_id')['product_price'].min()).reset_index()
session_min_product_price = session_min_product_price.rename(columns={'product_price': 'min_session_product_price'})
session_min_product_price.head()


Unnamed: 0,session_id,min_session_product_price
0,00000310a3874db670d94513217ac53d73be25ea0bc1c9...,0.000659
1,00002bea5c0c174264e846ce5beb196fd5b36f1d64e4d8...,6.7e-05
2,000055d87de3ef0e5c614ce4ec6c04a37405d767332477...,0.000149
3,00005e9475a26ea9798de9f24b7c43273122c4fd4db841...,0.000416
4,0000b16e532d23c2c8a904b438408bbed6728cdf18eed3...,0.000822


In [59]:
product_click_features_by_session = product_click_features_by_session.merge(session_max_product_price, on=["session_id"], how='left')
product_click_features_by_session = product_click_features_by_session.merge(session_min_product_price, on=["session_id"], how='left')


In [60]:
product_click_features_by_session.head()

Unnamed: 0,session_id,product_id,product_session_frequency,product_session_click_proportion,session_impression_count,mean_session_product_price,max_session_product_price,min_session_product_price
0,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,1efd18182268101b62a1ea12a9cafbe05487f3abb92924...,1,0.0,2,0.001093,0.001816,0.000465
1,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,62349c6eda0dc9fe8bb023213f03ebe93aefa5cbcdfecf...,1,0.0,2,0.001093,0.001816,0.000465
2,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,aa0bbdfa55326b5c08d3472b1ee1d56fe13a82f63f46c8...,1,1.0,2,0.001093,0.001816,0.000465
3,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,793301d78ea43b599acf05d350c8f9e485f5deaa417284...,1,0.0,2,0.001093,0.001816,0.000465
4,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,9e03ca4af958bef1ce5d54e684a40ee7e3a9aa7e5010b6...,1,0.0,2,0.001093,0.001816,0.000465


In [61]:
product_click_features_by_session.to_csv('../preprocessed_data/product_click_features_by_session.csv', index=False)



In [62]:
product_click_features_by_session.isnull().sum()

session_id                               0
product_id                               0
product_session_frequency                0
product_session_click_proportion    226776
session_impression_count                 0
mean_session_product_price               0
max_session_product_price                0
min_session_product_price                0
dtype: int64

In [None]:
product_click_features_by_session.sort_values('product_session_frequency', ascending=False).head()

In [None]:
k = dict(zip(train.session_id.tolist(), zip(train.product_id.tolist(), train.is_click.tolist())))


In [None]:
k['37b65411191e2f12441edad785c6ae94741eceaffdcb80cdb66520df9352d763']

In [None]:
len(list(set(val.product_id.unique()) - set(train.product_id.unique()))), len(val.product_id.unique())

In [None]:
len(list(set(val.user_id.unique()) - set(train.user_id.unique()))), len(val.user_id.unique())

In [None]:
len(list(set(val.session_id.unique()) - set(train.session_id.unique()))), len(val.session_id.unique())