In [None]:
# ! pip install fastparquet --user
# ! pip install ordered-set
# ! pip install swifter --user

In [2]:
import pandas as pd
import swifter
import numpy as np
import gc
from tqdm import tqdm
from collections import defaultdict
from ordered_set import OrderedSet
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler, StandardScaler


In [3]:
train = pd.read_parquet("../data_phase1/train.parquet")
val = pd.read_parquet("../data_phase1/validation.parquet")


In [4]:
train = train.fillna(value={"context_type": "NA"})
val = val.fillna(value={"context_type": "NA"})


In [5]:
train.describe(include='O')

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value
count,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990
unique,584665,208393,317426,408263,5,23,3,2,6,196,5,189571
top,cb21670cb4af4b9a8f1623e43a1d29cb60f1ae96cbbdb4...,cc83479dd22e19ec45d08805a61b73d7f33a69feaf42be...,f3de26eced2c81d1b0d6da40c11c9f987fe066b5a4f4fd...,55e1495c40504b4b15a358f95e2cbede34d011b287c32b...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...
freq,6,4428,354,1728,3105120,2053848,1853280,1927596,2714292,520824,3105204,152514


In [6]:
val.describe(include='O')

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value
count,687192,687192,687192,687192,687192,687192,687192,687192,687192,687192,687192,687192
unique,114532,114532,114532,218525,5,22,3,2,6,188,5,66955
top,748ea9fd8615989f0242efb43fe9383038db2816fa5b97...,9553c8a9c7b7d522f072a9414fd7722302649d87ba9b8e...,e63518443b20d5373f6d9b3d2126ea8ecaeb98483aa0ae...,fe13af44356050cdc93ad3d5e458e24c5077e5bf7a4c12...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...
freq,6,6,6,398,666960,513864,411180,423654,605262,102696,666990,15342


In [7]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df


In [8]:
train = reduce_mem_usage(train)
val = reduce_mem_usage(val)


Memory usage of dataframe is 428.22 MB
Memory usage after optimization is: 175.85 MB
Decreased by 58.9%
Memory usage of dataframe is 78.64 MB
Memory usage after optimization is: 47.95 MB
Decreased by 39.0%


In [9]:
gc.collect()

22

In [10]:
train = train.sort_values(by='query_id').reset_index(drop=True)
val = val.sort_values(by='query_id').reset_index(drop=True)


In [11]:
train.head(10)

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value,product_price,week,week_day,is_click
0,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,1efd18182268101b62a1ea12a9cafbe05487f3abb92924...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.000466,6,6,0
1,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,62349c6eda0dc9fe8bb023213f03ebe93aefa5cbcdfecf...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.001068,6,6,0
2,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,aa0bbdfa55326b5c08d3472b1ee1d56fe13a82f63f46c8...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.001238,6,6,1
3,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,793301d78ea43b599acf05d350c8f9e485f5deaa417284...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.000465,6,6,0
4,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,9e03ca4af958bef1ce5d54e684a40ee7e3a9aa7e5010b6...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.000936,6,6,0
5,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,9e7f6f10b18f35ce924575f599465265e22fa9d75b13c9...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.001765,6,6,0
6,00008bcf52759e7e70d533026da637b1cfeb0dc04a67ed...,968f154cc29e3fb1ed5f361ab23d9d8495cf47c5baefef...,f391f67a9d273be5a95460ce4756cdbc22430336a0b28d...,b08c37eac9adf04388a290b13105167b4a50911ac14e5c...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,ca0bdabcd3a4eae25db6f95687cb307200e78892a6d774...,0.000804,7,2,0
7,00008bcf52759e7e70d533026da637b1cfeb0dc04a67ed...,968f154cc29e3fb1ed5f361ab23d9d8495cf47c5baefef...,f391f67a9d273be5a95460ce4756cdbc22430336a0b28d...,7b8c68e43264904ec9f74df2b9461fe7381aab12d0740e...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,ca0bdabcd3a4eae25db6f95687cb307200e78892a6d774...,0.000722,7,2,0
8,00008bcf52759e7e70d533026da637b1cfeb0dc04a67ed...,968f154cc29e3fb1ed5f361ab23d9d8495cf47c5baefef...,f391f67a9d273be5a95460ce4756cdbc22430336a0b28d...,616636f3c60f43d24741d9eca7fe1c3c567825acc64348...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,ca0bdabcd3a4eae25db6f95687cb307200e78892a6d774...,0.000916,7,2,1
9,00008bcf52759e7e70d533026da637b1cfeb0dc04a67ed...,968f154cc29e3fb1ed5f361ab23d9d8495cf47c5baefef...,f391f67a9d273be5a95460ce4756cdbc22430336a0b28d...,01f0a57f4fbc0e42e76960238fce01c28e29f23819ae04...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,ca0bdabcd3a4eae25db6f95687cb307200e78892a6d774...,0.000761,7,2,0


In [12]:
train.isnull().sum()

query_id              0
user_id               0
session_id            0
product_id            0
page_type             0
previous_page_type    0
device_category       0
device_platform       0
user_tier             0
user_country          0
context_type          0
context_value         0
product_price         0
week                  0
week_day              0
is_click              0
dtype: int64

In [13]:
data = pd.concat([train[list(set(train.columns)-set(['is_click']))], val], axis=0)
data = data.reset_index(drop=True)

In [14]:
all_products = data.product_id.apply(str).tolist()

In [15]:
unique_products = OrderedSet(all_products)


In [16]:
train_session_interactions = dict(train.groupby('session_id')['product_id'].apply(list))
val_session_interactions = dict(val.groupby('session_id')['product_id'].apply(list))

train_session_actions = dict(train.groupby('session_id')['is_click'].apply(list))
# val_session_actions = dict(val.groupby('session_id')['is_click'].apply(list))


In [17]:
train['days_elapsed'] = (train['week'].astype(int)-1)*7 + train['week_day'].astype(int)
val['days_elapsed'] = (val['week'].astype(int)-1)*7 + val['week_day'].astype(int)

In [18]:
val['days_elapsed'] = val['days_elapsed'] - train['days_elapsed'].min()
train['days_elapsed'] = train['days_elapsed'] - train['days_elapsed'].min()


In [19]:
val['days_elapsed'] = val['days_elapsed'] - train['days_elapsed'].min()
val.days_elapsed.unique()

array([43, 38, 41, 52, 53, 49, 11, 45, 46,  9, 48,  8, 20, 42, 34, 44, 40,
       21, 39, 19, 14, 30, 47,  6, 33, 23, 17, 28,  5,  7, 13, 25])

In [20]:
gc.collect()

88

In [21]:
attribute_data = pd.read_parquet("../data_phase1/attributes.parquet")

In [22]:
train_attr_data = train.merge(attribute_data, on='product_id', how='left')


In [23]:
is_click = train_attr_data.is_click.values
train_attr_data = train_attr_data.drop(['is_click'], 1)
train_attr_data['is_click'] = is_click


In [24]:
train_attr_data.head(2)

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,...,collection,category_id_l1,category_id_l2,category_id_l3,brand_id,season_year,start_online_date,material_values,attribute_values,is_click
0,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,1efd18182268101b62a1ea12a9cafbe05487f3abb92924...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,...,a3791e8d85c005b0d9d60d6d3b7e8edd2f256a5cc928d0...,bb660069187af9e9238d10a742def09bf2bed60435b088...,663ca810e6cd391f1a8c24b0e39c4ac59f7e04c4759356...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,09e1d96d9e05b0bcab8bcba3943b899adb5177a8a4fe81...,dc1194ba428d5cd4c49f8a769a6577ac1042162da38bc1...,573.041667,4632872aedf88d5e81361e0389833dd1be3bcafc7e6b58...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,0
1,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,62349c6eda0dc9fe8bb023213f03ebe93aefa5cbcdfecf...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,...,4c8006c7e513057a9138641abc2a9e65f4b014a8045259...,abf367e49718254e068eee51a565fdcede2d741e4c7e33...,61fe255948ec07c4eb25c70f7144b54beddd00466ad866...,ed7da2b533e9ccdf3dcaede4d2b8d170ffc32e2b922ee8...,2b225d501fd0ecd875cc3412eb5a14f96e98abc7b2f6bc...,ef7d8d2e02aec8c328dafde95486f7181c37d07be3d167...,262.041667,f61ecea9b45f1590e57706b88207449bdd4cb703b917ad...,c22d2095ea26985f55d9ec999807dac2190b8be5a14397...,0


In [25]:
full_attribute_data = data[["query_id", "product_id", "user_id", "session_id"]].merge(attribute_data, on='product_id', how='left')



In [61]:
full_attribute_data.head(3)

Unnamed: 0,query_id,product_id,user_id,session_id,gender,main_colour,second_colour,season,collection,category_id_l1,category_id_l2,category_id_l3,brand_id,season_year,start_online_date,material_values,attribute_values
0,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,1efd18182268101b62a1ea12a9cafbe05487f3abb92924...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,a8c9cca4c116691f1e331a5058e84f05e31696bc4f611c...,8770603751ae3b965ab13bdd02c607985cd56be86d850b...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,9db35d237f5873f0410d3ca18c07430270086eb1e7838d...,a3791e8d85c005b0d9d60d6d3b7e8edd2f256a5cc928d0...,bb660069187af9e9238d10a742def09bf2bed60435b088...,663ca810e6cd391f1a8c24b0e39c4ac59f7e04c4759356...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,09e1d96d9e05b0bcab8bcba3943b899adb5177a8a4fe81...,dc1194ba428d5cd4c49f8a769a6577ac1042162da38bc1...,573.041667,4632872aedf88d5e81361e0389833dd1be3bcafc7e6b58...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...
1,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,62349c6eda0dc9fe8bb023213f03ebe93aefa5cbcdfecf...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,a8c9cca4c116691f1e331a5058e84f05e31696bc4f611c...,0f97dafafa5dc4bb18853ea00776dfcc52302f40411b50...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,9db35d237f5873f0410d3ca18c07430270086eb1e7838d...,4c8006c7e513057a9138641abc2a9e65f4b014a8045259...,abf367e49718254e068eee51a565fdcede2d741e4c7e33...,61fe255948ec07c4eb25c70f7144b54beddd00466ad866...,ed7da2b533e9ccdf3dcaede4d2b8d170ffc32e2b922ee8...,2b225d501fd0ecd875cc3412eb5a14f96e98abc7b2f6bc...,ef7d8d2e02aec8c328dafde95486f7181c37d07be3d167...,262.041667,f61ecea9b45f1590e57706b88207449bdd4cb703b917ad...,c22d2095ea26985f55d9ec999807dac2190b8be5a14397...
2,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,aa0bbdfa55326b5c08d3472b1ee1d56fe13a82f63f46c8...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,a8c9cca4c116691f1e331a5058e84f05e31696bc4f611c...,752e245892cce8aa87b8f41c583f95c7059cbaa4e9d61b...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,9db35d237f5873f0410d3ca18c07430270086eb1e7838d...,4c8006c7e513057a9138641abc2a9e65f4b014a8045259...,abf367e49718254e068eee51a565fdcede2d741e4c7e33...,61fe255948ec07c4eb25c70f7144b54beddd00466ad866...,ed7da2b533e9ccdf3dcaede4d2b8d170ffc32e2b922ee8...,2b225d501fd0ecd875cc3412eb5a14f96e98abc7b2f6bc...,ef7d8d2e02aec8c328dafde95486f7181c37d07be3d167...,226.041667,2a9ca66cf16c629b4d0fb6d78e988a199db1490ef21d78...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...


# Brand

In [26]:
brand_impression_data = pd.DataFrame(full_attribute_data.groupby('brand_id').count()).reset_index()[["brand_id", "query_id"]]
brand_impression_data = brand_impression_data.rename(columns={'query_id': 'brand_impression_count'})
brand_impression_data.head()



Unnamed: 0,brand_id,brand_impression_count
0,001a11cdc499400927ae65d8fad803cff44c64be4035f5...,658
1,003df714e5a9897245f0dcb93f1c04bc025cfa2be37bc7...,118
2,0045d654875981a3692893e40fd72bba8bb054c8b63e84...,2
3,004861d660ace63dfd4b549590fcb1a9e3e316a1a45e62...,18
4,0049be50020812a0f5811102c38f97c1e24bee3622bd82...,94


In [27]:
click_data = train_attr_data[train_attr_data['is_click']==1]

In [28]:
brand_click_data = pd.DataFrame(click_data.groupby('brand_id').count()).reset_index()[["brand_id", "query_id"]]
brand_click_data = brand_click_data.rename(columns={'query_id': 'brand_click_count'})
brand_click_data.head()


Unnamed: 0,brand_id,brand_click_count
0,001a11cdc499400927ae65d8fad803cff44c64be4035f5...,95
1,003df714e5a9897245f0dcb93f1c04bc025cfa2be37bc7...,14
2,004861d660ace63dfd4b549590fcb1a9e3e316a1a45e62...,1
3,0049be50020812a0f5811102c38f97c1e24bee3622bd82...,16
4,005c0db6f52c1f797f3e24459a1120d4815bd08c8aeddd...,2


In [29]:
brand_data = brand_impression_data.merge(brand_click_data, on='brand_id', how='left')
brand_data = brand_data.fillna(0)
brand_data.head()

Unnamed: 0,brand_id,brand_impression_count,brand_click_count
0,001a11cdc499400927ae65d8fad803cff44c64be4035f5...,658,95.0
1,003df714e5a9897245f0dcb93f1c04bc025cfa2be37bc7...,118,14.0
2,0045d654875981a3692893e40fd72bba8bb054c8b63e84...,2,0.0
3,004861d660ace63dfd4b549590fcb1a9e3e316a1a45e62...,18,1.0
4,0049be50020812a0f5811102c38f97c1e24bee3622bd82...,94,16.0


In [30]:
brand_data["brand_popularity_score"] = brand_data["brand_click_count"]/brand_data["brand_impression_count"]

In [31]:
brand_data.head()

Unnamed: 0,brand_id,brand_impression_count,brand_click_count,brand_popularity_score
0,001a11cdc499400927ae65d8fad803cff44c64be4035f5...,658,95.0,0.144377
1,003df714e5a9897245f0dcb93f1c04bc025cfa2be37bc7...,118,14.0,0.118644
2,0045d654875981a3692893e40fd72bba8bb054c8b63e84...,2,0.0,0.0
3,004861d660ace63dfd4b549590fcb1a9e3e316a1a45e62...,18,1.0,0.055556
4,0049be50020812a0f5811102c38f97c1e24bee3622bd82...,94,16.0,0.170213


In [32]:
# brand_data.to_csv("../preprocessed_data/brand_popularity_df.csv", index=False)

# Category L1

In [35]:
category_id_l1_impression_data = pd.DataFrame(full_attribute_data.groupby('category_id_l1').count()).reset_index()[["category_id_l1", "query_id"]]
category_id_l1_impression_data = category_id_l1_impression_data.rename(columns={'query_id': 'category_id_l1_impression_count'})
category_id_l1_impression_data.head()



Unnamed: 0,category_id_l1,category_id_l1_impression_count
0,01bdbf7ae5ca741c0db30ebea8c606042cbabac5e24217...,16442
1,0248828a70652add64d469c493415bb53a03b4b103a59a...,1233
2,02cf092784c2b661328d08172a50ad60ee5bbccd4d8d10...,14997
3,09c394b9d5af703e13e57e80009cd0fb43b84416ba2e7b...,1164
4,174386292851a161e3dd1792776ff0f50123b7d07d93d1...,527


In [41]:
category_id_l1_click_data = pd.DataFrame(click_data.groupby('category_id_l1').count()).reset_index()[["category_id_l1", "query_id"]]
category_id_l1_click_data = category_id_l1_click_data.rename(columns={'query_id': 'category_id_l1_click_count'})
category_id_l1_click_data.head()


Unnamed: 0,category_id_l1,category_id_l1_click_count
0,01bdbf7ae5ca741c0db30ebea8c606042cbabac5e24217...,2164
1,0248828a70652add64d469c493415bb53a03b4b103a59a...,165
2,02cf092784c2b661328d08172a50ad60ee5bbccd4d8d10...,2251
3,09c394b9d5af703e13e57e80009cd0fb43b84416ba2e7b...,186
4,174386292851a161e3dd1792776ff0f50123b7d07d93d1...,71


In [42]:
category_id_l1_data = category_id_l1_impression_data.merge(category_id_l1_click_data, on='category_id_l1', how='left')
category_id_l1_data = category_id_l1_data.fillna(0)
category_id_l1_data.head()


Unnamed: 0,category_id_l1,category_id_l1_impression_count,category_id_l1_click_count
0,01bdbf7ae5ca741c0db30ebea8c606042cbabac5e24217...,16442,2164
1,0248828a70652add64d469c493415bb53a03b4b103a59a...,1233,165
2,02cf092784c2b661328d08172a50ad60ee5bbccd4d8d10...,14997,2251
3,09c394b9d5af703e13e57e80009cd0fb43b84416ba2e7b...,1164,186
4,174386292851a161e3dd1792776ff0f50123b7d07d93d1...,527,71


In [43]:
category_id_l1_data["category_id_l1_popularity_score"] = category_id_l1_data["category_id_l1_click_count"]/category_id_l1_data["category_id_l1_impression_count"]
category_id_l1_data.head()


Unnamed: 0,category_id_l1,category_id_l1_impression_count,category_id_l1_click_count,category_id_l1_popularity_score
0,01bdbf7ae5ca741c0db30ebea8c606042cbabac5e24217...,16442,2164,0.131614
1,0248828a70652add64d469c493415bb53a03b4b103a59a...,1233,165,0.13382
2,02cf092784c2b661328d08172a50ad60ee5bbccd4d8d10...,14997,2251,0.150097
3,09c394b9d5af703e13e57e80009cd0fb43b84416ba2e7b...,1164,186,0.159794
4,174386292851a161e3dd1792776ff0f50123b7d07d93d1...,527,71,0.134725


In [44]:
# category_id_l1_data.to_csv("../preprocessed_data/category_id_l1_popularity_df.csv", index=False)


# Category L2

In [45]:
category_id_l2_impression_data = pd.DataFrame(full_attribute_data.groupby('category_id_l2').count()).reset_index()[["category_id_l2", "query_id"]]
category_id_l2_impression_data = category_id_l2_impression_data.rename(columns={'query_id': 'category_id_l2_impression_count'})
category_id_l2_impression_data.head()



Unnamed: 0,category_id_l2,category_id_l2_impression_count
0,000651f17486a862c075e5a2a0131a750abeea6db55ad3...,1695
1,000bdb044ab0dc94653d0b77046c8f6d4d22f2ac342773...,563
2,009a4ca993df6dd9828587d23ba38be14f1f6b7172516d...,3059
3,00ebf4a9f1cbba7fe21b3690246defdc438326eebb10d3...,2300
4,017f77b88550e40b287c7097544fc432ab8227595ed1ce...,12


In [46]:
category_id_l2_click_data = pd.DataFrame(click_data.groupby('category_id_l2').count()).reset_index()[["category_id_l2", "query_id"]]
category_id_l2_click_data = category_id_l2_click_data.rename(columns={'query_id': 'category_id_l2_click_count'})
category_id_l2_click_data.head()


Unnamed: 0,category_id_l2,category_id_l2_click_count
0,000651f17486a862c075e5a2a0131a750abeea6db55ad3...,229
1,000bdb044ab0dc94653d0b77046c8f6d4d22f2ac342773...,81
2,009a4ca993df6dd9828587d23ba38be14f1f6b7172516d...,444
3,00ebf4a9f1cbba7fe21b3690246defdc438326eebb10d3...,309
4,017f77b88550e40b287c7097544fc432ab8227595ed1ce...,1


In [47]:
category_id_l2_data = category_id_l2_impression_data.merge(category_id_l2_click_data, on='category_id_l2', how='left')
category_id_l2_data = category_id_l2_data.fillna(0)
category_id_l2_data.head()


Unnamed: 0,category_id_l2,category_id_l2_impression_count,category_id_l2_click_count
0,000651f17486a862c075e5a2a0131a750abeea6db55ad3...,1695,229.0
1,000bdb044ab0dc94653d0b77046c8f6d4d22f2ac342773...,563,81.0
2,009a4ca993df6dd9828587d23ba38be14f1f6b7172516d...,3059,444.0
3,00ebf4a9f1cbba7fe21b3690246defdc438326eebb10d3...,2300,309.0
4,017f77b88550e40b287c7097544fc432ab8227595ed1ce...,12,1.0


In [48]:
category_id_l2_data["category_id_l2_popularity_score"] = category_id_l2_data["category_id_l2_click_count"]/category_id_l2_data["category_id_l2_impression_count"]
category_id_l2_data.head()


Unnamed: 0,category_id_l2,category_id_l2_impression_count,category_id_l2_click_count,category_id_l2_popularity_score
0,000651f17486a862c075e5a2a0131a750abeea6db55ad3...,1695,229.0,0.135103
1,000bdb044ab0dc94653d0b77046c8f6d4d22f2ac342773...,563,81.0,0.143872
2,009a4ca993df6dd9828587d23ba38be14f1f6b7172516d...,3059,444.0,0.145145
3,00ebf4a9f1cbba7fe21b3690246defdc438326eebb10d3...,2300,309.0,0.134348
4,017f77b88550e40b287c7097544fc432ab8227595ed1ce...,12,1.0,0.083333


In [50]:
# category_id_l2_data.to_csv("../preprocessed_data/category_id_l2_popularity_df.csv", index=False)


# Main Colour


In [51]:
main_colour_impression_data = pd.DataFrame(full_attribute_data.groupby('main_colour').count()).reset_index()[["main_colour", "query_id"]]
main_colour_impression_data = main_colour_impression_data.rename(columns={'query_id': 'main_colour_impression_count'})
main_colour_impression_data.head()



Unnamed: 0,main_colour,main_colour_impression_count
0,0078d3ef1479189e5a922b27e2540a7bb1d10eb9ec0581...,64150
1,0a89539f0dde241e4171b2daa052a1ff94c13b85532e0e...,24456
2,0f97dafafa5dc4bb18853ea00776dfcc52302f40411b50...,429952
3,4737cd35940c2338e96c18a25aeb6848d46f0da795bce8...,17550
4,58d4862ee98f76d3e192f544dbe081472c716b66cf7928...,266552


In [52]:
main_colour_click_data = pd.DataFrame(click_data.groupby('main_colour').count()).reset_index()[["main_colour", "query_id"]]
main_colour_click_data = main_colour_click_data.rename(columns={'query_id': 'main_colour_click_count'})
main_colour_click_data.head()


Unnamed: 0,main_colour,main_colour_click_count
0,0078d3ef1479189e5a922b27e2540a7bb1d10eb9ec0581...,9005
1,0a89539f0dde241e4171b2daa052a1ff94c13b85532e0e...,3617
2,0f97dafafa5dc4bb18853ea00776dfcc52302f40411b50...,62540
3,4737cd35940c2338e96c18a25aeb6848d46f0da795bce8...,2447
4,58d4862ee98f76d3e192f544dbe081472c716b66cf7928...,38752


In [53]:
main_colour_data = main_colour_impression_data.merge(main_colour_click_data, on='main_colour', how='left')
main_colour_data = main_colour_data.fillna(0)
main_colour_data.head()


Unnamed: 0,main_colour,main_colour_impression_count,main_colour_click_count
0,0078d3ef1479189e5a922b27e2540a7bb1d10eb9ec0581...,64150,9005
1,0a89539f0dde241e4171b2daa052a1ff94c13b85532e0e...,24456,3617
2,0f97dafafa5dc4bb18853ea00776dfcc52302f40411b50...,429952,62540
3,4737cd35940c2338e96c18a25aeb6848d46f0da795bce8...,17550,2447
4,58d4862ee98f76d3e192f544dbe081472c716b66cf7928...,266552,38752


In [54]:
main_colour_data["main_colour_popularity_score"] = main_colour_data["main_colour_click_count"]/main_colour_data["main_colour_impression_count"]
main_colour_data.head()


Unnamed: 0,main_colour,main_colour_impression_count,main_colour_click_count,main_colour_popularity_score
0,0078d3ef1479189e5a922b27e2540a7bb1d10eb9ec0581...,64150,9005,0.140374
1,0a89539f0dde241e4171b2daa052a1ff94c13b85532e0e...,24456,3617,0.147898
2,0f97dafafa5dc4bb18853ea00776dfcc52302f40411b50...,429952,62540,0.145458
3,4737cd35940c2338e96c18a25aeb6848d46f0da795bce8...,17550,2447,0.13943
4,58d4862ee98f76d3e192f544dbe081472c716b66cf7928...,266552,38752,0.145383


In [55]:
# main_colour_data.to_csv("../preprocessed_data/main_colour_popularity_df.csv", index=False)


# Second Colour

In [56]:
second_colour_impression_data = pd.DataFrame(full_attribute_data.groupby('second_colour').count()).reset_index()[["second_colour", "query_id"]]
second_colour_impression_data = second_colour_impression_data.rename(columns={'query_id': 'second_colour_impression_count'})
second_colour_impression_data.head()



Unnamed: 0,second_colour,second_colour_impression_count
0,0078d3ef1479189e5a922b27e2540a7bb1d10eb9ec0581...,35145
1,0a89539f0dde241e4171b2daa052a1ff94c13b85532e0e...,12633
2,0f97dafafa5dc4bb18853ea00776dfcc52302f40411b50...,54373
3,3cf10b7e1460bf9131817644937b860b13ec960b82758b...,415
4,4737cd35940c2338e96c18a25aeb6848d46f0da795bce8...,25671


In [57]:
second_colour_click_data = pd.DataFrame(click_data.groupby('second_colour').count()).reset_index()[["second_colour", "query_id"]]
second_colour_click_data = second_colour_click_data.rename(columns={'query_id': 'second_colour_click_count'})
second_colour_click_data.head()


Unnamed: 0,second_colour,second_colour_click_count
0,0078d3ef1479189e5a922b27e2540a7bb1d10eb9ec0581...,4744
1,0a89539f0dde241e4171b2daa052a1ff94c13b85532e0e...,1822
2,0f97dafafa5dc4bb18853ea00776dfcc52302f40411b50...,7775
3,3cf10b7e1460bf9131817644937b860b13ec960b82758b...,49
4,4737cd35940c2338e96c18a25aeb6848d46f0da795bce8...,3698


In [58]:
second_colour_data = second_colour_impression_data.merge(second_colour_click_data, on='second_colour', how='left')
second_colour_data = second_colour_data.fillna(0)
second_colour_data.head()


Unnamed: 0,second_colour,second_colour_impression_count,second_colour_click_count
0,0078d3ef1479189e5a922b27e2540a7bb1d10eb9ec0581...,35145,4744.0
1,0a89539f0dde241e4171b2daa052a1ff94c13b85532e0e...,12633,1822.0
2,0f97dafafa5dc4bb18853ea00776dfcc52302f40411b50...,54373,7775.0
3,3cf10b7e1460bf9131817644937b860b13ec960b82758b...,415,49.0
4,4737cd35940c2338e96c18a25aeb6848d46f0da795bce8...,25671,3698.0


In [59]:
second_colour_data["second_colour_popularity_score"] = second_colour_data["second_colour_click_count"]/second_colour_data["second_colour_impression_count"]
second_colour_data.head()


Unnamed: 0,second_colour,second_colour_impression_count,second_colour_click_count,second_colour_popularity_score
0,0078d3ef1479189e5a922b27e2540a7bb1d10eb9ec0581...,35145,4744.0,0.134984
1,0a89539f0dde241e4171b2daa052a1ff94c13b85532e0e...,12633,1822.0,0.144225
2,0f97dafafa5dc4bb18853ea00776dfcc52302f40411b50...,54373,7775.0,0.142994
3,3cf10b7e1460bf9131817644937b860b13ec960b82758b...,415,49.0,0.118072
4,4737cd35940c2338e96c18a25aeb6848d46f0da795bce8...,25671,3698.0,0.144054


In [60]:
# second_colour_data.to_csv("../preprocessed_data/second_colour_popularity_df.csv", index=False)


# Collection

In [62]:
collection_impression_data = pd.DataFrame(full_attribute_data.groupby('collection').count()).reset_index()[["collection", "query_id"]]
collection_impression_data = collection_impression_data.rename(columns={'query_id': 'collection_impression_count'})
collection_impression_data.head()



Unnamed: 0,collection,collection_impression_count
0,3e5d88c58066b4182e2eb9e0e2206aaba3a46b193e4788...,18102
1,4c8006c7e513057a9138641abc2a9e65f4b014a8045259...,2070924
2,6611f47aa127beb2b2fcb372eab3945274b5e742af07c2...,31066
3,a3791e8d85c005b0d9d60d6d3b7e8edd2f256a5cc928d0...,1574209
4,b5beb2ed60981746e0b908797b3d3abb8491de15ff5575...,425039


In [63]:
collection_click_data = pd.DataFrame(click_data.groupby('collection').count()).reset_index()[["collection", "query_id"]]
collection_click_data = collection_click_data.rename(columns={'query_id': 'collection_click_count'})
collection_click_data.head()


Unnamed: 0,collection,collection_click_count
0,3e5d88c58066b4182e2eb9e0e2206aaba3a46b193e4788...,2517
1,4c8006c7e513057a9138641abc2a9e65f4b014a8045259...,296997
2,6611f47aa127beb2b2fcb372eab3945274b5e742af07c2...,4599
3,a3791e8d85c005b0d9d60d6d3b7e8edd2f256a5cc928d0...,227540
4,b5beb2ed60981746e0b908797b3d3abb8491de15ff5575...,60073


In [64]:
collection_data = collection_impression_data.merge(collection_click_data, on='collection', how='left')
collection_data = collection_data.fillna(0)
collection_data.head()


Unnamed: 0,collection,collection_impression_count,collection_click_count
0,3e5d88c58066b4182e2eb9e0e2206aaba3a46b193e4788...,18102,2517
1,4c8006c7e513057a9138641abc2a9e65f4b014a8045259...,2070924,296997
2,6611f47aa127beb2b2fcb372eab3945274b5e742af07c2...,31066,4599
3,a3791e8d85c005b0d9d60d6d3b7e8edd2f256a5cc928d0...,1574209,227540
4,b5beb2ed60981746e0b908797b3d3abb8491de15ff5575...,425039,60073


In [65]:
collection_data["collection_popularity_score"] = collection_data["collection_click_count"]/collection_data["collection_impression_count"]
collection_data.head()


Unnamed: 0,collection,collection_impression_count,collection_click_count,collection_popularity_score
0,3e5d88c58066b4182e2eb9e0e2206aaba3a46b193e4788...,18102,2517,0.139045
1,4c8006c7e513057a9138641abc2a9e65f4b014a8045259...,2070924,296997,0.143413
2,6611f47aa127beb2b2fcb372eab3945274b5e742af07c2...,31066,4599,0.14804
3,a3791e8d85c005b0d9d60d6d3b7e8edd2f256a5cc928d0...,1574209,227540,0.144542
4,b5beb2ed60981746e0b908797b3d3abb8491de15ff5575...,425039,60073,0.141335


In [66]:
# collection_data.to_csv("../preprocessed_data/collection_popularity_df.csv", index=False)
