In [1]:
# ! pip install fastparquet

In [2]:
import pandas as pd
from tqdm import tqdm
import swifter
import numpy as np
from glob import glob
import json
import gc


In [3]:
train = pd.read_parquet("../data_phase1/train.parquet")
val = pd.read_parquet("../data_phase1/validation.parquet")


In [4]:
train[["context_type"]] = train[["context_type"]].fillna(value="NA")
val[["context_type"]] = val[["context_type"]].fillna(value="NA")


In [5]:
train.head(2)

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value,product_price,week,week_day,is_click
0,92d4dd491a874a2cf92c8d311a44a42b597c64a5ede23d...,e5e4c71b1b9456dafece1338762d4ee3db698cf32c384c...,2740b0d77b4e6fafd75321f7d0794210afa8bd650955e7...,bf056e3841dd3a358c6aacb1f9e74e4c7c4adc62e33b45...,596618814963e496d74434df8b8fe3306892f2e4ce6aaa...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,designer_id,e5315dbea15a033bc6974a0bccf5fae4a017648bcd92ab...,0.000263,7,6,0
1,541a93bd95c3f4127a53e6b0d4b41db55ad9cb9e19d34a...,fca847f7eb5a5a21991421354b0f26afb4a517e540541f...,d5feab37634fd140e85b8f98dcb909a8779b4f0417c73a...,c6513ec49c8e04c265c907933799ff76f24c075c6308c4...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d179859aac8f7c1f88e1ee29b6655596873318c55127d3...,8c82855f15d05cd74fa59956434df17522fc68e4ce3900...,product_id,61205c20046f2688cb7ed03cad29d5a5dbdc360ff48290...,0.012966,6,5,0


In [6]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df


In [7]:
train = reduce_mem_usage(train)
val = reduce_mem_usage(val)


Memory usage of dataframe is 428.22 MB
Memory usage after optimization is: 175.85 MB
Decreased by 58.9%
Memory usage of dataframe is 78.64 MB
Memory usage after optimization is: 47.95 MB
Decreased by 39.0%


In [8]:
gc.collect()

521

In [9]:
train.shape, val.shape

((3507990, 16), (687192, 15))

In [10]:
train.memory_usage().sum()/(1024*1024*1024), val.memory_usage().sum()/(1024*1024*1024)

(0.1717264335602522, 0.04682622849941254)

In [11]:
full_df = pd.concat([train, val], 0)


In [12]:
full_df['days_elapsed'] = (full_df['week'].astype(int)-1)*7 + full_df['week_day'].astype(int)
full_df.head(2)

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value,product_price,week,week_day,is_click,days_elapsed
0,92d4dd491a874a2cf92c8d311a44a42b597c64a5ede23d...,e5e4c71b1b9456dafece1338762d4ee3db698cf32c384c...,2740b0d77b4e6fafd75321f7d0794210afa8bd650955e7...,bf056e3841dd3a358c6aacb1f9e74e4c7c4adc62e33b45...,596618814963e496d74434df8b8fe3306892f2e4ce6aaa...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,designer_id,e5315dbea15a033bc6974a0bccf5fae4a017648bcd92ab...,0.000263,7,6,0.0,48
1,541a93bd95c3f4127a53e6b0d4b41db55ad9cb9e19d34a...,fca847f7eb5a5a21991421354b0f26afb4a517e540541f...,d5feab37634fd140e85b8f98dcb909a8779b4f0417c73a...,c6513ec49c8e04c265c907933799ff76f24c075c6308c4...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d179859aac8f7c1f88e1ee29b6655596873318c55127d3...,8c82855f15d05cd74fa59956434df17522fc68e4ce3900...,product_id,61205c20046f2688cb7ed03cad29d5a5dbdc360ff48290...,0.012962,6,5,0.0,40


# Non-click

In [13]:
full_df = full_df[["query_id", "session_id", "user_id", "product_id", "user_country", "product_price"]]

In [14]:
attribute_df = pd.read_parquet("../data_phase1/attributes.parquet")


In [15]:
attribute_df = attribute_df[["product_id", "start_online_date"]]

In [16]:
train_df = train.merge(attribute_df, on=["product_id"], how="left")
train_df.head(2)


Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value,product_price,week,week_day,is_click,start_online_date
0,92d4dd491a874a2cf92c8d311a44a42b597c64a5ede23d...,e5e4c71b1b9456dafece1338762d4ee3db698cf32c384c...,2740b0d77b4e6fafd75321f7d0794210afa8bd650955e7...,bf056e3841dd3a358c6aacb1f9e74e4c7c4adc62e33b45...,596618814963e496d74434df8b8fe3306892f2e4ce6aaa...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,designer_id,e5315dbea15a033bc6974a0bccf5fae4a017648bcd92ab...,0.000263,7,6,0,570.041667
1,541a93bd95c3f4127a53e6b0d4b41db55ad9cb9e19d34a...,fca847f7eb5a5a21991421354b0f26afb4a517e540541f...,d5feab37634fd140e85b8f98dcb909a8779b4f0417c73a...,c6513ec49c8e04c265c907933799ff76f24c075c6308c4...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d179859aac8f7c1f88e1ee29b6655596873318c55127d3...,8c82855f15d05cd74fa59956434df17522fc68e4ce3900...,product_id,61205c20046f2688cb7ed03cad29d5a5dbdc360ff48290...,0.012962,6,5,0,208.041667


In [17]:
full_df = full_df.merge(attribute_df, on=["product_id"], how="left")


In [18]:
full_df.head(2)

Unnamed: 0,query_id,session_id,user_id,product_id,user_country,product_price,start_online_date
0,92d4dd491a874a2cf92c8d311a44a42b597c64a5ede23d...,2740b0d77b4e6fafd75321f7d0794210afa8bd650955e7...,e5e4c71b1b9456dafece1338762d4ee3db698cf32c384c...,bf056e3841dd3a358c6aacb1f9e74e4c7c4adc62e33b45...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,0.000263,570.041667
1,541a93bd95c3f4127a53e6b0d4b41db55ad9cb9e19d34a...,d5feab37634fd140e85b8f98dcb909a8779b4f0417c73a...,fca847f7eb5a5a21991421354b0f26afb4a517e540541f...,c6513ec49c8e04c265c907933799ff76f24c075c6308c4...,8c82855f15d05cd74fa59956434df17522fc68e4ce3900...,0.012962,208.041667


In [19]:
user_mean_price = train_df.groupby(["user_id"]).agg({"product_price": "mean"}).reset_index()
user_mean_price = user_mean_price.rename(columns={"product_price": "user_mean_interacted_product_price"})

user_max_price = train_df.groupby(["user_id"]).agg({"product_price": "max"}).reset_index()
user_max_price = user_max_price.rename(columns={"product_price": "user_max_interacted_product_price"})
user_max_price = user_mean_price.merge(user_max_price, on=["user_id"], how="inner")

user_min_price = train_df.groupby(["user_id"]).agg({"product_price": "min"}).reset_index()
user_min_price = user_min_price.rename(columns={"product_price": "user_min_interacted_product_price"})
user_min_price = user_max_price.merge(user_min_price, on=["user_id"], how="inner")

user_min_price.head()


Unnamed: 0,user_id,user_mean_interacted_product_price,user_max_interacted_product_price,user_min_interacted_product_price
0,00001039003fd7fbc902f30fb6d78eaa5176f7291cba9a...,0.000251,0.000587,0.000118
1,000032873c3c3fbab4ebc172399290d56247cbc1ef692c...,0.000823,0.001089,0.000499
2,0000511b406188f830c5fbf3b3be20b6883ec872e418a1...,0.000453,0.000976,0.000257
3,00008e290e495a3980448ae65d3de9d8857ab945949cc0...,0.000482,0.001844,0.000133
4,0000b8fe24d60b23e0c70707e21eba6e3d28b45a85060b...,0.020767,0.033936,0.000704


In [20]:
user_mean_start_online_date = train_df.groupby(["user_id"]).agg({"start_online_date": "mean"}).reset_index()
user_mean_start_online_date = user_mean_start_online_date.rename(columns={"start_online_date": "user_country_mean_interacted_product_start_online_date"})

user_max_start_online_date = train_df.groupby(["user_id"]).agg({"start_online_date": "max"}).reset_index()
user_max_start_online_date = user_max_start_online_date.rename(columns={"start_online_date": "user_max_interacted_product_start_online_date"})
user_max_start_online_date = user_mean_start_online_date.merge(user_max_start_online_date, on=["user_id"], how="inner")

user_min_start_online_date = train_df.groupby(["user_id"]).agg({"start_online_date": "min"}).reset_index()
user_min_start_online_date = user_min_start_online_date.rename(columns={"start_online_date": "user_min_interacted_product_start_online_date"})
user_min_start_online_date = user_max_start_online_date.merge(user_min_start_online_date, on=["user_id"], how="inner")

user_min_start_online_date.head()


Unnamed: 0,user_id,user_country_mean_interacted_product_start_online_date,user_max_interacted_product_start_online_date,user_min_interacted_product_start_online_date
0,00001039003fd7fbc902f30fb6d78eaa5176f7291cba9a...,393.013889,521.0,286.041667
1,000032873c3c3fbab4ebc172399290d56247cbc1ef692c...,533.180556,1172.0,388.0
2,0000511b406188f830c5fbf3b3be20b6883ec872e418a1...,358.355903,617.041667,170.041667
3,00008e290e495a3980448ae65d3de9d8857ab945949cc0...,712.680556,1239.0,182.041667
4,0000b8fe24d60b23e0c70707e21eba6e3d28b45a85060b...,448.5,461.0,386.0


# Click

In [21]:
click_data = train[train["is_click"]==1]
click_data = click_data[["query_id", "session_id", "user_id", "product_id", "user_country", "product_price"]]
click_data = click_data.merge(attribute_df, on=["product_id"], how="left")
click_data.head(2)


Unnamed: 0,query_id,session_id,user_id,product_id,user_country,product_price,start_online_date
0,207454e43755700a650c30fa0b655226e4a6d4c3a6fde2...,ec2877a9ea2383289d46b7ad8a5208a59c8e67d3783edb...,c2a9051f311a1f5a116a5b990c01421db70e17ce7b1fbd...,49463bcf36dd614f33ddd3acd6312e79a538301d5fbbbc...,5d85c8b1e3dedfaa9db0995af1e2454251b27bcf0ac577...,0.007858,272.041667
1,aa277f6a6bb9138964f3b75b32cc9bc8a7695a2350a4ba...,e2431d5d6e15857a6d901be25fcd8ac733b3424609f1ee...,8c895f8be59c089f91fea0fce424e66189b973246fcb4a...,feddd77c4d8453645c8bb9bdc45d8e6f1abfda041db06d...,dda222570fc6cc7550be8e4b7e31202caaf2e8430831db...,0.000201,163.041667


In [22]:
user_mean_click_price = click_data.groupby(["user_id"]).agg({"product_price": "mean"}).reset_index()
user_mean_click_price = user_mean_click_price.rename(columns={"product_price": "user_mean_clicked_product_price"})

user_max_click_price = click_data.groupby(["user_id"]).agg({"product_price": "max"}).reset_index()
user_max_click_price = user_max_click_price.rename(columns={"product_price": "user_max_clicked_product_price"})
user_max_click_price = user_mean_click_price.merge(user_max_click_price, on=["user_id"], how="inner")

user_min_click_price = click_data.groupby(["user_id"]).agg({"product_price": "min"}).reset_index()
user_min_click_price = user_min_click_price.rename(columns={"product_price": "user_min_clicked_product_price"})
user_min_click_price = user_max_click_price.merge(user_min_click_price, on=["user_id"], how="inner")

user_min_click_price.head()


Unnamed: 0,user_id,user_mean_clicked_product_price,user_max_clicked_product_price,user_min_clicked_product_price
0,00001039003fd7fbc902f30fb6d78eaa5176f7291cba9a...,0.000186,0.000186,0.000186
1,000032873c3c3fbab4ebc172399290d56247cbc1ef692c...,0.000695,0.000822,0.000499
2,0000511b406188f830c5fbf3b3be20b6883ec872e418a1...,0.000414,0.000512,0.000262
3,00008e290e495a3980448ae65d3de9d8857ab945949cc0...,0.000133,0.000133,0.000133
4,0000b8fe24d60b23e0c70707e21eba6e3d28b45a85060b...,0.000704,0.000704,0.000704


In [23]:
user_mean_click_start_online_date = click_data.groupby(["user_id"]).agg({"start_online_date": "mean"}).reset_index()
user_mean_click_start_online_date = user_mean_click_start_online_date.rename(columns={"start_online_date": "user_mean_clicked_product_start_online_date"})

user_max_click_start_online_date = click_data.groupby(["user_id"]).agg({"start_online_date": "max"}).reset_index()
user_max_click_start_online_date = user_max_click_start_online_date.rename(columns={"start_online_date": "user_max_clicked_product_start_online_date"})
user_max_click_start_online_date = user_mean_click_start_online_date.merge(user_max_click_start_online_date, on=["user_id"], how="inner")

user_min_click_start_online_date = click_data.groupby(["user_id"]).agg({"start_online_date": "min"}).reset_index()
user_min_click_start_online_date = user_min_click_start_online_date.rename(columns={"start_online_date": "user_min_clicked_product_start_online_date"})
user_min_click_start_online_date = user_max_click_start_online_date.merge(user_min_click_start_online_date, on=["user_id"], how="inner")

user_min_click_start_online_date.head()


Unnamed: 0,user_id,user_mean_clicked_product_start_online_date,user_max_clicked_product_start_online_date,user_min_clicked_product_start_online_date
0,00001039003fd7fbc902f30fb6d78eaa5176f7291cba9a...,521.0,521.0,521.0
1,000032873c3c3fbab4ebc172399290d56247cbc1ef692c...,621.816667,1172.0,434.0
2,0000511b406188f830c5fbf3b3be20b6883ec872e418a1...,401.760417,436.0,302.041667
3,00008e290e495a3980448ae65d3de9d8857ab945949cc0...,182.041667,182.041667,182.041667
4,0000b8fe24d60b23e0c70707e21eba6e3d28b45a85060b...,386.0,386.0,386.0


In [24]:
merged_user_df = user_min_price.merge(user_min_start_online_date, on="user_id", how="left")
merged_user_df = merged_user_df.merge(user_min_click_price, on="user_id", how="left")
merged_user_df = merged_user_df.merge(user_min_click_start_online_date, on="user_id", how="left")
merged_user_df.head()


Unnamed: 0,user_id,user_mean_interacted_product_price,user_max_interacted_product_price,user_min_interacted_product_price,user_country_mean_interacted_product_start_online_date,user_max_interacted_product_start_online_date,user_min_interacted_product_start_online_date,user_mean_clicked_product_price,user_max_clicked_product_price,user_min_clicked_product_price,user_mean_clicked_product_start_online_date,user_max_clicked_product_start_online_date,user_min_clicked_product_start_online_date
0,00001039003fd7fbc902f30fb6d78eaa5176f7291cba9a...,0.000251,0.000587,0.000118,393.013889,521.0,286.041667,0.000186,0.000186,0.000186,521.0,521.0,521.0
1,000032873c3c3fbab4ebc172399290d56247cbc1ef692c...,0.000823,0.001089,0.000499,533.180556,1172.0,388.0,0.000695,0.000822,0.000499,621.816667,1172.0,434.0
2,0000511b406188f830c5fbf3b3be20b6883ec872e418a1...,0.000453,0.000976,0.000257,358.355903,617.041667,170.041667,0.000414,0.000512,0.000262,401.760417,436.0,302.041667
3,00008e290e495a3980448ae65d3de9d8857ab945949cc0...,0.000482,0.001844,0.000133,712.680556,1239.0,182.041667,0.000133,0.000133,0.000133,182.041667,182.041667,182.041667
4,0000b8fe24d60b23e0c70707e21eba6e3d28b45a85060b...,0.020767,0.033936,0.000704,448.5,461.0,386.0,0.000704,0.000704,0.000704,386.0,386.0,386.0


In [25]:
del attribute_df, click_data
gc.collect()

44

In [49]:
attribute_df = pd.read_parquet("../data_phase1/attributes.parquet")


In [27]:
click_data = train[train["is_click"]==1]
click_data = click_data.merge(attribute_df, on=["product_id"], how="left")
click_data.head(2)


Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,...,season,collection,category_id_l1,category_id_l2,category_id_l3,brand_id,season_year,start_online_date,material_values,attribute_values
0,207454e43755700a650c30fa0b655226e4a6d4c3a6fde2...,c2a9051f311a1f5a116a5b990c01421db70e17ce7b1fbd...,ec2877a9ea2383289d46b7ad8a5208a59c8e67d3783edb...,49463bcf36dd614f33ddd3acd6312e79a538301d5fbbbc...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,5d85c8b1e3dedfaa9db0995af1e2454251b27bcf0ac577...,...,9db35d237f5873f0410d3ca18c07430270086eb1e7838d...,4c8006c7e513057a9138641abc2a9e65f4b014a8045259...,642878c18e40a1028b6610cd3396d4ee4fb6ad9623f5bb...,6812ef60b60c6444385c539f25f04d2fc2680c827a0686...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,1be792ca6909fa392bdaf78a115f657f9c4037ecd0f20f...,ef7d8d2e02aec8c328dafde95486f7181c37d07be3d167...,272.041667,f61ecea9b45f1590e57706b88207449bdd4cb703b917ad...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...
1,aa277f6a6bb9138964f3b75b32cc9bc8a7695a2350a4ba...,8c895f8be59c089f91fea0fce424e66189b973246fcb4a...,e2431d5d6e15857a6d901be25fcd8ac733b3424609f1ee...,feddd77c4d8453645c8bb9bdc45d8e6f1abfda041db06d...,596618814963e496d74434df8b8fe3306892f2e4ce6aaa...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,dda222570fc6cc7550be8e4b7e31202caaf2e8430831db...,...,9db35d237f5873f0410d3ca18c07430270086eb1e7838d...,b5beb2ed60981746e0b908797b3d3abb8491de15ff5575...,a691486f371de3a7ca37863dae5a8c46aee65f589518cf...,30dd625fb8429567fed311a055ae6c31e4f04d1375a843...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,de633bacd9aec5aeca3e21063e1557b0c3de6d7ea8921b...,ef7d8d2e02aec8c328dafde95486f7181c37d07be3d167...,163.041667,cf2fad948e2832e0f5de43bece1b5161b0a3f672968990...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...


In [28]:
clicked_users = click_data.user_id.unique().tolist()


In [48]:
del train_df
gc.collect()

168

In [33]:
dic = {}

In [35]:
from tqdm import tqdm

In [50]:
train_df = train.merge(attribute_df, on=["product_id"], how="left")
train_df.head(2)

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,...,season,collection,category_id_l1,category_id_l2,category_id_l3,brand_id,season_year,start_online_date,material_values,attribute_values
0,92d4dd491a874a2cf92c8d311a44a42b597c64a5ede23d...,e5e4c71b1b9456dafece1338762d4ee3db698cf32c384c...,2740b0d77b4e6fafd75321f7d0794210afa8bd650955e7...,bf056e3841dd3a358c6aacb1f9e74e4c7c4adc62e33b45...,596618814963e496d74434df8b8fe3306892f2e4ce6aaa...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,...,9db35d237f5873f0410d3ca18c07430270086eb1e7838d...,4c8006c7e513057a9138641abc2a9e65f4b014a8045259...,642878c18e40a1028b6610cd3396d4ee4fb6ad9623f5bb...,a37fdccd28723fba01a2f0a68d3bb1a3b847e550a7b0c2...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,206b88b91a241cc6ed5af856ce3ddda4dccf91ae2ec798...,dc1194ba428d5cd4c49f8a769a6577ac1042162da38bc1...,570.041667,f61ecea9b45f1590e57706b88207449bdd4cb703b917ad...,28d2b1e7b0970e2c58966c47b4c7ab7426e43a8689007e...
1,541a93bd95c3f4127a53e6b0d4b41db55ad9cb9e19d34a...,fca847f7eb5a5a21991421354b0f26afb4a517e540541f...,d5feab37634fd140e85b8f98dcb909a8779b4f0417c73a...,c6513ec49c8e04c265c907933799ff76f24c075c6308c4...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d179859aac8f7c1f88e1ee29b6655596873318c55127d3...,8c82855f15d05cd74fa59956434df17522fc68e4ce3900...,...,847a067597e39838f1f85b0774f44e68b4d6e64d3ec4dd...,4c8006c7e513057a9138641abc2a9e65f4b014a8045259...,f6bafdefb3c04b392adc1d139ea55a565c8829627bb7d3...,884c59e2814e352d318a8447c2f32a0370955af64838e3...,393b930ad3c40c6af1c14ce8572f39a3a2027ef70dd21e...,de60eee3c057eab83a9987275b3ecfb080026c3297b036...,ef7d8d2e02aec8c328dafde95486f7181c37d07be3d167...,208.041667,c4aebf95aa0044c5230c104faa9ef2b5ccfab836b764d4...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...


In [51]:
val_df = val.merge(attribute_df, on=["product_id"], how="left")
val_df.head(2)

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,...,season,collection,category_id_l1,category_id_l2,category_id_l3,brand_id,season_year,start_online_date,material_values,attribute_values
0,45dbde6284f13d59b04c5d2a5ab2513c896ba8f7a7dde0...,4d66a7c430e1f1f7da454f4e8c4bf3e7cf2435741329c0...,b677570c68f211d9543e96fe46750b66ebdfaa0fb2df0e...,d52833e4925f40de987bb732847a8dbc07c2ba1e33711a...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,...,9db35d237f5873f0410d3ca18c07430270086eb1e7838d...,a3791e8d85c005b0d9d60d6d3b7e8edd2f256a5cc928d0...,809e893b4176932824527f66afeaf4d4c8ce8973a0c372...,61c2b7dbe7b1763cba63d5d6c91b6abefbf210fc475d67...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,32d3c57850476db6e1183f358260cdf8b4c8ad06d1d09c...,ef7d8d2e02aec8c328dafde95486f7181c37d07be3d167...,276.041667,a0c383e8931a3908c348b1e7e3d7be3f85fd9d4f615eef...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...
1,6728f8b02604af603d65faa0a2a4e36307f2498c4bdd88...,9ff86157cc29dd17330d2e714318770884adf936794c66...,d1f1a8e13d81de690a968db95dd1960482143f59e6c69b...,ca58ed0a66cb8990221552d0d93de82713641b66465a86...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,c45f0825291fd1a087ef31abbcf9fd0ef10c915edf3041...,...,847a067597e39838f1f85b0774f44e68b4d6e64d3ec4dd...,4c8006c7e513057a9138641abc2a9e65f4b014a8045259...,f6bafdefb3c04b392adc1d139ea55a565c8829627bb7d3...,884c59e2814e352d318a8447c2f32a0370955af64838e3...,34a20ac15931fc3ddeda5472f41f0c2d46795ac35d8258...,899e117a3927d4276db3b5bc070a5ae5764b5aa7ab1730...,ef7d8d2e02aec8c328dafde95486f7181c37d07be3d167...,414.0,a18970e42a587074bf2c9fe47ad4a67859a54b58a59a34...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...


In [36]:
for user in tqdm(clicked_users):
    dd = click_data[click_data["user_id"]==user]
    temp_dic = dict(dd.groupby(["brand_id"])["query_id"].count())
    brand_keys = list(temp_dic.keys())
    brand_vals = list(temp_dic.values())
    sum_vals = sum(brand_vals)
    brand_vals = [(each/sum_vals)*100.0 for each in brand_vals]
    brand_counts = {brand_keys[i]: brand_vals[i] for i in range(len(brand_keys))}
    dic[user] = brand_counts
    

100%|██████████| 208393/208393 [05:29<00:00, 632.48it/s]


In [37]:
# json.dump(dic, open("../preprocessed_data/user_brand_popularity.json", "w"))

In [38]:
def get_feature_perc(feature):
    dic = {}
    
    for user in tqdm(clicked_users):
        dd = click_data[click_data["user_id"]==user]
        temp_dic = dict(dd.groupby([feature])["query_id"].count())
        keys = list(temp_dic.keys())
        vals = list(temp_dic.values())
        sum_vals = sum(vals)
        vals = [(each/sum_vals)*100.0 for each in vals]
        counts = {keys[i]: vals[i] for i in range(len(keys))}
        dic[user] = counts
        
    return dic
    

In [39]:
category_id_l1_dic = get_feature_perc("category_id_l1")


100%|██████████| 208393/208393 [05:27<00:00, 635.75it/s]


In [40]:
# json.dump(category_id_l1_dic, open("../preprocessed_data/user_category_id_l1_popularity.json", "w"))


In [41]:
category_id_l2_dic = get_feature_perc("category_id_l2")
json.dump(category_id_l2_dic, open("../preprocessed_data/user_category_id_l2_popularity.json", "w"))


100%|██████████| 208393/208393 [05:26<00:00, 637.87it/s]


In [42]:
category_id_l3_dic = get_feature_perc("category_id_l3")
json.dump(category_id_l3_dic, open("../preprocessed_data/user_category_id_l3_popularity.json", "w"))


100%|██████████| 208393/208393 [05:28<00:00, 634.44it/s]


In [43]:
season_dic = get_feature_perc("season")
json.dump(season_dic, open("../preprocessed_data/user_season_popularity.json", "w"))


100%|██████████| 208393/208393 [05:31<00:00, 628.33it/s]


In [56]:
def get_user_feature(row, col, dic):
    user_id = row["user_id"]
    field = row[col]
    try:
        return dic[user_id][field]
    except:
        return np.nan
    

In [57]:
user_brand_dic = json.load(open("../preprocessed_data/user_brand_popularity.json", "r"))
category_id_l1_dic = json.load(open("../preprocessed_data/user_category_id_l1_popularity.json", "r"))
category_id_l2_dic = json.load(open("../preprocessed_data/user_category_id_l2_popularity.json", "r"))
category_id_l3_dic = json.load(open("../preprocessed_data/user_category_id_l3_popularity.json", "r"))
season_dic = json.load(open("../preprocessed_data/user_season_popularity.json", "r"))


In [59]:
train_df["user_brand_click_percentage"] = train_df.swifter.apply(lambda x: get_user_feature(x, "brand_id", user_brand_dic), 1)
train_df["user_category_id_l1_click_percentage"] = train_df.swifter.apply(lambda x: get_user_feature(x, "category_id_l1", category_id_l1_dic), 1)
train_df["user_category_id_l2_click_percentage"] = train_df.swifter.apply(lambda x: get_user_feature(x, "category_id_l2", category_id_l2_dic), 1)
train_df["user_category_id_l3_click_percentage"] = train_df.swifter.apply(lambda x: get_user_feature(x, "category_id_l3", category_id_l3_dic), 1)
train_df["user_season_click_percentage"] = train_df.swifter.apply(lambda x: get_user_feature(x, "season", season_dic), 1)



HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3507990.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3507990.0), HTML(value='')))




In [65]:
train_df.head()

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,...,brand_id,season_year,start_online_date,material_values,attribute_values,user_brand_click_percentage,user_category_id_l1_click_percentage,user_category_id_l2_click_percentage,user_category_id_l3_click_percentage,user_season_click_percentage
0,92d4dd491a874a2cf92c8d311a44a42b597c64a5ede23d...,e5e4c71b1b9456dafece1338762d4ee3db698cf32c384c...,2740b0d77b4e6fafd75321f7d0794210afa8bd650955e7...,bf056e3841dd3a358c6aacb1f9e74e4c7c4adc62e33b45...,596618814963e496d74434df8b8fe3306892f2e4ce6aaa...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,...,206b88b91a241cc6ed5af856ce3ddda4dccf91ae2ec798...,dc1194ba428d5cd4c49f8a769a6577ac1042162da38bc1...,570.041667,f61ecea9b45f1590e57706b88207449bdd4cb703b917ad...,28d2b1e7b0970e2c58966c47b4c7ab7426e43a8689007e...,,,,60.0,80.0
1,541a93bd95c3f4127a53e6b0d4b41db55ad9cb9e19d34a...,fca847f7eb5a5a21991421354b0f26afb4a517e540541f...,d5feab37634fd140e85b8f98dcb909a8779b4f0417c73a...,c6513ec49c8e04c265c907933799ff76f24c075c6308c4...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d179859aac8f7c1f88e1ee29b6655596873318c55127d3...,8c82855f15d05cd74fa59956434df17522fc68e4ce3900...,...,de60eee3c057eab83a9987275b3ecfb080026c3297b036...,ef7d8d2e02aec8c328dafde95486f7181c37d07be3d167...,208.041667,c4aebf95aa0044c5230c104faa9ef2b5ccfab836b764d4...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,6.153846,6.153846,6.153846,3.076923,26.153846
2,263ea1e38126fe0c7bfbff24a33b1a09f4dac4f8cd4bb4...,90ea15d8d96a9d3e7ab463d990e5f4565cac9477498d37...,18ae37a1a05faa7fd54818794a1a8e44073e00a56fa05f...,b4d5e28da10318aa7776b364528dc92f83ba45326018b5...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,c94ddcb9053eae77ab9abec807ef2b0bb4efac14404d17...,...,411722e5c0beee7f8ac585be38f638c122bdba8fa20dbd...,dc1194ba428d5cd4c49f8a769a6577ac1042162da38bc1...,549.041667,c81b4e145190b15103823171aa09cbd86066ee5d23e571...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,100.0,100.0,100.0,100.0,100.0
3,3727580d84ce2fbe42ff8bc6f732331f65ea659864a04c...,8f88d89f2a71e2adf42f885fa6adedd09bf039843b535c...,10667a5a6047aa173d13997cdcd996cbdaf9b0149f9655...,5a36f600d3c01763c28e2dafc53119fba7bcc6a867ab8f...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,8c82855f15d05cd74fa59956434df17522fc68e4ce3900...,...,a92725d8c84a2fcad12388752f2dae1ae68e349606a5ab...,dc1194ba428d5cd4c49f8a769a6577ac1042162da38bc1...,581.041667,f5e2532bc7601c84a18a89274a01ffd22abe5f48d3971d...,bbb5e6cc195bd2378e280404acfc40e93cca81b6eb21fb...,,100.0,33.333333,33.333333,83.333333
4,1fcf5d263785455311cecf2f864eaa2eeca4da488383d9...,6b21688d90dfd9677fa7979dfc6da5b963c0e1e3d68a38...,8b39f00bfc9d45ed6f64dd39a72548936a157f9f03e7f4...,314d20e9e9ec3e97d1867c8bf8c6feb0c23d918021e175...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,...,dff0b5cfe100e8e6ac094b91c852178fef9414cfec2169...,dc1194ba428d5cd4c49f8a769a6577ac1042162da38bc1...,624.041667,25ab556db3343d9dec7c2083eeb09cc3839bc7f67567f0...,b47aef240e92e5e272da1197f67988f84996ed71d88a64...,100.0,100.0,100.0,100.0,


In [62]:
val_df["user_brand_click_percentage"] = val_df.swifter.apply(lambda x: get_user_feature(x, "brand_id", user_brand_dic), 1)
val_df["user_category_id_l1_click_percentage"] = val_df.swifter.apply(lambda x: get_user_feature(x, "category_id_l1", category_id_l1_dic), 1)
val_df["user_category_id_l2_click_percentage"] = val_df.swifter.apply(lambda x: get_user_feature(x, "category_id_l2", category_id_l2_dic), 1)
val_df["user_category_id_l3_click_percentage"] = val_df.swifter.apply(lambda x: get_user_feature(x, "category_id_l3", category_id_l3_dic), 1)
val_df["user_season_click_percentage"] = val_df.swifter.apply(lambda x: get_user_feature(x, "season", season_dic), 1)



HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))




In [66]:
val_df.head()

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,...,brand_id,season_year,start_online_date,material_values,attribute_values,user_brand_click_percentage,user_category_id_l1_click_percentage,user_category_id_l2_click_percentage,user_category_id_l3_click_percentage,user_season_click_percentage
0,45dbde6284f13d59b04c5d2a5ab2513c896ba8f7a7dde0...,4d66a7c430e1f1f7da454f4e8c4bf3e7cf2435741329c0...,b677570c68f211d9543e96fe46750b66ebdfaa0fb2df0e...,d52833e4925f40de987bb732847a8dbc07c2ba1e33711a...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,...,32d3c57850476db6e1183f358260cdf8b4c8ad06d1d09c...,ef7d8d2e02aec8c328dafde95486f7181c37d07be3d167...,276.041667,a0c383e8931a3908c348b1e7e3d7be3f85fd9d4f615eef...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,25.0,100.0,100.0,100.0,50.0
1,6728f8b02604af603d65faa0a2a4e36307f2498c4bdd88...,9ff86157cc29dd17330d2e714318770884adf936794c66...,d1f1a8e13d81de690a968db95dd1960482143f59e6c69b...,ca58ed0a66cb8990221552d0d93de82713641b66465a86...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,c45f0825291fd1a087ef31abbcf9fd0ef10c915edf3041...,...,899e117a3927d4276db3b5bc070a5ae5764b5aa7ab1730...,ef7d8d2e02aec8c328dafde95486f7181c37d07be3d167...,414.0,a18970e42a587074bf2c9fe47ad4a67859a54b58a59a34...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,,,,,
2,9d1d782fab18c98c8a8d4dab9cbc0cb3f786b5d5e5fd24...,b67c2b47e18777596747b94d0ae3e4a9f023a406fa5b5e...,d572b385d62d7eaaa4872e07ed2771b107d4db0b547212...,6374ef1b29e3046f9ae1607cd62cde1eb6305d4d9b1de5...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,14fc80e2d6821260d291ff47ce6d8d7534d4cb5aa4ab0d...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,58f6d97120664752ed0851301aa78457fe882f67453c58...,...,2f85e4e89f3d731a3fcc43c1ea068600dc082d9b2656de...,ef7d8d2e02aec8c328dafde95486f7181c37d07be3d167...,168.041667,4632872aedf88d5e81361e0389833dd1be3bcafc7e6b58...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,,100.0,100.0,100.0,
3,f23d6751b37c235047a64a20ffe732483f487743dc8812...,8e28240e0b0c5629959d76727906afa17ea5d89821cbac...,c5df0c490099d1e4a20173ecbb67880998bc4faf8210a9...,e74667d17676e39561716943bc7cf8ba8a94ce96a0dc41...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,f319f71fd9b14532cf715e601fe3b5c2473b58d4302e88...,...,7952ba3d185b86fddbfd84565c3da161037bab14f9f4ed...,ef7d8d2e02aec8c328dafde95486f7181c37d07be3d167...,167.041667,d04d6e357e7b5a68cad68b16bfb9bc2bb3948177d2b831...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,100.0,100.0,100.0,100.0,100.0
4,fe162740ab90f69597b14253e6f0fc6fb87ad8e2146cd8...,2173a7fa04d32abcb18cf8652e427f13a21156f7ff0bf5...,1e5f3aaf57b4863421d5019fe834f5e6979876f3d44e43...,a85298cf45012b8bfe8e5ec39108361e6083cad362dff3...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,...,2e7413adf3b546c7194ab0bd0daeee2bc9ce4f56a2b3d0...,ef7d8d2e02aec8c328dafde95486f7181c37d07be3d167...,205.041667,7dd5eeb18fabe79b8c9640f6648d1156337a690e48e0b5...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,,100.0,60.0,40.0,100.0


In [67]:
train_save = train_df[["query_id", "user_id", "session_id", "product_id", "user_brand_click_percentage", "user_category_id_l1_click_percentage", "user_category_id_l2_click_percentage", "user_category_id_l3_click_percentage", "user_season_click_percentage"]]
train_save.to_csv("../preprocessed_data/user_click_percentage_features.csv", index=False)



In [68]:
val_save = val_df[["query_id", "user_id", "session_id", "product_id", "user_brand_click_percentage", "user_category_id_l1_click_percentage", "user_category_id_l2_click_percentage", "user_category_id_l3_click_percentage", "user_season_click_percentage"]]
val_save.to_csv("../preprocessed_data/user_click_percentage_features.csv", index=False)



In [48]:
# merged_user_df.to_csv("../preprocessed_data/user_features.csv", index=False)
