In [1]:
# ! pip install fastparquet

In [2]:
import pandas as pd
from tqdm import tqdm
import swifter
import numpy as np
from glob import glob
import json
import gc


In [3]:
train = pd.read_parquet("../data_phase1/train.parquet")
val = pd.read_parquet("../data_phase1/validation.parquet")


In [4]:
train[["context_type"]] = train[["context_type"]].fillna(value="NA")
val[["context_type"]] = val[["context_type"]].fillna(value="NA")


In [5]:
train.head(2)

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value,product_price,week,week_day,is_click
0,92d4dd491a874a2cf92c8d311a44a42b597c64a5ede23d...,e5e4c71b1b9456dafece1338762d4ee3db698cf32c384c...,2740b0d77b4e6fafd75321f7d0794210afa8bd650955e7...,bf056e3841dd3a358c6aacb1f9e74e4c7c4adc62e33b45...,596618814963e496d74434df8b8fe3306892f2e4ce6aaa...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,designer_id,e5315dbea15a033bc6974a0bccf5fae4a017648bcd92ab...,0.000263,7,6,0
1,541a93bd95c3f4127a53e6b0d4b41db55ad9cb9e19d34a...,fca847f7eb5a5a21991421354b0f26afb4a517e540541f...,d5feab37634fd140e85b8f98dcb909a8779b4f0417c73a...,c6513ec49c8e04c265c907933799ff76f24c075c6308c4...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d179859aac8f7c1f88e1ee29b6655596873318c55127d3...,8c82855f15d05cd74fa59956434df17522fc68e4ce3900...,product_id,61205c20046f2688cb7ed03cad29d5a5dbdc360ff48290...,0.012966,6,5,0


In [6]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df


In [7]:
train = reduce_mem_usage(train)
val = reduce_mem_usage(val)


Memory usage of dataframe is 428.22 MB
Memory usage after optimization is: 175.85 MB
Decreased by 58.9%
Memory usage of dataframe is 78.64 MB
Memory usage after optimization is: 47.95 MB
Decreased by 39.0%


In [8]:
gc.collect()

521

In [9]:
train.shape, val.shape

((3507990, 16), (687192, 15))

In [10]:
train.memory_usage().sum()/(1024*1024*1024), val.memory_usage().sum()/(1024*1024*1024)

(0.1717264335602522, 0.04682622849941254)

In [11]:
full_df = pd.concat([train, val], 0)


In [12]:
full_df['days_elapsed'] = (full_df['week'].astype(int)-1)*7 + full_df['week_day'].astype(int)
full_df.head(2)

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value,product_price,week,week_day,is_click,days_elapsed
0,92d4dd491a874a2cf92c8d311a44a42b597c64a5ede23d...,e5e4c71b1b9456dafece1338762d4ee3db698cf32c384c...,2740b0d77b4e6fafd75321f7d0794210afa8bd650955e7...,bf056e3841dd3a358c6aacb1f9e74e4c7c4adc62e33b45...,596618814963e496d74434df8b8fe3306892f2e4ce6aaa...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,designer_id,e5315dbea15a033bc6974a0bccf5fae4a017648bcd92ab...,0.000263,7,6,0.0,48
1,541a93bd95c3f4127a53e6b0d4b41db55ad9cb9e19d34a...,fca847f7eb5a5a21991421354b0f26afb4a517e540541f...,d5feab37634fd140e85b8f98dcb909a8779b4f0417c73a...,c6513ec49c8e04c265c907933799ff76f24c075c6308c4...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d179859aac8f7c1f88e1ee29b6655596873318c55127d3...,8c82855f15d05cd74fa59956434df17522fc68e4ce3900...,product_id,61205c20046f2688cb7ed03cad29d5a5dbdc360ff48290...,0.012962,6,5,0.0,40


# Non-click

In [13]:
full_df = full_df[["query_id", "session_id", "user_id", "product_id", "user_country", "product_price"]]

In [14]:
attribute_df = pd.read_parquet("../data_phase1/attributes.parquet")


In [15]:
attribute_df = attribute_df[["product_id", "start_online_date"]]

In [16]:
full_df = full_df.merge(attribute_df, on=["product_id"], how="left")


In [17]:
full_df.head(2)

Unnamed: 0,query_id,session_id,user_id,product_id,user_country,product_price,start_online_date
0,92d4dd491a874a2cf92c8d311a44a42b597c64a5ede23d...,2740b0d77b4e6fafd75321f7d0794210afa8bd650955e7...,e5e4c71b1b9456dafece1338762d4ee3db698cf32c384c...,bf056e3841dd3a358c6aacb1f9e74e4c7c4adc62e33b45...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,0.000263,570.041667
1,541a93bd95c3f4127a53e6b0d4b41db55ad9cb9e19d34a...,d5feab37634fd140e85b8f98dcb909a8779b4f0417c73a...,fca847f7eb5a5a21991421354b0f26afb4a517e540541f...,c6513ec49c8e04c265c907933799ff76f24c075c6308c4...,8c82855f15d05cd74fa59956434df17522fc68e4ce3900...,0.012962,208.041667


In [18]:
user_country_mean_price = full_df.groupby(["user_country"]).agg({"product_price": "mean"}).reset_index()
user_country_mean_price = user_country_mean_price.rename(columns={"product_price": "user_country_mean_product_price"})

user_country_max_price = full_df.groupby(["user_country"]).agg({"product_price": "max"}).reset_index()
user_country_max_price = user_country_max_price.rename(columns={"product_price": "user_country_max_product_price"})
user_country_max_price = user_country_mean_price.merge(user_country_max_price, on=["user_country"], how="inner")

user_country_min_price = full_df.groupby(["user_country"]).agg({"product_price": "min"}).reset_index()
user_country_min_price = user_country_min_price.rename(columns={"product_price": "user_country_min_product_price"})
user_country_min_price = user_country_max_price.merge(user_country_min_price, on=["user_country"], how="inner")

user_country_min_price.head()


Unnamed: 0,user_country,user_country_mean_product_price,user_country_max_product_price,user_country_min_product_price
0,054a9a12cd3ad95f66a38ee890f283991e7ade2dbb82e5...,0.000864,0.071655,1e-06
1,062ecc316e190bbd486e5800e1b9a2b5ae6a2d392df5ec...,0.000948,0.006432,4.8e-05
2,0830c9cead2f7ace2b8d7e6bb6a24b5e892efe82c48cf1...,0.002462,0.87793,1.1e-05
3,088017151fe1ac5b2b556a69455437dcedbe8f2250119f...,0.000808,0.03598,1.9e-05
4,089593af68cf96a7b3c0dd4c98426e0fa5b379fee1345b...,0.000593,0.01593,2.4e-05


In [19]:
user_country_mean_start_online_date = full_df.groupby(["user_country"]).agg({"start_online_date": "mean"}).reset_index()
user_country_mean_start_online_date = user_country_mean_start_online_date.rename(columns={"start_online_date": "user_country_mean_product_start_online_date"})

user_country_max_start_online_date = full_df.groupby(["user_country"]).agg({"start_online_date": "max"}).reset_index()
user_country_max_start_online_date = user_country_max_start_online_date.rename(columns={"start_online_date": "user_country_max_product_start_online_date"})
user_country_max_start_online_date = user_country_mean_start_online_date.merge(user_country_max_start_online_date, on=["user_country"], how="inner")

user_country_min_start_online_date = full_df.groupby(["user_country"]).agg({"start_online_date": "min"}).reset_index()
user_country_min_start_online_date = user_country_min_start_online_date.rename(columns={"start_online_date": "user_country_min_product_start_online_date"})
user_country_min_start_online_date = user_country_max_start_online_date.merge(user_country_min_start_online_date, on=["user_country"], how="inner")

user_country_min_start_online_date.head()


Unnamed: 0,user_country,user_country_mean_product_start_online_date,user_country_max_product_start_online_date,user_country_min_product_start_online_date
0,054a9a12cd3ad95f66a38ee890f283991e7ade2dbb82e5...,421.0286,2144.041667,132.0
1,062ecc316e190bbd486e5800e1b9a2b5ae6a2d392df5ec...,402.009528,1672.041667,157.0
2,0830c9cead2f7ace2b8d7e6bb6a24b5e892efe82c48cf1...,397.212573,3214.041667,115.0
3,088017151fe1ac5b2b556a69455437dcedbe8f2250119f...,414.069756,1875.0,126.0
4,089593af68cf96a7b3c0dd4c98426e0fa5b379fee1345b...,528.336655,1410.041667,141.0


# Click

In [20]:
click_data = train[train["is_click"]==1]
click_data = click_data[["query_id", "session_id", "user_id", "product_id", "user_country", "product_price"]]
click_data = click_data.merge(attribute_df, on=["product_id"], how="left")
click_data.head(2)


Unnamed: 0,query_id,session_id,user_id,product_id,user_country,product_price,start_online_date
0,207454e43755700a650c30fa0b655226e4a6d4c3a6fde2...,ec2877a9ea2383289d46b7ad8a5208a59c8e67d3783edb...,c2a9051f311a1f5a116a5b990c01421db70e17ce7b1fbd...,49463bcf36dd614f33ddd3acd6312e79a538301d5fbbbc...,5d85c8b1e3dedfaa9db0995af1e2454251b27bcf0ac577...,0.007858,272.041667
1,aa277f6a6bb9138964f3b75b32cc9bc8a7695a2350a4ba...,e2431d5d6e15857a6d901be25fcd8ac733b3424609f1ee...,8c895f8be59c089f91fea0fce424e66189b973246fcb4a...,feddd77c4d8453645c8bb9bdc45d8e6f1abfda041db06d...,dda222570fc6cc7550be8e4b7e31202caaf2e8430831db...,0.000201,163.041667


In [21]:
user_country_mean_click_price = click_data.groupby(["user_country"]).agg({"product_price": "mean"}).reset_index()
user_country_mean_click_price = user_country_mean_click_price.rename(columns={"product_price": "user_country_mean_click_product_price"})

user_country_max_click_price = click_data.groupby(["user_country"]).agg({"product_price": "max"}).reset_index()
user_country_max_click_price = user_country_max_click_price.rename(columns={"product_price": "user_country_max_click_product_price"})
user_country_max_click_price = user_country_mean_click_price.merge(user_country_max_click_price, on=["user_country"], how="inner")

user_country_min_click_price = click_data.groupby(["user_country"]).agg({"product_price": "min"}).reset_index()
user_country_min_click_price = user_country_min_click_price.rename(columns={"product_price": "user_country_min_click_product_price"})
user_country_min_click_price = user_country_max_click_price.merge(user_country_min_click_price, on=["user_country"], how="inner")

user_country_min_click_price.head()


Unnamed: 0,user_country,user_country_mean_click_product_price,user_country_max_click_product_price,user_country_min_click_product_price
0,054a9a12cd3ad95f66a38ee890f283991e7ade2dbb82e5...,0.000984,0.025894,6.6e-05
1,062ecc316e190bbd486e5800e1b9a2b5ae6a2d392df5ec...,0.000932,0.00359,9.9e-05
2,0830c9cead2f7ace2b8d7e6bb6a24b5e892efe82c48cf1...,0.002764,0.867188,1.6e-05
3,088017151fe1ac5b2b556a69455437dcedbe8f2250119f...,0.000893,0.03598,2.3e-05
4,089593af68cf96a7b3c0dd4c98426e0fa5b379fee1345b...,0.000434,0.001246,5.3e-05


In [22]:
user_country_mean_click_start_online_date = click_data.groupby(["user_country"]).agg({"start_online_date": "mean"}).reset_index()
user_country_mean_click_start_online_date = user_country_mean_click_start_online_date.rename(columns={"start_online_date": "user_country_mean_click_product_start_online_date"})

user_country_max_click_start_online_date = click_data.groupby(["user_country"]).agg({"start_online_date": "max"}).reset_index()
user_country_max_click_start_online_date = user_country_max_click_start_online_date.rename(columns={"start_online_date": "user_country_max_click_product_start_online_date"})
user_country_max_click_start_online_date = user_country_mean_click_start_online_date.merge(user_country_max_click_start_online_date, on=["user_country"], how="inner")

user_country_min_click_start_online_date = click_data.groupby(["user_country"]).agg({"start_online_date": "min"}).reset_index()
user_country_min_click_start_online_date = user_country_min_click_start_online_date.rename(columns={"start_online_date": "user_country_min_click_product_start_online_date"})
user_country_min_click_start_online_date = user_country_max_click_start_online_date.merge(user_country_min_click_start_online_date, on=["user_country"], how="inner")

user_country_min_click_start_online_date.head()


Unnamed: 0,user_country,user_country_mean_click_product_start_online_date,user_country_max_click_product_start_online_date,user_country_min_click_product_start_online_date
0,054a9a12cd3ad95f66a38ee890f283991e7ade2dbb82e5...,411.068783,1508.0,135.0
1,062ecc316e190bbd486e5800e1b9a2b5ae6a2d392df5ec...,440.179762,1310.041667,161.041667
2,0830c9cead2f7ace2b8d7e6bb6a24b5e892efe82c48cf1...,397.004433,2822.041667,124.0
3,088017151fe1ac5b2b556a69455437dcedbe8f2250119f...,409.816129,1680.041667,135.0
4,089593af68cf96a7b3c0dd4c98426e0fa5b379fee1345b...,539.275,1388.041667,149.0


In [23]:
merged_user_country_df = user_country_min_price.merge(user_country_min_start_online_date, on="user_country", how="left")
merged_user_country_df = merged_user_country_df.merge(user_country_min_click_price, on="user_country", how="left")
merged_user_country_df = merged_user_country_df.merge(user_country_min_click_start_online_date, on="user_country", how="left")
merged_user_country_df.head()


Unnamed: 0,user_country,user_country_mean_product_price,user_country_max_product_price,user_country_min_product_price,user_country_mean_product_start_online_date,user_country_max_product_start_online_date,user_country_min_product_start_online_date,user_country_mean_click_product_price,user_country_max_click_product_price,user_country_min_click_product_price,user_country_mean_click_product_start_online_date,user_country_max_click_product_start_online_date,user_country_min_click_product_start_online_date
0,054a9a12cd3ad95f66a38ee890f283991e7ade2dbb82e5...,0.000864,0.071655,1e-06,421.0286,2144.041667,132.0,0.000984,0.025894,6.6e-05,411.068783,1508.0,135.0
1,062ecc316e190bbd486e5800e1b9a2b5ae6a2d392df5ec...,0.000948,0.006432,4.8e-05,402.009528,1672.041667,157.0,0.000932,0.00359,9.9e-05,440.179762,1310.041667,161.041667
2,0830c9cead2f7ace2b8d7e6bb6a24b5e892efe82c48cf1...,0.002462,0.87793,1.1e-05,397.212573,3214.041667,115.0,0.002764,0.867188,1.6e-05,397.004433,2822.041667,124.0
3,088017151fe1ac5b2b556a69455437dcedbe8f2250119f...,0.000808,0.03598,1.9e-05,414.069756,1875.0,126.0,0.000893,0.03598,2.3e-05,409.816129,1680.041667,135.0
4,089593af68cf96a7b3c0dd4c98426e0fa5b379fee1345b...,0.000593,0.01593,2.4e-05,528.336655,1410.041667,141.0,0.000434,0.001246,5.3e-05,539.275,1388.041667,149.0


In [24]:
# merged_user_country_df.to_csv("../preprocessed_data/user_country_features.csv", index=False)
