In [1]:
# ! pip install fastparquet --user
# ! pip install ordered-set
# ! pip install swifter --user

In [2]:
import pandas as pd
import json
import swifter
import numpy as np
import gc



In [3]:
train = pd.read_parquet("../data_phase1/train.parquet")
val = pd.read_parquet("../data_phase1/validation.parquet")


In [4]:
train = train.fillna(value={"context_type": "NA"})
val = val.fillna(value={"context_type": "NA"})


In [5]:
train.describe(include='O')

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value
count,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990
unique,584665,208393,317426,408263,5,23,3,2,6,196,5,189571
top,d82a97bd154ecf2314b23398516b8cf92d1a511fbfa394...,cc83479dd22e19ec45d08805a61b73d7f33a69feaf42be...,f3de26eced2c81d1b0d6da40c11c9f987fe066b5a4f4fd...,55e1495c40504b4b15a358f95e2cbede34d011b287c32b...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...
freq,6,4428,354,1728,3105120,2053848,1853280,1927596,2714292,520824,3105204,152514


In [6]:
val.describe(include='O')

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value
count,687192,687192,687192,687192,687192,687192,687192,687192,687192,687192,687192,687192
unique,114532,114532,114532,218525,5,22,3,2,6,188,5,66955
top,828a60abc3ef5fc6a845887b4f0f3dac34cc03f20b11bd...,37fa61f72b0b9c718809c513aade75ca4af96015dea1fd...,4abac0dd20bf1e0da03e35f403c5cf7611342714875527...,fe13af44356050cdc93ad3d5e458e24c5077e5bf7a4c12...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...
freq,6,6,6,398,666960,513864,411180,423654,605262,102696,666990,15342


In [7]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df


In [8]:
train = reduce_mem_usage(train)
val = reduce_mem_usage(val)


Memory usage of dataframe is 428.22 MB
Memory usage after optimization is: 175.85 MB
Decreased by 58.9%
Memory usage of dataframe is 78.64 MB
Memory usage after optimization is: 47.95 MB
Decreased by 39.0%


In [9]:
gc.collect()

22

In [10]:
click_data = train[train["is_click"]==1]

In [11]:
mean_click_price = click_data["product_price"].mean()
mean_click_price

0.001788

In [12]:
attribute_df = pd.read_parquet("../data_phase1/attributes.parquet")

In [13]:
attribute_df = attribute_df[["product_id", "start_online_date"]]
attribute_df.head()

Unnamed: 0,product_id,start_online_date
0,0013f07ccdf212210c110e63f0de46e37669c17a4d855a...,1067.041667
1,002239cd57f19f22e557030dff363dfbd1344d8f7ac829...,215.041667
2,0028022e1ecbbf92f03a1edb9accb58e7c682e7cd89897...,519.0
3,00433de93d9cb6b08584423a6b54306abacef89fbddffe...,202.041667
4,009623ea17e53324f8f5a3f45f5b21b9a885ea2765de82...,211.041667


In [14]:
attribute_df.product_id.nunique(), attribute_df.shape[0]

(443150, 443150)

In [15]:
click_data = click_data.merge(attribute_df, on=["product_id"], how="left")
click_data.shape

(602158, 17)

In [16]:
mean_click_start_online_date = click_data["start_online_date"].mean()
mean_click_start_online_date

414.747518851918

In [19]:
dic = {"mean_click_price": float(mean_click_price), "mean_click_start_online_date": float(mean_click_start_online_date)}


In [20]:
json.dump(dic, open("../preprocessed_data/average_click_values.json", "w"))