In [1]:
# ! pip install fastparquet --user
# ! pip install ordered-set

In [1]:
import pandas as pd
import numpy as np
import gc
import swifter
from tqdm import tqdm
from collections import defaultdict
from ordered_set import OrderedSet
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler, StandardScaler


In [2]:
train = pd.read_parquet("../data_phase1/train.parquet")
val = pd.read_parquet("../data_phase1/validation.parquet")


In [3]:
train = train.fillna(value={"context_type": "NA"})
val = val.fillna(value={"context_type": "NA"})


In [4]:
train.describe(include='O')

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value
count,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990
unique,584665,208393,317426,408263,5,23,3,2,6,196,5,189571
top,0fdb2cc16bbc70b19a364c0862c67431c8ff04ec39b574...,cc83479dd22e19ec45d08805a61b73d7f33a69feaf42be...,f3de26eced2c81d1b0d6da40c11c9f987fe066b5a4f4fd...,55e1495c40504b4b15a358f95e2cbede34d011b287c32b...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...
freq,6,4428,354,1728,3105120,2053848,1853280,1927596,2714292,520824,3105204,152514


In [5]:
val.describe(include='O')

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value
count,687192,687192,687192,687192,687192,687192,687192,687192,687192,687192,687192,687192
unique,114532,114532,114532,218525,5,22,3,2,6,188,5,66955
top,dfe61b5ed57e5f2e163dbe87af62bc93fb8c0c7dc28eed...,17e753a0b6e62f0c52b4dc1b65d4df96828c0ec4e9a651...,b67ba53e99cb968d15588a33750b1fffcc37023eafbeeb...,fe13af44356050cdc93ad3d5e458e24c5077e5bf7a4c12...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...
freq,6,6,6,398,666960,513864,411180,423654,605262,102696,666990,15342


In [6]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df


In [7]:
train = reduce_mem_usage(train)
val = reduce_mem_usage(val)


Memory usage of dataframe is 428.22 MB
Memory usage after optimization is: 175.85 MB
Decreased by 58.9%
Memory usage of dataframe is 78.64 MB
Memory usage after optimization is: 47.95 MB
Decreased by 39.0%


In [8]:
gc.collect()

22

In [9]:
data = pd.concat([train[list(set(train.columns)-set(['is_click']))], val], axis=0)
data = data.reset_index(drop=True)

In [10]:
data.head()

Unnamed: 0,user_tier,device_category,context_type,week,session_id,product_price,user_id,query_id,previous_page_type,user_country,device_platform,page_type,product_id,context_value,week_day
0,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,designer_id,7,2740b0d77b4e6fafd75321f7d0794210afa8bd650955e7...,0.000263,e5e4c71b1b9456dafece1338762d4ee3db698cf32c384c...,92d4dd491a874a2cf92c8d311a44a42b597c64a5ede23d...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,fec032cb05435471f2305006f4a1ba994c9d2f4bcad8ef...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,596618814963e496d74434df8b8fe3306892f2e4ce6aaa...,bf056e3841dd3a358c6aacb1f9e74e4c7c4adc62e33b45...,e5315dbea15a033bc6974a0bccf5fae4a017648bcd92ab...,6
1,d179859aac8f7c1f88e1ee29b6655596873318c55127d3...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,product_id,6,d5feab37634fd140e85b8f98dcb909a8779b4f0417c73a...,0.012962,fca847f7eb5a5a21991421354b0f26afb4a517e540541f...,541a93bd95c3f4127a53e6b0d4b41db55ad9cb9e19d34a...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,8c82855f15d05cd74fa59956434df17522fc68e4ce3900...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c6513ec49c8e04c265c907933799ff76f24c075c6308c4...,61205c20046f2688cb7ed03cad29d5a5dbdc360ff48290...,5
2,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,product_id,6,18ae37a1a05faa7fd54818794a1a8e44073e00a56fa05f...,0.002733,90ea15d8d96a9d3e7ab463d990e5f4565cac9477498d37...,263ea1e38126fe0c7bfbff24a33b1a09f4dac4f8cd4bb4...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,c94ddcb9053eae77ab9abec807ef2b0bb4efac14404d17...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,b4d5e28da10318aa7776b364528dc92f83ba45326018b5...,3a519c0b692e93feff8810bbbd4654eb297379271a8a04...,5
3,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,product_id,5,10667a5a6047aa173d13997cdcd996cbdaf9b0149f9655...,7e-05,8f88d89f2a71e2adf42f885fa6adedd09bf039843b535c...,3727580d84ce2fbe42ff8bc6f732331f65ea659864a04c...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,8c82855f15d05cd74fa59956434df17522fc68e4ce3900...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,5a36f600d3c01763c28e2dafc53119fba7bcc6a867ab8f...,c739fd56b5999e40d7391008230454adb1e371d30c6973...,5
4,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,product_id,3,8b39f00bfc9d45ed6f64dd39a72548936a157f9f03e7f4...,9e-05,6b21688d90dfd9677fa7979dfc6da5b963c0e1e3d68a38...,1fcf5d263785455311cecf2f864eaa2eeca4da488383d9...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,314d20e9e9ec3e97d1867c8bf8c6feb0c23d918021e175...,ebe7ea6e59e7fbd292af5284048b53d356aac5b57d7557...,4


In [13]:
all_products = data.product_id.apply(str).tolist()

In [14]:
unique_products = OrderedSet(all_products)
