In [1]:
# ! pip install fastparquet --user
# ! pip install ordered-set

In [1]:
import pandas as pd
import numpy as np
import swifter
import gc
from tqdm import tqdm
from collections import defaultdict
from ordered_set import OrderedSet
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer


In [2]:
train = pd.read_parquet("../data_phase1/train.parquet")
val = pd.read_parquet("../data_phase1/validation.parquet")


In [None]:
train = train.fillna(value={"context_type": "NA"})
val = val.fillna(value={"context_type": "NA"})


In [None]:
train.describe(include='O')

In [None]:
val.describe(include='O')

In [None]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df


In [None]:
train = reduce_mem_usage(train)
val = reduce_mem_usage(val)


In [None]:
gc.collect()

In [9]:
train = train.sort_values(by='query_id').reset_index(drop=True)
val = val.sort_values(by='query_id').reset_index(drop=True)


In [10]:
train.head()

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value,product_price,week,week_day,is_click
0,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,1efd18182268101b62a1ea12a9cafbe05487f3abb92924...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.000466,6,6,0
1,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,62349c6eda0dc9fe8bb023213f03ebe93aefa5cbcdfecf...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.001068,6,6,0
2,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,aa0bbdfa55326b5c08d3472b1ee1d56fe13a82f63f46c8...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.001238,6,6,1
3,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,793301d78ea43b599acf05d350c8f9e485f5deaa417284...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.000465,6,6,0
4,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,9e03ca4af958bef1ce5d54e684a40ee7e3a9aa7e5010b6...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.000936,6,6,0


In [11]:
train.isnull().sum()

query_id              0
user_id               0
session_id            0
product_id            0
page_type             0
previous_page_type    0
device_category       0
device_platform       0
user_tier             0
user_country          0
context_type          0
context_value         0
product_price         0
week                  0
week_day              0
is_click              0
dtype: int64

In [13]:
data = pd.concat([train[list(set(train.columns)-set(['is_click']))], val], axis=0)
data = data.reset_index(drop=True)

In [14]:
all_products = data.product_id.apply(str).tolist()

In [15]:
unique_products = OrderedSet(all_products)


In [16]:
train_session_interactions = dict(train.groupby('session_id')['product_id'].apply(list))
val_session_interactions = dict(val.groupby('session_id')['product_id'].apply(list))

train_session_actions = dict(train.groupby('session_id')['is_click'].apply(list))
# val_session_actions = dict(val.groupby('session_id')['is_click'].apply(list))


In [17]:
train['days_elapsed'] = (train['week'].astype(int)-1)*7 + train['week_day'].astype(int)
val['days_elapsed'] = (val['week'].astype(int)-1)*7 + val['week_day'].astype(int)


In [12]:
attribute_df = pd.read_parquet("../data_phase1/attributes.parquet")
attribute_df.head()

Unnamed: 0,product_id,gender,main_colour,second_colour,season,collection,category_id_l1,category_id_l2,category_id_l3,brand_id,season_year,start_online_date,material_values,attribute_values
0,0013f07ccdf212210c110e63f0de46e37669c17a4d855a...,a8c9cca4c116691f1e331a5058e84f05e31696bc4f611c...,7673fc4fdc325f3785a223787d2b32e381e8b4c1c8a765...,4737cd35940c2338e96c18a25aeb6848d46f0da795bce8...,847a067597e39838f1f85b0774f44e68b4d6e64d3ec4dd...,a3791e8d85c005b0d9d60d6d3b7e8edd2f256a5cc928d0...,abf367e49718254e068eee51a565fdcede2d741e4c7e33...,61fe255948ec07c4eb25c70f7144b54beddd00466ad866...,05f872d8b8ec85642ad49786d6e443c0df6e7df4bdcba3...,dd6ea8954a945ef0889f30d57b7fdb8d6aaad397e6c6ff...,c7c4ac6af030e54d02b9e4545e4223e76515c3ce4e498e...,1067.041667,f61ecea9b45f1590e57706b88207449bdd4cb703b917ad...,8b45c5d5e010acf257787f2ce0c505857d94709c436991...
1,002239cd57f19f22e557030dff363dfbd1344d8f7ac829...,4a00d8b84bdb2ec2f219304d3883a46336f9fb38d2f1e6...,0f97dafafa5dc4bb18853ea00776dfcc52302f40411b50...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,9db35d237f5873f0410d3ca18c07430270086eb1e7838d...,a3791e8d85c005b0d9d60d6d3b7e8edd2f256a5cc928d0...,e54f8513b708db3afdbd4950bd3420579a8cddabf4c1b3...,3809cade495cd7dc289e6aee521d380549ebd3456f03bc...,fd021cd2dbaf0d7b6105a1b136cf5a094e025010a2096f...,a6536c6bc250d525ccd3b63a3ec483a33a2010422932a3...,ef7d8d2e02aec8c328dafde95486f7181c37d07be3d167...,215.041667,1675f293342bbb518ba3a5ad39399aa0a13580653d253e...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...
2,0028022e1ecbbf92f03a1edb9accb58e7c682e7cd89897...,a8c9cca4c116691f1e331a5058e84f05e31696bc4f611c...,0f97dafafa5dc4bb18853ea00776dfcc52302f40411b50...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,9db35d237f5873f0410d3ca18c07430270086eb1e7838d...,4c8006c7e513057a9138641abc2a9e65f4b014a8045259...,bb660069187af9e9238d10a742def09bf2bed60435b088...,8d4b33036479822fc696f32e1252b16e5105b91b82d564...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,c9c2f76b2ae7911c95e1b9568a614c14bd0eebc750cade...,dc1194ba428d5cd4c49f8a769a6577ac1042162da38bc1...,519.0,5254281b3c0b606d2c56ae1747cf0b0a868053cc3246d2...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...
3,00433de93d9cb6b08584423a6b54306abacef89fbddffe...,a8c9cca4c116691f1e331a5058e84f05e31696bc4f611c...,9a446cf5272a0694254db28c796c058405fc9caeb6352f...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,9db35d237f5873f0410d3ca18c07430270086eb1e7838d...,a3791e8d85c005b0d9d60d6d3b7e8edd2f256a5cc928d0...,abf367e49718254e068eee51a565fdcede2d741e4c7e33...,c71ada9c30cd0210a4bdd2d54172dc7be4f07c9ffdbaa3...,f9226009034d0eecc774be42fbe07e9cdcf9ca5dd24fb5...,2f85e4e89f3d731a3fcc43c1ea068600dc082d9b2656de...,ef7d8d2e02aec8c328dafde95486f7181c37d07be3d167...,202.041667,2a9ca66cf16c629b4d0fb6d78e988a199db1490ef21d78...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...
4,009623ea17e53324f8f5a3f45f5b21b9a885ea2765de82...,a8c9cca4c116691f1e331a5058e84f05e31696bc4f611c...,68b6499cff2b4a31b0927effd65c194c69c24954fcb80b...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,9db35d237f5873f0410d3ca18c07430270086eb1e7838d...,b5beb2ed60981746e0b908797b3d3abb8491de15ff5575...,abf367e49718254e068eee51a565fdcede2d741e4c7e33...,c71ada9c30cd0210a4bdd2d54172dc7be4f07c9ffdbaa3...,162f8b725de80863b3ced87304a2922fe3d1bd1f25562f...,668b3eeef29561fa9dd4da956ca3eb2787b2d8df515eed...,ef7d8d2e02aec8c328dafde95486f7181c37d07be3d167...,211.041667,d276da9d2047f312bb486b3b59a646f046d72bc3cf2e19...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...


In [13]:
gc.collect()

129

In [20]:
def str_col_to_list(row, col):
    return row[col].split(',')


In [21]:
attribute_df["material_values"] = attribute_df.apply(lambda x: str_col_to_list(x, "material_values"), 1)


In [22]:
attribute_df["attribute_values"] = attribute_df.apply(lambda x: str_col_to_list(x, "attribute_values"), 1)


In [23]:
materials = attribute_df["material_values"]

mlb_mat = MultiLabelBinarizer()
materials_one_hot = pd.DataFrame(mlb_mat.fit_transform(materials),columns=mlb_mat.classes_, index=materials.index)


In [24]:
attr = attribute_df["attribute_values"]

mlb_attr = MultiLabelBinarizer()
attr_one_hot = pd.DataFrame(mlb_attr.fit_transform(attr),columns=mlb_attr.classes_, index=attr.index)


In [25]:
attr_one_hot.shape

(443150, 129)

In [26]:
attribute_df = pd.concat([attribute_df, materials_one_hot], 1)
attribute_df = pd.concat([attribute_df, attr_one_hot], 1)


In [27]:
attribute_df.head(2)

Unnamed: 0,product_id,gender,main_colour,second_colour,season,collection,category_id_l1,category_id_l2,category_id_l3,brand_id,...,f065a4c3d09389d54142658a19195a7cff537993d568464b777de1b558e85d42,f3b306fc59880e012ebe6fdea5232b919f509f3145175893c5bf85e757aaff9d,f4fed2b19d01634f9ee192f85ad78f031ec6e689e8b5273309a7a97752eeaca1,f7ce374e8f399da0b38bba96cb601a73acb742ad0a8a17e866ef0aaceed7da9d,f8713c99b122b53e26950834956ab0a523ea6b47ac6606444eeb3e05b82b741c,f9a55f03725a3a899b0f7bb4496b3958703815fcd4fe566e32e2186f6211ecbf,fb04cd95c93be959ee9000a1d6c9fec60178e98d78ba6c84985d8c8fdb8d8b67,fb3f233dce3b5c12ac0c6198b5e43a0ba5eac14340c8cdb6fea724b48cf7b0db,fd3cc98f1cfd8c2266e7abb23ef6e21de21c5b184f9192e21ba4ef760cc7aeef,ff6720923d4f8d3a730692db85c126af15eb6b21c7f68fc85d8e93ad4b279d51
0,0013f07ccdf212210c110e63f0de46e37669c17a4d855a...,a8c9cca4c116691f1e331a5058e84f05e31696bc4f611c...,7673fc4fdc325f3785a223787d2b32e381e8b4c1c8a765...,4737cd35940c2338e96c18a25aeb6848d46f0da795bce8...,847a067597e39838f1f85b0774f44e68b4d6e64d3ec4dd...,a3791e8d85c005b0d9d60d6d3b7e8edd2f256a5cc928d0...,abf367e49718254e068eee51a565fdcede2d741e4c7e33...,61fe255948ec07c4eb25c70f7144b54beddd00466ad866...,05f872d8b8ec85642ad49786d6e443c0df6e7df4bdcba3...,dd6ea8954a945ef0889f30d57b7fdb8d6aaad397e6c6ff...,...,0,0,0,0,0,0,0,0,0,0
1,002239cd57f19f22e557030dff363dfbd1344d8f7ac829...,4a00d8b84bdb2ec2f219304d3883a46336f9fb38d2f1e6...,0f97dafafa5dc4bb18853ea00776dfcc52302f40411b50...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,9db35d237f5873f0410d3ca18c07430270086eb1e7838d...,a3791e8d85c005b0d9d60d6d3b7e8edd2f256a5cc928d0...,e54f8513b708db3afdbd4950bd3420579a8cddabf4c1b3...,3809cade495cd7dc289e6aee521d380549ebd3456f03bc...,fd021cd2dbaf0d7b6105a1b136cf5a094e025010a2096f...,a6536c6bc250d525ccd3b63a3ec483a33a2010422932a3...,...,0,0,0,0,0,0,0,0,0,0


In [28]:
# attribute_df.to_csv('../preprocessed_data/product_material_attribute_one-hot.csv', index=False)

In [29]:
full_data = pd.concat([train, val], 0)

In [30]:
full_data = full_data[["session_id", "product_id", "is_click"]]
full_data.head()


Unnamed: 0,session_id,product_id,is_click
0,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,1efd18182268101b62a1ea12a9cafbe05487f3abb92924...,0.0
1,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,62349c6eda0dc9fe8bb023213f03ebe93aefa5cbcdfecf...,0.0
2,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,aa0bbdfa55326b5c08d3472b1ee1d56fe13a82f63f46c8...,1.0
3,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,793301d78ea43b599acf05d350c8f9e485f5deaa417284...,0.0
4,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,9e03ca4af958bef1ce5d54e684a40ee7e3a9aa7e5010b6...,0.0


In [31]:
full_data.shape, full_data.drop_duplicates().shape

((4195182, 3), (3739897, 3))

In [32]:
temp_attr_df = full_data.merge(attribute_df[["product_id", "main_colour", "second_colour", "gender", "start_online_date"]], on="product_id", how="left")
temp_attr_df.head(2)

Unnamed: 0,session_id,product_id,is_click,main_colour,second_colour,gender,start_online_date
0,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,1efd18182268101b62a1ea12a9cafbe05487f3abb92924...,0.0,8770603751ae3b965ab13bdd02c607985cd56be86d850b...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,a8c9cca4c116691f1e331a5058e84f05e31696bc4f611c...,573.041667
1,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,62349c6eda0dc9fe8bb023213f03ebe93aefa5cbcdfecf...,0.0,0f97dafafa5dc4bb18853ea00776dfcc52302f40411b50...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,a8c9cca4c116691f1e331a5058e84f05e31696bc4f611c...,262.041667


In [33]:
temp_attr_df.shape

(4195182, 7)

In [34]:
temp_attr_df_dedup = temp_attr_df.drop_duplicates(["session_id", "product_id"], keep="first")

In [35]:
temp_attr_df_dedup.shape

(3313157, 7)

In [36]:
gc.collect()

88

In [37]:
session_start_online_mean_df = temp_attr_df.groupby(["session_id"]).agg({"start_online_date": "mean"}).reset_index()
session_start_online_mean_df = session_start_online_mean_df.rename(columns={"start_online_date": "session_start_online_date_mean"})
session_start_online_mean_df.head()


Unnamed: 0,session_id,session_start_online_date_mean
0,00000310a3874db670d94513217ac53d73be25ea0bc1c9...,375.934028
1,00002bea5c0c174264e846ce5beb196fd5b36f1d64e4d8...,315.354167
2,000055d87de3ef0e5c614ce4ec6c04a37405d767332477...,582.875
3,00005e9475a26ea9798de9f24b7c43273122c4fd4db841...,401.371528
4,0000b16e532d23c2c8a904b438408bbed6728cdf18eed3...,201.541667


In [38]:
session_start_online_max_df = temp_attr_df.groupby(["session_id"]).agg({"start_online_date": "max"}).reset_index()
session_start_online_max_df = session_start_online_max_df.rename(columns={"start_online_date": "session_start_online_date_max"})
session_start_online_max_df.head()


Unnamed: 0,session_id,session_start_online_date_max
0,00000310a3874db670d94513217ac53d73be25ea0bc1c9...,772.0
1,00002bea5c0c174264e846ce5beb196fd5b36f1d64e4d8...,825.0
2,000055d87de3ef0e5c614ce4ec6c04a37405d767332477...,688.041667
3,00005e9475a26ea9798de9f24b7c43273122c4fd4db841...,903.041667
4,0000b16e532d23c2c8a904b438408bbed6728cdf18eed3...,246.041667


In [39]:
session_start_online_min_df = temp_attr_df.groupby(["session_id"]).agg({"start_online_date": "min"}).reset_index()
session_start_online_min_df = session_start_online_min_df.rename(columns={"start_online_date": "session_start_online_date_min"})
session_start_online_min_df.head()


Unnamed: 0,session_id,session_start_online_date_min
0,00000310a3874db670d94513217ac53d73be25ea0bc1c9...,150.0
1,00002bea5c0c174264e846ce5beb196fd5b36f1d64e4d8...,118.0
2,000055d87de3ef0e5c614ce4ec6c04a37405d767332477...,279.041667
3,00005e9475a26ea9798de9f24b7c43273122c4fd4db841...,200.041667
4,0000b16e532d23c2c8a904b438408bbed6728cdf18eed3...,160.041667


In [40]:
session_start_online_feature_df = session_start_online_mean_df.merge(session_start_online_max_df, on="session_id", how="left")
session_start_online_feature_df = session_start_online_feature_df.merge(session_start_online_min_df, on="session_id", how="left")
session_start_online_feature_df.head(3)


Unnamed: 0,session_id,session_start_online_date_mean,session_start_online_date_max,session_start_online_date_min
0,00000310a3874db670d94513217ac53d73be25ea0bc1c9...,375.934028,772.0,150.0
1,00002bea5c0c174264e846ce5beb196fd5b36f1d64e4d8...,315.354167,825.0,118.0
2,000055d87de3ef0e5c614ce4ec6c04a37405d767332477...,582.875,688.041667,279.041667


In [41]:
temp_attr_df_dedup.shape

(3313157, 7)

In [42]:
temp_attr_df_dedup = temp_attr_df_dedup.merge(session_start_online_feature_df, on=["session_id"], how="left")


In [43]:
temp_attr_df_dedup.shape

(3313157, 10)

In [44]:
session_main_colour_group = dict(temp_attr_df.groupby('session_id')["main_colour"].apply(list))
session_second_colour_group = dict(temp_attr_df.groupby('session_id')["second_colour"].apply(list))
session_gender_group = dict(temp_attr_df.groupby('session_id')["gender"].apply(list))


In [45]:
def count_feature_frequency_in_session(row, feature, dic):
    cnt = 0
    for val in dic[row["session_id"]]:
        if val==row[feature]:
            cnt+=1
            
    return cnt


In [46]:
# temp_attr_df.shape, temp_attr_df.drop_duplicates().shape

In [47]:
# temp_attr_df = temp_attr_df.drop_duplicates()

In [48]:
temp_attr_df_dedup["main_colour_freq_in session"] = temp_attr_df_dedup.swifter.apply(lambda x: count_feature_frequency_in_session(x, "main_colour", session_main_colour_group), 1)



HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3313157.0), HTML(value='')))




In [49]:
temp_attr_df_dedup["second_colour_freq_in session"] = temp_attr_df_dedup.swifter.apply(lambda x: count_feature_frequency_in_session(x, "second_colour", session_second_colour_group), 1)


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3313157.0), HTML(value='')))




In [50]:
temp_attr_df_dedup["gender_freq_in session"] = temp_attr_df_dedup.swifter.apply(lambda x: count_feature_frequency_in_session(x, "gender", session_gender_group), 1)


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3313157.0), HTML(value='')))




In [51]:
temp_attr_df_dedup.shape

(3313157, 13)

In [52]:
temp_attr_df_dedup.head()

Unnamed: 0,session_id,product_id,is_click,main_colour,second_colour,gender,start_online_date,session_start_online_date_mean,session_start_online_date_max,session_start_online_date_min,main_colour_freq_in session,second_colour_freq_in session,gender_freq_in session
0,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,1efd18182268101b62a1ea12a9cafbe05487f3abb92924...,0.0,8770603751ae3b965ab13bdd02c607985cd56be86d850b...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,a8c9cca4c116691f1e331a5058e84f05e31696bc4f611c...,573.041667,314.947917,573.041667,133.0,1,12,12
1,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,62349c6eda0dc9fe8bb023213f03ebe93aefa5cbcdfecf...,0.0,0f97dafafa5dc4bb18853ea00776dfcc52302f40411b50...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,a8c9cca4c116691f1e331a5058e84f05e31696bc4f611c...,262.041667,314.947917,573.041667,133.0,1,12,12
2,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,aa0bbdfa55326b5c08d3472b1ee1d56fe13a82f63f46c8...,1.0,752e245892cce8aa87b8f41c583f95c7059cbaa4e9d61b...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,a8c9cca4c116691f1e331a5058e84f05e31696bc4f611c...,226.041667,314.947917,573.041667,133.0,4,12,12
3,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,793301d78ea43b599acf05d350c8f9e485f5deaa417284...,0.0,752e245892cce8aa87b8f41c583f95c7059cbaa4e9d61b...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,a8c9cca4c116691f1e331a5058e84f05e31696bc4f611c...,394.0,314.947917,573.041667,133.0,4,12,12
4,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,9e03ca4af958bef1ce5d54e684a40ee7e3a9aa7e5010b6...,0.0,752e245892cce8aa87b8f41c583f95c7059cbaa4e9d61b...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,a8c9cca4c116691f1e331a5058e84f05e31696bc4f611c...,273.041667,314.947917,573.041667,133.0,4,12,12


In [53]:
gc.collect()

151

In [54]:
temp_attr_df_v2 = full_data.merge(attribute_df[["product_id", "season", "collection", "category_id_l1", "category_id_l2", "category_id_l3", "brand_id"]], on="product_id", how="left")



In [55]:
session_season_count = pd.DataFrame(temp_attr_df_v2.groupby('session_id')['season'].count())
session_season_count = session_season_count.reset_index()
session_season_count = session_season_count.rename(columns={"season": "session_season_frequency"})
session_season_count.head()


Unnamed: 0,session_id,session_season_frequency
0,00000310a3874db670d94513217ac53d73be25ea0bc1c9...,12
1,00002bea5c0c174264e846ce5beb196fd5b36f1d64e4d8...,6
2,000055d87de3ef0e5c614ce4ec6c04a37405d767332477...,6
3,00005e9475a26ea9798de9f24b7c43273122c4fd4db841...,12
4,0000b16e532d23c2c8a904b438408bbed6728cdf18eed3...,12


In [56]:
session_collection_count = pd.DataFrame(temp_attr_df_v2.groupby('session_id')['collection'].count())
session_collection_count = session_collection_count.reset_index()
session_collection_count = session_collection_count.rename(columns={"collection": "session_collection_frequency"})
session_collection_count.head(2)


Unnamed: 0,session_id,session_collection_frequency
0,00000310a3874db670d94513217ac53d73be25ea0bc1c9...,12
1,00002bea5c0c174264e846ce5beb196fd5b36f1d64e4d8...,6


In [57]:
session_category_id_l1_count = pd.DataFrame(temp_attr_df_v2.groupby('session_id')['category_id_l1'].count())
session_category_id_l1_count = session_category_id_l1_count.reset_index()
session_category_id_l1_count = session_category_id_l1_count.rename(columns={"category_id_l1": "session_category_id_l1_frequency"})
session_category_id_l1_count.head(2)


Unnamed: 0,session_id,session_category_id_l1_frequency
0,00000310a3874db670d94513217ac53d73be25ea0bc1c9...,12
1,00002bea5c0c174264e846ce5beb196fd5b36f1d64e4d8...,6


In [58]:
session_category_id_l2_count = pd.DataFrame(temp_attr_df_v2.groupby('session_id')['category_id_l2'].count())
session_category_id_l2_count = session_category_id_l2_count.reset_index()
session_category_id_l2_count = session_category_id_l2_count.rename(columns={"category_id_l2": "session_category_id_l2_frequency"})
session_category_id_l2_count.head(2)


Unnamed: 0,session_id,session_category_id_l2_frequency
0,00000310a3874db670d94513217ac53d73be25ea0bc1c9...,12
1,00002bea5c0c174264e846ce5beb196fd5b36f1d64e4d8...,6


In [59]:
session_brand_id_count = pd.DataFrame(temp_attr_df_v2.groupby('session_id')['brand_id'].count())
session_brand_id_count = session_brand_id_count.reset_index()
session_brand_id_count = session_brand_id_count.rename(columns={"brand_id": "session_brand_id_frequency"})
session_brand_id_count.head(2)


Unnamed: 0,session_id,session_brand_id_frequency
0,00000310a3874db670d94513217ac53d73be25ea0bc1c9...,12
1,00002bea5c0c174264e846ce5beb196fd5b36f1d64e4d8...,6


In [60]:
temp_attr_df_v2 = temp_attr_df_v2.drop_duplicates(["session_id", "product_id"], keep="first")

In [61]:
temp_attr_df_v2.shape

(3313157, 9)

In [62]:
temp_attr_df_v2 = temp_attr_df_v2.merge(session_season_count, on="session_id", how="left")
temp_attr_df_v2 = temp_attr_df_v2.merge(session_collection_count, on="session_id", how="left")
temp_attr_df_v2 = temp_attr_df_v2.merge(session_category_id_l1_count, on="session_id", how="left")
temp_attr_df_v2 = temp_attr_df_v2.merge(session_category_id_l2_count, on="session_id", how="left")
temp_attr_df_v2 = temp_attr_df_v2.merge(session_brand_id_count, on="session_id", how="left")


In [63]:
temp_attr_df_v2.head()

Unnamed: 0,session_id,product_id,is_click,season,collection,category_id_l1,category_id_l2,category_id_l3,brand_id,session_season_frequency,session_collection_frequency,session_category_id_l1_frequency,session_category_id_l2_frequency,session_brand_id_frequency
0,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,1efd18182268101b62a1ea12a9cafbe05487f3abb92924...,0.0,9db35d237f5873f0410d3ca18c07430270086eb1e7838d...,a3791e8d85c005b0d9d60d6d3b7e8edd2f256a5cc928d0...,bb660069187af9e9238d10a742def09bf2bed60435b088...,663ca810e6cd391f1a8c24b0e39c4ac59f7e04c4759356...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,09e1d96d9e05b0bcab8bcba3943b899adb5177a8a4fe81...,12,12,12,12,12
1,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,62349c6eda0dc9fe8bb023213f03ebe93aefa5cbcdfecf...,0.0,9db35d237f5873f0410d3ca18c07430270086eb1e7838d...,4c8006c7e513057a9138641abc2a9e65f4b014a8045259...,abf367e49718254e068eee51a565fdcede2d741e4c7e33...,61fe255948ec07c4eb25c70f7144b54beddd00466ad866...,ed7da2b533e9ccdf3dcaede4d2b8d170ffc32e2b922ee8...,2b225d501fd0ecd875cc3412eb5a14f96e98abc7b2f6bc...,12,12,12,12,12
2,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,aa0bbdfa55326b5c08d3472b1ee1d56fe13a82f63f46c8...,1.0,9db35d237f5873f0410d3ca18c07430270086eb1e7838d...,4c8006c7e513057a9138641abc2a9e65f4b014a8045259...,abf367e49718254e068eee51a565fdcede2d741e4c7e33...,61fe255948ec07c4eb25c70f7144b54beddd00466ad866...,ed7da2b533e9ccdf3dcaede4d2b8d170ffc32e2b922ee8...,2b225d501fd0ecd875cc3412eb5a14f96e98abc7b2f6bc...,12,12,12,12,12
3,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,793301d78ea43b599acf05d350c8f9e485f5deaa417284...,0.0,847a067597e39838f1f85b0774f44e68b4d6e64d3ec4dd...,a3791e8d85c005b0d9d60d6d3b7e8edd2f256a5cc928d0...,bb660069187af9e9238d10a742def09bf2bed60435b088...,663ca810e6cd391f1a8c24b0e39c4ac59f7e04c4759356...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,09e1d96d9e05b0bcab8bcba3943b899adb5177a8a4fe81...,12,12,12,12,12
4,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,9e03ca4af958bef1ce5d54e684a40ee7e3a9aa7e5010b6...,0.0,9db35d237f5873f0410d3ca18c07430270086eb1e7838d...,4c8006c7e513057a9138641abc2a9e65f4b014a8045259...,abf367e49718254e068eee51a565fdcede2d741e4c7e33...,61fe255948ec07c4eb25c70f7144b54beddd00466ad866...,ed7da2b533e9ccdf3dcaede4d2b8d170ffc32e2b922ee8...,2b225d501fd0ecd875cc3412eb5a14f96e98abc7b2f6bc...,12,12,12,12,12


In [64]:
merged_df = temp_attr_df_v2.merge(temp_attr_df_dedup[["session_id", "product_id", "main_colour", \
                                                   "second_colour", "gender", "main_colour_freq_in session", \
                                                   "second_colour_freq_in session", "gender_freq_in session", \
                                                     "session_start_online_date_mean", "session_start_online_date_max", \
                                                     "session_start_online_date_min"]], \
                                                   on=["session_id", "product_id"], how="left")



In [65]:
merged_df.head()

Unnamed: 0,session_id,product_id,is_click,season,collection,category_id_l1,category_id_l2,category_id_l3,brand_id,session_season_frequency,...,session_brand_id_frequency,main_colour,second_colour,gender,main_colour_freq_in session,second_colour_freq_in session,gender_freq_in session,session_start_online_date_mean,session_start_online_date_max,session_start_online_date_min
0,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,1efd18182268101b62a1ea12a9cafbe05487f3abb92924...,0.0,9db35d237f5873f0410d3ca18c07430270086eb1e7838d...,a3791e8d85c005b0d9d60d6d3b7e8edd2f256a5cc928d0...,bb660069187af9e9238d10a742def09bf2bed60435b088...,663ca810e6cd391f1a8c24b0e39c4ac59f7e04c4759356...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,09e1d96d9e05b0bcab8bcba3943b899adb5177a8a4fe81...,12,...,12,8770603751ae3b965ab13bdd02c607985cd56be86d850b...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,a8c9cca4c116691f1e331a5058e84f05e31696bc4f611c...,1,12,12,314.947917,573.041667,133.0
1,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,62349c6eda0dc9fe8bb023213f03ebe93aefa5cbcdfecf...,0.0,9db35d237f5873f0410d3ca18c07430270086eb1e7838d...,4c8006c7e513057a9138641abc2a9e65f4b014a8045259...,abf367e49718254e068eee51a565fdcede2d741e4c7e33...,61fe255948ec07c4eb25c70f7144b54beddd00466ad866...,ed7da2b533e9ccdf3dcaede4d2b8d170ffc32e2b922ee8...,2b225d501fd0ecd875cc3412eb5a14f96e98abc7b2f6bc...,12,...,12,0f97dafafa5dc4bb18853ea00776dfcc52302f40411b50...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,a8c9cca4c116691f1e331a5058e84f05e31696bc4f611c...,1,12,12,314.947917,573.041667,133.0
2,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,aa0bbdfa55326b5c08d3472b1ee1d56fe13a82f63f46c8...,1.0,9db35d237f5873f0410d3ca18c07430270086eb1e7838d...,4c8006c7e513057a9138641abc2a9e65f4b014a8045259...,abf367e49718254e068eee51a565fdcede2d741e4c7e33...,61fe255948ec07c4eb25c70f7144b54beddd00466ad866...,ed7da2b533e9ccdf3dcaede4d2b8d170ffc32e2b922ee8...,2b225d501fd0ecd875cc3412eb5a14f96e98abc7b2f6bc...,12,...,12,752e245892cce8aa87b8f41c583f95c7059cbaa4e9d61b...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,a8c9cca4c116691f1e331a5058e84f05e31696bc4f611c...,4,12,12,314.947917,573.041667,133.0
3,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,793301d78ea43b599acf05d350c8f9e485f5deaa417284...,0.0,847a067597e39838f1f85b0774f44e68b4d6e64d3ec4dd...,a3791e8d85c005b0d9d60d6d3b7e8edd2f256a5cc928d0...,bb660069187af9e9238d10a742def09bf2bed60435b088...,663ca810e6cd391f1a8c24b0e39c4ac59f7e04c4759356...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,09e1d96d9e05b0bcab8bcba3943b899adb5177a8a4fe81...,12,...,12,752e245892cce8aa87b8f41c583f95c7059cbaa4e9d61b...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,a8c9cca4c116691f1e331a5058e84f05e31696bc4f611c...,4,12,12,314.947917,573.041667,133.0
4,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,9e03ca4af958bef1ce5d54e684a40ee7e3a9aa7e5010b6...,0.0,9db35d237f5873f0410d3ca18c07430270086eb1e7838d...,4c8006c7e513057a9138641abc2a9e65f4b014a8045259...,abf367e49718254e068eee51a565fdcede2d741e4c7e33...,61fe255948ec07c4eb25c70f7144b54beddd00466ad866...,ed7da2b533e9ccdf3dcaede4d2b8d170ffc32e2b922ee8...,2b225d501fd0ecd875cc3412eb5a14f96e98abc7b2f6bc...,12,...,12,752e245892cce8aa87b8f41c583f95c7059cbaa4e9d61b...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,a8c9cca4c116691f1e331a5058e84f05e31696bc4f611c...,4,12,12,314.947917,573.041667,133.0


In [66]:
del full_data

In [67]:
gc.collect()

110

In [68]:
full_data = pd.concat([train, val], 0)
full_data = full_data[["session_id", "product_id"]]


In [69]:
temp_attr_df_v3 = full_data.merge(attribute_df[["product_id", "start_online_date"]], on="product_id", how="left")


In [70]:
session_wise_mean_price = pd.DataFrame(temp_attr_df_v3.groupby('session_id')['start_online_date'].mean().reset_index())
session_wise_mean_price = session_wise_mean_price.rename(columns={"start_online_date": "mean_session_start_online_date"})
session_wise_mean_price["count_session_items"] = pd.DataFrame(temp_attr_df_v3.groupby('session_id')['start_online_date'].count().reset_index())[["start_online_date"]]
# session_wise_mean_price["difference_start_online_date_from_mean_of_other_products_in_session"] = [np.nan]*len(session_wise_mean_price)
session_wise_mean_price
# TODO: Create price diff feature after joining



Unnamed: 0,session_id,mean_session_start_online_date,count_session_items
0,00000310a3874db670d94513217ac53d73be25ea0bc1c9...,375.934028,12
1,00002bea5c0c174264e846ce5beb196fd5b36f1d64e4d8...,315.354167,6
2,000055d87de3ef0e5c614ce4ec6c04a37405d767332477...,582.875000,6
3,00005e9475a26ea9798de9f24b7c43273122c4fd4db841...,401.371528,12
4,0000b16e532d23c2c8a904b438408bbed6728cdf18eed3...,201.541667,12
...,...,...,...
355217,fffe63b765f90ef81cd2362da7c3909cac7679deb8257e...,453.013889,6
355218,fffe8b59f260d5f89660841dc7e9636b9e0a16a02b1720...,209.627315,18
355219,fffeb5fe6042a656a40fad46d80cf4a1778b5598259eac...,681.704861,12
355220,fffec9863201ea4774c9f2e6cc7901bb033de7906e72b3...,216.375000,12


In [71]:
session_wise_mean_price.session_id.nunique(), merged_df.session_id.nunique(), merged_df.shape

(355222, 355222, (3313157, 23))

In [72]:
merged_df = merged_df.merge(session_wise_mean_price, on="session_id", how="left")
merged_df.head(2)

Unnamed: 0,session_id,product_id,is_click,season,collection,category_id_l1,category_id_l2,category_id_l3,brand_id,session_season_frequency,...,second_colour,gender,main_colour_freq_in session,second_colour_freq_in session,gender_freq_in session,session_start_online_date_mean,session_start_online_date_max,session_start_online_date_min,mean_session_start_online_date,count_session_items
0,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,1efd18182268101b62a1ea12a9cafbe05487f3abb92924...,0.0,9db35d237f5873f0410d3ca18c07430270086eb1e7838d...,a3791e8d85c005b0d9d60d6d3b7e8edd2f256a5cc928d0...,bb660069187af9e9238d10a742def09bf2bed60435b088...,663ca810e6cd391f1a8c24b0e39c4ac59f7e04c4759356...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,09e1d96d9e05b0bcab8bcba3943b899adb5177a8a4fe81...,12,...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,a8c9cca4c116691f1e331a5058e84f05e31696bc4f611c...,1,12,12,314.947917,573.041667,133.0,314.947917,12
1,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,62349c6eda0dc9fe8bb023213f03ebe93aefa5cbcdfecf...,0.0,9db35d237f5873f0410d3ca18c07430270086eb1e7838d...,4c8006c7e513057a9138641abc2a9e65f4b014a8045259...,abf367e49718254e068eee51a565fdcede2d741e4c7e33...,61fe255948ec07c4eb25c70f7144b54beddd00466ad866...,ed7da2b533e9ccdf3dcaede4d2b8d170ffc32e2b922ee8...,2b225d501fd0ecd875cc3412eb5a14f96e98abc7b2f6bc...,12,...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,a8c9cca4c116691f1e331a5058e84f05e31696bc4f611c...,1,12,12,314.947917,573.041667,133.0,314.947917,12


In [73]:
merged_df = merged_df.drop(["season", "collection", "category_id_l1", "category_id_l2", "brand_id", "main_colour", \
                            "second_colour", "gender", "is_click"], 1)



In [74]:
merged_df.head(2)

Unnamed: 0,session_id,product_id,category_id_l3,session_season_frequency,session_collection_frequency,session_category_id_l1_frequency,session_category_id_l2_frequency,session_brand_id_frequency,main_colour_freq_in session,second_colour_freq_in session,gender_freq_in session,session_start_online_date_mean,session_start_online_date_max,session_start_online_date_min,mean_session_start_online_date,count_session_items
0,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,1efd18182268101b62a1ea12a9cafbe05487f3abb92924...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,12,12,12,12,12,1,12,12,314.947917,573.041667,133.0,314.947917,12
1,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,62349c6eda0dc9fe8bb023213f03ebe93aefa5cbcdfecf...,ed7da2b533e9ccdf3dcaede4d2b8d170ffc32e2b922ee8...,12,12,12,12,12,1,12,12,314.947917,573.041667,133.0,314.947917,12


In [75]:
merged_df.shape, merged_df[["session_id", "product_id"]].nunique()

((3313157, 16),
 session_id    355222
 product_id    423813
 dtype: int64)

In [76]:
# merged_df.to_csv("../preprocessed_data/session_wise_product_attribute_frequency_agg_features.csv", index=False)


In [113]:
gc.collect()

22

In [14]:
full_product_data = pd.concat([train, val], 0)[["product_id"]]

In [15]:
colour_data = full_product_data.merge(attribute_df[["product_id", "main_colour", "second_colour"]], on="product_id", how="left")
colour_data.head(2)

Unnamed: 0,product_id,main_colour,second_colour
0,1efd18182268101b62a1ea12a9cafbe05487f3abb92924...,8770603751ae3b965ab13bdd02c607985cd56be86d850b...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...
1,62349c6eda0dc9fe8bb023213f03ebe93aefa5cbcdfecf...,0f97dafafa5dc4bb18853ea00776dfcc52302f40411b50...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...


In [16]:
main_colour_freq_data = colour_data.groupby("main_colour").count().reset_index()[["main_colour", "product_id"]]
main_colour_freq_data = main_colour_freq_data.rename(columns={"product_id": "main_colour_perc"})
main_colour_freq_data["main_colour_perc"] = (main_colour_freq_data["main_colour_perc"]/len(colour_data))*100
main_colour_freq_data.head(2)


Unnamed: 0,main_colour,main_colour_perc
0,0078d3ef1479189e5a922b27e2540a7bb1d10eb9ec0581...,1.529135
1,0a89539f0dde241e4171b2daa052a1ff94c13b85532e0e...,0.582954


In [17]:
second_colour_freq_data = colour_data.groupby("second_colour").count().reset_index()[["second_colour", "product_id"]]
second_colour_freq_data = second_colour_freq_data.rename(columns={"product_id": "second_colour_perc"})
second_colour_freq_data["second_colour_perc"] = (second_colour_freq_data["second_colour_perc"]/len(colour_data))*100
second_colour_freq_data.head(2)


Unnamed: 0,second_colour,second_colour_perc
0,0078d3ef1479189e5a922b27e2540a7bb1d10eb9ec0581...,0.837747
1,0a89539f0dde241e4171b2daa052a1ff94c13b85532e0e...,0.301131


In [18]:
gender_data = full_product_data.merge(attribute_df[["product_id", "gender"]], on="product_id", how="left")

gender_freq_data = gender_data.groupby("gender").count().reset_index()[["gender", "product_id"]]
gender_freq_data = gender_freq_data.rename(columns={"product_id": "gender_perc"})
gender_freq_data["gender_perc"] = (gender_freq_data["gender_perc"]/len(gender_data))*100
gender_freq_data.head()


Unnamed: 0,gender,gender_perc
0,2e7778b3e29aa8df403a1b4fd4c26f848d7a739fd0cb03...,6.207287
1,4a00d8b84bdb2ec2f219304d3883a46336f9fb38d2f1e6...,4.962836
2,51783ef49eeaae84ab72114d1b79f671847df95b30c8e9...,33.387658
3,a8c9cca4c116691f1e331a5058e84f05e31696bc4f611c...,55.442219


In [19]:
season_data = full_product_data.merge(attribute_df[["product_id", "season"]], on="product_id", how="left")

season_freq_data = season_data.groupby("season").count().reset_index()[["season", "product_id"]]
season_freq_data = season_freq_data.rename(columns={"product_id": "season_perc"})
season_freq_data["season_perc"] = (season_freq_data["season_perc"]/len(season_data))*100
season_freq_data.head()

Unnamed: 0,season,season_perc
0,4d26e1f9a28a916bccb9fb780cd32f78a8c77027323bef...,1.711296
1,7bf9f2fa2f47fd0bbf7ce2617c5aab212260b07486fa54...,0.011394
2,847a067597e39838f1f85b0774f44e68b4d6e64d3ec4dd...,34.184548
3,89d58bdc4ab58151ccdd5a1444aa86f2ef6dd25a0ebf22...,0.017091
4,9db35d237f5873f0410d3ca18c07430270086eb1e7838d...,63.919515


In [20]:
main_colour_freq_data.to_csv("../preprocessed_data/main_colour_frequency.csv", index=False)
second_colour_freq_data.to_csv("../preprocessed_data/second_colour_frequency.csv", index=False)
gender_freq_data.to_csv("../preprocessed_data/gender_frequency.csv", index=False)
season_freq_data.to_csv("../preprocessed_data/season_frequency.csv", index=False)


In [21]:
collection_data = full_product_data.merge(attribute_df[["product_id", "collection"]], on="product_id", how="left")

collection_freq_data = collection_data.groupby("collection").count().reset_index()[["collection", "product_id"]]
collection_freq_data = collection_freq_data.rename(columns={"product_id": "collection_perc"})
collection_freq_data["collection_perc"] = (collection_freq_data["collection_perc"]/len(collection_data))*100
collection_freq_data.head()


Unnamed: 0,collection,collection_perc
0,3e5d88c58066b4182e2eb9e0e2206aaba3a46b193e4788...,0.431495
1,4c8006c7e513057a9138641abc2a9e65f4b014a8045259...,49.364342
2,6611f47aa127beb2b2fcb372eab3945274b5e742af07c2...,0.740516
3,a3791e8d85c005b0d9d60d6d3b7e8edd2f256a5cc928d0...,37.524212
4,b5beb2ed60981746e0b908797b3d3abb8491de15ff5575...,10.131599


In [22]:
category_id_l1_data = full_product_data.merge(attribute_df[["product_id", "category_id_l1"]], on="product_id", how="left")

category_id_l1_freq_data = category_id_l1_data.groupby("category_id_l1").count().reset_index()[["category_id_l1", "product_id"]]
category_id_l1_freq_data = category_id_l1_freq_data.rename(columns={"product_id": "category_id_l1_perc"})
category_id_l1_freq_data["category_id_l1_perc"] = (category_id_l1_freq_data["category_id_l1_perc"]/len(category_id_l1_data))*100
category_id_l1_freq_data.head()


Unnamed: 0,category_id_l1,category_id_l1_perc
0,01bdbf7ae5ca741c0db30ebea8c606042cbabac5e24217...,0.391926
1,0248828a70652add64d469c493415bb53a03b4b103a59a...,0.029391
2,02cf092784c2b661328d08172a50ad60ee5bbccd4d8d10...,0.357482
3,09c394b9d5af703e13e57e80009cd0fb43b84416ba2e7b...,0.027746
4,174386292851a161e3dd1792776ff0f50123b7d07d93d1...,0.012562


In [23]:
category_id_l2_data = full_product_data.merge(attribute_df[["product_id", "category_id_l2"]], on="product_id", how="left")

category_id_l2_freq_data = category_id_l2_data.groupby("category_id_l2").count().reset_index()[["category_id_l2", "product_id"]]
category_id_l2_freq_data = category_id_l2_freq_data.rename(columns={"product_id": "category_id_l2_perc"})
category_id_l2_freq_data["category_id_l2_perc"] = (category_id_l2_freq_data["category_id_l2_perc"]/len(category_id_l2_data))*100
category_id_l2_freq_data.head()


Unnamed: 0,category_id_l2,category_id_l2_perc
0,000651f17486a862c075e5a2a0131a750abeea6db55ad3...,0.040403
1,000bdb044ab0dc94653d0b77046c8f6d4d22f2ac342773...,0.01342
2,009a4ca993df6dd9828587d23ba38be14f1f6b7172516d...,0.072917
3,00ebf4a9f1cbba7fe21b3690246defdc438326eebb10d3...,0.054825
4,017f77b88550e40b287c7097544fc432ab8227595ed1ce...,0.000286


In [24]:
brand_id_data = full_product_data.merge(attribute_df[["product_id", "brand_id"]], on="product_id", how="left")

brand_id_freq_data = brand_id_data.groupby("brand_id").count().reset_index()[["brand_id", "product_id"]]
brand_id_freq_data = brand_id_freq_data.rename(columns={"product_id": "brand_id_perc"})
brand_id_freq_data["brand_id_perc"] = (brand_id_freq_data["brand_id_perc"]/len(brand_id_data))*100
brand_id_freq_data.head()


Unnamed: 0,brand_id,brand_id_perc
0,001a11cdc499400927ae65d8fad803cff44c64be4035f5...,0.015685
1,003df714e5a9897245f0dcb93f1c04bc025cfa2be37bc7...,0.002813
2,0045d654875981a3692893e40fd72bba8bb054c8b63e84...,4.8e-05
3,004861d660ace63dfd4b549590fcb1a9e3e316a1a45e62...,0.000429
4,0049be50020812a0f5811102c38f97c1e24bee3622bd82...,0.002241


In [25]:
season_year_data = full_product_data.merge(attribute_df[["product_id", "season_year"]], on="product_id", how="left")

season_year_freq_data = season_year_data.groupby("season_year").count().reset_index()[["season_year", "product_id"]]
season_year_freq_data = season_year_freq_data.rename(columns={"product_id": "season_year_perc"})
season_year_freq_data["season_year_perc"] = (season_year_freq_data["season_year_perc"]/len(season_year_data))*100
season_year_freq_data.head()


Unnamed: 0,season_year,season_year_perc
0,02c7f3572648bef96a50c2412c2cb43487e7fca30fff13...,0.483006
1,0687dcdc2271d7bc265408527846d3366f2f3286d8ebba...,0.00081
2,107aa2043f4b5bf7963e36fc12fb0bd2fc43dcc5f835f5...,1.948783
3,1153b2c55ca31b774d98a02968d3d78d43486857de12c8...,0.008367
4,1a0b64aeca0200a1e80147282d2407d167f22e0f81fbc9...,1.849288


In [26]:
collection_freq_data.to_csv("../preprocessed_data/collection_freq_data.csv", index=False)
category_id_l1_freq_data.to_csv("../preprocessed_data/category_id_l1_freq_data.csv", index=False)
category_id_l2_freq_data.to_csv("../preprocessed_data/category_id_l2_freq_data.csv", index=False)
brand_id_freq_data.to_csv("../preprocessed_data/brand_id_freq_data.csv", index=False)
season_year_freq_data.to_csv("../preprocessed_data/season_year_freq_data.csv", index=False)
