In [53]:
# ! pip install fastparquet --user
# ! pip install ordered-set

In [54]:
import pandas as pd
import numpy as np
import gc
import swifter
from tqdm import tqdm
from collections import defaultdict
from ordered_set import OrderedSet
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler, StandardScaler


In [55]:
train = pd.read_parquet("../data_phase1/train.parquet")
val = pd.read_parquet("../data_phase1/validation.parquet")


In [56]:
train = train.fillna(value={"context_type": "NA"})
val = val.fillna(value={"context_type": "NA"})


In [57]:
train.describe(include='O')

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value
count,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990,3507990
unique,584665,208393,317426,408263,5,23,3,2,6,196,5,189571
top,d0876a02f764a84375acfe8a81a0bb530c761e51baf7f6...,cc83479dd22e19ec45d08805a61b73d7f33a69feaf42be...,f3de26eced2c81d1b0d6da40c11c9f987fe066b5a4f4fd...,55e1495c40504b4b15a358f95e2cbede34d011b287c32b...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...
freq,6,4428,354,1728,3105120,2053848,1853280,1927596,2714292,520824,3105204,152514


In [58]:
val.describe(include='O')

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value
count,687192,687192,687192,687192,687192,687192,687192,687192,687192,687192,687192,687192
unique,114532,114532,114532,218525,5,22,3,2,6,188,5,66955
top,d5bb623020ad1524663872fa9016cd4ed2a52368dce978...,bbc4c7157b8493167dd43ae0bc82e7feeb96055ac42c0a...,7e258ed67bc7a56f4cca0bec20433defe8f32c4b33b681...,fe13af44356050cdc93ad3d5e458e24c5077e5bf7a4c12...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...
freq,6,6,6,398,666960,513864,411180,423654,605262,102696,666990,15342


In [59]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df


In [60]:
train = reduce_mem_usage(train)
val = reduce_mem_usage(val)


Memory usage of dataframe is 428.22 MB
Memory usage after optimization is: 175.85 MB
Decreased by 58.9%
Memory usage of dataframe is 78.64 MB
Memory usage after optimization is: 47.95 MB
Decreased by 39.0%


In [61]:
gc.collect()

592

In [62]:
train = train.sort_values(by='query_id').reset_index(drop=True)
val = val.sort_values(by='query_id').reset_index(drop=True)


In [63]:
train.head(10)

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value,product_price,week,week_day,is_click
0,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,1efd18182268101b62a1ea12a9cafbe05487f3abb92924...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.000466,6,6,0
1,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,62349c6eda0dc9fe8bb023213f03ebe93aefa5cbcdfecf...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.001068,6,6,0
2,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,aa0bbdfa55326b5c08d3472b1ee1d56fe13a82f63f46c8...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.001238,6,6,1
3,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,793301d78ea43b599acf05d350c8f9e485f5deaa417284...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.000465,6,6,0
4,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,9e03ca4af958bef1ce5d54e684a40ee7e3a9aa7e5010b6...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.000936,6,6,0
5,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,ff2088c2fd3bf9cb8c2601252dd65a99201f59972820d3...,37b65411191e2f12441edad785c6ae94741eceaffdcb80...,9e7f6f10b18f35ce924575f599465265e22fa9d75b13c9...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,d05d943023ee7c2091506a72ce25841e84f88a4233a11d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,a6e9c2a832972484be2bd4378135743d1d00d651502c38...,0.001765,6,6,0
6,00008bcf52759e7e70d533026da637b1cfeb0dc04a67ed...,968f154cc29e3fb1ed5f361ab23d9d8495cf47c5baefef...,f391f67a9d273be5a95460ce4756cdbc22430336a0b28d...,b08c37eac9adf04388a290b13105167b4a50911ac14e5c...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,ca0bdabcd3a4eae25db6f95687cb307200e78892a6d774...,0.000804,7,2,0
7,00008bcf52759e7e70d533026da637b1cfeb0dc04a67ed...,968f154cc29e3fb1ed5f361ab23d9d8495cf47c5baefef...,f391f67a9d273be5a95460ce4756cdbc22430336a0b28d...,7b8c68e43264904ec9f74df2b9461fe7381aab12d0740e...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,ca0bdabcd3a4eae25db6f95687cb307200e78892a6d774...,0.000722,7,2,0
8,00008bcf52759e7e70d533026da637b1cfeb0dc04a67ed...,968f154cc29e3fb1ed5f361ab23d9d8495cf47c5baefef...,f391f67a9d273be5a95460ce4756cdbc22430336a0b28d...,616636f3c60f43d24741d9eca7fe1c3c567825acc64348...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,ca0bdabcd3a4eae25db6f95687cb307200e78892a6d774...,0.000916,7,2,1
9,00008bcf52759e7e70d533026da637b1cfeb0dc04a67ed...,968f154cc29e3fb1ed5f361ab23d9d8495cf47c5baefef...,f391f67a9d273be5a95460ce4756cdbc22430336a0b28d...,01f0a57f4fbc0e42e76960238fce01c28e29f23819ae04...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,ca0bdabcd3a4eae25db6f95687cb307200e78892a6d774...,0.000761,7,2,0


In [64]:
train.isnull().sum()

query_id              0
user_id               0
session_id            0
product_id            0
page_type             0
previous_page_type    0
device_category       0
device_platform       0
user_tier             0
user_country          0
context_type          0
context_value         0
product_price         0
week                  0
week_day              0
is_click              0
dtype: int64

In [65]:
data = pd.concat([train[list(set(train.columns)-set(['is_click']))], val], axis=0)
data = data.reset_index(drop=True)

In [66]:
all_products = data.product_id.apply(str).tolist()

In [67]:
unique_products = OrderedSet(all_products)


In [68]:
train_session_interactions = dict(train.groupby('session_id')['product_id'].apply(list))
val_session_interactions = dict(val.groupby('session_id')['product_id'].apply(list))

train_session_actions = dict(train.groupby('session_id')['is_click'].apply(list))
# val_session_actions = dict(val.groupby('session_id')['is_click'].apply(list))


In [69]:
train['days_elapsed'] = (train['week'].astype(int)-1)*7 + train['week_day'].astype(int)
val['days_elapsed'] = (val['week'].astype(int)-1)*7 + val['week_day'].astype(int)


In [70]:
train['sess_step'] = train.groupby('session_id')['days_elapsed'].rank(method='max').apply(int)
val['sess_step'] = val.groupby('session_id')['days_elapsed'].rank(method='max').apply(int)

train['sess_step'] = train['sess_step']//6
val['sess_step'] = val['sess_step']//6


In [71]:
gc.collect()

66

In [72]:
def get_prod_freq(row):
    cnt = 0
    try:
        for prod in train_session_interactions[row['session_id']]:
            if prod==row['product_id']:
                cnt+=1
    except KeyError:
        pass
    try:
        for prod in val_session_interactions[row['session_id']]:
            if prod==row['product_id']:
                cnt+=1
    except KeyError:
        pass
    return cnt
        

In [73]:
# train['product_session_frequency'] = train.apply(lambda x: get_prod_freq(x), 1)
# val['product_session_frequency'] = val.apply(lambda x: get_prod_freq(x), 1)


In [76]:
gc.collect()

292

In [77]:
data = pd.concat([train, val], 0)

In [24]:
unique_product_interacted_users = pd.DataFrame(data.groupby('product_id')['user_id'].nunique()).sort_values('user_id', ascending=False).reset_index()
unique_product_interacted_users = unique_product_interacted_users.rename(columns={"user_id": "#unique_users_interacted"})
unique_product_interacted_users.head(3)


Unnamed: 0,product_id,#unique_users_interacted
0,55e1495c40504b4b15a358f95e2cbede34d011b287c32b...,1494
1,0c166227b9e707541a81c20864bea7d85d300360347dac...,1338
2,b854610c86034fd48a5f9425daf4b31d0dddc7659e3498...,1296


In [32]:
# unique_product_interacted_users.to_csv("../preprocessed_data/unique_user_interactions_per_product.csv", index=False)


In [33]:
click_data = train[train["is_click"]==1]

In [28]:
unique_product_clicked_users = pd.DataFrame(click_data.groupby('product_id')['user_id'].nunique()).sort_values('user_id', ascending=False).reset_index()
unique_product_clicked_users = unique_product_clicked_users.rename(columns={"user_id": "#unique_users_clicked"})
unique_product_clicked_users.head(3)


Unnamed: 0,product_id,#unique_users_clicked
0,b854610c86034fd48a5f9425daf4b31d0dddc7659e3498...,489
1,f3bee158e5cccca0419223f80fda086032cbb4a77babc1...,370
2,55e1495c40504b4b15a358f95e2cbede34d011b287c32b...,350


In [30]:
# unique_product_clicked_users.to_csv("../preprocessed_data/clickout_features/unique_user_clicks_per_product.csv", index=False)


In [37]:
mean_query_price = pd.DataFrame(data.groupby(['query_id'])['product_price'].mean()).reset_index()
mean_query_price = mean_query_price.rename(columns={"product_price": "mean_query_price"})

max_query_price = pd.DataFrame(data.groupby(['query_id'])['product_price'].max()).reset_index()
max_query_price = max_query_price.rename(columns={"product_price": "max_query_price"})

min_query_price = pd.DataFrame(data.groupby(['query_id'])['product_price'].min()).reset_index()
min_query_price = min_query_price.rename(columns={"product_price": "min_query_price"})


In [40]:
min_query_price

Unnamed: 0,query_id,min_query_price
0,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,0.000465
1,00004e487e05ff29ac41a930f9ed972fe4cbf6a7a17794...,0.000110
2,00008bcf52759e7e70d533026da637b1cfeb0dc04a67ed...,0.000690
3,000090cdd1f86fbf8bcf0e4d1be90e651a9062082faf38...,0.001185
4,0000bd4862d5d5d470bebfe7a7b8049e25a5294d5325e7...,0.000331
...,...,...
699192,ffff796259982a7e6ede97e32748cda8449a238b2a49b8...,0.000562
699193,ffffc6d940f7a899f08a597f83ddb431e68145f6326fc6...,0.002060
699194,ffffd03fca468d248b8652c874482d3e1a2068fa207f30...,0.000574
699195,fffff0cd0ff76059499a937ed7bcc7206d4bb75dd7d909...,0.000656


In [44]:
query_price_df = mean_query_price.merge(max_query_price, on="query_id", how="left")
query_price_df = query_price_df.merge(min_query_price, on="query_id", how="left")


In [47]:
query_price_df.head(3)


Unnamed: 0,query_id,mean_query_price,max_query_price,min_query_price
0,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,0.00099,0.001765,0.000465
1,00004e487e05ff29ac41a930f9ed972fe4cbf6a7a17794...,0.000392,0.000941,0.00011
2,00008bcf52759e7e70d533026da637b1cfeb0dc04a67ed...,0.000787,0.000916,0.00069


In [48]:
# query_price_df.to_csv("../preprocessed_data/query_wise_price_features.csv", index=False)


In [78]:
attribute_df = pd.read_parquet("../data_phase1/attributes.parquet")

In [79]:
attribute_df.head(2)

Unnamed: 0,product_id,gender,main_colour,second_colour,season,collection,category_id_l1,category_id_l2,category_id_l3,brand_id,season_year,start_online_date,material_values,attribute_values
0,0013f07ccdf212210c110e63f0de46e37669c17a4d855a...,a8c9cca4c116691f1e331a5058e84f05e31696bc4f611c...,7673fc4fdc325f3785a223787d2b32e381e8b4c1c8a765...,4737cd35940c2338e96c18a25aeb6848d46f0da795bce8...,847a067597e39838f1f85b0774f44e68b4d6e64d3ec4dd...,a3791e8d85c005b0d9d60d6d3b7e8edd2f256a5cc928d0...,abf367e49718254e068eee51a565fdcede2d741e4c7e33...,61fe255948ec07c4eb25c70f7144b54beddd00466ad866...,05f872d8b8ec85642ad49786d6e443c0df6e7df4bdcba3...,dd6ea8954a945ef0889f30d57b7fdb8d6aaad397e6c6ff...,c7c4ac6af030e54d02b9e4545e4223e76515c3ce4e498e...,1067.041667,f61ecea9b45f1590e57706b88207449bdd4cb703b917ad...,8b45c5d5e010acf257787f2ce0c505857d94709c436991...
1,002239cd57f19f22e557030dff363dfbd1344d8f7ac829...,4a00d8b84bdb2ec2f219304d3883a46336f9fb38d2f1e6...,0f97dafafa5dc4bb18853ea00776dfcc52302f40411b50...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...,9db35d237f5873f0410d3ca18c07430270086eb1e7838d...,a3791e8d85c005b0d9d60d6d3b7e8edd2f256a5cc928d0...,e54f8513b708db3afdbd4950bd3420579a8cddabf4c1b3...,3809cade495cd7dc289e6aee521d380549ebd3456f03bc...,fd021cd2dbaf0d7b6105a1b136cf5a094e025010a2096f...,a6536c6bc250d525ccd3b63a3ec483a33a2010422932a3...,ef7d8d2e02aec8c328dafde95486f7181c37d07be3d167...,215.041667,1675f293342bbb518ba3a5ad39399aa0a13580653d253e...,ca8c396e7422e324d4454a911e0319d07b85a4fb89b006...


In [50]:
start_online_attribute_data = data[["query_id", "product_id"]].merge(attribute_df[["product_id", "start_online_date"]], on="product_id", how="left")



In [53]:
start_online_attribute_data.head(3)

Unnamed: 0,query_id,product_id,start_online_date
0,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,1efd18182268101b62a1ea12a9cafbe05487f3abb92924...,573.041667
1,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,62349c6eda0dc9fe8bb023213f03ebe93aefa5cbcdfecf...,262.041667
2,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,aa0bbdfa55326b5c08d3472b1ee1d56fe13a82f63f46c8...,226.041667


In [58]:
mean_query_date = pd.DataFrame(start_online_attribute_data.groupby(['query_id'])['start_online_date'].mean()).reset_index()
mean_query_date = mean_query_date.rename(columns={"start_online_date": "mean_query_start_online_date"})

max_query_date = pd.DataFrame(start_online_attribute_data.groupby(['query_id'])['start_online_date'].max()).reset_index()
max_query_date = max_query_date.rename(columns={"start_online_date": "max_query_start_online_date"})

min_query_date = pd.DataFrame(start_online_attribute_data.groupby(['query_id'])['start_online_date'].min()).reset_index()
min_query_date = min_query_date.rename(columns={"start_online_date": "min_query_start_online_date"})



In [59]:
query_date_df = mean_query_date.merge(max_query_date, on="query_id", how="left")
query_date_df = query_date_df.merge(min_query_date, on="query_id", how="left")


In [61]:
query_date_df.head(3)

Unnamed: 0,query_id,mean_query_start_online_date,max_query_start_online_date,min_query_start_online_date
0,0000437da0efd6e292f12a8a3bf7525de5bd05b5807a74...,310.194444,573.041667,133.0
1,00004e487e05ff29ac41a930f9ed972fe4cbf6a7a17794...,549.513889,1393.041667,217.041667
2,00008bcf52759e7e70d533026da637b1cfeb0dc04a67ed...,217.694444,444.0,141.0


In [62]:
# query_date_df.to_csv("../preprocessed_data/query_wise_start_online_date_features.csv", index=False)

In [34]:
temp = pd.DataFrame(click_data.groupby(["product_id"])["days_elapsed"].apply(list)).reset_index()


In [44]:
def _func(row):
    return True if len(row["days_elapsed"])>0 else False

temp["FLAG"] = temp.swifter.apply(lambda x: _func(x), 1)
temp[temp["FLAG"]==True]

HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=408263.0), HTML(value='')))




Unnamed: 0,product_id,days_elapsed,FLAG
2,00007316b79817657a13be236d31f5ec3970fb0e7d66e0...,"[4, 41, 48]",True
4,00008f66579dee0738eb52386a02af5b5d9b16d6bf05f4...,"[18, 19, 42, 46, 18, 45]",True
7,0000c500714eb26669951652d28109b1455ce8bba6b8b8...,[16],True
9,00011f122f6e19ba6b281c17fd2b1d4d39ba65cb044a5a...,[8],True
10,000143db2d98ad628acef989b959628fb036e25b51bde0...,"[40, 31, 34, 35]",True
...,...,...,...
408243,fffd3f70e72cf5995840cee856449f7586f66a1403242d...,"[27, 43]",True
408247,fffdc5189877268716c6e2ec9ebc82721660c2ef897d0b...,"[43, 27, 33, 37, 42, 4, 19, 20, 16, 30, 16, 49...",True
408248,fffdd43cf561e5d011f5046756ec5bbb390aa3f63a10f5...,[40],True
408251,fffe6dff9f30d3a38be0617cd553a34636ff5b7c3d107f...,"[43, 32]",True


In [47]:
temp = temp[temp["FLAG"]==True]

In [48]:
def get_last_click_days_elapsed(row):
    return max(row["days_elapsed"])

temp["last_clickout_days_elapsed"] = temp.swifter.apply(lambda x: get_last_click_days_elapsed(x), 1)


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=209120.0), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [51]:
temp.head()

Unnamed: 0,product_id,last_clickout_days_elapsed
2,00007316b79817657a13be236d31f5ec3970fb0e7d66e0...,48
4,00008f66579dee0738eb52386a02af5b5d9b16d6bf05f4...,46
7,0000c500714eb26669951652d28109b1455ce8bba6b8b8...,16
9,00011f122f6e19ba6b281c17fd2b1d4d39ba65cb044a5a...,8
10,000143db2d98ad628acef989b959628fb036e25b51bde0...,40


In [50]:
temp = temp.drop(["days_elapsed", "FLAG"], 1)

In [52]:
# temp.to_csv("../preprocessed_data/clickout_features/product_wise_last_clickout_days_elapsed.csv", index=False)

In [114]:
clickout_price_list = pd.DataFrame(click_data.sort_values(["product_id", "days_elapsed"]).groupby("product_id")["product_price"].apply(list)).reset_index()
clickout_price_list


Unnamed: 0,product_id,product_price
0,00002185a176ed9deaabd2f8b5eceb6a3039f1c8596e45...,[]
1,000029d3d499cbd642ac8d2b79028d4d1e048f915cdfcd...,[]
2,00007316b79817657a13be236d31f5ec3970fb0e7d66e0...,"[0.0013399124145507812, 0.0013570785522460938,..."
3,00008a8e96b8602a19be7341577ae45e9bfee0716b6b12...,[]
4,00008f66579dee0738eb52386a02af5b5d9b16d6bf05f4...,"[0.00012564659118652344, 0.0001256465911865234..."
...,...,...
408258,ffff8cf68cb0fc18441257c733708bee962813cf75b55c...,[]
408259,ffff97f6b6a67cb388f622ed66bc113d881e4782727cfd...,[]
408260,ffffae28c60c57762e0a79d5e3723ed4f2f7ff88cc1d9e...,[]
408261,ffffb9d4dba27082692d06dbef04556a09e988ea8a7a15...,[]


In [115]:
def _func(row):
    return True if len(row["product_price"])>0 else False

clickout_price_list["FLAG"] = clickout_price_list.swifter.apply(lambda x: _func(x), 1)
clickout_price_list[clickout_price_list["FLAG"]==True]


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=408263.0), HTML(value='')))




Unnamed: 0,product_id,product_price,FLAG
2,00007316b79817657a13be236d31f5ec3970fb0e7d66e0...,"[0.0013399124145507812, 0.0013570785522460938,...",True
4,00008f66579dee0738eb52386a02af5b5d9b16d6bf05f4...,"[0.00012564659118652344, 0.0001256465911865234...",True
7,0000c500714eb26669951652d28109b1455ce8bba6b8b8...,[0.00015747547149658203],True
9,00011f122f6e19ba6b281c17fd2b1d4d39ba65cb044a5a...,[8.83340835571289e-05],True
10,000143db2d98ad628acef989b959628fb036e25b51bde0...,"[0.0019702911376953125, 0.00197601318359375, 0...",True
...,...,...,...
408243,fffd3f70e72cf5995840cee856449f7586f66a1403242d...,"[0.000553131103515625, 0.0005626678466796875]",True
408247,fffdc5189877268716c6e2ec9ebc82721660c2ef897d0b...,"[0.00026488304138183594, 0.0002667903900146484...",True
408248,fffdd43cf561e5d011f5046756ec5bbb390aa3f63a10f5...,[0.0006232261657714844],True
408251,fffe6dff9f30d3a38be0617cd553a34636ff5b7c3d107f...,"[0.00022327899932861328, 0.00022780895233154297]",True


In [116]:
clickout_price_list = clickout_price_list[clickout_price_list["FLAG"]==True]


In [117]:
def get_last_click_product_price(row):
    return row["product_price"][-1]

clickout_price_list["last_clickout_product_price"] = clickout_price_list.swifter.apply(lambda x: get_last_click_product_price(x), 1)



HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=209120.0), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [118]:
clickout_price_list = clickout_price_list.drop(["product_price", "FLAG"], 1)

In [119]:
# clickout_price_list.to_csv("../preprocessed_data/clickout_features/product_wise_last_clickout_product_price.csv", index=False)


In [120]:
clickout_price_list.shape

(209120, 2)

In [103]:
vals1 = [1,2,3,4,4,5,5,5,6,7,1,8,2]
vals2 = [1,2,3,4,5,6,7,8,9,10,11,12,13,1,2,3,4,5,6,7,8,9,10,11,12,13]
vals3 = [1,1,1,2,2,3,4,5,3,4,5,6,8]
k = pd.DataFrame({'aa': ['a']*len(vals1)+['b']*len(vals3), 'bb': vals2, 'values': vals1+vals3})


In [104]:
from sklearn.utils import shuffle
k = shuffle(k)
k

Unnamed: 0,aa,bb,values
0,a,1,1
25,b,13,8
12,a,13,2
23,b,11,5
4,a,5,4
7,a,8,5
19,b,7,4
18,b,6,3
8,a,9,6
1,a,2,2


In [105]:
k.sort_values(['aa', 'bb']).groupby('aa')["values"].apply(list)

aa
a    [1, 2, 3, 4, 4, 5, 5, 5, 6, 7, 1, 8, 2]
b    [1, 1, 1, 2, 2, 3, 4, 5, 3, 4, 5, 6, 8]
Name: values, dtype: object