In [1]:
%load_ext kedro.extras.extensions.ipython

The kedro.extras.extensions.ipython extension is already loaded. To reload it, use:
  %reload_ext kedro.extras.extensions.ipython


In [2]:
%reload_kedro

2022-08-11 13:09:39,612 - kedro.framework.hooks.manager - INFO - Registered hooks from 1 installed plugin(s): kedro-mlflow-0.8.1
2022-08-11 13:09:39,655 - kedro.framework.session.store - INFO - `read()` not implemented for `BaseSessionStore`. Assuming empty store.
2022-08-11 13:09:40,157 - root - INFO - ** Kedro project GetInData ML Framework
2022-08-11 13:09:40,158 - root - INFO - Defined global variable `context`, `session`, `catalog` and `pipelines`
2022-08-11 13:09:40,168 - root - INFO - Registered line magic `run_viz`
2022-08-11 13:09:40,169 - root - INFO - Registered line magic `reload_kedro_mlflow`


In [3]:
import pandas as pd
import numpy as np

In [4]:
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [5]:
catalog.list()

['articles',
 'customers',
 'transactions',
 'customers_sample',
 'transactions_sample',
 'articles_sample',
 'input_images',
 'resized_images',
 'image_embeddings',
 'text_embeddings',
 'parameters',
 'params:image_embeddings_inference',
 'params:image_embeddings_inference.run_id',
 'params:image_embeddings_inference.image_path',
 'params:image_embeddings_inference.batch_size',
 'params:customers',
 'params:transactions',
 'params:articles',
 'params:image_embeddings',
 'params:image_embeddings.image_path',
 'params:image_embeddings.encoder',
 'params:image_embeddings.decoder',
 'params:image_embeddings.batch_size',
 'params:image_embeddings.image_size',
 'params:image_embeddings.embedding_size',
 'params:image_embeddings.num_epochs',
 'params:image_embeddings.shuffle_reconstructions',
 'params:image_embeddings.save_model',
 'params:image_embeddings.model_name',
 'params:image_embeddings.seed',
 'params:image_resizer',
 'params:image_resizer.output_size',
 'params:image_resizer.method

In [8]:
articles = context.catalog.load('articles_sample')

2022-08-11 13:10:29,528 - kedro.io.data_catalog - INFO - Loading data from `articles_sample` (CSVDataSet)...


In [9]:
customers = context.catalog.load('customers_sample')

2022-08-11 13:10:29,773 - kedro.io.data_catalog - INFO - Loading data from `customers_sample` (CSVDataSet)...


In [10]:
transactions = context.catalog.load('transactions_sample')

2022-08-11 13:10:29,975 - kedro.io.data_catalog - INFO - Loading data from `transactions_sample` (CSVDataSet)...


In [11]:
articles.shape, customers.shape, transactions.shape

((21108, 25), (137198, 7), (157785, 5))

### articles, customer features (non-interaction)

##### count -> user-item, user-category of last week/month/season/same week of last year

In [18]:
# count of article id per customer_id
transactions.groupby(['customer_id'])['article_id'].count().reset_index(name='count_of_article_per_customer')

Unnamed: 0,customer_id,count_of_article_per_customer
0,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,1
1,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,2
2,000346516dd355b40badca0c0f5f37a318ddae31f0e0f7...,3
3,000531ecd2754a0cf214373cc7b6dbcf7ce9a77c3284a2...,1
4,000989f72a2b8e5da2f4abafc86c2e213816fa2ff2a060...,4
...,...,...
49140,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,5
49141,fff69e3ab1bd701315881c7706c38d5cbcda1f4e0cf213...,2
49142,fff94b7e15cd61eaf699f0e7d29e4023d28733119f323d...,2
49143,fffdd9c81f64c0228a54e1c0240cf40877acec6ce985d4...,3


In [72]:
# count of prod_group_name per customer_id
(transactions
    .merge(articles[['article_id', 'product_group_name']], on='article_id')[['customer_id', 'product_group_name']]
    .drop_duplicates()
    .groupby(['customer_id'])['product_group_name']
    .count()
    .reset_index(name='count_of_product_group_name_per_customer')
)

Unnamed: 0,customer_id,count_of_product_group_name_per_customer
0,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,1
1,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,1
2,000346516dd355b40badca0c0f5f37a318ddae31f0e0f7...,3
3,000531ecd2754a0cf214373cc7b6dbcf7ce9a77c3284a2...,1
4,000989f72a2b8e5da2f4abafc86c2e213816fa2ff2a060...,3
...,...,...
49140,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,5
49141,fff69e3ab1bd701315881c7706c38d5cbcda1f4e0cf213...,1
49142,fff94b7e15cd61eaf699f0e7d29e4023d28733119f323d...,1
49143,fffdd9c81f64c0228a54e1c0240cf40877acec6ce985d4...,1


##### time -> first, last days of transactions

In [25]:
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])

In [26]:
max_date = transactions['t_dat'].max()

In [27]:
max_date

Timestamp('2020-09-22 00:00:00')

In [40]:
# days since first transaction for each customer
(max_date - 
    (transactions
        .groupby(['customer_id'])['t_dat']
        .min()
    )
).dt.days.reset_index(name='days_since_first_transaction')

Unnamed: 0,customer_id,days_since_first_transaction
0,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,153
1,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,91
2,000346516dd355b40badca0c0f5f37a318ddae31f0e0f7...,92
3,000531ecd2754a0cf214373cc7b6dbcf7ce9a77c3284a2...,95
4,000989f72a2b8e5da2f4abafc86c2e213816fa2ff2a060...,123
...,...,...
49140,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,55
49141,fff69e3ab1bd701315881c7706c38d5cbcda1f4e0cf213...,60
49142,fff94b7e15cd61eaf699f0e7d29e4023d28733119f323d...,105
49143,fffdd9c81f64c0228a54e1c0240cf40877acec6ce985d4...,109


In [41]:
# days since last transaction for each customer
(max_date - 
    (transactions
        .groupby(['customer_id'])['t_dat']
        .max()
    )
).dt.days.reset_index(name='days_since_last_transaction')

Unnamed: 0,customer_id,days_since_last_transaction
0,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,153
1,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,24
2,000346516dd355b40badca0c0f5f37a318ddae31f0e0f7...,53
3,000531ecd2754a0cf214373cc7b6dbcf7ce9a77c3284a2...,95
4,000989f72a2b8e5da2f4abafc86c2e213816fa2ff2a060...,102
...,...,...
49140,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,11
49141,fff69e3ab1bd701315881c7706c38d5cbcda1f4e0cf213...,60
49142,fff94b7e15cd61eaf699f0e7d29e4023d28733119f323d...,105
49143,fffdd9c81f64c0228a54e1c0240cf40877acec6ce985d4...,57


In [57]:
# average purchase span for each customer
sorted_transactions = transactions.sort_values(by=['customer_id', 't_dat'], ascending=[True, False])[['customer_id', 't_dat']].drop_duplicates()
sorted_transactions['t_dat_next'] = sorted_transactions['t_dat'].shift(1)
sorted_transactions['customer_id_next'] = sorted_transactions['customer_id'].shift(1)
sorted_transactions = sorted_transactions.assign(avg_purchase_span = lambda x: x.t_dat_next-x.t_dat)
(
    sorted_transactions[sorted_transactions['customer_id']==sorted_transactions['customer_id_next']]
    .groupby(['customer_id'])['avg_purchase_span']
    .mean().dt.days
    .reset_index()
)

Unnamed: 0,customer_id,avg_purchase_span
0,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,67
1,000346516dd355b40badca0c0f5f37a318ddae31f0e0f7...,39
2,000989f72a2b8e5da2f4abafc86c2e213816fa2ff2a060...,10
3,000fab9caf56e8060680e5fd82d709ade7496318832201...,72
4,00159bfc26c2cf788c09789f006fa698b24d0f1dbd8309...,32
...,...,...
22183,fff187d1386edced8ef49b1df0155241943c9c4cc7abbf...,41
22184,fff3573d9131d15da6a46c1ca8f03b5d37e4f6b804171e...,18
22185,fff4381593e170ca0aea188998487c830d9a4070c9ec4b...,49
22186,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,22


In [58]:
# average/median purchase span for each customer/article pair
print('todo later')

todo later


##### sales_channel_id percentage

In [61]:
transactions.sales_channel_id.value_counts()

2    113798
1     43987
Name: sales_channel_id, dtype: int64

In [69]:
# mean percentage sales offline per customer
(
    transactions.assign(if_offline=lambda x: x.sales_channel_id-1)
    .groupby(['customer_id'])['if_offline']
    .mean()
    .reset_index(name='perc_customer_sales_offline')
)

Unnamed: 0,customer_id,perc_customer_sales_offline
0,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,1.0000
1,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,0.0000
2,000346516dd355b40badca0c0f5f37a318ddae31f0e0f7...,0.3333
3,000531ecd2754a0cf214373cc7b6dbcf7ce9a77c3284a2...,1.0000
4,000989f72a2b8e5da2f4abafc86c2e213816fa2ff2a060...,1.0000
...,...,...
49140,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,1.0000
49141,fff69e3ab1bd701315881c7706c38d5cbcda1f4e0cf213...,1.0000
49142,fff94b7e15cd61eaf699f0e7d29e4023d28733119f323d...,1.0000
49143,fffdd9c81f64c0228a54e1c0240cf40877acec6ce985d4...,1.0000


In [70]:
# mean percentage sales offline per article
(
    transactions.assign(if_offline=lambda x: x.sales_channel_id-1)
    .groupby(['article_id'])['if_offline']
    .mean()
    .reset_index(name='perc_article_sales_offline')
)

Unnamed: 0,article_id,perc_article_sales_offline
0,0108775015,0.0000
1,0110065002,0.0000
2,0123173001,0.5682
3,0129085001,0.0000
4,0129085026,0.0000
...,...,...
6775,0945600001,1.0000
6776,0945995002,0.4500
6777,0946748001,0.0000
6778,0946764002,1.0000


##### item attributes count

In [75]:
print('todo - attribute last 7/30/90 days')

todo - attribute last 7/30/90 days


##### has bought from this category

In [74]:
# has bought from this product_group_name
(transactions
    .merge(articles[['article_id', 'product_group_name']], on='article_id')[['customer_id', 'product_group_name']]
    .drop_duplicates()
    .assign(has_bought_prod_group=lambda x: 1)
)

Unnamed: 0,customer_id,product_group_name,has_bought_prod_group
0,0090852ad6a446ccf02a7d57ec7385c360ab33a813b870...,Garment Upper body,1
1,02d85f158c762390dd85c03f661ba0da839dc77fdfcb74...,Garment Upper body,1
2,218f9e1dd7e16d5fb334f04e5ff0e00cfadf5a538a2073...,Garment Upper body,1
3,695cf42cd265a49672ed66412c7929a5770c1d3445efab...,Garment Upper body,1
4,6df54cad7fb153888242c3a4891b8adb0654c5696e9a95...,Garment Upper body,1
...,...,...,...
157779,a2d7c7d05b2db3ad9e5dc611746208272bb93b39f54434...,Underwear,1
157780,b02bd7f6859bde6124b6748638fc78789e0e092d282fe2...,Socks & Tights,1
157782,d9dc75b5ba9fa1a3947fd743dadb3b404b9d38ff790525...,Garment Upper body,1
157783,eebf05b7cc90e72b62da284ded1234db8b4e3a2a06978c...,Garment Lower body,1


- count of user-item and user-item attributes in the last 7/30/all days

- percentage of items with the same attributes as the candidate item among the customer’s past purchases

- age-product combination features: products popularity under each age group

- difference/ratio -> difference between age and mean age of who purchased item; ratio of one user’s purchased item count and the item’s count
- similarity -> CF score of item2item, cosine similarity of item2item (word2vec), cosine similarity of user2item (ProNE graph embedding)
- scores used in retrieval strategies used as FEATURES for ranking model
- USER basic features - num, price, sales_channel_id
- PRODUCT basic features - statistics based on EACH ATTRIBUTE of the product including times, price, age, sales_channel_id, FN, Active, club_member_status, last_purchase_time, average_purchase_interval
- USER-PRODUCT INTERACTION features - statistics based on each attributes
- user-product REPURCHASE features - whether user will repurchase product and whether product will be repurchased
- higher-order combinational features - for example, predict when the user will next purchase the product
- itemCF - calculate the similarity of each item through itemCF, and then calculate the score whether user will buy the product
- SIMILARITY BASED MEASURES - average similarity of items vs the items that customers bought
- streaks - number of times a customer bought a product/category/section IN A ROW
- DAY DIFF of user-item and user-item attributes last seen in the transactions
- SIMILARITIES between user and item:
- BPR factorization user2item similarity
- word2vec item2item similarity
- Jaccard similarity between the attributes of items purchased and the attributes of the target item
- text/image similarity by the embeddings
- TIME WEIGHTED POPULARITY 1/days-purchase date
- features about RECALL STRATEGY:
- whether this articles is recalled by strategy_name
- the rank of this article under the strategy_name
- cosine similarity between EACH PAIR of customer and article,
- customer’s bought history aggregated features of this particular article
- MEAN EMBEDDING for each customer’s based on CUSTOMER’s TRANSACTION HISTORY -> COSINE DISTANCE - how close the customer and the article are
- last purchase date with the same attributes as the candidate item
- purchase rate of the candidate item by customers of the same AGE
- collaborative filtering features