In [35]:
%load_ext kedro.extras.extensions.ipython

The kedro.extras.extensions.ipython extension is already loaded. To reload it, use:
  %reload_ext kedro.extras.extensions.ipython


In [2]:
%reload_kedro

2022-08-12 08:39:52,136 - kedro.framework.hooks.manager - INFO - Registered hooks from 1 installed plugin(s): kedro-mlflow-0.8.1
2022-08-12 08:39:52,178 - kedro.framework.session.store - INFO - `read()` not implemented for `BaseSessionStore`. Assuming empty store.
2022-08-12 08:39:52,674 - root - INFO - ** Kedro project GetInData ML Framework
2022-08-12 08:39:52,674 - root - INFO - Defined global variable `context`, `session`, `catalog` and `pipelines`
2022-08-12 08:39:52,684 - root - INFO - Registered line magic `run_viz`
2022-08-12 08:39:52,685 - root - INFO - Registered line magic `reload_kedro_mlflow`


In [3]:
import pandas as pd
import numpy as np

In [4]:
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [5]:
catalog.list()

['articles',
 'customers',
 'transactions',
 'customers_sample',
 'transactions_sample',
 'articles_sample',
 'input_images',
 'resized_images',
 'image_embeddings',
 'text_embeddings',
 'parameters',
 'params:image_embeddings_inference',
 'params:image_embeddings_inference.run_id',
 'params:image_embeddings_inference.image_path',
 'params:image_embeddings_inference.batch_size',
 'params:customers',
 'params:transactions',
 'params:articles',
 'params:image_embeddings',
 'params:image_embeddings.image_path',
 'params:image_embeddings.encoder',
 'params:image_embeddings.decoder',
 'params:image_embeddings.batch_size',
 'params:image_embeddings.image_size',
 'params:image_embeddings.embedding_size',
 'params:image_embeddings.num_epochs',
 'params:image_embeddings.shuffle_reconstructions',
 'params:image_embeddings.save_model',
 'params:image_embeddings.model_name',
 'params:image_embeddings.seed',
 'params:image_resizer',
 'params:image_resizer.output_size',
 'params:image_resizer.method

In [6]:
articles = context.catalog.load('articles_sample')

2022-08-12 08:39:54,376 - kedro.io.data_catalog - INFO - Loading data from `articles_sample` (CSVDataSet)...


In [7]:
customers = context.catalog.load('customers_sample')

2022-08-12 08:39:54,846 - kedro.io.data_catalog - INFO - Loading data from `customers_sample` (CSVDataSet)...


In [8]:
transactions = context.catalog.load('transactions_sample')

2022-08-12 08:39:55,561 - kedro.io.data_catalog - INFO - Loading data from `transactions_sample` (CSVDataSet)...


In [9]:
articles.shape, customers.shape, transactions.shape

((21108, 25), (137198, 7), (157785, 5))

### articles, customer features (non-interaction)

##### count -> user-item, user-category of last week/month/season/same week of last year

In [10]:
# count of article id per customer_id
transactions.groupby(['customer_id'])['article_id'].count().reset_index(name='count_of_article_per_customer')

Unnamed: 0,customer_id,count_of_article_per_customer
0,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,1
1,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,2
2,000346516dd355b40badca0c0f5f37a318ddae31f0e0f7...,3
3,000531ecd2754a0cf214373cc7b6dbcf7ce9a77c3284a2...,1
4,000989f72a2b8e5da2f4abafc86c2e213816fa2ff2a060...,4
...,...,...
49140,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,5
49141,fff69e3ab1bd701315881c7706c38d5cbcda1f4e0cf213...,2
49142,fff94b7e15cd61eaf699f0e7d29e4023d28733119f323d...,2
49143,fffdd9c81f64c0228a54e1c0240cf40877acec6ce985d4...,3


In [11]:
# count of prod_group_name per customer_id
(transactions
    .merge(articles[['article_id', 'product_group_name']], on='article_id')[['customer_id', 'product_group_name']]
    .drop_duplicates()
    .groupby(['customer_id'])['product_group_name']
    .count()
    .reset_index(name='count_of_product_group_name_per_customer')
)

Unnamed: 0,customer_id,count_of_product_group_name_per_customer
0,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,1
1,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,1
2,000346516dd355b40badca0c0f5f37a318ddae31f0e0f7...,3
3,000531ecd2754a0cf214373cc7b6dbcf7ce9a77c3284a2...,1
4,000989f72a2b8e5da2f4abafc86c2e213816fa2ff2a060...,3
...,...,...
49140,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,5
49141,fff69e3ab1bd701315881c7706c38d5cbcda1f4e0cf213...,1
49142,fff94b7e15cd61eaf699f0e7d29e4023d28733119f323d...,1
49143,fffdd9c81f64c0228a54e1c0240cf40877acec6ce985d4...,1


##### time -> first, last days of transactions

In [12]:
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])

In [13]:
max_date = transactions['t_dat'].max()

In [14]:
max_date

Timestamp('2020-09-22 00:00:00')

In [15]:
# days since first transaction for each customer
(max_date - 
    (transactions
        .groupby(['customer_id'])['t_dat']
        .min()
    )
).dt.days.reset_index(name='days_since_first_transaction')

Unnamed: 0,customer_id,days_since_first_transaction
0,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,153
1,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,91
2,000346516dd355b40badca0c0f5f37a318ddae31f0e0f7...,92
3,000531ecd2754a0cf214373cc7b6dbcf7ce9a77c3284a2...,95
4,000989f72a2b8e5da2f4abafc86c2e213816fa2ff2a060...,123
...,...,...
49140,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,55
49141,fff69e3ab1bd701315881c7706c38d5cbcda1f4e0cf213...,60
49142,fff94b7e15cd61eaf699f0e7d29e4023d28733119f323d...,105
49143,fffdd9c81f64c0228a54e1c0240cf40877acec6ce985d4...,109


In [16]:
# days since last transaction for each customer
(max_date - 
    (transactions
        .groupby(['customer_id'])['t_dat']
        .max()
    )
).dt.days.reset_index(name='days_since_last_transaction')

Unnamed: 0,customer_id,days_since_last_transaction
0,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,153
1,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,24
2,000346516dd355b40badca0c0f5f37a318ddae31f0e0f7...,53
3,000531ecd2754a0cf214373cc7b6dbcf7ce9a77c3284a2...,95
4,000989f72a2b8e5da2f4abafc86c2e213816fa2ff2a060...,102
...,...,...
49140,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,11
49141,fff69e3ab1bd701315881c7706c38d5cbcda1f4e0cf213...,60
49142,fff94b7e15cd61eaf699f0e7d29e4023d28733119f323d...,105
49143,fffdd9c81f64c0228a54e1c0240cf40877acec6ce985d4...,57


In [17]:
# average purchase span for each customer
sorted_transactions = transactions.sort_values(by=['customer_id', 't_dat'], ascending=[True, False])[['customer_id', 't_dat']].drop_duplicates()
sorted_transactions['t_dat_next'] = sorted_transactions['t_dat'].shift(1)
sorted_transactions['customer_id_next'] = sorted_transactions['customer_id'].shift(1)
sorted_transactions = sorted_transactions.assign(avg_purchase_span = lambda x: x.t_dat_next-x.t_dat)
(
    sorted_transactions[sorted_transactions['customer_id']==sorted_transactions['customer_id_next']]
    .groupby(['customer_id'])['avg_purchase_span']
    .mean().dt.days
    .reset_index()
)

Unnamed: 0,customer_id,avg_purchase_span
0,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,67
1,000346516dd355b40badca0c0f5f37a318ddae31f0e0f7...,39
2,000989f72a2b8e5da2f4abafc86c2e213816fa2ff2a060...,10
3,000fab9caf56e8060680e5fd82d709ade7496318832201...,72
4,00159bfc26c2cf788c09789f006fa698b24d0f1dbd8309...,32
...,...,...
22183,fff187d1386edced8ef49b1df0155241943c9c4cc7abbf...,41
22184,fff3573d9131d15da6a46c1ca8f03b5d37e4f6b804171e...,18
22185,fff4381593e170ca0aea188998487c830d9a4070c9ec4b...,49
22186,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,22


In [18]:
# average/median purchase span for each customer/article pair
print('todo later')

todo later


##### percentage of rebuying items

In [26]:
(
    transactions
    .groupby(['customer_id', 'article_id'])['t_dat'].count()
    .reset_index()
    .assign(rebought=lambda x: (x.t_dat>1).astype(int))
    .groupby(['article_id'])['rebought'].mean()
    .reset_index(name='perc_rebought')
)

Unnamed: 0,article_id,perc_rebought
0,0108775015,0.0000
1,0110065002,0.0000
2,0123173001,0.3448
3,0129085001,0.2241
4,0129085026,0.0000
...,...,...
6775,0945600001,0.5000
6776,0945995002,0.0000
6777,0946748001,0.0000
6778,0946764002,0.0000


##### sales_channel_id percentage

In [61]:
transactions.sales_channel_id.value_counts()

2    113798
1     43987
Name: sales_channel_id, dtype: int64

In [69]:
# mean percentage sales offline per customer
(
    transactions.assign(if_offline=lambda x: x.sales_channel_id-1)
    .groupby(['customer_id'])['if_offline']
    .mean()
    .reset_index(name='perc_customer_sales_offline')
)

Unnamed: 0,customer_id,perc_customer_sales_offline
0,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,1.0000
1,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,0.0000
2,000346516dd355b40badca0c0f5f37a318ddae31f0e0f7...,0.3333
3,000531ecd2754a0cf214373cc7b6dbcf7ce9a77c3284a2...,1.0000
4,000989f72a2b8e5da2f4abafc86c2e213816fa2ff2a060...,1.0000
...,...,...
49140,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,1.0000
49141,fff69e3ab1bd701315881c7706c38d5cbcda1f4e0cf213...,1.0000
49142,fff94b7e15cd61eaf699f0e7d29e4023d28733119f323d...,1.0000
49143,fffdd9c81f64c0228a54e1c0240cf40877acec6ce985d4...,1.0000


In [70]:
# mean percentage sales offline per article
(
    transactions.assign(if_offline=lambda x: x.sales_channel_id-1)
    .groupby(['article_id'])['if_offline']
    .mean()
    .reset_index(name='perc_article_sales_offline')
)

Unnamed: 0,article_id,perc_article_sales_offline
0,0108775015,0.0000
1,0110065002,0.0000
2,0123173001,0.5682
3,0129085001,0.0000
4,0129085026,0.0000
...,...,...
6775,0945600001,1.0000
6776,0945995002,0.4500
6777,0946748001,0.0000
6778,0946764002,1.0000


##### item attributes count

##### count prod_group_name

In [33]:
# count prod_group_name
(
    transactions
    .merge(articles[['article_id', 'product_group_name']], on='article_id')
    .groupby(['customer_id', 'product_group_name'])['article_id']
    .count()
    .reset_index(name='count_from_prod_group_name')
)

Unnamed: 0,customer_id,product_group_name,count_from_prod_group_name
0,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,Swimwear,1
1,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,Garment Lower body,2
2,000346516dd355b40badca0c0f5f37a318ddae31f0e0f7...,Garment Lower body,1
3,000346516dd355b40badca0c0f5f37a318ddae31f0e0f7...,Garment Upper body,1
4,000346516dd355b40badca0c0f5f37a318ddae31f0e0f7...,Underwear,1
...,...,...,...
89892,fff69e3ab1bd701315881c7706c38d5cbcda1f4e0cf213...,Underwear,2
89893,fff94b7e15cd61eaf699f0e7d29e4023d28733119f323d...,Garment Lower body,2
89894,fffdd9c81f64c0228a54e1c0240cf40877acec6ce985d4...,Garment Upper body,3
89895,ffff61677073258d461e043cc9ed4ed97be5617a920640...,Garment Lower body,1


In [31]:
# has bought from this product_group_name
(
    transactions
    .merge(articles[['article_id', 'product_group_name']], on='article_id')[['customer_id', 'product_group_name']]
    .drop_duplicates()
    .assign(has_bought_prod_group=lambda x: 1)
)

Unnamed: 0,customer_id,product_group_name,has_bought_prod_group
0,0090852ad6a446ccf02a7d57ec7385c360ab33a813b870...,Garment Upper body,1
1,02d85f158c762390dd85c03f661ba0da839dc77fdfcb74...,Garment Upper body,1
2,218f9e1dd7e16d5fb334f04e5ff0e00cfadf5a538a2073...,Garment Upper body,1
3,695cf42cd265a49672ed66412c7929a5770c1d3445efab...,Garment Upper body,1
4,6df54cad7fb153888242c3a4891b8adb0654c5696e9a95...,Garment Upper body,1
...,...,...,...
157779,a2d7c7d05b2db3ad9e5dc611746208272bb93b39f54434...,Underwear,1
157780,b02bd7f6859bde6124b6748638fc78789e0e092d282fe2...,Socks & Tights,1
157782,d9dc75b5ba9fa1a3947fd743dadb3b404b9d38ff790525...,Garment Upper body,1
157783,eebf05b7cc90e72b62da284ded1234db8b4e3a2a06978c...,Garment Lower body,1


- count of user-item and user-item attributes in the last 7/30/all days

In [34]:
(
    transactions
    .groupby(['customer_id', 'article_id'])['t_dat']
    .count()
    .reset_index(name='count_customer_article_id')
)

Unnamed: 0,customer_id,article_id,count_customer_article_id
0,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0351484002,1
1,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,0706016015,1
2,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,0715411001,1
3,000346516dd355b40badca0c0f5f37a318ddae31f0e0f7...,0816423002,1
4,000346516dd355b40badca0c0f5f37a318ddae31f0e0f7...,0816588001,1
...,...,...,...
137411,fffdd9c81f64c0228a54e1c0240cf40877acec6ce985d4...,0832361001,1
137412,fffdd9c81f64c0228a54e1c0240cf40877acec6ce985d4...,0832361003,1
137413,fffdd9c81f64c0228a54e1c0240cf40877acec6ce985d4...,0915611003,1
137414,ffff61677073258d461e043cc9ed4ed97be5617a920640...,0817417002,1


### candidate features

##### jaccard similarity

The actual comparison was actually simpler. I calculated average jaccard index of attributes. So when a customer had 3 items in the history:

item_1 attributes A,B,C  
item_2 attributes B,C,D  
item_3 attributes A,B,D

Then you consider a new item_4 with attributes A,B so the average jaccard is calculated like this

item_1 attributes A,B,C = jaccard 0.66  
item_2 attributes B,C,D = jaccard 0.33  
item_3 attributes A,B,D = jaccard 0.66

so the average similarity is 0.55

I had a different version to calculate "fuzzy" attribute similarity based on their cooccurence in baskets so if the attribute=X cooccured with attribute=Y in 50% of the baskets its similarity is 50% not 0 or 1 like in the original.

In [55]:
articles.columns

Index(['article_id', 'product_code', 'prod_name', 'product_type_no',
       'product_type_name', 'product_group_name', 'graphical_appearance_no',
       'graphical_appearance_name', 'colour_group_code', 'colour_group_name',
       'perceived_colour_value_id', 'perceived_colour_value_name',
       'perceived_colour_master_id', 'perceived_colour_master_name',
       'department_no', 'department_name', 'index_code', 'index_name',
       'index_group_no', 'index_group_name', 'section_no', 'section_name',
       'garment_group_no', 'garment_group_name', 'detail_desc'],
      dtype='object')

In [62]:
cat_articles = articles[['product_type_name', 'product_group_name', 'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name']]

In [81]:
jaccard_articles = articles[['article_id']]

In [82]:
jaccard_articles['set_of_attributes'] = cat_articles.apply(set, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jaccard_articles['set_of_attributes'] = cat_articles.apply(set, axis=1)


In [90]:
jaccard_articles

Unnamed: 0,article_id,set_of_attributes
0,0927491001,"{Khaki green, Dress, Womens Tailoring, Dark Gr..."
1,0691839001,"{Trousers, Black, Garment Lower body, Ladieswe..."
2,0908924002,"{Blouses, Black, Shirt, Womens Tailoring, Ladi..."
3,0679630001,"{Outdoor, Melange, Jacket, Medium Dusty, Ladie..."
4,0698899003,"{Blouses, Black, Vest top, Jacquard, Ladieswea..."
...,...,...
21103,0475827011,"{Other shoe, Medium Dusty, Shoes, Menswear, Be..."
21104,0641715002,"{Blouses, White, Divided Projects, Divided, Bl..."
21105,0545612001,"{Blue, Trousers, Denim, Garment Lower body, La..."
21106,0663277003,"{Hat/beanie, Melange, Divided Accessories, Acc..."


### so inefficient

In [85]:
jaccard_articles.add_suffix('_first').merge(jaccard_articles.add_suffix('_second'), how='cross')

Unnamed: 0,article_id_first,set_of_attributes_first,article_id_second,set_of_attributes_second
0,0927491001,"{Khaki green, Dress, Womens Tailoring, Dark Gr...",0927491001,"{Khaki green, Dress, Womens Tailoring, Dark Gr..."
1,0927491001,"{Khaki green, Dress, Womens Tailoring, Dark Gr...",0691839001,"{Trousers, Black, Garment Lower body, Ladieswe..."
2,0927491001,"{Khaki green, Dress, Womens Tailoring, Dark Gr...",0908924002,"{Blouses, Black, Shirt, Womens Tailoring, Ladi..."
3,0927491001,"{Khaki green, Dress, Womens Tailoring, Dark Gr...",0679630001,"{Outdoor, Melange, Jacket, Medium Dusty, Ladie..."
4,0927491001,"{Khaki green, Dress, Womens Tailoring, Dark Gr...",0698899003,"{Blouses, Black, Vest top, Jacquard, Ladieswea..."
...,...,...,...,...
445547659,0520392001,"{Trousers, Shorts, Garment Lower body, Ladiesw...",0475827011,"{Other shoe, Medium Dusty, Shoes, Menswear, Be..."
445547660,0520392001,"{Trousers, Shorts, Garment Lower body, Ladiesw...",0641715002,"{Blouses, White, Divided Projects, Divided, Bl..."
445547661,0520392001,"{Trousers, Shorts, Garment Lower body, Ladiesw...",0545612001,"{Blue, Trousers, Denim, Garment Lower body, La..."
445547662,0520392001,"{Trousers, Shorts, Garment Lower body, Ladiesw...",0663277003,"{Hat/beanie, Melange, Divided Accessories, Acc..."


### so inefficient :DD

In [91]:
attributes_users = list(zip(jaccard_articles["set_of_attributes"], jaccard_articles["article_id"]))

In [95]:
result = []

In [96]:
# create list of all possible combinations between sets of (wording, user) and loop through them
for item in list(itertools.combinations(attributes_users, 2)):
    similarity = jaccard_similarity(item[0][0], item[1][0])
    data = {"user1": item[0][1], "user2": item[1][1], "similarity": similarity}
    result.append(data)

In [97]:
pd.DataFrame(result)

Unnamed: 0,user1,user2,similarity
0,0927491001,0691839001,0.2308
1,0927491001,0908924002,0.3077
2,0927491001,0679630001,0.0625
3,0927491001,0698899003,0.1333
4,0927491001,0601728029,0.1429
...,...,...,...
222763273,0641715002,0663277003,0.0714
222763274,0641715002,0520392001,0.0625
222763275,0545612001,0663277003,0.0714
222763276,0545612001,0520392001,0.2143


### works

In [114]:
def jaccard(x, y):
    """ returns the jaccard similarity between two lists """
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality/float(union_cardinality)

In [101]:
jaccard_articles.set_index(['article_id'], inplace=True)

In [102]:
jaccard_articles

Unnamed: 0_level_0,set_of_attributes
article_id,Unnamed: 1_level_1
0927491001,"{Khaki green, Dress, Womens Tailoring, Dark Gr..."
0691839001,"{Trousers, Black, Garment Lower body, Ladieswe..."
0908924002,"{Blouses, Black, Shirt, Womens Tailoring, Ladi..."
0679630001,"{Outdoor, Melange, Jacket, Medium Dusty, Ladie..."
0698899003,"{Blouses, Black, Vest top, Jacquard, Ladieswea..."
...,...
0475827011,"{Other shoe, Medium Dusty, Shoes, Menswear, Be..."
0641715002,"{Blouses, White, Divided Projects, Divided, Bl..."
0545612001,"{Blue, Trousers, Denim, Garment Lower body, La..."
0663277003,"{Hat/beanie, Melange, Divided Accessories, Acc..."


In [104]:
jaccard_articles.loc['0679630001']['set_of_attributes']

{'Garment Upper body',
 'Grey',
 'H&M+',
 'Jacket',
 'Ladieswear',
 'Medium Dusty',
 'Melange',
 'Outdoor'}

In [115]:
def avg_jaccard_similarity(
    candidate_item: str,
    customer_id: str,
    articles_attributes: pd.DataFrame,
    transactions: pd.DataFrame):
    
    candidate_item_attributes = articles_attributes.loc[candidate_item]['set_of_attributes']
    bought_items = list(transactions[transactions['customer_id']==customer_id].article_id.unique())
    jaccard_similarity_list = []
    for item in bought_items:
        item_attributes = articles_attributes.loc[item]['set_of_attributes']
        jaccard_similarity = jaccard(candidate_item_attributes, item_attributes)
        jaccard_similarity_list.append(jaccard_similarity)
    print(jaccard_similarity_list)
    return sum(jaccard_similarity_list)/len(jaccard_similarity_list)    

In [116]:
avg_jaccard_similarity(candidate_item='0545612001',
                       articles_attributes=jaccard_articles,
                       customer_id='fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e593881ae6007d775f0f',
                       transactions=transactions
                      )

[0.0, 0.06666666666666667, 0.0625, 0.7777777777777778, 0.0625]


0.1938888888888889

make it into whole dataframe calculation

In [175]:
def calculate_avg_jaccard_similarity(
    candidates_df: pd.DataFrame,
    articles_attributes: pd.DataFrame,
    transactions: pd.DataFrame):
    candidate_item, candidate_user = candidates_df.article_id, candidates_df.customer_id
    candidate_item_attributes = articles_attributes.loc[candidate_item]['set_of_attributes']
    bought_items = list(transactions[transactions['customer_id']==candidate_user].article_id.unique())
    if not bought_items:
        return 0
    jaccard_similarity_list = []
    for item in bought_items:
        item_attributes = articles_attributes.loc[item]['set_of_attributes']
        jaccard_similarity = jaccard(candidate_item_attributes, item_attributes)
        jaccard_similarity_list.append(jaccard_similarity)
    return sum(jaccard_similarity_list)/len(jaccard_similarity_list)    

In [180]:
candidates_df = pd.DataFrame({'customer_id': sorted(customers['customer_id'].sample(5).to_list()*3),
                              'article_id': articles['article_id'].sample(15).to_list()
                             })

In [181]:
candidates_df.head()

Unnamed: 0,customer_id,article_id
0,20e52bf42639499af2b48a5c5736e7d29b3a985d93da4b...,548837002
1,20e52bf42639499af2b48a5c5736e7d29b3a985d93da4b...,901831004
2,20e52bf42639499af2b48a5c5736e7d29b3a985d93da4b...,691009003
3,a9e3b488bdec5f770e97ce001e5d50d5175e31595fdb32...,647756003
4,a9e3b488bdec5f770e97ce001e5d50d5175e31595fdb32...,710054003


In [182]:
candidates_df.apply(lambda x: calculate_avg_jaccard_similarity(x, jaccard_articles, transactions), axis=1)

0    0.0000
1    0.0000
2    0.0000
3    0.0000
4    0.0000
5    0.0000
6    0.1561
7    0.0925
8    0.1172
9    0.0000
10   0.0000
11   0.0000
12   0.0588
13   0.0588
14   0.2857
dtype: float64

### image/text embeddings

In [117]:
image_embeddings = context.catalog.load('image_embeddings')

2022-08-12 13:04:30,561 - kedro.io.data_catalog - INFO - Loading data from `image_embeddings` (ParquetDataSet)...


In [118]:
image_embeddings

Unnamed: 0,img_emb_1,img_emb_2,img_emb_3,img_emb_4,img_emb_5,img_emb_6,img_emb_7,img_emb_8,img_emb_9,img_emb_10,...,img_emb_23,img_emb_24,img_emb_25,img_emb_26,img_emb_27,img_emb_28,img_emb_29,img_emb_30,img_emb_31,img_emb_32
0520567001,1.1136,1.6530,-2.1507,2.0899,1.1859,2.2600,0.3472,2.0131,-4.7345,2.4018,...,2.0007,-0.2031,-2.8428,0.4911,-0.0419,2.5446,0.1087,1.5822,4.2004,-2.0204
0878510001,1.4409,1.3867,-2.2826,2.1407,0.8620,2.1601,0.0811,1.7678,-4.8009,1.9922,...,1.6910,-0.1554,-3.0912,0.5878,-0.4512,2.7649,0.4721,1.5510,4.1647,-2.0769
0573864007,-0.7568,1.5158,-0.5756,0.6109,1.7299,1.2196,1.0663,1.6654,-1.9784,2.5785,...,1.9107,-0.5298,-0.3629,-0.1672,1.4940,0.2434,-1.1655,1.0513,2.0177,-0.7936
0832893002,1.1667,1.3586,-2.2046,1.9648,1.0202,2.0245,0.1760,1.7535,-4.6813,2.1408,...,1.7510,-0.3855,-2.8435,0.4786,-0.1807,2.4874,0.3440,1.6696,4.0890,-2.0683
0654410024,0.9548,1.6560,-1.9917,1.9652,1.2261,2.1809,0.4167,1.9897,-4.4603,2.4084,...,1.9925,-0.1926,-2.6166,0.4386,0.0854,2.3450,-0.0179,1.5020,3.9832,-1.8879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0633529002,1.3275,1.4195,-2.1956,2.0770,0.9209,2.1351,0.1450,1.7867,-4.6704,2.0465,...,1.7254,-0.1601,-2.9580,0.5514,-0.3419,2.6463,0.3709,1.5213,4.0722,-2.0117
0822895002,0.4658,1.8164,-1.5941,1.6946,1.4782,2.0865,0.7031,2.0820,-3.8644,2.6402,...,2.1455,-0.1754,-2.0297,0.2855,0.5559,1.8323,-0.4750,1.3414,3.5597,-1.5813
0759602003,0.2192,1.4499,-1.8595,1.4416,1.6240,1.7125,0.6022,1.8542,-4.2194,2.7570,...,2.0814,-0.9784,-1.9779,0.1264,0.7923,1.5771,-0.2489,1.9445,3.8230,-1.9520
0808866004,1.3212,1.3889,-2.2574,2.0748,0.9433,2.1152,0.1294,1.7773,-4.7725,2.0773,...,1.7342,-0.2571,-2.9936,0.5410,-0.3257,2.6536,0.4098,1.6110,4.1551,-2.0828


In [131]:
def calculate_customer_embedding(df: pd.DataFrame, embeddings: pd.DataFrame):
    list_of_articles = list(df.article_id.unique())
    mean_embeddings = list(embeddings[embeddings.index.isin(list_of_articles)].mean(axis=0))
    return mean_embeddings

In [183]:
%%time
cust_embeddings = transactions.groupby(['customer_id']).apply(lambda x: calculate_customer_embedding(x, image_embeddings)).reset_index(name='embeddings')

CPU times: user 1min 57s, sys: 671 ms, total: 1min 58s
Wall time: 1min 58s


In [184]:
cust_embeddings.shape

(49145, 2)

In [185]:
image_embeddings.shape

(105100, 32)

In [135]:
text_embeddings = context.catalog.load('text_embeddings')

2022-08-12 13:33:47,961 - kedro.io.data_catalog - INFO - Loading data from `text_embeddings` (ParquetDataSet)...


In [136]:
text_embeddings.shape

(105542, 384)

In [137]:
%%time
transactions.groupby(['customer_id']).apply(lambda x: calculate_customer_embedding(x, text_embeddings)).reset_index(name='embeddings')

CPU times: user 1min 56s, sys: 702 ms, total: 1min 56s
Wall time: 1min 56s


Unnamed: 0,customer_id,embeddings
0,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"[0.060339782387018204, 0.017903417348861694, 0..."
1,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,"[-0.03108256496489048, 0.07063155621290207, -0..."
2,000346516dd355b40badca0c0f5f37a318ddae31f0e0f7...,"[-0.03189806267619133, 0.08947073668241501, -0..."
3,000531ecd2754a0cf214373cc7b6dbcf7ce9a77c3284a2...,"[-0.007199143059551716, 0.10448223352432251, 0..."
4,000989f72a2b8e5da2f4abafc86c2e213816fa2ff2a060...,"[-0.02177959680557251, 0.09326094388961792, 0...."
...,...,...
49140,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,"[-0.04204563423991203, 0.05189591646194458, 0...."
49141,fff69e3ab1bd701315881c7706c38d5cbcda1f4e0cf213...,"[0.07588859647512436, 0.014239171519875526, -0..."
49142,fff94b7e15cd61eaf699f0e7d29e4023d28733119f323d...,"[0.04923105612397194, 0.0729655772447586, 0.03..."
49143,fffdd9c81f64c0228a54e1c0240cf40877acec6ce985d4...,"[-0.049292903393507004, 0.0693088248372078, 0...."


to split embeddings from list to single columns

df3 = df2.teams.apply(pd.Series) <br>
df3.columns = ['team1', 'team2']

In [198]:
np.array(cust_embeddings.set_index('customer_id').loc['000531ecd2754a0cf214373cc7b6dbcf7ce9a77c3284a2facbe072f763618bbc'].embeddings)

array([ 0.43539429,  1.43819571, -1.93127227,  1.56413281,  1.48725712,
        1.79077113,  0.50990307,  1.83755374, -4.31625366,  2.61858845,
        0.31816185,  1.70420766, -2.99186277,  1.39999628, -0.2818093 ,
       -0.97360808, -0.16821225, -0.3271288 , -2.23177528,  6.11882687,
        0.79731834,  2.77756238,  2.01025057, -0.82952678, -2.17244005,
        0.2082431 ,  0.57116681,  1.78582966, -0.12209548,  1.87108564,
        3.87817287, -1.97070253])

In [194]:
image_embeddings.loc['0878510001'].values

array([ 1.4408963e+00,  1.3866708e+00, -2.2826171e+00,  2.1407368e+00,
        8.6195004e-01,  2.1600864e+00,  8.1075065e-02,  1.7677919e+00,
       -4.8008981e+00,  1.9921565e+00,  1.2470684e+00,  1.4888606e+00,
       -2.9717569e+00,  5.6244093e-01, -1.0710030e+00, -1.5912099e+00,
        6.6585618e-01,  6.0829879e-03, -1.7968456e+00,  6.5001235e+00,
        9.7213534e-04,  2.9396386e+00,  1.6909531e+00, -1.5535519e-01,
       -3.0912085e+00,  5.8781457e-01, -4.5122060e-01,  2.7648869e+00,
        4.7212821e-01,  1.5510160e+00,  4.1646776e+00, -2.0769293e+00],
      dtype=float32)

In [230]:
def cosine_similarity(A, B):
    return np.dot(A,B)/(np.linalg.norm(A)*np.linalg.norm(B))

In [235]:
def cosine_embedding_similarity(candidates_df, customers_embeddings, items_embeddings):
    """
    """
    candidate_item, candidate_user = candidates_df.article_id, candidates_df.customer_id
    try:
        customer_embeddings = np.array(cust_embeddings.set_index('customer_id').loc[candidate_user].embeddings)
    except KeyError:
        return 0
    try:
        candidate_embeddings = items_embeddings.loc[candidate_item].values
    except KeyError:
        return 0
    return cosine_similarity(customer_embeddings, candidate_embeddings)

In [239]:
candidates_df = pd.DataFrame({'customer_id': sorted(customers['customer_id'].sample(5).to_list()*3),
                              'article_id': articles['article_id'].sample(15).to_list()
                             })

In [240]:
candidates_df.head()

Unnamed: 0,customer_id,article_id
0,43be7a90d1b3b4ddc4486a2b9ca4d8dc69d73791e99cb7...,584198001
1,43be7a90d1b3b4ddc4486a2b9ca4d8dc69d73791e99cb7...,863587003
2,43be7a90d1b3b4ddc4486a2b9ca4d8dc69d73791e99cb7...,695545008
3,a60d2b71f2728406f419fb65e761435c5338554170f3cb...,539816001
4,a60d2b71f2728406f419fb65e761435c5338554170f3cb...,841874001


In [241]:
candidates_df.apply(lambda x: cosine_embedding_similarity(x, cust_embeddings, image_embeddings), axis=1)

0    0.0000
1    0.0000
2    0.0000
3    0.9625
4    0.8308
5    0.9894
6    0.0000
7    0.0000
8    0.0000
9    0.0000
10   0.0000
11   0.0000
12   0.0000
13   0.0000
14   0.0000
dtype: float64

text similarities

In [242]:
%%time
cust_embeddings = transactions.groupby(['customer_id']).apply(lambda x: calculate_customer_embedding(x, text_embeddings)).reset_index(name='embeddings')

CPU times: user 1min 56s, sys: 707 ms, total: 1min 57s
Wall time: 1min 57s


In [243]:
candidates_df.apply(lambda x: cosine_embedding_similarity(x, cust_embeddings, text_embeddings), axis=1)

0    0.0000
1    0.0000
2    0.0000
3    0.4155
4    0.3849
5    0.4410
6    0.0000
7    0.0000
8    0.0000
9    0.0000
10   0.0000
11   0.0000
12   0.0000
13   0.0000
14   0.0000
dtype: float64

all similarities

collaborative filtering still AWAITS :<<<

candidates -> STRATEGY FEATURE, MAYBE SCORE?