In [1]:
import pandas as pd
from hnmchallenge.data_reader import DataReader
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np



from hnmchallenge.models.top_pop import TopPop
from hnmchallenge.evaluation.python_evaluation import map_at_k, recall_at_k
from hnmchallenge.constant import *
from hnmchallenge.models.sgmc.sgmc import SGMC
from hnmchallenge.models.ease.ease import EASE
from hnmchallenge.models.itemknn.itemknn import ItemKNN 

In [2]:
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.DEBUG)

In [3]:
dataset = StratifiedDataset()
dr = DataReader()

In [4]:
# creating H&M 
holdout = dataset.get_holdout()

In [5]:
item_per_user = holdout.groupby(DEFAULT_USER_COL)[DEFAULT_ITEM_COL].apply(list)

In [6]:
item_per_user_df = item_per_user.to_frame()

In [7]:
item_per_user_df

Unnamed: 0_level_0,article_id
customer_id,Unnamed: 1_level_1
0,"[1652, 7053, 11572]"
1,"[3161, 8254, 16695, 13392, 2427]"
2,"[8443, 3023, 7068, 8089, 3215, 7193]"
3,"[632, 3]"
4,[4]
...,...
1136201,[17418]
1136202,[20154]
1136203,[4770]
1136204,[17478]


In [8]:
recs = pd.read_feather(dr.get_preprocessed_data_path()/"cosine_recs_100.feather")

In [9]:
recs

Unnamed: 0,customer_id,recs
0,0,"[1482, 1638, 1797, 4861, 9027, 1952, 0, 1488, ..."
1,1,"[3161, 7534, 2997, 3503, 1482, 7628, 6992, 782..."
2,2,"[248, 7818, 8074, 196, 7135, 249, 4151, 9, 309..."
3,3,"[3, 2596, 3936, 3935, 2872, 621, 376, 16595, 6..."
4,5,"[5, 499, 69, 123, 16014, 2005, 3554, 2008, 725..."
...,...,...
962725,1136200,"[20583, 17333, 3328, 18900, 16949, 21549, 2049..."
962726,1136201,"[17418, 20780, 14892, 15765, 17869, 17715, 128..."
962727,1136203,"[19923, 20246, 20091, 19496, 16864, 21001, 199..."
962728,1136204,"[17478, 20806, 20946, 18373, 18249, 17488, 215..."


In [10]:
final_df = pd.merge(item_per_user_df.reset_index(), recs, on=DEFAULT_USER_COL)

In [11]:
final_df

Unnamed: 0,customer_id,article_id,recs
0,0,"[1652, 7053, 11572]","[1482, 1638, 1797, 4861, 9027, 1952, 0, 1488, ..."
1,1,"[3161, 8254, 16695, 13392, 2427]","[3161, 7534, 2997, 3503, 1482, 7628, 6992, 782..."
2,2,"[8443, 3023, 7068, 8089, 3215, 7193]","[248, 7818, 8074, 196, 7135, 249, 4151, 9, 309..."
3,3,"[632, 3]","[3, 2596, 3936, 3935, 2872, 621, 376, 16595, 6..."
4,5,[991],"[5, 499, 69, 123, 16014, 2005, 3554, 2008, 725..."
...,...,...,...
962725,1136200,[22036],"[20583, 17333, 3328, 18900, 16949, 21549, 2049..."
962726,1136201,[17418],"[17418, 20780, 14892, 15765, 17869, 17715, 128..."
962727,1136203,[4770],"[19923, 20246, 20091, 19496, 16864, 21001, 199..."
962728,1136204,[17478],"[17478, 20806, 20946, 18373, 18249, 17488, 215..."


In [12]:
final_df["hit_all"] = final_df.apply(lambda x: len(np.intersect1d(x.article_id, x.recs))/ len(x.article_id), axis=1)

In [13]:
final_df

Unnamed: 0,customer_id,article_id,recs,hit_all
0,0,"[1652, 7053, 11572]","[1482, 1638, 1797, 4861, 9027, 1952, 0, 1488, ...",0.0
1,1,"[3161, 8254, 16695, 13392, 2427]","[3161, 7534, 2997, 3503, 1482, 7628, 6992, 782...",0.2
2,2,"[8443, 3023, 7068, 8089, 3215, 7193]","[248, 7818, 8074, 196, 7135, 249, 4151, 9, 309...",0.0
3,3,"[632, 3]","[3, 2596, 3936, 3935, 2872, 621, 376, 16595, 6...",0.5
4,5,[991],"[5, 499, 69, 123, 16014, 2005, 3554, 2008, 725...",0.0
...,...,...,...,...
962725,1136200,[22036],"[20583, 17333, 3328, 18900, 16949, 21549, 2049...",0.0
962726,1136201,[17418],"[17418, 20780, 14892, 15765, 17869, 17715, 128...",1.0
962727,1136203,[4770],"[19923, 20246, 20091, 19496, 16864, 21001, 199...",0.0
962728,1136204,[17478],"[17478, 20806, 20946, 18373, 18249, 17488, 215...",1.0


In [14]:
print("Recall@100")
final_df["hit_all"].mean()

Recall@100


0.30180611594459295

In [15]:
final_df[final_df["hit_all"]>0]

Unnamed: 0,customer_id,article_id,recs,hit_all
1,1,"[3161, 8254, 16695, 13392, 2427]","[3161, 7534, 2997, 3503, 1482, 7628, 6992, 782...",0.200000
3,3,"[632, 3]","[3, 2596, 3936, 3935, 2872, 621, 376, 16595, 6...",0.500000
5,6,"[1891, 12146, 8234, 4443, 1655, 12235]","[4133, 2204, 8796, 451, 2012, 3360, 7317, 7177...",0.166667
6,7,"[391, 391, 7312, 7312, 10256, 7995, 8823, 1013...","[7116, 7117, 73, 6761, 1482, 7597, 2352, 401, ...",0.086957
8,9,[79],"[11, 10, 19, 2041, 228, 261, 2055, 7270, 9702,...",1.000000
...,...,...,...,...
962717,1136190,"[19321, 9294]","[7954, 8835, 16601, 8285, 8923, 8532, 3468, 69...",0.500000
962719,1136192,[18777],"[6241, 6431, 6491, 6680, 7289, 3747, 4214, 743...",1.000000
962721,1136194,"[14732, 81]","[21495, 18303, 19410, 15523, 15679, 20865, 207...",0.500000
962726,1136201,[17418],"[17418, 20780, 14892, 15765, 17869, 17715, 128...",1.000000


In [16]:
final_df["recs_hit"] = final_df.apply(lambda x: np.intersect1d(x.article_id, x.recs), axis=1)

In [17]:
final_df

Unnamed: 0,customer_id,article_id,recs,hit_all,recs_hit
0,0,"[1652, 7053, 11572]","[1482, 1638, 1797, 4861, 9027, 1952, 0, 1488, ...",0.0,[]
1,1,"[3161, 8254, 16695, 13392, 2427]","[3161, 7534, 2997, 3503, 1482, 7628, 6992, 782...",0.2,[3161]
2,2,"[8443, 3023, 7068, 8089, 3215, 7193]","[248, 7818, 8074, 196, 7135, 249, 4151, 9, 309...",0.0,[]
3,3,"[632, 3]","[3, 2596, 3936, 3935, 2872, 621, 376, 16595, 6...",0.5,[3]
4,5,[991],"[5, 499, 69, 123, 16014, 2005, 3554, 2008, 725...",0.0,[]
...,...,...,...,...,...
962725,1136200,[22036],"[20583, 17333, 3328, 18900, 16949, 21549, 2049...",0.0,[]
962726,1136201,[17418],"[17418, 20780, 14892, 15765, 17869, 17715, 128...",1.0,[17418]
962727,1136203,[4770],"[19923, 20246, 20091, 19496, 16864, 21001, 199...",0.0,[]
962728,1136204,[17478],"[17478, 20806, 20946, 18373, 18249, 17488, 215...",1.0,[17478]


In [18]:
hold_in = dataset.get_holdin()

In [19]:
all_article_id = hold_in.groupby(DEFAULT_USER_COL)[DEFAULT_ITEM_COL].apply(list).rename("all_items")

In [20]:
all_article_id

customer_id
0          [0, 0, 0, 644, 1952, 1952, 1482, 1638, 1797, 4...
1          [1, 482, 759, 2006, 339, 339, 6018, 7534, 7534...
2          [2, 326, 2055, 2238, 4151, 309, 5786, 5174, 54...
3                                      [3, 3, 3, 2596, 3936]
5                                                        [5]
                                 ...                        
1136200                                 [3328, 17333, 20583]
1136201                                              [17418]
1136203                                       [19923, 20246]
1136204                                              [17478]
1136205                             [10033, 7580, 1765, 838]
Name: all_items, Length: 962730, dtype: object

In [21]:
final_df

Unnamed: 0,customer_id,article_id,recs,hit_all,recs_hit
0,0,"[1652, 7053, 11572]","[1482, 1638, 1797, 4861, 9027, 1952, 0, 1488, ...",0.0,[]
1,1,"[3161, 8254, 16695, 13392, 2427]","[3161, 7534, 2997, 3503, 1482, 7628, 6992, 782...",0.2,[3161]
2,2,"[8443, 3023, 7068, 8089, 3215, 7193]","[248, 7818, 8074, 196, 7135, 249, 4151, 9, 309...",0.0,[]
3,3,"[632, 3]","[3, 2596, 3936, 3935, 2872, 621, 376, 16595, 6...",0.5,[3]
4,5,[991],"[5, 499, 69, 123, 16014, 2005, 3554, 2008, 725...",0.0,[]
...,...,...,...,...,...
962725,1136200,[22036],"[20583, 17333, 3328, 18900, 16949, 21549, 2049...",0.0,[]
962726,1136201,[17418],"[17418, 20780, 14892, 15765, 17869, 17715, 128...",1.0,[17418]
962727,1136203,[4770],"[19923, 20246, 20091, 19496, 16864, 21001, 199...",0.0,[]
962728,1136204,[17478],"[17478, 20806, 20946, 18373, 18249, 17488, 215...",1.0,[17478]


In [22]:
final_df = pd.merge(final_df, all_article_id, on=DEFAULT_USER_COL)

In [23]:
final_df

Unnamed: 0,customer_id,article_id,recs,hit_all,recs_hit,all_items
0,0,"[1652, 7053, 11572]","[1482, 1638, 1797, 4861, 9027, 1952, 0, 1488, ...",0.0,[],"[0, 0, 0, 644, 1952, 1952, 1482, 1638, 1797, 4..."
1,1,"[3161, 8254, 16695, 13392, 2427]","[3161, 7534, 2997, 3503, 1482, 7628, 6992, 782...",0.2,[3161],"[1, 482, 759, 2006, 339, 339, 6018, 7534, 7534..."
2,2,"[8443, 3023, 7068, 8089, 3215, 7193]","[248, 7818, 8074, 196, 7135, 249, 4151, 9, 309...",0.0,[],"[2, 326, 2055, 2238, 4151, 309, 5786, 5174, 54..."
3,3,"[632, 3]","[3, 2596, 3936, 3935, 2872, 621, 376, 16595, 6...",0.5,[3],"[3, 3, 3, 2596, 3936]"
4,5,[991],"[5, 499, 69, 123, 16014, 2005, 3554, 2008, 725...",0.0,[],[5]
...,...,...,...,...,...,...
962725,1136200,[22036],"[20583, 17333, 3328, 18900, 16949, 21549, 2049...",0.0,[],"[3328, 17333, 20583]"
962726,1136201,[17418],"[17418, 20780, 14892, 15765, 17869, 17715, 128...",1.0,[17418],[17418]
962727,1136203,[4770],"[19923, 20246, 20091, 19496, 16864, 21001, 199...",0.0,[],"[19923, 20246]"
962728,1136204,[17478],"[17478, 20806, 20946, 18373, 18249, 17488, 215...",1.0,[17478],[17478]


In [24]:
import collections
from tqdm import tqdm_notebook

In [25]:
def multiple_buy_items(x):
    c = collections.Counter(x)
    p=[]
    c = {key:val for key, val in c.items() if val != 1 }
    for k,v in c.items():
        p.append(np.repeat(k,v))
    if len(p) > 0:
        p = np.concatenate(p) 
    return p

In [26]:
final_df

Unnamed: 0,customer_id,article_id,recs,hit_all,recs_hit,all_items
0,0,"[1652, 7053, 11572]","[1482, 1638, 1797, 4861, 9027, 1952, 0, 1488, ...",0.0,[],"[0, 0, 0, 644, 1952, 1952, 1482, 1638, 1797, 4..."
1,1,"[3161, 8254, 16695, 13392, 2427]","[3161, 7534, 2997, 3503, 1482, 7628, 6992, 782...",0.2,[3161],"[1, 482, 759, 2006, 339, 339, 6018, 7534, 7534..."
2,2,"[8443, 3023, 7068, 8089, 3215, 7193]","[248, 7818, 8074, 196, 7135, 249, 4151, 9, 309...",0.0,[],"[2, 326, 2055, 2238, 4151, 309, 5786, 5174, 54..."
3,3,"[632, 3]","[3, 2596, 3936, 3935, 2872, 621, 376, 16595, 6...",0.5,[3],"[3, 3, 3, 2596, 3936]"
4,5,[991],"[5, 499, 69, 123, 16014, 2005, 3554, 2008, 725...",0.0,[],[5]
...,...,...,...,...,...,...
962725,1136200,[22036],"[20583, 17333, 3328, 18900, 16949, 21549, 2049...",0.0,[],"[3328, 17333, 20583]"
962726,1136201,[17418],"[17418, 20780, 14892, 15765, 17869, 17715, 128...",1.0,[17418],[17418]
962727,1136203,[4770],"[19923, 20246, 20091, 19496, 16864, 21001, 199...",0.0,[],"[19923, 20246]"
962728,1136204,[17478],"[17478, 20806, 20946, 18373, 18249, 17488, 215...",1.0,[17478],[17478]


In [27]:
final_df["mb_items"] = final_df.apply(lambda x: multiple_buy_items(x.all_items), axis=1)

In [28]:
final_df

Unnamed: 0,customer_id,article_id,recs,hit_all,recs_hit,all_items,mb_items
0,0,"[1652, 7053, 11572]","[1482, 1638, 1797, 4861, 9027, 1952, 0, 1488, ...",0.0,[],"[0, 0, 0, 644, 1952, 1952, 1482, 1638, 1797, 4...","[0, 0, 0, 1952, 1952]"
1,1,"[3161, 8254, 16695, 13392, 2427]","[3161, 7534, 2997, 3503, 1482, 7628, 6992, 782...",0.2,[3161],"[1, 482, 759, 2006, 339, 339, 6018, 7534, 7534...","[339, 339, 7534, 7534]"
2,2,"[8443, 3023, 7068, 8089, 3215, 7193]","[248, 7818, 8074, 196, 7135, 249, 4151, 9, 309...",0.0,[],"[2, 326, 2055, 2238, 4151, 309, 5786, 5174, 54...",[]
3,3,"[632, 3]","[3, 2596, 3936, 3935, 2872, 621, 376, 16595, 6...",0.5,[3],"[3, 3, 3, 2596, 3936]","[3, 3, 3]"
4,5,[991],"[5, 499, 69, 123, 16014, 2005, 3554, 2008, 725...",0.0,[],[5],[]
...,...,...,...,...,...,...,...
962725,1136200,[22036],"[20583, 17333, 3328, 18900, 16949, 21549, 2049...",0.0,[],"[3328, 17333, 20583]",[]
962726,1136201,[17418],"[17418, 20780, 14892, 15765, 17869, 17715, 128...",1.0,[17418],[17418],[]
962727,1136203,[4770],"[19923, 20246, 20091, 19496, 16864, 21001, 199...",0.0,[],"[19923, 20246]",[]
962728,1136204,[17478],"[17478, 20806, 20946, 18373, 18249, 17488, 215...",1.0,[17478],[17478],[]


In [29]:
final_df["sb_items"] = final_df.apply(lambda x: np.setdiff1d(x.all_items, x.mb_items), axis=1)

In [30]:
final_df

Unnamed: 0,customer_id,article_id,recs,hit_all,recs_hit,all_items,mb_items,sb_items
0,0,"[1652, 7053, 11572]","[1482, 1638, 1797, 4861, 9027, 1952, 0, 1488, ...",0.0,[],"[0, 0, 0, 644, 1952, 1952, 1482, 1638, 1797, 4...","[0, 0, 0, 1952, 1952]","[644, 1482, 1638, 1797, 4861, 9027]"
1,1,"[3161, 8254, 16695, 13392, 2427]","[3161, 7534, 2997, 3503, 1482, 7628, 6992, 782...",0.2,[3161],"[1, 482, 759, 2006, 339, 339, 6018, 7534, 7534...","[339, 339, 7534, 7534]","[1, 482, 759, 981, 1482, 2006, 2997, 3161, 350..."
2,2,"[8443, 3023, 7068, 8089, 3215, 7193]","[248, 7818, 8074, 196, 7135, 249, 4151, 9, 309...",0.0,[],"[2, 326, 2055, 2238, 4151, 309, 5786, 5174, 54...",[],"[2, 9, 196, 248, 249, 309, 326, 2055, 2238, 38..."
3,3,"[632, 3]","[3, 2596, 3936, 3935, 2872, 621, 376, 16595, 6...",0.5,[3],"[3, 3, 3, 2596, 3936]","[3, 3, 3]","[2596, 3936]"
4,5,[991],"[5, 499, 69, 123, 16014, 2005, 3554, 2008, 725...",0.0,[],[5],[],[5]
...,...,...,...,...,...,...,...,...
962725,1136200,[22036],"[20583, 17333, 3328, 18900, 16949, 21549, 2049...",0.0,[],"[3328, 17333, 20583]",[],"[3328, 17333, 20583]"
962726,1136201,[17418],"[17418, 20780, 14892, 15765, 17869, 17715, 128...",1.0,[17418],[17418],[],[17418]
962727,1136203,[4770],"[19923, 20246, 20091, 19496, 16864, 21001, 199...",0.0,[],"[19923, 20246]",[],"[19923, 20246]"
962728,1136204,[17478],"[17478, 20806, 20946, 18373, 18249, 17488, 215...",1.0,[17478],[17478],[],[17478]


In [31]:
final_df["holdout_sb_items"] = final_df.apply(lambda x: np.setdiff1d(x.article_id, x.mb_items), axis=1)

In [32]:
final_df

Unnamed: 0,customer_id,article_id,recs,hit_all,recs_hit,all_items,mb_items,sb_items,holdout_sb_items
0,0,"[1652, 7053, 11572]","[1482, 1638, 1797, 4861, 9027, 1952, 0, 1488, ...",0.0,[],"[0, 0, 0, 644, 1952, 1952, 1482, 1638, 1797, 4...","[0, 0, 0, 1952, 1952]","[644, 1482, 1638, 1797, 4861, 9027]","[1652, 7053, 11572]"
1,1,"[3161, 8254, 16695, 13392, 2427]","[3161, 7534, 2997, 3503, 1482, 7628, 6992, 782...",0.2,[3161],"[1, 482, 759, 2006, 339, 339, 6018, 7534, 7534...","[339, 339, 7534, 7534]","[1, 482, 759, 981, 1482, 2006, 2997, 3161, 350...","[2427, 3161, 8254, 13392, 16695]"
2,2,"[8443, 3023, 7068, 8089, 3215, 7193]","[248, 7818, 8074, 196, 7135, 249, 4151, 9, 309...",0.0,[],"[2, 326, 2055, 2238, 4151, 309, 5786, 5174, 54...",[],"[2, 9, 196, 248, 249, 309, 326, 2055, 2238, 38...","[3023, 3215, 7068, 7193, 8089, 8443]"
3,3,"[632, 3]","[3, 2596, 3936, 3935, 2872, 621, 376, 16595, 6...",0.5,[3],"[3, 3, 3, 2596, 3936]","[3, 3, 3]","[2596, 3936]",[632]
4,5,[991],"[5, 499, 69, 123, 16014, 2005, 3554, 2008, 725...",0.0,[],[5],[],[5],[991]
...,...,...,...,...,...,...,...,...,...
962725,1136200,[22036],"[20583, 17333, 3328, 18900, 16949, 21549, 2049...",0.0,[],"[3328, 17333, 20583]",[],"[3328, 17333, 20583]",[22036]
962726,1136201,[17418],"[17418, 20780, 14892, 15765, 17869, 17715, 128...",1.0,[17418],[17418],[],[17418],[17418]
962727,1136203,[4770],"[19923, 20246, 20091, 19496, 16864, 21001, 199...",0.0,[],"[19923, 20246]",[],"[19923, 20246]",[4770]
962728,1136204,[17478],"[17478, 20806, 20946, 18373, 18249, 17488, 215...",1.0,[17478],[17478],[],[17478],[17478]


In [33]:
final_df["hit_sb"] = final_df.apply(lambda x: len(np.intersect1d(x.recs_hit, x.holdout_sb_items))/ len(x.article_id), axis=1)

In [34]:
final_df[final_df["hit_sb"]>0]

Unnamed: 0,customer_id,article_id,recs,hit_all,recs_hit,all_items,mb_items,sb_items,holdout_sb_items,hit_sb
1,1,"[3161, 8254, 16695, 13392, 2427]","[3161, 7534, 2997, 3503, 1482, 7628, 6992, 782...",0.200000,[3161],"[1, 482, 759, 2006, 339, 339, 6018, 7534, 7534...","[339, 339, 7534, 7534]","[1, 482, 759, 981, 1482, 2006, 2997, 3161, 350...","[2427, 3161, 8254, 13392, 16695]",0.200000
5,6,"[1891, 12146, 8234, 4443, 1655, 12235]","[4133, 2204, 8796, 451, 2012, 3360, 7317, 7177...",0.166667,[1891],"[6, 1990, 2229, 2603, 1928, 1950, 3597, 1638, ...",[],"[6, 309, 451, 1496, 1638, 1928, 1950, 1990, 20...","[1655, 1891, 4443, 8234, 12146, 12235]",0.166667
6,7,"[391, 391, 7312, 7312, 10256, 7995, 8823, 1013...","[7116, 7117, 73, 6761, 1482, 7597, 2352, 401, ...",0.086957,"[3179, 7597]","[7, 8, 274, 128, 128, 275, 1651, 1651, 1649, 1...","[128, 128, 1651, 1651, 1649, 1649, 1482, 1482,...","[7, 8, 222, 274, 275, 293, 294, 317, 319, 351,...","[391, 789, 3164, 3179, 7312, 7995, 8602, 8823,...",0.043478
8,9,[79],"[11, 10, 19, 2041, 228, 261, 2055, 7270, 9702,...",1.000000,[79],"[10, 11]",[],"[10, 11]",[79],1.000000
9,10,"[11127, 7270, 19, 15804, 12893, 12893, 3629, 1...","[11, 9316, 11232, 19, 9702, 12042, 8850, 4612,...",0.625000,"[19, 7270, 11035, 11127, 12893]","[11, 2041, 2055, 308, 1711, 2347, 3133, 2216, ...","[11, 11, 19, 19, 11232, 11232, 9316, 9316]","[64, 308, 1523, 1711, 2041, 2055, 2216, 2347, ...","[3629, 7270, 11035, 11127, 12893, 15804]",0.500000
...,...,...,...,...,...,...,...,...,...,...
962717,1136190,"[19321, 9294]","[7954, 8835, 16601, 8285, 8923, 8532, 3468, 69...",0.500000,[9294],"[7954, 8285, 8835, 16601]",[],"[7954, 8285, 8835, 16601]","[9294, 19321]",0.500000
962719,1136192,[18777],"[6241, 6431, 6491, 6680, 7289, 3747, 4214, 743...",1.000000,[18777],[6241],[],[6241],[18777],1.000000
962721,1136194,"[14732, 81]","[21495, 18303, 19410, 15523, 15679, 20865, 207...",0.500000,[14732],"[15523, 19410, 21495, 15679, 18303]",[],"[15523, 15679, 18303, 19410, 21495]","[81, 14732]",0.500000
962726,1136201,[17418],"[17418, 20780, 14892, 15765, 17869, 17715, 128...",1.000000,[17418],[17418],[],[17418],[17418],1.000000


In [35]:
final_df["hit_sb"].mean()

0.28334409999919424

In [36]:
final_df["hit_all"].mean()

0.30180611594459295

In [37]:
final_df["percentage_mb"] = final_df.apply(lambda x: len(x.mb_items)/len(x.all_items), axis=1)

In [38]:
final_df

Unnamed: 0,customer_id,article_id,recs,hit_all,recs_hit,all_items,mb_items,sb_items,holdout_sb_items,hit_sb,percentage_mb
0,0,"[1652, 7053, 11572]","[1482, 1638, 1797, 4861, 9027, 1952, 0, 1488, ...",0.0,[],"[0, 0, 0, 644, 1952, 1952, 1482, 1638, 1797, 4...","[0, 0, 0, 1952, 1952]","[644, 1482, 1638, 1797, 4861, 9027]","[1652, 7053, 11572]",0.0,0.454545
1,1,"[3161, 8254, 16695, 13392, 2427]","[3161, 7534, 2997, 3503, 1482, 7628, 6992, 782...",0.2,[3161],"[1, 482, 759, 2006, 339, 339, 6018, 7534, 7534...","[339, 339, 7534, 7534]","[1, 482, 759, 981, 1482, 2006, 2997, 3161, 350...","[2427, 3161, 8254, 13392, 16695]",0.2,0.210526
2,2,"[8443, 3023, 7068, 8089, 3215, 7193]","[248, 7818, 8074, 196, 7135, 249, 4151, 9, 309...",0.0,[],"[2, 326, 2055, 2238, 4151, 309, 5786, 5174, 54...",[],"[2, 9, 196, 248, 249, 309, 326, 2055, 2238, 38...","[3023, 3215, 7068, 7193, 8089, 8443]",0.0,0.000000
3,3,"[632, 3]","[3, 2596, 3936, 3935, 2872, 621, 376, 16595, 6...",0.5,[3],"[3, 3, 3, 2596, 3936]","[3, 3, 3]","[2596, 3936]",[632],0.0,0.600000
4,5,[991],"[5, 499, 69, 123, 16014, 2005, 3554, 2008, 725...",0.0,[],[5],[],[5],[991],0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
962725,1136200,[22036],"[20583, 17333, 3328, 18900, 16949, 21549, 2049...",0.0,[],"[3328, 17333, 20583]",[],"[3328, 17333, 20583]",[22036],0.0,0.000000
962726,1136201,[17418],"[17418, 20780, 14892, 15765, 17869, 17715, 128...",1.0,[17418],[17418],[],[17418],[17418],1.0,0.000000
962727,1136203,[4770],"[19923, 20246, 20091, 19496, 16864, 21001, 199...",0.0,[],"[19923, 20246]",[],"[19923, 20246]",[4770],0.0,0.000000
962728,1136204,[17478],"[17478, 20806, 20946, 18373, 18249, 17488, 215...",1.0,[17478],[17478],[],[17478],[17478],1.0,0.000000


In [39]:
user_mb_tendency = final_df[[DEFAULT_USER_COL, "percentage_mb"]].copy()

In [40]:
user_mb_tendency

Unnamed: 0,customer_id,percentage_mb
0,0,0.454545
1,1,0.210526
2,2,0.000000
3,3,0.600000
4,5,0.000000
...,...,...
962725,1136200,0.000000
962726,1136201,0.000000
962727,1136203,0.000000
962728,1136204,0.000000


In [41]:
recs_hit_df = final_df[final_df["hit_all"]>0][[DEFAULT_USER_COL, "recs_hit"]]

In [42]:
recs_hit_df = recs_hit_df.explode("recs_hit")

In [43]:
recs_df = pd.read_feather(dr.get_preprocessed_data_path()/"cosine_recs_100.feather").explode("recs")

In [44]:
recs_df

Unnamed: 0,customer_id,recs
0,0,1482
0,0,1638
0,0,1797
0,0,4861
0,0,9027
...,...,...
962729,1136205,4459
962729,1136205,5696
962729,1136205,146
962729,1136205,3115


In [45]:
recs_hit_df

Unnamed: 0,customer_id,recs_hit
1,1,3161
3,3,3
5,6,1891
6,7,3179
6,7,7597
...,...,...
962717,1136190,9294
962719,1136192,18777
962721,1136194,14732
962726,1136201,17418


In [46]:
relevance_df = pd.merge(recs_df, recs_hit_df, left_on=[DEFAULT_USER_COL, "recs"],
                        right_on=[DEFAULT_USER_COL, "recs_hit"], how="outer")

In [47]:
relevance_df.loc[relevance_df["recs_hit"].notnull(), "recs_hit"] = 1 

In [48]:
relevance_df = relevance_df.fillna(0)

In [50]:
relevance_df["hit_sum"] = relevance_df.groupby(DEFAULT_USER_COL)["recs_hit"].transform("sum")

In [53]:
relevance_df = relevance_df[relevance_df["hit_sum"]>0]

In [54]:
relevance_df = pd.merge(user_mb_tendency, relevance_df, on=DEFAULT_USER_COL)

In [56]:
relevance_df = relevance_df.drop("hit_sum", axis=1)

In [57]:
relevance_df = relevance_df.rename({"recs_hit":"relevance"}, axis=1)

In [58]:
relevance_df = relevance_df.rename({"recs":DEFAULT_ITEM_COL}, axis=1)

In [59]:
relevance_df

Unnamed: 0,customer_id,percentage_mb,article_id,relevance
0,1,0.210526,3161,1
1,1,0.210526,7534,0
2,1,0.210526,2997,0
3,1,0.210526,3503,0
4,1,0.210526,1482,0
...,...,...,...,...
48331695,1136204,0.000000,10716,0
48331696,1136204,0.000000,12879,0
48331697,1136204,0.000000,15711,0
48331698,1136204,0.000000,16688,0


In [60]:
relevance_df.to_feather(dr.get_preprocessed_data_path() / "relevance_df.feather")