In [217]:
!pip install implicit



In [218]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)


# Написанные нами функции
##from metrics import precision_at_k, recall_at_k

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Recomended_sys/retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [None]:
data.isna().sum()


user_id              0
basket_id            0
day                  0
item_id              0
quantity             0
sales_value          0
store_id             0
retail_disc          0
trans_time           0
week_no              0
coupon_disc          0
coupon_match_disc    0
dtype: int64

In [None]:
users, items, interactions, week_no = data.user_id.nunique(), data.item_id.nunique(), data.shape[0], data['week_no'].nunique()

print('# users: ', users)
print('# items: ', items)
print('# interactions: ', interactions)
print('# week: ', week_no)

# users:  2499
# items:  89051
# interactions:  2396804
# week:  95


### Train-test split

In [None]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [None]:
data_train.shape[0], data_test.shape[0]

(2278490, 118314)

# 1. Бейзлайны


Создадим датафрейм с покупками юзеров на тестовом датасете (последние 3 недели)


In [None]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [None]:
test_users = result.shape[0]
new_test_users = len(set(data_test['user_id']) - set(data_train['user_id']))

print('В тестовом дата сете {} юзеров'.format(test_users))
print('В тестовом дата сете {} новых юзеров'.format(new_test_users))

В тестовом дата сете 2042 юзеров
В тестовом дата сете 0 новых юзеров


1.1 Random recommendation


In [None]:
def random_recommendation(items, n=5):
    """Случайные рекоммендации"""
    
    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()

In [None]:
%%time

items = data_train.item_id.unique()

result['random_recommendation'] = result['user_id'].apply(lambda x: random_recommendation(items, n=5))

CPU times: user 4 s, sys: 21.4 ms, total: 4.02 s
Wall time: 4.04 s


1.2 Popularity-based recommendation


In [None]:
def popularity_recommendation(data, n=5):
    """Топ-n популярных товаров"""
    
    popular = data.groupby('item_id')['sales_value'].sum().reset_index()
    popular.sort_values('sales_value', ascending=False, inplace=True)
    
    recs = popular.head(n).item_id
    
    return recs.tolist()

In [None]:
%%time
# Можно так делать, так как рекомендация не зависит от юзера
popular_recs = popularity_recommendation(data_train, n=5)

result['popular_recommendation'] = result['user_id'].apply(lambda x: popular_recs)

CPU times: user 104 ms, sys: 2.57 ms, total: 106 ms
Wall time: 113 ms


1.3 Weighted random recommender

прямопропорционально популярности. Вес = log(sales_sum товара)

In [None]:
def weighted_random_recommendation(data, n):

    popularity = data.groupby('item_id')['sales_value'].sum().reset_index()
    popularity['weight'] = popularity['sales_value']/popularity['sales_value'].sum()
    item_weights = np.array(popularity['weight'])
    items = np.array(popularity['item_id'])
    recs = np.random.choice(items, p = item_weights, size=n, replace=False)

    return recs.tolist()

In [None]:
result['weighted random recommender'] = result['user_id'].apply(lambda x: weighted_random_recommendation(data_train, n = 5))

In [238]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    precision = flags.sum() / len(recommended_list)
    
    
    return precision

Улучшим бейзлайн weighted random recommender, ограничив items топ-5000 по популярности

In [None]:
def random_recommendation_5000_weighted(data, n=5):
    """Случайные рекоммендации"""
    popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
    popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
    top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000)
    top_5000['weighted'] = top_5000['n_sold']/top_5000['n_sold'].sum()
    items = np.array(top_5000['item_id'])
    recs = np.random.choice(items, p = np.array(top_5000['weighted']), size=n, replace=False)
    
    return recs.tolist()

In [None]:
result['random_recommendation_5000_weighted'] = result['user_id'].apply(lambda x: random_recommendation_5000_weighted(data, n=5))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,weighted random recommender,random_recommendation_5000_weighted
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[947292, 14077281, 13381566, 943225, 7441687]","[6534178, 6533889, 1029743, 6534166, 1082185]","[7441210, 828113, 936270, 989249, 7409971]","[6534178, 6533889, 6534166, 397896, 6544236]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[384031, 1247184, 834948, 208322, 13381915]","[6534178, 6533889, 1029743, 6534166, 1082185]","[7167945, 949142, 847374, 1040183, 874209]","[6534178, 6534166, 948622, 6544236, 397896]"


## Метрики бейзлайнов


In [None]:
for column_name in result.columns[1:]:
    print('%s: %f' % (column_name, result.apply(lambda row: precision_at_k(row[column_name], row['actual']), axis=1).mean()))

actual: 1.000000
random_recommendation: 0.000686
popular_recommendation: 0.155240
weighted random recommender: 0.025955
random_recommendation_5000_weighted: 0.047209


Лучшая метрика остается у бейзлайна popular_recommendation

# 2.1 Item-Item Recommender / ItemKNN


Prepare_matrix

In [None]:
## Ограничим рекомендуемые items топ-5000 для этой и следующих моделей
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

In [None]:
# Заведем фиктивный item_id (если юзер покупал товары из топ-5000, то он "купил" такой товар)
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,819255,819304,819308,819330,819518,819594,819643,819765,819840,819845,819927,819978,820082,820122,820165,820291,820301,820321,820361,820486,820518,820560,820701,820895,821025,821083,821200,821209,821219,821344,...,13512965,13671759,13672065,13777104,13841744,13842088,13842090,13842214,13842224,13877192,13945141,13945244,13987135,14025185,14043817,14043823,14043825,14043826,14050436,14050460,14050461,14077333,14106445,15452677,15452812,15506577,15511891,15596279,15596488,15596515,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [None]:
user_item_matrix.head(2)

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,819255,819304,819308,819330,819518,819594,819643,819765,819840,819845,819927,819978,820082,820122,820165,820291,820301,820321,820361,820486,820518,820560,820701,820895,821025,821083,821200,821209,821219,821344,...,13512965,13671759,13672065,13777104,13841744,13842088,13842090,13842214,13842224,13877192,13945141,13945244,13987135,14025185,14043817,14043823,14043825,14043826,14050436,14050460,14050461,14077333,14106445,15452677,15452812,15506577,15511891,15596279,15596488,15596515,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
user_item_matrix.shape

(2499, 5001)

In [None]:
%%time

model = ItemItemRecommender(K=5, num_threads=4) # K - кол-во билжайших соседей

model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

recs = model.recommend(userid=userid_to_id[2],  # userid - id от 0 до N-1
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, # кол-во рекомендаций 
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=True)

HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))


CPU times: user 2.33 s, sys: 30.7 ms, total: 2.36 s
Wall time: 1.75 s


In [None]:
%%time

result['itemitem'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=True)])

CPU times: user 64.4 ms, sys: 0 ns, total: 64.4 ms
Wall time: 69.2 ms


## 2.2 Косинусное сходство и CosineRecommender


In [None]:
%%time

model = CosineRecommender(K=5, num_threads=4) # K - кол-во билжайших соседей

model.fit(csr_matrix(user_item_matrix).T.tocsr(), 
          show_progress=True)

recs = model.recommend(userid=userid_to_id[1], 
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, 
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=False)

HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))


CPU times: user 2.33 s, sys: 26 ms, total: 2.36 s
Wall time: 1.77 s


In [None]:
%%time

result['cosine'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=True)])

CPU times: user 79.9 ms, sys: 0 ns, total: 79.9 ms
Wall time: 81 ms


## 2.3. TF-IDF взвешивание и TFIDFRecommender

In [None]:
%%time

model = TFIDFRecommender(K=5, num_threads=4) # K - кол-во билжайших соседей

model.fit(csr_matrix(user_item_matrix).T.tocsr(), 
          show_progress=True)

recs = model.recommend(userid=userid_to_id[1], 
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, 
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=False)

HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))


CPU times: user 2.34 s, sys: 26 ms, total: 2.36 s
Wall time: 1.78 s


In [None]:
%%time

result['tfidf'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=False)])

CPU times: user 79.2 ms, sys: 997 µs, total: 80.2 ms
Wall time: 81.5 ms


##2.4. Own purchases

In [None]:
%%time

model = ItemItemRecommender(K=1, num_threads=4) # K - кол-во билжайших соседей

model.fit(csr_matrix(user_item_matrix).T.tocsr(), 
          show_progress=True)

recs = model.recommend(userid=userid_to_id[1], 
                        user_items=csr_matrix(user_item_matrix).tocsr(),  
                        N=5, 
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=False)

HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))


CPU times: user 2.26 s, sys: 23.8 ms, total: 2.28 s
Wall time: 1.7 s


In [None]:

result['own_purchases'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=False)])

In [None]:
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,weighted random recommender,random_recommendation_5000_weighted,itemitem,cosine,tfidf,own_purchases
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[947292, 14077281, 13381566, 943225, 7441687]","[6534178, 6533889, 1029743, 6534166, 1082185]","[7441210, 828113, 936270, 989249, 7409971]","[6534178, 6533889, 6534166, 397896, 6544236]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[1082185, 1029743, 995785, 1004906, 1081177]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[384031, 1247184, 834948, 208322, 13381915]","[6534178, 6533889, 1029743, 6534166, 1082185]","[7167945, 949142, 847374, 1040183, 874209]","[6534178, 6534166, 948622, 6544236, 397896]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[1082185, 1098066, 6534178, 1127831, 1068719]"


In [None]:
for column_name in result.columns[1:]:
    print('%s: %f' % (column_name, result.apply(lambda row: precision_at_k(row[column_name], row['actual']), axis=1).mean()))

actual: 1.000000
random_recommendation: 0.000686
popular_recommendation: 0.155240
weighted random recommender: 0.025955
random_recommendation_5000_weighted: 0.047209
itemitem: 0.136925
cosine: 0.132909
tfidf: 0.138981
own_purchases: 0.201917


  # This is added back by InteractiveShellApp.init_path()


###3.Матричная факторизация. ALS

In [219]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'

# N = Neighbors
N_PREDICT = 50 

In [220]:
# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

In [221]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Recomended_sys/transaction_data.csv')

data.columns = [col.lower() for col in data.columns]
data.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)

In [223]:
test_size_weeks = 3
valid_size_weeks = 6

data_train = data[data['week_no'] < data['week_no'].max() - valid_size_weeks]
data_test= data[data['week_no'] >= data['week_no'].max() - test_size_weeks]
data_valid = data[(data['week_no'] >= data['week_no'].max() - valid_size_weeks) &
                      (data['week_no'] < data['week_no'].max() - test_size_weeks)]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631.0,1.0,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631.0,1.0,0.0,0.0


In [224]:
data_test.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
1070075,1098,31944941471,335,901062,1,1.35,343,-0.66,14.0,49.0,0.0,0.0
1070076,1098,31944941471,335,991205,1,11.99,343,0.0,14.0,49.0,0.0,0.0


In [225]:
data_valid.head(2), data_valid.tail(2)

(        user_id    basket_id  day  ...  week_no  coupon_disc  coupon_match_disc
 979980     2277  31672287700  314  ...     46.0          0.0                0.0
 979981     2277  31672287700  314  ...     46.0          0.0                0.0
 
 [2 rows x 12 columns],
          user_id    basket_id  day  ...  week_no  coupon_disc  coupon_match_disc
 1074023      227  31964082593  334  ...     48.0          0.0                0.0
 1074024      227  31964082593  334  ...     48.0          0.0                0.0
 
 [2 rows x 12 columns])

In [226]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [227]:
print_stats_data(data_train,'train')
print_stats_data(data_valid,'valid')
print_stats_data(data_test,'test')

train
Shape: (984283, 12) Users: 2497 Items: 59870
valid
Shape: (85912, 12) Users: 1852 Items: 20910
test
Shape: (86287, 12) Users: 1852 Items: 20431


In [228]:
item_features = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Recomended_sys/product.csv')
item_features.columns = [col.lower() for col in item_features.columns]
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)

In [229]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [230]:
result1 = data_test.groupby('user_id')['item_id'].unique().reset_index()
result1.columns=['user_id', 'actual']
result1.head(2)

Unnamed: 0,user_id,actual
0,1,"[837208, 849264, 851231, 856942, 861272, 86474..."
1,2,"[868389, 868547, 883665, 911974, 925862, 93493..."


In [231]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

In [232]:
# Заведем фиктивный item_id (если юзер покупал товары из топ-5000, то он "купил" такой товар)
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', # Можно пробоват ьдругие варианты
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат sparse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


item_id,51716,202291,397896,420647,480014,818980,819063,819255,819304,819308,819330,819518,819643,819765,819840,819845,819927,819978,820122,820165,820301,820321,820341,820347,820361,820486,820560,820895,821083,821200,821209,821219,821316,821344,821464,821556,821562,821565,821695,821730,...,12301839,12302069,12326050,12349795,12351966,12352248,12352249,12352293,12352330,12384365,12384657,12384775,12385050,12425418,12427353,12428017,12428436,12456256,12517450,12518330,12524245,12524510,12524690,12577242,12604644,12648296,12695224,12696183,12731432,12731436,12731544,12731685,12731714,12777316,12781986,12810391,12810393,12811532,12946027,12949590
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [233]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [234]:
%%time

model = AlternatingLeastSquares(factors=100, 
                                regularization=0.001,
                                iterations=15, 
                                calculate_training_loss=True, 
                                num_threads=4)

model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

recs = model.recommend(userid=userid_to_id[2],  # userid - id от 0 до N
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, # кол-во рекомендаций 
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=True)



HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))


CPU times: user 993 ms, sys: 24.5 ms, total: 1.02 s
Wall time: 1.04 s


In [235]:
def get_recommendations(user, model, N=5):
    res = [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[user], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=N, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=True)]
    return res

In [236]:
%%time
    
result1['als'] = result1['user_id'].apply(lambda x: get_recommendations(x, model=model, N=5))

CPU times: user 25.8 s, sys: 20.1 s, total: 45.9 s
Wall time: 23.4 s


In [239]:
for column_name in result1.columns[1:]:
    print('%s: %f' % (column_name, result1.apply(lambda row: precision_at_k(row[column_name], row['actual']), axis=1).mean()))

actual: 1.000000
als: 0.180130


# 3.1.Проведем подбор гиперпараметров для модели ALS Random search


In [None]:
import itertools

In [None]:
def sample_hyperparameters():
  while True:
    yield {
        "factors": np.random.randint(32, 160),
        "iterations": np.random.randint(16, 64),
        "regularization": np.random.exponential(0.05),
        "num_epochs": np.random.randint(5, 50)          
    }

In [None]:
user_item_matrix.shape


(2497, 5001)

In [None]:
## Для оценки гиперпараметров будем использовать валидационную выборку data_valid
def random_search(data_valid,
                  user_item_matrix,
                  num_samples = 10):
    """
    Sample random hyperparameters, fit a LightFM model, and evaluate it
    on the test set.

    Parameters
    ----------

    train: np.float32 coo_matrix of shape [n_users, n_items]
        Training data.
    test: np.float32 coo_matrix of shape [n_users, n_items]
        Test data.
    num_samples: int, optional
        Number of hyperparameter choices to evaluate.


    Returns
    -------

    generator of (auc_score, hyperparameter dict, fitted model)

    """

    for hyperparams in itertools.islice(sample_hyperparameters(), num_samples):
        num_epochs = hyperparams.pop("num_epochs")

        model = AlternatingLeastSquares(**hyperparams)
        model.fit(csr_matrix(user_item_matrix).T.tocsr(), show_progress=True)

        model.recommend(userid=userid_to_id[2],
                        user_items=csr_matrix(user_item_matrix).tocsr(),  
                        N=5,
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=True)
        result = result = data_valid.groupby('user_id')['item_id'].unique().reset_index()
        result.columns=['user_id', 'actual']
        result['als'] = result['user_id'].apply(lambda x: get_recommendations(x, model=model, N=5))
        score = result.apply(lambda row: precision_at_k(row['als'], row['actual']), axis=1).mean()
        hyperparams['num_epochs'] = num_epochs

        yield (score, hyperparams, model)

(score, hyperparams, model) = max(random_search(data_valid,
                                              user_item_matrix,
                                              ), key = lambda x: x[0])
                                  
print("Best params {} at {}".format(score, hyperparams))



HBox(children=(FloatProgress(value=0.0, max=52.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, max=56.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, max=55.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, max=34.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, max=36.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, max=38.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, max=46.0), HTML(value='')))


Best params 0.19136069114470608 at {'factors': 55, 'iterations': 34, 'regularization': 0.00533199511117114, 'num_epochs': 19}


In [None]:
user_item_matrix.shape

(2497, 5001)

In [None]:
## Оценим работу модели с подобранными гиперпараметрами
%%time
model = AlternatingLeastSquares(factors=55, 
                                regularization=0.00533199511117114,
                                iterations=34, 
                                calculate_training_loss=True, 
                                num_threads=4)
# На вход item-user matrix
model.fit(csr_matrix(user_item_matrix).T.tocsr(), show_progress=True)



HBox(children=(FloatProgress(value=0.0, max=34.0), HTML(value='')))


CPU times: user 947 ms, sys: 22.6 ms, total: 969 ms
Wall time: 998 ms


In [None]:
%%time
    
result1['als_random_search'] = result1['user_id'].apply(lambda x: get_recommendations(x, model=model, N=5))

CPU times: user 14.2 s, sys: 11.1 s, total: 25.2 s
Wall time: 12.9 s


3.2. TF-IDF взвешивание

In [None]:
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

In [None]:
user_item_matrix = tfidf_weight(user_item_matrix.T).T  # Применяется к item-user матрице ! 

In [None]:
%%time
model = AlternatingLeastSquares(factors=55, 
                                regularization=0.00533199511117114,
                                iterations=34, 
                                calculate_training_loss=True, 
                                num_threads=4)

model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

result1['als_tfidf'] = result1['user_id'].apply(lambda x: get_recommendations(x, model=model, N=5))



HBox(children=(FloatProgress(value=0.0, max=34.0), HTML(value='')))


CPU times: user 13.5 s, sys: 10.1 s, total: 23.6 s
Wall time: 12.4 s


3.3 BM25 взвешивание

In [None]:
user_item_matrix = bm25_weight(user_item_matrix.T).T  # Применяется к item-user матрице ! 

In [None]:
model = AlternatingLeastSquares(factors=55, 
                                regularization=0.00533199511117114,
                                iterations=34, 
                                calculate_training_loss=True, 
                                num_threads=4)

model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

result1['als_bm25'] = result1['user_id'].apply(lambda x: get_recommendations(x, model=model, N=5))



HBox(children=(FloatProgress(value=0.0, max=34.0), HTML(value='')))




In [None]:
for column_name in result1.columns[1:]:
    print('%s: %f' % (column_name, result1.apply(lambda row: precision_at_k(row[column_name], row['actual']), axis=1).mean()))

actual: 1.000000
als: 0.180346
als_random_search: 0.202484
als_tfidf: 0.202268
als_bm25: 0.189309


##4. Гибридные системы. LightFM


In [None]:
!pip3 install lightfm

Collecting lightfm
  Downloading lightfm-1.16.tar.gz (310 kB)
[K     |████████████████████████████████| 310 kB 3.1 MB/s 
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.16-cp37-cp37m-linux_x86_64.whl size=706137 sha256=fdddd6b083a36b5947f0fc6a297897fd95f065841ccf1a43e9211988b9559918
  Stored in directory: /root/.cache/pip/wheels/f8/56/28/5772a3bd3413d65f03aa452190b00898b680b10028a1021914
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.16


In [None]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Recomended_sys/retail_train.csv')

In [None]:
test_size_weeks = 3
valid_size_weeks = 6

data_train = data[data['week_no'] < data['week_no'].max() - valid_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]
data_valid = data[(data['week_no'] >= data['week_no'].max() - valid_size_weeks) &
                      (data['week_no'] < data['week_no'].max() - test_size_weeks)]

In [None]:
data_test.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2277416,338,41260573635,636,840173,1,1.99,369,0.0,112,92,0.0,0.0
2277417,338,41260573635,636,1037348,1,0.89,369,-0.3,112,92,0.0,0.0


In [None]:
data_valid.head(2), data_valid.tail(2)

(         user_id    basket_id  day  ...  week_no  coupon_disc  coupon_match_disc
 2191387       84  40877069294  615  ...       89          0.0                0.0
 2191388       84  40877069294  615  ...       89          0.0                0.0
 
 [2 rows x 12 columns],
          user_id    basket_id  day  ...  week_no  coupon_disc  coupon_match_disc
 2282323      462  41297773713  635  ...       91          0.0                0.0
 2282324      462  41297773713  635  ...       91          0.0                0.0
 
 [2 rows x 12 columns])

In [None]:
##Загрузим датасет с признаками user_id
user_features = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Recomended_sys/hh_demographic.csv')

# column processing
user_features.columns = [col.lower() for col in user_features.columns]
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [None]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [None]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [None]:
def _prepare_matrix(data):
    user_item_matrix = pd.pivot_table(data,
                                      index='user_id',
                                      columns='item_id',
                                      values='quantity',  # Можно пробовать другие варианты
                                      aggfunc='count',
                                      fill_value=0
                                      )

    user_item_matrix = user_item_matrix.astype(float)  # необходимый тип матрицы для implicit
    sparse_user_item = csr_matrix(user_item_matrix).tocsr() # переведем в формат sparse matrix

    return user_item_matrix, sparse_user_item

In [None]:
user_item_matrix, sparse_user_item = _prepare_matrix(data_train)

In [None]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [None]:
user_item_matrix.head(2)

item_id,25671,26081,26093,26190,26355,26426,26540,26601,26636,26691,26738,26889,26941,27021,27030,27152,27158,27159,27323,27334,27346,27404,27479,27491,27503,27509,27510,27522,27558,27633,27657,27658,27686,27695,27697,27732,27735,27745,27754,27760,...,17208674,17209402,17209483,17209541,17209599,17209602,17209604,17209679,17209779,17214322,17214352,17214934,17214939,17214944,17214969,17214981,17215077,17215112,17215156,17238070,17238168,17238204,17239218,17239287,17239926,17240083,17240213,17240256,17240369,17240710,17241860,17242362,17242460,17242672,17282612,17283027,17283671,17283763,17284296,17284297
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Prepare CSR test matrix


In [None]:
data_test = data_test[data_test['item_id'].isin(data_train['item_id'].unique())]

test_user_item_matrix, sparse_test_user_item  = _prepare_matrix(data_test)

In [None]:
test_user_item_matrix.shape, user_item_matrix.shape

((2041, 21806), (2499, 85334))

In [None]:
test_userids = test_user_item_matrix.index.values
test_itemids = test_user_item_matrix.columns.values

test_matrix_userids = np.arange(len(test_userids))
test_matrix_itemids = np.arange(len(test_itemids))

test_id_to_itemid = dict(zip(test_matrix_itemids, test_itemids))
test_id_to_userid = dict(zip(test_matrix_userids, test_userids))

test_itemid_to_id = dict(zip(test_itemids, test_matrix_itemids))
test_userid_to_id = dict(zip(test_userids, test_matrix_userids))

Prepare user and item features

In [None]:
user_feat = pd.DataFrame(user_item_matrix.index)
user_feat = user_feat.merge(user_features, on='user_id', how='left')
user_feat.set_index('user_id', inplace=True)
user_feat.head(2)

Unnamed: 0_level_0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,65+,A,35-49K,Homeowner,2 Adults No Kids,2.0,None/Unknown
2,,,,,,,


In [None]:
user_feat.shape

(2499, 7)

In [None]:
item_feat = pd.DataFrame(user_item_matrix.columns)
item_feat = item_feat.merge(item_features, on='item_id', how='left')
item_feat.set_index('item_id', inplace=True)

item_feat.head(2)

Unnamed: 0_level_0,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [None]:
item_feat.shape

(85334, 6)

Encoding features

In [None]:
user_feat_lightfm = pd.get_dummies(user_feat, columns=user_feat.columns.tolist())
item_feat_lightfm = pd.get_dummies(item_feat, columns=item_feat.columns.tolist())

In [None]:
user_feat_lightfm.head(2)

Unnamed: 0_level_0,age_desc_19-24,age_desc_25-34,age_desc_35-44,age_desc_45-54,age_desc_55-64,age_desc_65+,marital_status_code_A,marital_status_code_B,marital_status_code_U,income_desc_100-124K,income_desc_125-149K,income_desc_15-24K,income_desc_150-174K,income_desc_175-199K,income_desc_200-249K,income_desc_25-34K,income_desc_250K+,income_desc_35-49K,income_desc_50-74K,income_desc_75-99K,income_desc_Under 15K,homeowner_desc_Homeowner,homeowner_desc_Probable Owner,homeowner_desc_Probable Renter,homeowner_desc_Renter,homeowner_desc_Unknown,hh_comp_desc_1 Adult Kids,hh_comp_desc_2 Adults Kids,hh_comp_desc_2 Adults No Kids,hh_comp_desc_Single Female,hh_comp_desc_Single Male,hh_comp_desc_Unknown,household_size_desc_1,household_size_desc_2,household_size_desc_3,household_size_desc_4,household_size_desc_5+,kid_category_desc_1,kid_category_desc_2,kid_category_desc_3+,kid_category_desc_None/Unknown
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
user_feat_lightfm.shape

(2499, 41)

In [None]:
item_feat_lightfm.head(2)

Unnamed: 0_level_0,manufacturer_1,manufacturer_2,manufacturer_3,manufacturer_4,manufacturer_5,manufacturer_6,manufacturer_8,manufacturer_9,manufacturer_10,manufacturer_11,manufacturer_13,manufacturer_14,manufacturer_15,manufacturer_16,manufacturer_17,manufacturer_19,manufacturer_20,manufacturer_21,manufacturer_22,manufacturer_23,manufacturer_24,manufacturer_25,manufacturer_26,manufacturer_28,manufacturer_29,manufacturer_30,manufacturer_31,manufacturer_32,manufacturer_33,manufacturer_34,manufacturer_35,manufacturer_36,manufacturer_37,manufacturer_38,manufacturer_39,manufacturer_40,manufacturer_41,manufacturer_42,manufacturer_43,manufacturer_44,...,curr_size_of_product_R 20 CT,curr_size_of_product_R 20 OZ,curr_size_of_product_R 24 CT,curr_size_of_product_R 25 CT,curr_size_of_product_R 4/3.5 OZ,curr_size_of_product_R 50 CT,curr_size_of_product_R 51 CT,curr_size_of_product_R 6 OZ,curr_size_of_product_R 6.8 OZ,curr_size_of_product_R 75 CT,curr_size_of_product_REGULAR,curr_size_of_product_SCAN KPC,curr_size_of_product_SFT,curr_size_of_product_SINGLE,curr_size_of_product_SIZE 1,curr_size_of_product_SIZE 2,curr_size_of_product_SIZE 3,curr_size_of_product_SIZE 4,curr_size_of_product_SIZE 5,curr_size_of_product_SM,curr_size_of_product_SM/MED,curr_size_of_product_SMALL,curr_size_of_product_SML,curr_size_of_product_SNGL,curr_size_of_product_SNGL 20OZ,curr_size_of_product_SOFT,curr_size_of_product_SWAMP THIN,curr_size_of_product_SZ 1 2CT,curr_size_of_product_SZ1 2PK,curr_size_of_product_T 6 OZ,curr_size_of_product_TALL,curr_size_of_product_TWIN,curr_size_of_product_TWIN PACK,curr_size_of_product_UNDER 15LB,curr_size_of_product_X-LARGE,curr_size_of_product_X-LG,curr_size_of_product_XL,curr_size_of_product_XL/3PK,curr_size_of_product_XL/6PK,curr_size_of_product_XLG
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
25671,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
26081,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
item_feat_lightfm.shape

(85334, 12964)

4.3. Init model


In [None]:
model = LightFM(no_components=40,
 #               loss='bpr',
                loss='warp',
                learning_rate=0.05, 
                item_alpha=0.1,
                user_alpha=0.1, 
                random_state=42)

In [None]:
model.fit((sparse_user_item > 0) * 1,  # user-item matrix из 0 и 1
          sample_weight=coo_matrix(user_item_matrix),
          user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
          item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
          epochs=15, 
          num_threads=4,
          verbose=False) 

<lightfm.lightfm.LightFM at 0x7f6a4cf3fe50>

4.4.Train precision

In [240]:
def precision_at_k_feat(model, test_interactions, train_interactions=None,
                   k=10, user_features=None, item_features=None,
                   preserve_rows=False, num_threads=1, check_intersections=True):
    """
    Measure the precision at k metric for a model: the fraction of known
    positives in the first k positions of the ranked list of results.
    A perfect score is 1.0.

    Parameters
    ----------

    model: LightFM instance
         the model to be evaluated
    test_interactions: np.float32 csr_matrix of shape [n_users, n_items]
         Non-zero entries representing known positives in the evaluation set.
    train_interactions: np.float32 csr_matrix of shape [n_users, n_items], optional
         Non-zero entries representing known positives in the train set. These
         will be omitted from the score calculations to avoid re-recommending
         known positives.
    k: integer, optional
         The k parameter.
    user_features: np.float32 csr_matrix of shape [n_users, n_user_features], optional
         Each row contains that user's weights over features.
    item_features: np.float32 csr_matrix of shape [n_items, n_item_features], optional
         Each row contains that item's weights over features.
    preserve_rows: boolean, optional
         When False (default), the number of rows in the output will be equal
         to the number of users with interactions in the evaluation set.
         When True, the number of rows in the output will be equal to the
         number of users.
    num_threads: int, optional
         Number of parallel computation threads to use. Should
         not be higher than the number of physical cores.
    check_intersections: bool, optional, True by default,
        Only relevant when train_interactions are supplied.
        A flag that signals whether the test and train matrices should be checked
        for intersections to prevent optimistic ranks / wrong evaluation / bad data split.

    Returns
    -------

    np.array of shape [n_users with interactions or n_users,]
         Numpy array containing precision@k scores for each user. If there are
         no interactions for a given user the returned precision will be 0.
    """

    ranks = model.predict_rank(test_interactions,
                               train_interactions=train_interactions,
                               user_features=user_features,
                               item_features=item_features,
                               num_threads=num_threads,
                               check_intersections=check_intersections,
                               )

    ranks.data = np.less(ranks.data, k, ranks.data)

    precision = np.squeeze(np.array(ranks.sum(axis=1))) / k

    if not preserve_rows:
        precision = precision[test_interactions.getnnz(axis=1) > 0]

    return precision

In [None]:
train_precision = precision_at_k_feat(model, sparse_user_item, 
                                 user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                 k=5).mean()

train_precision

0.047939174

4.5.Test precision

In [None]:
test_user_item_matrix.shape

(2041, 21806)

In [None]:
test_precision = precision_at_k_feat(model, csr_matrix(test_user_item_matrix).tocsr(), 
                                 user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                 k=5).mean()

test_precision

0.015384616

## Двухуровневые модели рекомендаций

Split dataset for train, eval, test

In [514]:
VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

In [515]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Recomended_sys/retail_train.csv')

data.columns = [col.lower() for col in data.columns]
data.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)

In [516]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]


# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

# сделаем объединенный сет данных для первого уровня (матчинга)
df_join_train_matcher = pd.concat([data_train_matcher, data_val_matcher])

In [517]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2108779, 12) Users: 2498 Items: 83685
val_matcher
Shape: (169711, 12) Users: 2154 Items: 27649
train_ranker
Shape: (169711, 12) Users: 2154 Items: 27649
val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329


In [518]:
data_train_matcher.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


Prefilter items

In [519]:
def prefilter_items(data, take_n_popular):
    popularity = data_train_matcher.groupby('item_id')['quantity'].sum().reset_index()
    popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
    top_5000 = popularity.sort_values('n_sold', ascending=False).head(take_n_popular).item_id.tolist()
    other_items = data_train_matcher.loc[~data_train_matcher['item_id'].isin(top_5000)]
    b = other_items['item_id'].tolist()
    data_train_matcher_filtered = data_train_matcher.query('item_id not in @b')
    return data_train_matcher_filtered

In [520]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, take_n_popular=5000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5000


Make cold-start to warm-start

In [521]:
# ищем общих пользователей
common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (1345359, 12) Users: 2484 Items: 5000
val_matcher
Shape: (169206, 12) Users: 2145 Items: 27240
train_ranker
Shape: (169206, 12) Users: 2145 Items: 27240
val_ranker
Shape: (118012, 12) Users: 2036 Items: 24100


Init/train recommender

In [522]:
result2 = data_val_matcher.groupby('user_id')['item_id'].unique().reset_index()
result2.columns=['user_id', 'actual']
result2.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [523]:
def _prepare_matrix(data):
    user_item_matrix = pd.pivot_table(data,
                                      index='user_id',
                                      columns='item_id',
                                      values='quantity',  # Можно пробовать другие варианты
                                      aggfunc='count',
                                      fill_value=0
                                      )

    user_item_matrix = user_item_matrix.astype(float)  # необходимый тип матрицы для implicit
    sparse_user_item = csr_matrix(user_item_matrix).tocsr() # переведем в формат sparse matrix

    return user_item_matrix, sparse_user_item

In [524]:
user_item_matrix, sparse_user_item =_prepare_matrix(data_train_matcher)

In [525]:
def _prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""

        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))

        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id

In [526]:
id_to_itemid, id_to_userid, itemid_to_id, userid_to_id = _prepare_dicts(user_item_matrix)

In [527]:
model = AlternatingLeastSquares(factors=55, 
                                regularization=0.00533199511117114,
                                iterations=34, 
                                calculate_training_loss=True, 
                                num_threads=4)

model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

recs = model.recommend(userid=userid_to_id[2],  # userid - id от 0 до N
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, # кол-во рекомендаций 
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=True)



HBox(children=(FloatProgress(value=0.0, max=34.0), HTML(value='')))




In [528]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    precision = flags.sum() / len(recommended_list)
    
    
    return precision

In [529]:
result2['als'] = result2['user_id'].apply(lambda x: get_recommendations(x, model=model, N=5))
result2.apply(lambda row: precision_at_k(row['als'], row['actual']), axis=1).mean()

0.23710955710955367

Own recommender

In [530]:
## """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""
def fit_own_recommender(user_item_matrix):
    own_recommender = ItemItemRecommender(K=1, num_threads=4)
    own_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())

    return own_recommender

In [531]:
own_recommender = ItemItemRecommender(K=1, num_threads=4)

In [532]:
own_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())

HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




In [533]:
result2['own'] = result2['user_id'].apply(lambda x: get_recommendations(x, model=own_recommender, N=5))
result2.apply(lambda row: precision_at_k(row['own'], row['actual']), axis=1).mean()

0.3396736596736598

In [534]:
result2.head(2)

Unnamed: 0,user_id,actual,als,own
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[1005186, 1029743, 1100972, 1033142, 832678]","[1082185, 995242, 1029743, 1005186, 6534178]"
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[1106523, 1133018, 5569230, 1082185, 8090521]","[1082185, 1106523, 1133018, 951590, 5569230]"


 получим 50 *кандидатов*

In [535]:
ACTUAL_COL = 'actual'

In [536]:
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [537]:
# N = Neighbors
N_PREDICT = 50 

In [538]:
%%time

result_eval_matcher['own_rec'] = result_eval_matcher[USER_COL].apply(lambda x: get_recommendations(x, model=own_recommender, N=N_PREDICT))

result_eval_matcher['als_rec'] = result_eval_matcher[USER_COL].apply(lambda x: get_recommendations(x, model = model, N=N_PREDICT))

CPU times: user 22.2 s, sys: 17.6 s, total: 39.8 s
Wall time: 20.3 s


In [539]:
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual,own_rec,als_rec
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[1082185, 995242, 1029743, 1005186, 6534178, 9...","[1005186, 1029743, 1100972, 1033142, 832678, 9..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[1082185, 1106523, 1133018, 951590, 5569230, 8...","[1106523, 1133018, 5569230, 1082185, 8090521, ..."


In [540]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [379]:
def calc_recall(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [541]:
TOPK_PRECISION = 5

In [542]:
sorted(calc_precision(result_eval_matcher, TOPK_PRECISION), key=lambda x: x[1],reverse=True)

[('own_rec', 0.3396736596736598), ('als_rec', 0.23710955710955367)]

## Лучшая метрика precision@5 показывает модель ItemItemRecommender, которая рекомендует товары, купленные user в сочетании с фильтром топ-5000 самых популярных товаров. На тестовом датасете  метрика [('own_rec', 0.3396736596736598)]. 
Модель ранжирования при добавлении дополнительных фичей показывает худший результат и на трейне и на тесте:'reranked_own_rec', 0.2048603839441497)


## Ranking part

Обучаем модель 2-ого уровня на выбранных кандидатах

Подготовка данных для трейна

In [543]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]
df_match_candidates.head()

Unnamed: 0,user_id
0,2070
1,2021
2,1753
3,2120
4,1346


In [544]:
# собираем кандитатов с первого этапа (matcher)
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(lambda x: get_own_recommendations(x, own_recommender, N=N_PREDICT))

In [545]:
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [546]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

In [547]:
df_match_candidates.head(4)

Unnamed: 0,user_id,item_id
0,2070,1082185.0
0,2070,1085604.0
0,2070,6534178.0
0,2070,1029743.0


Создаем трейн сет для ранжирования с учетом кандидатов с этапа 1

In [548]:
# dask dataframe

df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

df_ranker_train['target'].fillna(0, inplace= True)

In [549]:
df_ranker_train.target.value_counts()

0.0    25294
1.0    11050
Name: target, dtype: int64

In [550]:
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target
0,2070,1082185.0,1.0
1,2070,1085604.0,1.0


Подготавливаем фичи для обучения модели

In [551]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [552]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [553]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1082185.0,1.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1085604.0,1.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [555]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [556]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

#Обучение модели ранжирования

In [557]:
!pip install lightgbm



In [558]:
import lightgbm as lgb
from  lightgbm import LGBMClassifier

In [468]:
lgb = LGBMClassifier(objective='binary',
                     max_depth=8,
                     n_estimators=300,
                     learning_rate=0.05,
                     categorical_column=cat_feats)

lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [399]:
df_ranker_predict = df_ranker_train.copy()

In [400]:
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [401]:
df_ranker_predict.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,proba_item_purchase
0,2070,1082185.0,1.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0.647701
1,2070,1085604.0,1.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0.162541


Мы обучили модель ранжирования на покупках из сета data_train_ranker и на кандитатах от own_recommendations, что является тренировочным сетом, и теперь наша задача предсказать и оценить именно на тестовом сете

Evaluation on test dataset

In [402]:
result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


Eval matching on test dataset

In [403]:
%%time
result_eval_ranker['own_rec'] = result_eval_ranker[USER_COL].apply(lambda x: get_own_recommendations(x, own_recommender, N=N_PREDICT))

CPU times: user 55.3 ms, sys: 2.11 ms, total: 57.4 ms
Wall time: 63.1 ms


In [404]:
TOPK_PRECISION = 5

In [405]:
# померяем precision только модели матчинга, чтобы понимать влияение ранжирования на метрики

sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

[('own_rec', 0.2932056319580868)]

Eval re-ranked matched result on test dataset

In [406]:
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [407]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))

In [408]:
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('own_rec', 0.2932056319580868)
('reranked_own_rec', 0.20056719022687258)


  # This is added back by InteractiveShellApp.init_path()


#добавим фичи

In [559]:
data_transaction = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Recomended_sys/transaction_data.csv')
# column processing
data_transaction.columns = [col.lower() for col in data_transaction.columns]

data_transaction.rename(columns={'household_key': 'user_id', 'product_id': 'item_id'}, inplace=True)

In [503]:
data_transaction.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631.0,1.0,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631.0,1.0,0.0,0.0


Чтобы считать поведенческие фичи, нужно учесть все данные что были до data_val_ranker

In [504]:
data_transaction = data_transaction[data_transaction['week_no'] < data_transaction['week_no'].max() - (VAL_RANKER_WEEKS)]

In [560]:
df_ranker_train = df_ranker_train.merge(data_transaction.groupby(by=ITEM_COL).agg('sales_value').sum().rename('total_item_sales_value'), how='left',on=ITEM_COL)


In [506]:
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,total_item_sales_value_x,item_freq_per_basket,item_freq,total_user_sales_value,item_quantity_per_week,total_quantity_value,user_freq,total_item_sales_value_y
0,2070,1082185.0,1.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,11570.65,0.10655,26127,5754.86,297.846154,27104,1996,11570.65
1,2070,1085604.0,1.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,1899.87,0.011741,2879,5754.86,35.78022,3256,1996,1899.87


In [507]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [508]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc',
 'total_item_sales_value_x',
 'item_freq_per_basket',
 'item_freq',
 'total_user_sales_value',
 'item_quantity_per_week',
 'total_quantity_value',
 'user_freq',
 'total_item_sales_value_y']

In [509]:
lgb = LGBMClassifier(objective='binary',
                     max_depth=10,
                     n_estimators=500,
                     learning_rate=0.05,
                     categorical_column=cat_feats)

lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [417]:
df_ranker_predict = df_ranker_train.copy()
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [418]:
result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [419]:
%%time
result_eval_ranker['own_rec'] = result_eval_ranker[USER_COL].apply(lambda x: get_own_recommendations(x, own_recommender, N=N_PREDICT))

CPU times: user 52.5 ms, sys: 1.15 ms, total: 53.6 ms
Wall time: 53.9 ms


In [420]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))

In [421]:
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('own_rec', 0.2932056319580868)
('reranked_own_rec', 0.20067190226875742)


  # This is added back by InteractiveShellApp.init_path()


Дoбавим еще фичи 

In [561]:
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq'), how='left',on=ITEM_COL)
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('sales_value').sum().rename('total_user_sales_value'), how='left',on=USER_COL)
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=ITEM_COL)
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('total_quantity_value'), how='left',on=ITEM_COL)
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq'), how='left',on=USER_COL)

In [562]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]
cat_feats = X_train.columns[2:].tolist()


cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc',
 'total_item_sales_value',
 'item_freq_per_basket',
 'item_freq',
 'total_user_sales_value',
 'item_quantity_per_week',
 'total_quantity_value',
 'user_freq']

In [564]:
X_train[cat_feats] = X_train[cat_feats].astype('category')

In [513]:
X_train.shape

(36344, 29)

In [565]:
lgb = LGBMClassifier(objective='binary',
                     max_depth=10,
                     n_estimators=500,
                     learning_rate=0.05,
                     categorical_column=cat_feats)

lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [427]:
df_ranker_predict = df_ranker_train.copy()
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [428]:
result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [429]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))

In [430]:
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.2048603839441497)


  # This is added back by InteractiveShellApp.init_path()


## Сделаем рекомендации для каждого user на тестовом датасете

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [482]:
df_test =pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Recomended_sys/retail_test1.csv')

In [483]:
df_test.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0


In [433]:
result_test = df_test.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_test.columns=[USER_COL, ACTUAL_COL]
result_test.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


In [434]:
test_user_item_matrix, test_sparse_user_item =_prepare_matrix(df_test)

In [435]:
id_to_itemid, id_to_userid, itemid_to_id, userid_to_id = _prepare_dicts(test_user_item_matrix)

Двухуровневая модель Itemitem + топ 5000 по популярности + модель ранжирования Lightgbm

In [442]:
result_test['own'] = result_test['user_id'].apply(lambda x: get_recommendations(x, model=own_recommender, N=N_PREDICT))

In [443]:
result_test.head(5)

Unnamed: 0,user_id,actual,own
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...","[903239, 874149, 886536, 878285, 925862, 87150..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784...","[903239, 909672, 917660, 861706, 920978, 92992..."
2,3,"[827683, 908531, 989069, 1071377, 1080155, 109...","[903239, 909672, 861706, 894439, 925862, 90543..."
3,6,"[956902, 960791, 1037863, 1119051, 1137688, 84...","[903239, 886536, 838186, 864700, 900886, 92097..."
4,7,"[847270, 855557, 859987, 863407, 895454, 90663...","[903239, 886536, 915621, 925862, 919535, 89744..."


In [447]:
# померяем precision только модели матчинга, чтобы понимать влияение ранжирования на метрики

sorted(calc_precision(result_test, TOPK_PRECISION), key=lambda x: x[1],reverse=True)

[('own', 0.003819628647214856)]

In [484]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(df_test[USER_COL].unique())
df_match_candidates.columns = [USER_COL]
df_match_candidates.head()

Unnamed: 0,user_id
0,1340
1,588
2,2070
3,1602
4,447


In [485]:
# собираем кандитатов с первого этапа (matcher)
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(lambda x: get_own_recommendations(x, own_recommender, N=N_PREDICT))

In [486]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)
df_match_candidates

Unnamed: 0,user_id,item_id
0,1340,1082185.0
0,1340,1085604.0
0,1340,6534178.0
0,1340,1029743.0
0,1340,995242.0
...,...,...
1884,247,1138292.0
1884,247,1030362.0
1884,247,1033615.0
1884,247,1132956.0


In [487]:
df_test = df_test[[USER_COL, ITEM_COL]].copy()
df_test = df_match_candidates.merge(df_test, on=[USER_COL, ITEM_COL], how='left')

In [489]:
df_test = df_test.merge(item_features, on='item_id', how='left')
df_test = df_test.merge(user_features, on='user_id', how='left')

In [490]:
df_test.shape

(29195, 15)

In [491]:
## Добавим фичи как трейне
df_test = df_test.merge(data_transaction.groupby(by=ITEM_COL).agg('sales_value').sum().rename('total_item_sales_value'), how='left',on=ITEM_COL)
df_test = df_test.merge(data_transaction.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)
df_test = df_test.merge(data_transaction.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq'), how='left',on=ITEM_COL)
df_test = df_test.merge(data_transaction.groupby(by=USER_COL).agg('sales_value').sum().rename('total_user_sales_value'), how='left',on=USER_COL)
df_test = df_test.merge(data_transaction.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=ITEM_COL)
df_test = df_test.merge(data_transaction.groupby(by=ITEM_COL).agg('quantity').sum().rename('total_quantity_value'), how='left',on=ITEM_COL)
df_test = df_test.merge(data_transaction.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq'), how='left',on=USER_COL)

In [492]:
df_test.shape

(29195, 22)

In [497]:
cat_feats = df_test.columns[2:].tolist()
cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc',
 'total_item_sales_value',
 'item_freq_per_basket',
 'item_freq',
 'total_user_sales_value',
 'item_quantity_per_week',
 'total_quantity_value',
 'user_freq']

In [566]:
df_test[cat_feats] = df_test[cat_feats].astype('category')

In [567]:
test_predict = lgb.predict_proba(df_test)

In [569]:
df_test_predict = df_test.copy()
df_test_predict['proba_item_purchase'] = test_predict[:,1]

In [571]:
df_test_predict.head(2)

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,total_item_sales_value,item_freq_per_basket,item_freq,total_user_sales_value,item_quantity_per_week,total_quantity_value,user_freq,proba_item_purchase
0,1340,1082185.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,,,,,,,,11570.65,0.048864,11982,135.32,136.56044,12427,65.0,0.199588
1,1340,1085604.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,,,,,,,,1899.87,0.00635,1557,135.32,19.384615,1764,65.0,0.196608


In [572]:
result_test['reranked_own_rec'] = result_test[USER_COL].apply(lambda user_id: rerank(user_id))

In [573]:
print(*sorted(calc_precision(result_test, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.18836158192090097)
('own', 0.003819628647214856)


  # This is added back by InteractiveShellApp.init_path()


In [575]:
result_test.head(2)

Unnamed: 0,user_id,actual,own,reranked_own_rec
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...","[903239, 874149, 886536, 878285, 925862, 87150...","[1082185.0, 1082185.0, 1082185.0, 995242.0, 99..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784...","[903239, 909672, 917660, 861706, 920978, 92992...","[1106523.0, 1082185.0, 1133018.0, 1053690.0, 1..."


In [578]:
result_test.to_csv('reranked_own_rec.csv', index=False)