## Рекомендательные системы
### Курсовой проект

### Import libs

In [1]:
import pandas as pd
import numpy as np
import pickle

# from catboost import CatBoostClassifier, Pool, CatBoost

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys

from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

import warnings
warnings.filterwarnings('ignore')

from tqdm.notebook import tqdm

### Read data

In [2]:
PATH_DATA = "./data"

data = pd.read_csv(os.path.join(PATH_DATA,'retail_train.csv'))
item_features = pd.read_csv(os.path.join(PATH_DATA,'product.csv'))
user_features = pd.read_csv(os.path.join(PATH_DATA,'hh_demographic.csv'))

### Set global const

In [3]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'

### Process features dataset

In [4]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

### Split dataset for train, eval, test

In [5]:
# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)

VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]


# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

# сделаем объединенный сет данных для первого уровня (матчинга)
df_join_train_matcher = pd.concat([data_train_matcher, data_val_matcher])

In [6]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [7]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')
# выведем разброс по пользователям и товарам

train_matcher
Shape: (2108779, 12) Users: 2498 Items: 83685
val_matcher
Shape: (169711, 12) Users: 2154 Items: 27649
train_ranker
Shape: (169711, 12) Users: 2154 Items: 27649
val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329


### Prefilter items

In [76]:
# goods_popularity = data_train_matcher.groupby('item_id')['quantity'].sum().reset_index()
# goods_popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

# goods_popularity.sort_values('n_sold', ascending=False)

In [77]:
# top_sold = goods_popularity.loc[goods_popularity["n_sold"] >= 10000]
# top_sold = pd.merge(top_sold, item_features, left_on=ITEM_COL, right_on=ITEM_COL)
# top_sold.sort_values('n_sold', ascending=False).head()

In [78]:
# top_sold = goods_popularity.loc[(goods_popularity["n_sold"] >= 5000) & (goods_popularity["n_sold"] < 10000)]
# top_sold = pd.merge(top_sold, item_features, left_on=ITEM_COL, right_on=ITEM_COL)
# top_sold.sort_values('n_sold', ascending=False)

In [11]:
# Больше всего в колличественном выражении продется бензин, и популярны товары первой необходимости - молочка, хлеб.
# По метрике на тестовом датасете выявил что оптимально поставить take_n_popular=2500

n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=2500)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 2501


### Make cold-start to warm-start

In [12]:
# ищем общих пользователей
common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (861404, 13) Users: 2495 Items: 2501
val_matcher
Shape: (169615, 12) Users: 2151 Items: 27644
train_ranker
Shape: (169615, 12) Users: 2151 Items: 27644
val_ranker
Shape: (118282, 12) Users: 2040 Items: 24325


### Init/train recommender

In [13]:
recommender = MainRecommender(data_train_matcher)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/2501 [00:00<?, ?it/s]

### Eval recall of matching

In [14]:
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [15]:
# N = Neighbors
N_PREDICT = 500

methods = [
    recommender.get_similar_users_recommendation,
    recommender.get_similar_items_recommendation,
    recommender.get_own_recommendations,
    recommender.get_als_recommendations,
    ]

In [16]:
# %%time

# for method in tqdm(methods):
#     print(method.__name__)
#     result_eval_matcher[method.__name__] = result_eval_matcher[USER_COL].apply(lambda x: method(x, N=N_PREDICT))

In [17]:
precision_5, recall_50  = list(), list()

for col in result_eval_matcher.iloc[:,2:]:
    precision_5.append(result_eval_matcher.apply(lambda row: precision_at_k(row[col], row['actual'], k=5), axis=1).mean())
    recall_50.append(result_eval_matcher.apply(lambda row: recall_at_k(row[col], row['actual'], k=50), axis=1).mean())

In [18]:
pd.DataFrame({'Algoritm': result_eval_matcher.columns[2:],
              'Precision@5': precision_5,
              'Recall@50': recall_50}
            )

Unnamed: 0,Algoritm,Precision@5,Recall@50



* N Algoritm	Precision@5	Recall@50
* 0	get_similar_users_recommendation	0.020177	0.011280
* 1	get_similar_items_recommendation	0.058670	0.037023
* 2	get_own_recommendations	0.194049	0.072068
* 3	get_als_recommendations	0.099024	0.047857


Наилучший результат показывает способ get_own_recommendations

## Ranking part

### Обучаем модель 2-ого уровня на выбранных кандидатах

- Обучаем на data_train_ranking
- Обучаем *только* на выбранных кандидатах
- Я *для примера* сгенерирую топ-50 кадидиатов через get_own_recommendations
- (!) Если юзер купил < 50 товаров, то get_own_recommendations дополнит рекоммендации топ-популярными

-- давние покупки -- | -- 6 недель -- | -- 3 недель -- 

### Train data preparing

In [19]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

# собираем кандитатов с первого этапа (matcher), N_PREDICT = 200
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(
                                                                lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))
df_match_candidates.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[1105426, 1092937, 917033, 10198378, 1008814, ..."
1,2021,"[950935, 1119454, 835578, 863762, 1013928, 653..."


In [20]:
# разворачиваем товары
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = ITEM_COL

df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)
df_match_candidates.head(4)

Unnamed: 0,user_id,item_id
0,2070,1105426
0,2070,1092937
0,2070,917033
0,2070,10198378


### Check warm start

In [21]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (1075500, 2) Users: 2151 Items: 2478


### Создаем трейн сет для ранжирования с учетом кандидатов с этапа 1 

In [22]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

# Не хватает нулей в датасете, поэтому добавляем наших кандитатов в качество нулей
df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

# чистим дубликаты
df_ranker_train = df_ranker_train.drop_duplicates(subset=[USER_COL, ITEM_COL])
df_ranker_train['target'].fillna(0, inplace= True)

In [23]:
df_ranker_train.target.value_counts()

0.0    971335
1.0     23801
Name: target, dtype: int64

In [24]:
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target
0,2070,1105426,0.0
1,2070,1092937,1.0


In [25]:
# (!) На каждого юзера 200 item_id-кандидатов

df_ranker_train['target'].mean()

0.023917333912148692

### Подготавливаем фичи для обучения модели

In [26]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [27]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


### user_id feats

In [28]:
# Средний чек
# +

user_average_bill = df_join_train_matcher.groupby([USER_COL, 'basket_id'])[['sales_value']].mean().reset_index().groupby(USER_COL)['sales_value'].mean()
user_average_bill.name = 'user_average_bill'

user_features = user_features.merge(user_average_bill, how='left', on=USER_COL, suffixes=(False, False))

In [29]:
# Количество покупок в месяц
# +

df_join_train_matcher['month'] = df_join_train_matcher['day'].apply(lambda x: (x+30)//30)

user_avg_month_purchase = df_join_train_matcher.groupby([USER_COL]).agg({ 'month': ['min', 'max'] }).reset_index()
user_avg_month_purchase['month_purchase'] = user_avg_month_purchase['month','max'] - user_avg_month_purchase['month','min']
user_avg_month_purchase.drop([('month','min'), ('month','max')], axis=1, inplace=True)
user_avg_month_purchase.set_index(USER_COL, inplace = True)

# Количество покупок / среднее количество покупок в месяц
user_frequency_purchase = df_join_train_matcher.groupby([USER_COL])['basket_id'].count() / user_avg_month_purchase['month_purchase']
user_frequency_purchase.name = 'user_frequency_purchase'

user_features = user_features.merge(user_frequency_purchase, how='left', on=USER_COL, suffixes=(False, False))

In [30]:
# # Cредняя стоимость покупок в месяц
# -

# user_avg_value_month = df_join_train_matcher.groupby([USER_COL])['sales_value'].sum().rename('user_avg_value_month') / user_avg_month_purchase['month_purchase']
# user_avg_value_month.name = 'user_avg_value_month'

# user_features = user_features.merge(user_avg_value_month, how='left', on=USER_COL, suffixes=(False, False))

### item_id feats

In [31]:
# Среднее кол-во покупок item_id в неделю
# +

item_per_week = df_join_train_matcher.groupby([ITEM_COL])['quantity'].sum().rename('item_per_week') / df_join_train_matcher['week_no'].max() 

item_features = item_features.merge(item_per_week, how='left', on=ITEM_COL, suffixes=(False, False))

In [32]:
# Средняя частота item в корзине
# +

item_freq_per_basket = df_join_train_matcher.groupby([ITEM_COL]).agg(USER_COL).count().rename('item_freq_per_basket')/df_join_train_matcher.basket_id.nunique()

item_features = item_features.merge(item_freq_per_basket, how='left', on=ITEM_COL, suffixes=(False, False))

In [33]:
# # Среднее кол-во item в корзине
# item_quantity_per_basket = df_join_train_matcher.groupby([ITEM_COL]).agg('quantity').sum().rename('item_quantity_per_basket')/df_join_train_matcher.basket_id.nunique()

# item_features = item_features.merge(item_quantity_per_basket, how='left', on=ITEM_COL, suffixes=(False, False))

In [34]:
# Цена (Можно посчитать из retil_train.csv) - цена есть в таблице взаимодействий
# +

item_price = df_join_train_matcher.groupby([ITEM_COL]).agg({'quantity': 'sum', 'sales_value': 'sum'})
item_price = item_price['sales_value'] / item_price['quantity']
item_price.name = 'item_price'

item_price.head()

item_features = item_features.merge(item_price, how='left', on=ITEM_COL, suffixes=(False, False))

In [35]:
# Среднее кол-во покупок 1 товара в категории в неделю
# +

data_categ = pd.merge(df_join_train_matcher, item_features, how='inner', on=ITEM_COL)

categ_freq_ = data_categ.groupby(['department'])['quantity'].sum()
categ_item_freq_ = data_categ.groupby(['department', ITEM_COL])['quantity'].sum().reset_index()

categ_freq = pd.merge(categ_item_freq_, categ_freq_, on='department')
categ_freq['category_freq'] = categ_freq['quantity_y'] / df_join_train_matcher['week_no'].max() / categ_freq['quantity_x']
categ_freq.set_index(ITEM_COL, inplace=True)
category_freq = categ_freq['category_freq']

item_features = item_features.merge(category_freq, how='left', on=ITEM_COL, suffixes=(False, False))

In [36]:
# (Кол-во покупок в неделю) / (Среднее кол-во покупок 1 товара в категории в неделю)
# +

freq_ratio_1 = item_per_week / category_freq
freq_ratio_1.name = 'freq_ratio_1'

item_features = item_features.merge(freq_ratio_1, how='left', on=ITEM_COL, suffixes=(False, False))

In [37]:
# Цена / Средняя цена товара в категории
# +

data_categ['price'] = data_categ['sales_value']/data_categ['quantity']
avg_categ_price = data_categ.groupby(['commodity_desc'])['price'].mean()
data_categ_avg = pd.merge(data_categ, avg_categ_price, on='commodity_desc', suffixes=('_x', '_y'))
data_categ_avg = data_categ_avg.groupby(['item_id']).agg({'price_x': 'mean', 'price_y': 'mean'})

price_diff = data_categ_avg['price_x'] / data_categ_avg['price_y']
price_diff.name = 'price_diff'

item_features = item_features.merge(price_diff, how='left', on=ITEM_COL, suffixes=(False, False))

### user_id - item_id feats

In [38]:
# (Кол-во покупок юзером конкретной категории в неделю) / (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)
# +

categ_user_freq_ = data_categ.groupby(['user_id', 'department'])['quantity'].sum().reset_index()
categ_user_freq_['category_freq'] = categ_user_freq_['quantity'] / data_train_matcher['week_no'].max()

categ_user_freq = pd.merge(categ_user_freq_, categ_freq_, on='department')
categ_user_freq['avg_weekly_freq'] = categ_user_freq['quantity_y'] / data_train_matcher['week_no'].max()
categ_user_freq['freq_ratio_2'] = categ_user_freq['category_freq'] / categ_user_freq['avg_weekly_freq']

user_dep_ratio_2 = categ_user_freq.filter(['user_id', 'department', 'freq_ratio_2'], axis=1)

In [39]:
# (Кол-во покупок юзером конкретной категории в неделю) - (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)
# +

categ_user_freq['freq_ratio_3'] = categ_user_freq['category_freq'] - categ_user_freq['avg_weekly_freq']
user_dep_ratio_3 = categ_user_freq.filter(['user_id', 'department', 'freq_ratio_3'], axis=1)

In [40]:
# час совершения транзакции
# ++

temp_data = data_train_matcher.copy()
temp_data['hour'] = temp_data['trans_time'] // 100
median_sales_hour = temp_data.groupby([USER_COL, ITEM_COL])['hour'].median().reset_index()
median_sales_hour.columns = [USER_COL, ITEM_COL, 'median_sales_hour']

df_ranker_train = df_ranker_train.merge(median_sales_hour, on=[USER_COL, ITEM_COL], how='left', suffixes=(False, False))

In [41]:
# # день недели совершения транзакции
# # -

# temp_data['weekday'] = temp_data['day'] % 7
# median_weekday = temp_data.groupby([USER_COL, ITEM_COL])['weekday'].median().reset_index()
# median_weekday.columns = [USER_COL, ITEM_COL, 'median_weekday']
# df_ranker_train = df_ranker_train.merge(median_weekday, on=[USER_COL, ITEM_COL])

In [42]:
# # cреднее кол-во дней между покупками
# # -

# mean_visits_interval = temp_data.groupby(USER_COL)['day'].nunique().reset_index()
# mean_visits_interval['mean_visits_interval'] = (temp_data.groupby(USER_COL)['day'].max() - temp_data.groupby(USER_COL)['day'].min()) / mean_visits_interval['day']
# df_ranker_train = df_ranker_train.merge(mean_visits_interval[[USER_COL, 'mean_visits_interval']], on=[USER_COL])

In [43]:
# # средний чек корзины клиента
# # -

# mean_check = temp_data.groupby(['user_id', 'basket_id'])['sales_value'].sum().reset_index()
# mean_check = mean_check.groupby('user_id')['sales_value'].mean().reset_index()
# mean_check.columns = ['user_id', 'mean_check']
# df_ranker_train = df_ranker_train.merge(mean_check, on=['user_id'])

In [44]:
# # кол-во магазинов, в которых продавался товар
# # -

# num_stores = temp_data.groupby(['item_id'])['store_id'].nunique().reset_index()
# num_stores.columns = ['item_id', 'num_stores']
# df_ranker_train = df_ranker_train.merge(num_stores, on=['item_id'])

In [45]:
# # кол-во уникальных товаров, купленных клиентом
# # -

# num_items = temp_data.groupby([USER_COL])[ITEM_COL].nunique().reset_index()
# num_items.columns = [USER_COL, 'num_items']
# df_ranker_train = df_ranker_train.merge(num_items, on=[USER_COL])

In [46]:
# # кол-во транзакций клиента
# # -

# n_transactions = temp_data.groupby([USER_COL])[ITEM_COL].count().reset_index()
# n_transactions.columns = [USER_COL, 'n_transactions']
# df_ranker_train = df_ranker_train.merge(n_transactions, on=[USER_COL])

In [47]:
# # mean / max / std кол-ва уникальных товаров в корзине клиента
# # -

# df = temp_data.groupby([USER_COL, 'basket_id'])[ITEM_COL].nunique().reset_index()
# mean_n_items_basket = df.groupby(USER_COL)[ITEM_COL].mean().reset_index()
# mean_n_items_basket.columns = [USER_COL, 'mean_n_items_basket']
# df_ranker_train = df_ranker_train.merge(mean_n_items_basket, on=[USER_COL])

# max_n_items_basket = df.groupby(USER_COL)[ITEM_COL].max().reset_index()
# max_n_items_basket.columns = [USER_COL, 'max_n_items_basket']
# df_ranker_train = df_ranker_train.merge(max_n_items_basket, on=[USER_COL])

# std_n_items_basket = df.groupby(USER_COL)[ITEM_COL].std().reset_index()
# std_n_items_basket.columns = [USER_COL, 'std_n_items_basket']
# df_ranker_train = df_ranker_train.merge(std_n_items_basket, on=[USER_COL])

In [48]:
# # mean / max / std кол-ва уникальных категорий в корзине клиента
# # -

# temp_data = temp_data.merge(item_features[[ITEM_COL, 'commodity_desc']], on=[ITEM_COL])
# df = temp_data.groupby([USER_COL, 'basket_id'])['commodity_desc'].nunique().reset_index()

# mean_n_item_categories_basket = df.groupby(USER_COL)['commodity_desc'].mean().reset_index()
# mean_n_item_categories_basket.columns = [USER_COL, 'mean_n_item_categories_basket']
# df_ranker_train = df_ranker_train.merge(mean_n_item_categories_basket, on=[USER_COL])

# max_n_item_categories_basket = df.groupby(USER_COL)['commodity_desc'].max().reset_index()
# max_n_item_categories_basket.columns = [USER_COL, 'max_n_item_categories_basket']
# df_ranker_train = df_ranker_train.merge(max_n_item_categories_basket, on=[USER_COL])

# std_n_item_categories_basket = df.groupby(USER_COL)['commodity_desc'].std().reset_index()
# std_n_item_categories_basket.columns = [USER_COL, 'std_n_item_categories_basket']
# df_ranker_train = df_ranker_train.merge(std_n_item_categories_basket, on=[USER_COL])

In [49]:
df_ranker_train = df_ranker_train.merge(user_features, on=USER_COL, how='left', suffixes=(False, False))
df_ranker_train = df_ranker_train.merge(item_features, on=ITEM_COL, how='left', suffixes=(False, False))

In [50]:
df_ranker_train = pd.merge(df_ranker_train, user_dep_ratio_2, on=[USER_COL, 'department'], how='left')
df_ranker_train = pd.merge(df_ranker_train, user_dep_ratio_3, on=[USER_COL, 'department'], how='left')

In [51]:
# Также добавляем блок сгенерированных быстрых фичей
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('sales_value').median().rename('avg_item_sales_value'), how='left',on=ITEM_COL)
# +

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('sales_value').median().rename('avg_user_sales_value'), how='left',on=USER_COL)
# +

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('sales_value').sum().rename('total_user_sales_value'), how='left',on=USER_COL)
# +

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq'), how='left',on=USER_COL)
# +

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=USER_COL)

In [52]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [53]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')
cat_feats

['median_sales_hour',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc',
 'user_average_bill',
 'user_frequency_purchase',
 'manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'item_per_week',
 'item_freq_per_basket',
 'item_price',
 'category_freq',
 'freq_ratio_1',
 'price_diff',
 'freq_ratio_2',
 'freq_ratio_3',
 'avg_item_sales_value',
 'avg_user_sales_value',
 'total_user_sales_value',
 'user_freq',
 'user_quantity_per_week']

### Обучение модели ранжирования

In [54]:
# train_pool = Pool(data=X_train, label=y_train, group_id=X_train[USER_COL], cat_features=cat_feats)

# params_cb = {"n_estimators":500,
#              "objective": "YetiRankPairwise",
#              "max_depth": 6,
#              "task_type": "CPU",
#              "eta": 0.1,
#              "verbose": 50,
#              "random_seed": 21,
#              'cat_features': cat_feats,
#             } 

# model_cb = CatBoost(params=params_cb)

In [55]:
# model_cb.fit(train_pool)

In [56]:
# df_ranker_predict = df_ranker_train.copy()

In [57]:
# df_ranker_predict['proba_item_purchase'] = model_cb.predict(X_train, prediction_type='Probability')[:, 1]

In [58]:
lgb = LGBMClassifier(objective='binary',
                     max_depth=100,
                     n_estimators=300,
                     learning_rate=0.1,
                     categorical_column=cat_feats)

lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

In [59]:
df_ranker_predict = df_ranker_train.copy()

In [60]:
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

### Evaluation on test dataset

In [61]:
result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


### Eval matching on test dataset

In [62]:
%%time
result_eval_ranker['own_rec'] = result_eval_ranker[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

CPU times: total: 3.98 s
Wall time: 3.99 s


In [63]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [64]:
TOPK_PRECISION = 5

In [65]:
# померяем precision только модели матчинга, чтобы понимать влияение ранжирования на метрики

sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

[('own_rec', 0.16009803921568627)]

## Eval re-ranked matched result on test dataset
    Вспомним df_match_candidates сет, который был получен own_recommendations на юзерах, набор пользователей мы фиксировали и он одинаков, значи и прогноз одинаков, поэтому мы можем использовать этот датафрейм для переранжирования.

In [66]:
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [67]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))

In [68]:
# смотрим на метрики выше и сравниваем что с ранжированием и без, добавляем фичи и то же смотрим
# в первом приближении метрики должны расти с использованием второго этапа

print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.26527415143603134)
('own_rec', 0.16009803921568627)


### Оценка на тесте для выполнения курсового проекта

In [69]:
df_test = pd.read_csv(os.path.join(PATH_DATA, 'retail_test1.csv'))

In [70]:
df_test.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0


In [71]:
result_test = df_test.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_test.columns = [USER_COL, ACTUAL_COL]
result_test.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


In [72]:
result_test = result_test[result_test.user_id.isin(common_users)]

In [73]:
%%time
result_test['own_rec'] = result_test[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

CPU times: total: 3.64 s
Wall time: 3.64 s


In [74]:
result_test['reranker_own_rec'] = result_test[USER_COL].apply(lambda user_id: rerank(user_id))

In [75]:
print(*sorted(calc_precision(result_test, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranker_own_rec', 0.21824324324324326)
('own_rec', 0.1377588953797132)


In [81]:
((0.21824324324324326 / 0.1377588953797132) - 1) * 100

58.42406593177607

## На тестовом датасете получил метрику 0.218, что на 58% выше бейзлайна (0,137)