# Course project


**Основное**
- Дедлайн - 30 сентября 23:59
- Целевая метрика precision@5
- Бейзлайн решения - [MainRecommender](https://github.com/geangohn/recsys-tutorial/blob/master/src/recommenders.py)
- Сдаем ссылку на github с решением. В решении должны быть отчетливо видна метрика на новом тестовом сете из файла retail_test1.csv, то есть вам нужно для всех юзеров из этого файла выдать выши рекомендации, и посчитать на actual покупках precision@5. 

**!! Мы не рассматриваем холодный старт для пользователя, все наши пользователя одинаковы во всех сетах, поэтому нужно позаботиться об их исключении из теста.**


**Hints:** 

Сначала просто попробуйте разные параметры MainRecommender:  
- N в топ-N товарах при формировании user-item матирцы (сейчас топ-5000)  
- Различные веса в user-item матрице (0/1, кол-во покупок, log(кол-во покупок + 1), сумма покупки, ...)  
- Разные взвешивания матрицы (TF-IDF, BM25 - у него есть параметры)  
- Разные смешивания рекомендаций (обратите внимание на бейзлайн - прошлые покупки юзера)  

Сделайте MVP - минимально рабочий продукт - (пусть даже top-popular), а потом его улучшайте

Если вы делаете двухуровневую модель - следите за валидацией 

# Import libs

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

pd.pandas.set_option('display.max_columns', None)
import warnings
warnings.simplefilter('ignore')

## Read data

In [2]:
data = pd.read_csv('retail_train.csv')

In [195]:
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

In [3]:
data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


# Set global const

In [4]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'

# N = Neighbors
N_PREDICT = 50 

# Process features dataset

In [196]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

# Split dataset for train, eval, test

In [6]:
VAL_MATCHER_WEEKS = 4
VAL_RANKER_WEEKS = 0

In [7]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]

# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
# data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [8]:
# сделаем объединенный сет данных для первого уровня (матчинга)
df_join_train_matcher = pd.concat([data_train_matcher, data_val_matcher])

In [9]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [10]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
# print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2251517, 12) Users: 2499 Items: 86343
val_matcher
Shape: (116220, 12) Users: 2014 Items: 24258
train_ranker
Shape: (116220, 12) Users: 2014 Items: 24258


In [11]:
# выше видим разброс по пользователям и товарам и дальше мы перейдем к warm-start (только известные пользователи)

In [12]:
data_val_matcher.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2248412,2445,41125177682,629,847774,1,9.99,315,-2.0,0,91,-1.0,0.0
2248413,2445,41125177682,629,863337,2,0.5,315,0.0,0,91,0.0,0.0


# Prefilter items

In [13]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=10000)
n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 86343 to 10000


# Make cold-start to warm-start

In [14]:
# ищем общих пользователей
# common_users = list(set(data_train_matcher.user_id.values)&(set(data_val_matcher.user_id.values))&set(data_val_ranker.user_id.values))
common_users = list(set(data_train_matcher.user_id.values)&(set(data_val_matcher.user_id.values)))

# оставляем общих пользователей
data_train_matcher = data_train_matcher[data_train_matcher.user_id.isin(common_users)]
data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
# data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
# print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (852043, 13) Users: 2013 Items: 9999
val_matcher
Shape: (116209, 12) Users: 2013 Items: 24255
train_ranker
Shape: (116209, 12) Users: 2013 Items: 24255


# Init/train recommender

In [15]:
recommender = MainRecommender(data_train_matcher)



  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

# Eval recall of matching

In [16]:
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[829563, 830156, 832990, 840361, 856942, 87157..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [17]:
%%time

result_eval_matcher['own_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))
result_eval_matcher['sim_item_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_similar_items_recommendation(x, N=N_PREDICT))
result_eval_matcher['als_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_als_recommendations(x, N=N_PREDICT))

result_eval_matcher['cosine_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_cosine_recommendations(x, N=N_PREDICT))
result_eval_matcher['tfidf_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_tfidf_recommendations(x, N=N_PREDICT))
result_eval_matcher['bm25_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_bm25_recommendations(x, N=N_PREDICT))

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

Wall time: 1h 39min 33s


### Пример оборачивания

In [18]:
def calc_recall(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [19]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

### Recall@50 of matching

In [20]:
TOPK_RECALL = N_PREDICT

In [21]:
sorted(calc_recall(result_eval_matcher, TOPK_RECALL), key=lambda x: x[1],reverse=True)

[('tfidf_rec', 0.10642135665622501),
 ('cosine_rec', 0.1050937414981526),
 ('bm25_rec', 0.10318371137457337),
 ('own_rec', 0.06172439961747682),
 ('als_rec', 0.05794761611080355),
 ('sim_item_rec', 0.030393259918637224)]

### Precision@5 of matching

In [22]:
TOPK_PRECISION = 5

In [23]:
sorted(calc_precision(result_eval_matcher, TOPK_PRECISION), key=lambda x: x[1],reverse=True)

[('cosine_rec', 0.28574267262791747),
 ('tfidf_rec', 0.2844510680576243),
 ('bm25_rec', 0.2724292101341266),
 ('als_rec', 0.13800298062593008),
 ('own_rec', 0.12409339294585116),
 ('sim_item_rec', 0.04898161947342311)]

In [24]:
result_eval_matcher['candidates'] = result_eval_matcher['cosine_rec']

# Ranking part

## Подготовка данных для трейна

In [98]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [99]:
# собираем кандитатов с первого этапа (matcher)
df_match_candidates = df_match_candidates.merge(result_eval_matcher[['user_id','candidates']], on = 'user_id', how = 'left')

In [100]:
df_match_candidates.head(2)

Unnamed: 0,user_id,candidates
0,2445,"[899229, 844179, 1029743, 999613, 5569230, 993..."
1,148,"[5568489, 1003487, 921952, 883003, 5569230, 80..."


In [101]:
# разворачиваем товары
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [102]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

In [103]:
df_match_candidates.head(4)

Unnamed: 0,user_id,item_id
0,2445,899229
0,2445,844179
0,2445,1029743
0,2445,999613


### Check warm start

In [104]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (100650, 2) Users: 2013 Items: 9510


In [105]:
df_match_candidates.to_csv(os.path.join('df_match_candidates.csv'), index=False, encoding='utf-8', sep=',')

### Создаем трейн сет для ранжирования с учетом кандидатов с этапа 1 

In [197]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

# чистим дубликаты
df_ranker_train = df_ranker_train.drop_duplicates(subset=[USER_COL, ITEM_COL])

df_ranker_train['target'].fillna(0, inplace= True)

In [198]:
df_ranker_train.target.value_counts()

0.0    89793
1.0    10428
Name: target, dtype: int64

In [199]:
df_ranker_train.head(9)

Unnamed: 0,user_id,item_id,target
0,2445,899229,1.0
1,2445,844179,1.0
2,2445,1029743,0.0
3,2445,999613,0.0
4,2445,5569230,0.0
5,2445,993826,0.0
6,2445,893018,0.0
7,2445,1003158,1.0
8,2445,950935,0.0


## Подготавливаем фичи для обучения модели

### Описательные фичи

In [200]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [178]:
item_features['curr_size_of_product'].value_counts()

              30607
16 OZ          3924
12 OZ          3473
8 OZ           2363
6 OZ           1697
              ...  
255000 8IN        1
2.5 LTR           1
2/.7 OZ           1
0.87 OZ           1
522888 CTN        1
Name: curr_size_of_product, Length: 4345, dtype: int64

In [201]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [164]:
user_features['age_desc'].value_counts()

45-54    288
35-44    194
25-34    142
65+       72
55-64     59
19-24     46
Name: age_desc, dtype: int64

In [168]:
user_features['household_size_desc'].value_counts()

2     318
1     255
3     109
5+     66
4      53
Name: household_size_desc, dtype: int64

In [163]:
user_features['kid_category_desc'].value_counts()

None/Unknown    558
1               114
3+               69
2                60
Name: kid_category_desc, dtype: int64

In [202]:
def get_item_user_features(item_features, user_features, df_ranker_train):
    
    item_features =item_features.drop('curr_size_of_product', axis=1)
    
    user_features['age_desc'] = user_features['age_desc'].replace(
        {'65+': 5, '45-54': 3, '25-34': 1, '35-44': 2, '19-24': 0, '55-64': 4})
    user_features['age_desc'] = user_features['age_desc'].astype('int')
    
    user_features['income_desc'] = user_features['income_desc'].replace(
        {'Under 15K': 1,
         '15-24K': 2,
         '25-34K': 3,
         '35-49K': 4, 
         '50-74K': 5,       
         '75-99K': 6,         
         '100-124K': 7,         
         '125-149K': 8,
         '150-174K': 9,
         '175-199K': 10,
         '200-249K': 11,
         '250K+': 12})
    user_features['income_desc'] = user_features['income_desc'].astype('int')
    
    user_features.loc[user_features['kid_category_desc']=='3+', 'kid_category_desc'] = 3
    user_features.loc[user_features['kid_category_desc']=='None/Unknown', 'kid_category_desc'] = 0
    user_features['kid_category_desc'] = user_features['kid_category_desc'].astype('int')
    
    user_features.loc[user_features['household_size_desc']=='5+', 'household_size_desc'] = 5
    user_features['household_size_desc'] = user_features['household_size_desc'].astype('int')
    
    df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
    df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')
            
    return df_ranker_train

In [203]:
df_ranker_train = get_item_user_features(item_features, user_features, df_ranker_train)
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2445,899229,1.0,2193,GROCERY,National,ICE CREAM/MILK/SHERBTS,PREMIUM,3.0,U,4.0,Unknown,Unknown,1.0,0.0
1,2445,844179,1.0,2852,MEAT,National,BEEF,PRIMAL,3.0,U,4.0,Unknown,Unknown,1.0,0.0
2,2445,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,3.0,U,4.0,Unknown,Unknown,1.0,0.0
3,2445,999613,0.0,418,GROCERY,National,BAKED SWEET GOODS,SNACK CAKE - MULTI PACK,3.0,U,4.0,Unknown,Unknown,1.0,0.0
4,2445,5569230,0.0,1208,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,3.0,U,4.0,Unknown,Unknown,1.0,0.0


**Фичи user_id:**
    - Средний чек
    - Средняя сумма покупки 1 товара в каждой категории
    - Кол-во покупок в каждой категории
    - Частотность покупок раз/месяц
    - Долю покупок в выходные
    - Долю покупок утром/днем/вечером

**Фичи item_id**:
    - Кол-во покупок в неделю
    - Среднее ол-во покупок 1 товара в категории в неделю
    - (Кол-во покупок в неделю) / (Среднее ол-во покупок 1 товара в категории в неделю)
    - Цена (Можно посчитать из retil_train.csv)
    - Цена / Средняя цена товара в категории
    
**Фичи пары user_id - item_id**
    - (Средняя сумма покупки 1 товара в каждой категории (берем категорию item_id)) - (Цена item_id)
    - (Кол-во покупок юзером конкретной категории в неделю) - (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)
    - (Кол-во покупок юзером конкретной категории в неделю) / (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)

In [204]:
# 1. 'mean_cheque' = Средний чек

users_sales = data.groupby(USER_COL)['sales_value'].sum().reset_index()
num_baskets = data.groupby(USER_COL)['basket_id'].nunique().reset_index()
users_sales = users_sales.merge(num_baskets, on=USER_COL, how='left')
users_sales['mean_cheque'] = users_sales['sales_value'] / users_sales['basket_id']
users_sales.drop(['sales_value', 'basket_id'], axis=1, inplace=True)

df_ranker_train = df_ranker_train.merge(users_sales, on=USER_COL, how='left')

In [205]:
# 2. 'mean_department_price' = Средний чек по категории

departments = list(set(df_ranker_train['department'].tolist()))
df_ranker_train['mean_department_price'] = 0

for n in departments:
    dep_df_ranker_train = df_ranker_train[df_ranker_train['department'] == n]
    ids = dep_df_ranker_train[ITEM_COL].tolist()
    dep_data = data[data[ITEM_COL].isin(ids)]
    
    dep_sales = dep_data.groupby(USER_COL).agg({
    'sales_value' : 'sum', 
    'quantity': 'sum'}).reset_index()
    
    dep_sales['dep_mean_price'] = dep_sales['sales_value'] / dep_sales['quantity']
    dep_sales.drop(['sales_value', 'quantity'], axis=1, inplace=True)
    
    for i in range(dep_sales.shape[0]):
        df_ranker_train.loc[(((df_ranker_train[USER_COL] == dep_sales[USER_COL][i]) & (df_ranker_train['department'] == n)) == True), 'mean_department_price'] = dep_sales['dep_mean_price'][i]

In [206]:
# 3. 'item_id_week_sales' = Кол-во покупок в неделю

week_sales = data.groupby(ITEM_COL).agg({ 
    'quantity': 'sum',
    'week_no' : 'nunique'
}).reset_index()
week_sales['item_id_week_sales'] = week_sales['quantity'] / week_sales['week_no']
week_sales.drop(['quantity', 'week_no'], axis=1, inplace=True)

df_ranker_train = df_ranker_train.merge(week_sales, on=ITEM_COL, how='left')

In [207]:
# 4. 'mean_price' = Цена

mean_price = data.groupby(ITEM_COL).agg({
    'sales_value' : 'sum', 
    'quantity': 'sum'
}).reset_index()

mean_price['mean_price'] = mean_price['sales_value'] / mean_price['quantity']

mean_price.drop(['sales_value', 'quantity'], axis=1, inplace=True)

df_ranker_train = df_ranker_train.merge(mean_price, on=ITEM_COL, how='left')

In [208]:
# 5. 'delta_dep_user_price' = Средняя сумма покупки юзером 1 товара в каждой категории - Средняя цена в этой категории

df_ranker_train['delta_dep_user_price'] = 0

for n in departments:
    dep_df_ranker_train = df_ranker_train[df_ranker_train['department'] == n]
    ids = dep_df_ranker_train[ITEM_COL].tolist()
    dep_data = data[data[ITEM_COL].isin(ids)]
    
    dep_mean_price = dep_data['sales_value'].sum() / dep_data['quantity'].sum()
    
    dep_user_sales = data.groupby(USER_COL).agg({
    'sales_value' : 'sum', 
    'quantity': 'sum'}).reset_index()
    
    dep_user_sales['mean_dep_user_price'] = dep_user_sales['sales_value'] / dep_user_sales['quantity']
    dep_user_sales.drop(['sales_value', 'quantity'], axis=1, inplace=True)
    
    for i in range(dep_user_sales.shape[0]):
        df_ranker_train.loc[(((df_ranker_train[USER_COL] == dep_user_sales[USER_COL][i]) &
                            (df_ranker_train['department'] == n)) == True), 'delta_dep_user_price'] = dep_user_sales['mean_dep_user_price'][i] - dep_mean_price

In [209]:
# 6. 'rel_week_sales' = (Кол-во покупок юзером конкретной категории в неделю) / (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)

df_ranker_train['rel_week_sales'] = 0

for n in departments:
    dep_df_ranker_train = df_ranker_train[df_ranker_train['department'] == n]
    ids = dep_df_ranker_train[ITEM_COL].tolist()
    dep_data = data[data[ITEM_COL].isin(ids)]
    
    dep_mean_week_sales = dep_data['quantity'].sum() / dep_data['week_no'].nunique()
    
    dep_user_week_sales = data.groupby(USER_COL).agg({ 
    'quantity': 'sum',
    'week_no' : 'nunique'
    }).reset_index()
    
    dep_user_week_sales['mean_user_week_sales'] = dep_user_week_sales['quantity'] / dep_user_week_sales['week_no']
    dep_user_week_sales.drop(['quantity', 'week_no'], axis=1, inplace=True)
    
    for i in range(dep_user_week_sales.shape[0]):
        df_ranker_train.loc[(((df_ranker_train[USER_COL] == dep_user_week_sales[USER_COL][i]) &
                            (df_ranker_train['department'] == n)) == True), 'rel_week_sales'] = dep_user_week_sales['mean_user_week_sales'][i] / dep_mean_week_sales

### Поведенческие фичи

##### Чтобы считать поведенческие фичи, нужно учесть все данные что были до data_val_ranker

In [211]:
df_join_train_matcher.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [221]:
def get_behevior_features(df_ranker_train, df_join_train_matcher):

    df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('sales_value').sum().rename('total_item_sales_value'), how='left',on=ITEM_COL)

    df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('total_quantity_value'), how='left',on=ITEM_COL)

    df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq'), how='left',on=ITEM_COL)

    df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq'), how='left',on=USER_COL)

    df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('sales_value').sum().rename('total_user_sales_value'), how='left',on=USER_COL)

    df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=ITEM_COL)

    df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=USER_COL)

    df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

    df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_baskter')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)

    df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

    df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)
    
    return df_ranker_train

In [222]:
df_ranker_train = get_behevior_features(df_ranker_train, df_join_train_matcher)
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,mean_cheque,mean_department_price,item_id_week_sales,mean_price,delta_dep_user_price,rel_week_sales,total_item_sales_value,total_quantity_value,item_freq,user_freq,total_user_sales_value,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket
0,2445,899229,1.0,2193,GROCERY,National,ICE CREAM/MILK/SHERBTS,PREMIUM,3.0,U,4.0,Unknown,Unknown,1.0,0.0,29.611627,2.453235,1.62963,3.0075,-2.963255,0.349204,132.33,44,39,2382,6136.13,0.468085,2135.382979,0.000173,0.789877,0.000153,0.009373
1,2445,844179,1.0,2852,MEAT,National,BEEF,PRIMAL,3.0,U,4.0,Unknown,Unknown,1.0,0.0,29.611627,4.389487,51.336842,3.710115,-4.51712,2.758488,17945.48,4838,3511,2382,6136.13,51.468085,2135.382979,0.019038,0.789877,0.013816,0.009373
2,2445,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,3.0,U,4.0,Unknown,Unknown,1.0,0.0,29.611627,2.453235,166.736842,2.397848,-2.963255,0.349204,37415.98,15635,13277,2382,6136.13,166.329787,2135.382979,0.061525,0.789877,0.052246,0.009373
3,2445,999613,0.0,418,GROCERY,National,BAKED SWEET GOODS,SNACK CAKE - MULTI PACK,3.0,U,4.0,Unknown,Unknown,1.0,0.0,29.611627,2.453235,1.38,2.632174,-2.963255,0.349204,181.62,69,69,2382,6136.13,0.734043,2135.382979,0.000272,0.789877,0.000272,0.009373
4,2445,5569230,0.0,1208,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,3.0,U,4.0,Unknown,Unknown,1.0,0.0,29.611627,2.453235,74.778947,3.079282,-2.963255,0.349204,21327.72,6885,4337,2382,6136.13,73.244681,2135.382979,0.027093,0.789877,0.017067,0.009373


In [223]:
df_ranker_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100221 entries, 0 to 100220
Data columns (total 32 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   user_id                    100221 non-null  int64  
 1   item_id                    100221 non-null  int64  
 2   target                     100221 non-null  float64
 3   manufacturer               100221 non-null  int64  
 4   department                 100221 non-null  object 
 5   brand                      100221 non-null  object 
 6   commodity_desc             100221 non-null  object 
 7   sub_commodity_desc         100221 non-null  object 
 8   age_desc                   38285 non-null   float64
 9   marital_status_code        38285 non-null   object 
 10  income_desc                38285 non-null   float64
 11  homeowner_desc             38285 non-null   object 
 12  hh_comp_desc               38285 non-null   object 
 13  household_size_desc        38

## Обучение модели ранжирования

In [224]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [225]:
# cat_feats = X_train.columns[2:].tolist()
cat_feats = ['manufacturer',
             'department', 
             'brand',
             'commodity_desc',
             'sub_commodity_desc',
             'age_desc',
             'marital_status_code',
             'income_desc',
             'homeowner_desc',
             'hh_comp_desc',
             'household_size_desc',
            'kid_category_desc']

X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

In [226]:
lgb = LGBMClassifier(objective='binary',
                     max_depth=10,
                     n_estimators=89,
                     num_leaves = 100,
                     learning_rate=0.1,
                     scale_pos_weight = 2.98,
                     reg_lambda = 0.1,
                     categorical_column=cat_feats)

lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

In [227]:
df_ranker_predict = df_ranker_train.copy()

In [228]:
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [229]:
df_ranker_predict.head(5)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,mean_cheque,mean_department_price,item_id_week_sales,mean_price,delta_dep_user_price,rel_week_sales,total_item_sales_value,total_quantity_value,item_freq,user_freq,total_user_sales_value,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket,proba_item_purchase
0,2445,899229,1.0,2193,GROCERY,National,ICE CREAM/MILK/SHERBTS,PREMIUM,3.0,U,4.0,Unknown,Unknown,1.0,0.0,29.611627,2.453235,1.62963,3.0075,-2.963255,0.349204,132.33,44,39,2382,6136.13,0.468085,2135.382979,0.000173,0.789877,0.000153,0.009373,0.428927
1,2445,844179,1.0,2852,MEAT,National,BEEF,PRIMAL,3.0,U,4.0,Unknown,Unknown,1.0,0.0,29.611627,4.389487,51.336842,3.710115,-4.51712,2.758488,17945.48,4838,3511,2382,6136.13,51.468085,2135.382979,0.019038,0.789877,0.013816,0.009373,0.397091
2,2445,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,3.0,U,4.0,Unknown,Unknown,1.0,0.0,29.611627,2.453235,166.736842,2.397848,-2.963255,0.349204,37415.98,15635,13277,2382,6136.13,166.329787,2135.382979,0.061525,0.789877,0.052246,0.009373,0.664235
3,2445,999613,0.0,418,GROCERY,National,BAKED SWEET GOODS,SNACK CAKE - MULTI PACK,3.0,U,4.0,Unknown,Unknown,1.0,0.0,29.611627,2.453235,1.38,2.632174,-2.963255,0.349204,181.62,69,69,2382,6136.13,0.734043,2135.382979,0.000272,0.789877,0.000272,0.009373,0.183929
4,2445,5569230,0.0,1208,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,3.0,U,4.0,Unknown,Unknown,1.0,0.0,29.611627,2.453235,74.778947,3.079282,-2.963255,0.349204,21327.72,6885,4337,2382,6136.13,73.244681,2135.382979,0.027093,0.789877,0.017067,0.009373,0.419451


# Оценка на тесте

In [230]:
df_test = pd.read_csv('retail_test1.csv')
df_transactions = pd.read_csv('retail_train.csv')

In [231]:
df_test.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0


In [232]:
#warm_start
common_users = df_match_candidates.user_id.values
df_test = df_test[df_test.user_id.isin(common_users)]

print_stats_data(df_match_candidates, 'match_candidates')
print_stats_data(df_test, 'test')

match_candidates
Shape: (100650, 2) Users: 2013 Items: 9510
test
Shape: (84761, 12) Users: 1704 Items: 20081


In [233]:
result_test = df_test.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_test.columns=[USER_COL, ACTUAL_COL]
result_test.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,3,"[827683, 908531, 989069, 1071377, 1080155, 109..."


## Eval matching on test dataset

In [234]:
%%time
result_test['cosine_rec'] = result_test[USER_COL].apply(lambda x: recommender.get_cosine_recommendations(x, N=N_PREDICT))

Wall time: 16min 38s


In [235]:
# померяем precision только модели матчинга, чтобы понимать влияение ранжирования на метрики

sorted(calc_precision(result_test, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

[('cosine_rec', 0.24999999999999797)]

## Eval re-ranked matched result on test dataset    

In [236]:
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [237]:
result_test['reranked_cosine_rec'] = result_test[USER_COL].apply(lambda user_id: rerank(user_id))

In [238]:
print('Precision@5 on test dataset:')
print(*sorted(calc_precision(result_test, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

Precision@5 on test dataset:
('cosine_rec', 0.24999999999999797)
('reranked_cosine_rec', 0.2172535211267582)


In [239]:
# precision@5 >= 0.25

Сохраняем результат в файл recommendations.csv

In [241]:
result_test[['user_id', 'reranked_cosine_rec']].to_csv(os.path.join('recommendations.csv'), index=False, encoding='utf-8', sep=',')