In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

In [2]:
data = pd.read_csv('./data_init/retail_train.csv')
item_features = pd.read_csv('./data_init/product.csv')
user_features = pd.read_csv('./data_init/hh_demographic.csv')

In [3]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'

In [4]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

In [5]:
VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

In [6]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]


# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [7]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [8]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2108779, 12) Users: 2498 Items: 83685
val_matcher
Shape: (169711, 12) Users: 2154 Items: 27649
train_ranker
Shape: (169711, 12) Users: 2154 Items: 27649
val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329


In [9]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))


Decreased # items from 83685 to 5001


In [10]:
# ищем общих пользователей
common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (861404, 13) Users: 2495 Items: 5001
val_matcher
Shape: (169615, 12) Users: 2151 Items: 27644
train_ranker
Shape: (169615, 12) Users: 2151 Items: 27644
val_ranker
Shape: (118282, 12) Users: 2040 Items: 24325


In [11]:
recommender = MainRecommender(data_train_matcher)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [12]:
ACTUAL_COL = 'actual'

In [13]:
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [14]:
# N = Neighbors
# N_PREDICT = 500 
N_PREDICT = 50


In [15]:
%%time
# для понятности расписано все в строчку, без функций, ваша задача уметь оборачивать все это в функции
result_eval_matcher['own_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))
result_eval_matcher['sim_item_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_similar_items_recommendation(x, N=N_PREDICT))
result_eval_matcher['als_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_als_recommendations(x, N=N_PREDICT))

Wall time: 44.8 s


In [16]:
# result_eval_matcher['sim_user_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_similar_users_recommendation(x, N=50))

In [17]:
def calc_recall(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [18]:
TOPK_RECALL = 50

In [19]:
sorted(calc_recall(result_eval_matcher, TOPK_RECALL), key=lambda x: x[1],reverse=True)

[('own_rec', 0.06525657038145175),
 ('als_rec', 0.04674148438621005),
 ('sim_item_rec', 0.033177242329740085)]

### k = 5

('own_rec', 0.018201887674891032),

 ('als_rec', 0.012482741845430493),
 
 ('sim_item_rec', 0.005468078972246774)

### k = 20

('own_rec', 0.03928427679372909),

 ('als_rec', 0.03025469094032399),
 
 ('sim_item_rec', 0.017403457973386664)

### k = 50

('own_rec', 0.06525657038145175),

 ('als_rec', 0.04924763015953112),
 
 ('sim_item_rec', 0.03357448231179952)

### k = 100

('own_rec', 0.09604492955885034),

 ('als_rec', 0.06865378617460409),
 
 ('sim_item_rec', 0.05349611082233438)

### k = 200

('own_rec', 0.13537278412833242),

 ('als_rec', 0.09757952773257741),
 
 ('sim_item_rec', 0.08557721974610613)

### k = 500

('own_rec', 0.18205324555508678),

 ('als_rec', 0.14630521177961273),
 
 ('sim_item_rec', 0.13569470651199136)

Логично, что если мы предсказали 500 товаров для пользователя, а потом будем смотреть recall, то в список рекомендованных попадёт больше товаров из списка actual. Наверно имеет смысл брать k для recall исходя из того количества, которое планируется предлагать пользователю, то есть исходя из задач бизнеса.

In [20]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [21]:
TOPK_PRECISION = 5

In [22]:
sorted(calc_precision(result_eval_matcher, TOPK_PRECISION), key=lambda x: x[1],reverse=True)

[('own_rec', 0.17712691771268974),
 ('als_rec', 0.11585309158530813),
 ('sim_item_rec', 0.05337052533705295)]

In [23]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [24]:
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

In [25]:
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [26]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

In [27]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (107550, 2) Users: 2151 Items: 4574


In [28]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

In [29]:
df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

# чистим дубликаты
df_ranker_train = df_ranker_train.drop_duplicates(subset=[USER_COL, ITEM_COL])

df_ranker_train['target'].fillna(0, inplace= True)

## Фичи для обучения модели

In [30]:
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [31]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [32]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [33]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

In [34]:
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


### Мои фичи

**user_mean_value (средний чек)**

In [35]:
user_mean_value = data.groupby('user_id')['sales_value'].mean().\
                                        reset_index().rename(columns={'sales_value':'user_mean_value'})
user_mean_value.set_index('user_id', inplace=True)
user_mean_value.head(2)

Unnamed: 0_level_0,user_mean_value
user_id,Unnamed: 1_level_1
1,2.492077
2,2.783893


In [36]:
df_ranker_train = df_ranker_train.merge(user_mean_value, on='user_id', how='left')

**cat_value (Кол-во покупок в каждой категории)**

In [37]:
data_copy = data.copy()
item_departments = item_features[['item_id', 'department']]
item_departments.set_index('item_id', inplace=True)
item_departments
data_copy = data_copy.merge(item_features, how='inner', on='item_id')

In [38]:
data_copy.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,69,PRODUCE,Private,POTATOES,POTATOES RUSSET (BULK&BAG),5 LB
1,1929,27021022215,4,1004906,1,1.39,441,-0.6,1755,1,0.0,0.0,69,PRODUCE,Private,POTATOES,POTATOES RUSSET (BULK&BAG),5 LB


In [39]:
cat_value = data_copy[['user_id', 'quantity', 'department']]
cat_value.head(2)

Unnamed: 0,user_id,quantity,department
0,2375,1,PRODUCE
1,1929,1,PRODUCE


In [40]:
cat_value = (cat_value.assign(idx=cat_value.groupby('user_id').cumcount())
                 .pivot_table(index='user_id', columns='department', 
                              values='quantity', aggfunc='sum'))

In [41]:
cat_value = cat_value.fillna(0)
cat_value.drop(columns = ' ', inplace=True)
cat_value.head(3)

department,AUTOMOTIVE,CHARITABLE CONT,CHEF SHOPPE,CNTRL/STORE SUP,COSMETICS,COUP/STR & MFG,DAIRY DELI,DELI,DELI/SNACK BAR,DRUG GM,...,RESTAURANT,RX,SALAD BAR,SEAFOOD,SEAFOOD-PCKGD,SPIRITS,TOYS,TRAVEL & LEISUR,VIDEO,VIDEO RENTAL
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,58.0,0.0,176.0,...,3.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,5.0,0.0,0.0,12.0,0.0,106.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6.0,0.0,96.0,...,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0


In [42]:
df_ranker_train = df_ranker_train.merge(cat_value, on='user_id', how='left')

**cat_popularity (популярность категории)**

In [43]:
cat_popularity = data_copy.groupby('department')['quantity'].sum().to_frame()
cat_popularity['cat_popularity'] = cat_popularity['quantity'] / cat_popularity['quantity'].sum()
cat_popularity.drop(columns = 'quantity', inplace=True)
cat_popularity.head(5)

Unnamed: 0_level_0,cat_popularity
department,Unnamed: 1_level_1
,0.0
AUTOMOTIVE,2.535514e-07
CHARITABLE CONT,1.246974e-08
CHEF SHOPPE,3.134061e-06
CNTRL/STORE SUP,9.144476e-08


In [44]:
df_ranker_train.merge(cat_popularity, on=["department"])

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,RX,SALAD BAR,SEAFOOD,SEAFOOD-PCKGD,SPIRITS,TOYS,TRAVEL & LEISUR,VIDEO,VIDEO RENTAL,cat_popularity
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,...,0.0,1.0,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.000258
1,2070,1016298,0.0,4074,DELI,National,CHICKEN/POULTRY,CHIX:VALUE ADDED (COLD),,45-54,...,0.0,1.0,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.000258
2,2070,8015407,0.0,1216,DELI,National,SANDWICHES,SANDWICHES - (COLD),,45-54,...,0.0,1.0,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.000258
3,2070,1054185,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,...,0.0,1.0,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.000258
4,2021,1072494,0.0,69,DELI,Private,CHEESES,CHEESE:CHEESEBALLS/SPREADS,14 OZ,,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000258
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106967,1161,6602327,0.0,673,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,3 LB,,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.000014
106968,1189,6602327,0.0,673,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,3 LB,,...,0.0,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.000014
106969,1340,6602327,0.0,673,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,3 LB,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000014
106970,43,6602327,0.0,673,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,3 LB,35-44,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000014


**item_per_week (количество покупок в неделю)**

In [45]:
item_per_week = data[['item_id', 'quantity', 'week_no']]
item_per_week.head()

Unnamed: 0,item_id,quantity,week_no
0,1004906,1,1
1,1033142,1,1
2,1036325,1,1
3,1082185,1,1
4,8160430,1,1


In [46]:
item_per_week = (item_per_week.assign(idx=item_per_week.groupby('item_id').cumcount())
                 .pivot_table(index='item_id', columns='week_no', 
                              values='quantity', aggfunc='sum'))
item_per_week = item_per_week.fillna(0)

In [47]:
df_ranker_train = df_ranker_train.merge(item_per_week, on='item_id', how='left')

**value (цена)**

In [48]:
#value = data[['item_id', 'sales_value']]
#value = value.groupby('item_id')['sales_value'].mean().to_frame().rename(columns={'sales_value':'value'})
#value.head(3)

In [49]:
#df_ranker_train = df_ranker_train.merge(value, on='item_id', how='left')

In [50]:
df_ranker_train.head(3)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,86,87,88,89,90,91,92,93,94,95
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,...,2.0,0.0,2.0,0.0,1.0,0.0,2.0,0.0,0.0,3.0
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,2070,879194,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,14 CT,45-54,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [52]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

# cat_feats

## Обучение модели

In [53]:
lgb = LGBMClassifier(objective='binary',
                     max_depth=10,
                     n_estimators=400,
                     learning_rate=0.07,
                     categorical_column=cat_feats)

lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

  return f(**kwargs)


In [54]:
df_ranker_predict = df_ranker_train.copy()

In [55]:
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [56]:
result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [57]:
%%time
result_eval_ranker['own_rec'] = result_eval_ranker[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

Wall time: 13 s


In [58]:
# померяем precision только модели матчинга, чтобы понимать влияение ранжирования на метрики

sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

[('own_rec', 0.1444117647058813)]

In [59]:
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [60]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))

In [61]:
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.20511749347258304)
('own_rec', 0.1444117647058813)


  return flags.sum() / len(recommended_list)


Я добавила несколько своих фичей и настроила параметры модели. Мне удалось улучшить результат на 5%. Мой результат: 'reranked_own_rec', 0.20511749347258304