### Загрузка библиотек

In [81]:
from catboost import CatBoostRanker
import numpy as np
import pandas as pd
import sys
from src.metrics import precision_at_k, recall_at_k
from src.recommenders import MainRecommender
from src.utils import prefilter_items

### Загрузка данных

In [82]:
data_train = pd.read_csv('data/retail_train.csv')
data_test = pd.read_csv('data/retail_test.csv')
item_features = pd.read_csv('data/product.csv')
user_features = pd.read_csv('data/hh_demographic.csv')

In [83]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'
N_CANDIDATES = 50
N_RANGED = 5

In [84]:
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]
item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

### Разделение данных

In [85]:
VAL_MATCHER_WEEKS = 6

In [86]:
data_train_matcher = data_train[data_train['week_no'] <= (data_train['week_no'].max() - VAL_MATCHER_WEEKS)]
data_val_matcher = data_train[data_train['week_no'] > (data_train['week_no'].max() - VAL_MATCHER_WEEKS)]
data_train_ranker = data_val_matcher.copy()

### Префильтрация

- Убираем товары, которые не продавались за последние 12 месяцев;
- Убираем некоторые категории товаров;
- Убираем слишком дешевые и слишком дорогие товары;
- Оставляем только топ-5000 популярных товаров.

In [87]:
n_items_before = data_train_matcher['item_id'].nunique()
data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, n_popular=5000)
n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 85828 to 5001


In [88]:
common_users = data_train_matcher.user_id.unique()
data_val_matcher = data_val_matcher.loc[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker.loc[data_train_ranker.user_id.isin(common_users)]

In [89]:
def calc_recall(df_result, top_k):
    for col_name in df_result.columns[2:]:
        yield col_name, df_result.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()


def calc_precision(df_result, top_k):
    for col_name in df_result.columns[2:]:
        yield col_name, df_result.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [90]:
recommender = MainRecommender(data_train_matcher)

In [91]:
result_test = data_test.groupby(USER_COL, sort=False)[ITEM_COL].unique().reset_index()
result_test.columns=[USER_COL, ACTUAL_COL]
result_test.head(2)

Unnamed: 0,user_id,actual
0,1340,"[912987, 819255, 834117, 866227, 889362, 89608..."
1,588,"[1024426, 6534178, 9673270, 826842, 833025, 85..."


In [92]:
result_test.loc[~result_test[USER_COL].isin(data_train_matcher[USER_COL].unique())]

Unnamed: 0,user_id,actual
1220,2325,"[849274, 863885, 872137, 877913, 883932, 96520..."


In [93]:
result_test = result_test.loc[result_test[USER_COL].isin(data_train_matcher[USER_COL].unique())]

In [94]:
result_test['top_popular'] = result_test[USER_COL].apply(lambda x: recommender.overall_top_purchases[:N_CANDIDATES])

In [95]:
baseline_metric = tuple(*calc_precision(result_test, N_RANGED))[1]
baseline_metric

0.12611464968152866

### Подбор кандидатов и выбор модели подбора

In [96]:
recommender = MainRecommender(data_train_matcher, K1=1, B=0.3)

In [97]:
result_eval_matcher = data_val_matcher.groupby(USER_COL, sort=False)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,843,"[845193, 865891, 883404, 904375, 923746, 93663..."
1,2223,"[7155012, 14077656, 936753, 941856, 1103105, 1..."


In [98]:
def make_recommendations(df_result, rec_name_model, N=50):
    rec_name = rec_name_model[0]
    rec_model = rec_name_model[1]
    df_result[rec_name] = df_result[USER_COL].apply(lambda x: rec_model(x, N=N))

In [99]:
own_rec = ('own_recs', recommender.get_own_recommendations)
als_rec = ('als_recs', recommender.get_als_recommendations)
sim_user_rec = ('similar_user_recs', recommender.get_similar_users_recommendation)
sim_item_rec = ('similar_item_recs', recommender.get_similar_items_recommendation)

In [100]:
%%time

for rec in (own_rec, als_rec, sim_user_rec, sim_item_rec):
    make_recommendations(result_eval_matcher, rec, N=N_CANDIDATES)

CPU times: user 1min 37s, sys: 0 ns, total: 1min 37s
Wall time: 1min 37s


In [101]:
make_recommendations(result_eval_matcher, ('own+top_pop', recommender.get_own_recommendations), N=N_CANDIDATES//2)

In [102]:
def fill_with_tops(column, N=5):
    tops = np.array(recommender.overall_top_purchases)
    recs = np.array(column)
    mask = np.isin(tops, recs, invert=True)
    tops = tops[mask]
    return np.append(recs, tops[:N])

In [103]:
result_eval_matcher['own+top_pop'] = result_eval_matcher['own+top_pop'].apply(lambda row: fill_with_tops(row, N=N_CANDIDATES//2))

In [104]:
sorted(calc_recall(result_eval_matcher, N_CANDIDATES), key=lambda x: x[1], reverse=True)

[('own_recs', 0.11465071065557855),
 ('own+top_pop', 0.10104838811379524),
 ('als_recs', 0.08289372244749829),
 ('similar_user_recs', 0.05441639151085825),
 ('similar_item_recs', 0.048598920310415517)]

In [105]:
sorted(calc_precision(result_eval_matcher, N_RANGED), key=lambda x: x[1], reverse=True)

[('own_recs', 0.3377612633534603),
 ('own+top_pop', 0.3377612633534603),
 ('als_recs', 0.18522991175104506),
 ('similar_item_recs', 0.10385508592661402),
 ('similar_user_recs', 0.09725963771481655)]

## Обучение модели ранжирования

In [106]:
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [107]:
make_recommendations(df_match_candidates, ('candidates', recommender.get_als_recommendations), N=N_CANDIDATES)

In [108]:
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = ITEM_COL

In [109]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

In [110]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1

In [111]:
df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')
df_ranker_train = df_ranker_train.drop_duplicates(subset=[USER_COL, ITEM_COL])
df_ranker_train['target'].fillna(0, inplace= True)

In [112]:
df_ranker_train.target.value_counts()

target
0.0    97093
1.0    10557
Name: count, dtype: int64

In [113]:
class DataProcessor:
    def __init__(self):
        self.cat_feats = None
        self.data_train_ranker = None
        self.item_features = None
        self.nan_cols = None
        self.train_data = None
        self.user_features = None

    
    def _add_features(self, df, is_fit=True):
        df = df[[USER_COL, ITEM_COL]].copy()
        df = df.merge(self.item_features, on=ITEM_COL, how='left')
        df = df.merge(self.user_features, on=USER_COL, how='left')
        feats_count = len(df.columns)
        df = df.merge(self.train_data.groupby(USER_COL, sort=False)['sales_value']. \
                                    mean().reset_index(), how='left', on=USER_COL)
        df.rename(columns={'sales_value': 'avg_bill'}, inplace=True)
        df = df.merge(self.train_data.groupby([USER_COL, 'department'], sort=False)['sales_value'] \
                                                .mean().reset_index(), how='left', on=[USER_COL, 'department']). \
                                                rename(columns={'sales_value': 'avg_cat_spendings'})
        df = df.merge((self.train_data.groupby(ITEM_COL, sort=False)['quantity'] \
                                                 .count() / self.train_data['week_no'].nunique()).reset_index(), how='left', on=ITEM_COL)
        df.rename(columns={'quantity': 'avg_week_purchases'}, inplace=True)
        df = df.merge((self.train_data.groupby('department', sort=False)['quantity'] \
                                                 .count() / self.train_data['week_no'].nunique()).reset_index(), \
                                                how='left', on='department').rename(columns={'quantity': 'avg_week_purchases_cat'})
        df = df.merge(self.train_data.groupby('department', sort=False)['sales_value'] \
                                                 .mean().reset_index(), how='left', on='department') \
                                                 .rename(columns={'sales_value': 'avg_cat_spendings_items'})
        df = df.merge((self.train_data.groupby([USER_COL, 'department'], sort=False)['quantity'] \
                                                 .count() / self.train_data['week_no'].nunique()).reset_index(), how='left', \
                                                on=[USER_COL, 'department']).rename(columns={'quantity': 'user_week_cat_purchase'})
        df = df.merge(self.train_data.groupby(ITEM_COL, sort=False).agg(USER_COL).count().rename('item_popularity'), how='left', on=ITEM_COL)
        df = df.merge(self.train_data.groupby([USER_COL, ITEM_COL], sort=False)['quantity'].sum(). \
                                              reset_index(), how='left', on=[USER_COL, ITEM_COL]).rename(columns={'quantity': 'total_buys'})
        new_feats = len(df.columns) - feats_count
        df = df.iloc[:, 2:]
        self.cat_feats = df.columns.tolist()[:-new_feats]
        df[self.cat_feats] = df[self.cat_feats].astype('category')
        if not is_fit:
            self.nan_cols = df.isna().sum().loc[df.isna().sum() > 0].index.tolist()
        for col in self.nan_cols:
            df[f'{col}_nan'] = 0
            df.loc[df[col].isna(), f'{col}_nan'] = 1
            if is_fit:
                data_source = self.data_train_ranker.copy()
            else:
                data_source = df.copy()  
            df[col].fillna(data_source[col].value_counts().index[0], inplace=True)
        additional_nans = df.isna().sum().loc[df.isna().sum() > 0].index.tolist()
        for col in additional_nans:
            df[col].fillna(data_source[col].value_counts().index[0], inplace=True)
        return df
        
        
    def fit(self, data_train_ranker, train_data, user_features, item_features):
        self.data_train_ranker = data_train_ranker
        self.user_features = user_features
        self.item_features = item_features
        self.train_data = train_data.merge(self.item_features[[ITEM_COL, 'department']], how='left', on=ITEM_COL)
        self.data_train_ranker = self._add_features(self.data_train_ranker, is_fit=False)
        
    
    def transform(self, X):
        X = self._add_features(X)
        return X

In [114]:
processor = DataProcessor()
processor.fit(df_ranker_train, data_train_matcher, user_features, item_features)

In [115]:
X_train = processor.transform(df_ranker_train)

In [116]:
y_train = df_ranker_train['target']

In [117]:
model = CatBoostRanker(
    iterations=200,
    silent=True,
    eta=0.2,
    task_type='GPU',
    max_depth=7,
    loss_function='PairLogitPairwise',
    random_state=42,
    cat_features=processor.cat_feats
    )

In [118]:
%%time

model.fit(X_train, y_train, group_id=df_ranker_train[USER_COL], subgroup_id=df_ranker_train[ITEM_COL])

CPU times: user 39.7 s, sys: 507 ms, total: 40.3 s
Wall time: 22.2 s


<catboost.core.CatBoostRanker at 0x7f0cfd9191b0>

In [119]:
result_test = data_test.groupby(USER_COL, sort=False)[ITEM_COL].unique().reset_index()
result_test.columns=[USER_COL, ACTUAL_COL]

In [120]:
result_test = result_test.loc[result_test[USER_COL].isin(data_train_matcher[USER_COL].unique())]

In [121]:
make_recommendations(result_test, ('als_recs', recommender.get_als_recommendations), N=N_CANDIDATES)

In [122]:
sorted(calc_precision(result_test, N_RANGED), key=lambda x: x[1], reverse=True)

[('als_recs', 0.13036093418259023)]

In [123]:
df_test_candidates = result_test.rename(columns={'als_recs': 'candidates'}).drop(ACTUAL_COL, axis=1)

In [124]:
df_items_test = df_test_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items_test.name = ITEM_COL

In [125]:
df_test_candidates = df_test_candidates.drop('candidates', axis=1).join(df_items_test)

In [126]:
X_test = processor.transform(df_test_candidates)

In [127]:
test_preds = model.predict(X_test)

In [128]:
df_test_candidates['score_item_purchase'] = test_preds

In [129]:
def rerank(user_id):
    return df_test_candidates.loc[df_test_candidates[USER_COL]==user_id].sort_values('score_item_purchase', ascending=False). \
                                                                                                    head(N_RANGED).item_id.tolist()

In [130]:
result_test['reranked_als_recs'] = result_test[USER_COL].apply(lambda user_id: rerank(user_id))

In [131]:
final_metric = sorted(calc_precision(result_test, N_RANGED), key=lambda x: x[1], reverse=True)[0][1]
final_metric

0.2627388535031847