<a href="https://colab.research.google.com/github/dmitryrubtsov/Recommender-systems/blob/master/recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Рекомендательные системы в бизнесе

## Info

**Целевая метрика** - money precision@5. Порог для уcпешной сдачи проекта money precision@5 > 20%

**Бизнес ограничения в топ-5 товарах:**
- Для каждого юзера 5 рекомендаций
- 2 новых товара (юзер никогда не покупал)
- 1 дорогой товар, > 7 долларов
- Все товары из разных категорий (категория - sub_commodity_desc)  
- Стоимость каждого рекомендованного товара > 1 доллара  

**Baseline**: [github](https://github.com/geangohn/recsys-tutorial)

$$\mathtt{Money \quad Precision@k} = \frac{\mathtt{revenue \quad of \quad recommended \quad items @K \quad that \quad are  \quad relevant}}{\mathtt{revenue \quad of \quad recommended \quad items @k}}$$

<a id=load></a>
## Load Data and Modules

### Load modules

In [1]:
!pip install implicit swifter



In [2]:
import pandas as pd
import swifter

In [3]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

In [4]:
# from src.utils import optimizing_df, postfilter_items
# from src.metrics import money_precision_at_k
# from src.pipeline import AlsEstimator, ColumnSelector, PrefilterItems, RandomEstimator

### src

#### metrics.py


In [5]:
def money_precision_at_k(y_pred: pd.Series, y_true: pd.Series, item_price, k=5):
    y_pred = y_pred.swifter.progress_bar(False).apply(pd.Series)
    user_filter = ~(y_true.swifter.progress_bar(False).apply(len) < k)

    y_pred = y_pred.loc[user_filter]
    y_true = y_true.loc[user_filter]

    prices_recommended = y_pred.swifter.progress_bar(False).applymap(lambda item: item_price.price.get(item))
    flags = y_pred.loc[:, :k - 1].swifter.progress_bar(False) \
        .apply(lambda row: np.isin(np.array(row), y_true.get(row.name)), axis=1) \
        .swifter.progress_bar(False).apply(pd.Series)

    metric = (
        (flags * prices_recommended.loc[:, :k - 1]).sum(axis=1) / prices_recommended.loc[:, :k - 1].sum(axis=1)
    ).mean()
    return metric

#### utils.py

In [6]:
import pandas as pd
import swifter
import numpy as np


def optimizing_df(df, silent=False, width_line=100):
    assert isinstance(df, pd.DataFrame), 'This is not a dataframe'

    if not silent:
        start_memory_usage = df.memory_usage(deep=True).sum() / 1024**2
        print('Start of dataframe memory optimization'.center(width_line, '*'))
        print(f'Memory usage by dataframe: {start_memory_usage:.02f} MB')

    df_dtype = pd.DataFrame(df.dtypes, columns=['dtype'], index=df.columns)

    df_dtype['min'] = df.select_dtypes(['int', 'float']).min()
    df_dtype['max'] = df.select_dtypes(['int', 'float']).max()
    df_dtype['is_int'] = ~(df.select_dtypes(['int', 'float']).fillna(0).astype(int) - df.select_dtypes(['int', 'float']).fillna(0)).sum().astype('bool_')

    df_dtype.loc[(df_dtype['is_int'] == True), 'dtype'] = 'int64'
    df_dtype.loc[(df_dtype['is_int'] == True) & (df_dtype['min'] >= np.iinfo('int32').min) & (df_dtype['max'] <= np.iinfo('int32').max), 'dtype'] = 'int32'
    df_dtype.loc[(df_dtype['is_int'] == True) & (df_dtype['min'] >= np.iinfo('int16').min) & (df_dtype['max'] <= np.iinfo('int16').max), 'dtype'] = 'int16'
    df_dtype.loc[(df_dtype['is_int'] == True) & (df_dtype['min'] >= np.iinfo('int8').min) & (df_dtype['max'] <= np.iinfo('int8').max), 'dtype'] = 'int8'

    df_dtype.loc[(df_dtype['is_int'] == True) & (df_dtype['min'] >= np.iinfo('uint64').min) ,'dtype'] = 'uint64'
    df_dtype.loc[(df_dtype['is_int'] == True) & (df_dtype['min'] >= np.iinfo('uint32').min) & (df_dtype['max'] <= np.iinfo('uint32').max), 'dtype'] = 'uint32'
    df_dtype.loc[(df_dtype['is_int'] == True) & (df_dtype['min'] >= np.iinfo('uint16').min) & (df_dtype['max'] <= np.iinfo('uint16').max), 'dtype'] = 'uint16'
    df_dtype.loc[(df_dtype['is_int'] == True) & (df_dtype['min'] >= np.iinfo('uint8').min) & (df_dtype['max'] <= np.iinfo('uint8').max), 'dtype'] = 'uint8'

    df_dtype.loc[(df_dtype['is_int'] == True) & (df_dtype['min'] == 0) & (df_dtype['max'] == 1),'dtype'] = 'bool_'

    df_dtype.loc[(df_dtype['is_int'] == False), 'dtype'] = 'float64'
    df_dtype.loc[(df_dtype['is_int'] == False) & (df_dtype['min'] >= np.finfo('float32').min) & (df_dtype['max'] <= np.finfo('float32').max), 'dtype'] = 'float32'

    for col in df.select_dtypes('object').columns:
        num_unique_values = df[col].nunique()
        num_total_values = df[col].count()
        if num_unique_values / num_total_values < 0.5:
            df_dtype.loc[col, 'dtype'] = 'category'

    dtypes = df_dtype['dtype'].to_dict()

    df = df.astype(dtypes)

    if not silent:
        memory_usage = df.memory_usage(deep=True).sum() / 1024**2
        print('MEMORY USAGE AFTER COMPLETION:'.center(width_line, '_'))
        print(f'Memory usage of properties dataframe is : {memory_usage:.02f} MB')
        print(f'This is {100 * memory_usage / start_memory_usage:.02f} % of the initial size')
    return df


def postfilter_items(recommendations, item_info=None, user_history=None, n_rec=5, n_new=2, n_exp=1, price_lte=7):
    recommendations = recommendations.swifter.progress_bar(False).apply(pd.Series)

    mask_unique = recommendations.swifter.progress_bar(False) \
        .apply(lambda row: ~pd.Series(row).duplicated(), axis=1)

    mask_sub_commodity = recommendations.fillna(item_info.index.max() + 1) \
        .swifter.progress_bar(False) \
        .applymap(lambda item: item_info.SUB_COMMODITY_DESC.get(item)) \
        .swifter.progress_bar(False) \
        .apply(lambda row: ~pd.Series(row).duplicated(), axis=1)

    mask = mask_unique & mask_sub_commodity

    recommendations = recommendations.where(mask) \
        .swifter.progress_bar(False) \
        .apply(lambda row: np.array(row), axis=1) \
        .swifter.progress_bar(False) \
        .apply(lambda item: item[~np.isnan(item)]) \
        .swifter.progress_bar(False) \
        .apply(pd.Series)
    if user_history is not None:
        rec_new = recommendations.fillna(user_history.index.max() + 1) \
            .swifter.progress_bar(False) \
            .apply(lambda row: ~np.isin(np.array(row), user_history.get(row.name)), axis=1) \
            .swifter.progress_bar(False).apply(pd.Series)
        rec_new_filter = rec_new.loc[rec_new.loc[:, :n_rec - 1].sum(axis=1) < n_new]
        mask_new = rec_new_filter.swifter.progress_bar(False) \
            .apply(lambda row: postfilter_for_item(row, n=n_new, n_rec=n_rec), axis=1).swifter \
            .progress_bar(False).apply(pd.Series)
        recommendations.loc[mask_new.index] = recommendations.loc[mask_new.index].where(mask_new.apply(pd.Series))
        recommendations = recommendations.swifter.progress_bar(False) \
            .apply(lambda row: np.array(row), axis=1) \
            .swifter.progress_bar(False).apply(lambda item: item[~np.isnan(item)]) \
            .swifter.progress_bar(False).apply(pd.Series)
    if item_info is not None:
        rec_exp = recommendations.fillna(item_info.index.max() + 1) \
            .swifter.progress_bar(False).applymap(lambda item: item_info.price.get(item))
        rec_exp_filter = rec_exp.loc[(rec_exp.loc[:, :n_rec - 1] >= price_lte).sum(axis=1) < n_exp]
        mask_exp = rec_exp_filter.swifter.progress_bar(False) \
            .apply(lambda row: postfilter_for_item(row, n=n_exp, n_rec=n_rec), axis=1) \
            .swifter.progress_bar(False).apply(pd.Series)
        recommendations.loc[mask_exp.index] = recommendations.loc[mask_exp.index] \
            .where(mask_exp.apply(pd.Series))
        recommendations = recommendations.swifter.progress_bar(False) \
            .apply(lambda row: np.array(row, dtype='uint'), axis=1) \
            .swifter.progress_bar(False).apply(lambda item: item[~np.isnan(item)]) \
            .swifter.progress_bar(False).apply(pd.Series)

    recommendations = recommendations.loc[:, :n_rec - 1].swifter.progress_bar(False) \
        .apply(lambda row: np.array(row), axis=1)
    return recommendations


def postfilter_for_item(items_mask, n=2, n_rec=5):
    mask = np.ones(len(items_mask), dtype=bool)
    n = n - items_mask[:n_rec].sum()
    n_low = n
    for index, item in enumerate(np.flip(items_mask[:n_rec])):
        if not item:
            mask[n_rec - index - 1] = False
            n_low -= 1
        if not n_low:
            break
    n_high = n
    for index, item in enumerate(items_mask[n_rec:]):
        if not item:
            mask[n_rec + index] = False
        else:
            n_high -= 1
        if not n_high:
            break
    return mask

#### pipeline.py

In [63]:
import pandas as pd
import numpy as np
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender
from implicit.nearest_neighbours import bm25_weight
from scipy.sparse import csr_matrix
from sklearn.base import BaseEstimator, TransformerMixin


class ColumnSelector(TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        assert isinstance(X, pd.DataFrame), 'This is not a dataframe'
        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError(f'DataFrame does not contain the following columns: {cols_error}')


class PrefilterItems(TransformerMixin, BaseEstimator):
    def __init__(
            self, take_n_popular=5000, item_features=None,
            filter_item_id=-99, n_last_week=52, price_low=2, price_high=50
    ):

        self.take_n_popular = take_n_popular
        self.item_features = item_features
        self.filter_item_id = filter_item_id
        self.n_last_week = n_last_week
        self.price_low = price_low
        self.price_high = price_high

    def _reset(self):
        if hasattr(self, 'is_fit_'):
            del self.is_fit_

    def fit(self, X, items=None):
        self._reset
        return self

    def transform(self, X, items=None):
        if not hasattr(self, 'is_fit_'):
            assert isinstance(X, pd.DataFrame), 'This is not a dataframe'
            # Уберем самые популярные товары (их и так купят)
            popularity = X.groupby('item_id')['user_id'].nunique().reset_index() / X['user_id'].nunique()
            popularity.rename(columns={'user_id': 'share_unique_users'}, inplace=True)

            top_popular = popularity[popularity['share_unique_users'] > 0.2].item_id.tolist()
            X = X[~X['item_id'].isin(top_popular)]
            # Уберем самые НЕ популярные товары (их и так НЕ купят)
            top_notpopular = popularity[popularity['share_unique_users'] < 0.02].item_id.tolist()
            X = X[~X['item_id'].isin(top_notpopular)]
            # Уберем товары, которые не продавались за последние n недель
            last_time = X.week_no.max() - self.n_last_week
            X = X.loc[X.item_id.isin(X.loc[X.week_no > last_time, 'item_id'])]
            # Уберем не интересные для рекоммендаций категории (department)
            if self.item_features is not None:
                department_size = self.item_features.groupby('DEPARTMENT')['PRODUCT_ID'] \
                    .nunique().sort_values(ascending=False).rename('n_items')
                rare_departments = department_size[department_size > 150].index.tolist()
                items_in_rare_departments = self.item_features.loc[self.item_features['DEPARTMENT']
                                                                   .isin(rare_departments)]['PRODUCT_ID'].unique().tolist()
                X = X.loc[X.item_id.isin(items_in_rare_departments)]
            # Уберем слишком дешевые товары (на них не заработаем). 1 покупка из рассылок стоит 60 руб.
            X = X[X['price'] > self.price_low]
            # Уберем слишком дорогие товары
            X = X[X['price'] < self.price_high]
            # Возьмем топ по популярности
            popularity = X.groupby('item_id')['quantity'].sum().reset_index()
            popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

            top = popularity.sort_values('n_sold', ascending=False)[: self.take_n_popular].item_id.tolist()
            # Заведем фиктивный item_id (если юзер покупал товары из топ-n, то он "купил" такой товар)
            X.loc[~X['item_id'].isin(top), 'item_id'] = self.filter_item_id
            self.is_fit_ = True

        return X


class RandomEstimator(TransformerMixin, BaseEstimator):
    def __init__(
        self, n_rec=5, n_rec_pre=100, n_new=2, n_exp=1, price_lte=7,
        filter_item_id=-99, filter=True, filter_post=True,
        postfilter_func=None, random_state=42
    ):

        self.n_rec = n_rec
        self.n_rec_pre = n_rec_pre
        self.n_new = n_new
        self.n_exp = n_exp
        self.price_lte = price_lte
        self.filter_item_id = filter_item_id
        self.filter = filter
        self.filter_post = filter_post
        self.postfilter_func = postfilter_func
        self.random_state = random_state

    def _reset(self):
        if hasattr(self, 'items'):
            del self.items
        if hasattr(self, 'item_info'):
            del self.item_info
        if hasattr(self, 'user_history'):
            del self.user_history

    def fit(self, X, items=None):
        self._reset()
        self.items = X.item_id.unique()
        self.item_info = X.groupby('item_id').agg({'price': 'max', 'SUB_COMMODITY_DESC': 'first'})
        self.user_history = pd.DataFrame(X.groupby('user_id').item_id.unique().rename('history'))

        if items is not None:
            self.items = items
        else:
            self.items = X.item_id.unique()
        if self.filter:
            self.items = self.items[np.where(self.items != self.filter_item_id)]
        return self

    def transform(self, X):
        X = X['user_id'].drop_duplicates()
        return X

    def predict(self, X):
        X = self.transform(X)

        if self.filter_post:
            n_rec = self.n_rec_pre
        else:
            n_rec = self.n_rec

        rec = X.swifter.progress_bar(False).apply(lambda x: self._random_recommendation(n_rec))
        rec.index = X.values

        if self.postfilter_func is not None and self.filter_post:
            rec = self.postfilter_func(
                rec,
                item_info=self.item_info,
                user_history=self.user_history,
                n_rec=self.n_rec,
                n_new=self.n_new,
                n_exp=self.n_exp,
                price_lte=self.price_lte,
            )

        assert (rec.swifter.progress_bar(False).apply(len) == self.n_rec).all(), f'The number of recommendations is not equal {self.n_rec}.'

        return rec

    def _random_recommendation(self, n_rec):
        np.random.seed(self.random_state)
        recs = np.random.choice(self.items, size=n_rec, replace=False, )
        return recs


class AlsEstimator(TransformerMixin, BaseEstimator):
    def __init__(
            self, recommendations='als', n_rec=5, n_rec_pre=100, n_new=2,
            n_exp=1, price_lte=7, filter_item_id=-99, filter=True, filter_post=True,
            postfilter_func=None, factors=50, regularization=0.01,
            iterations=10, matrix_values='quantity', matrix_aggfunc='count',
            weighting=True, use_native=True, use_gpu=False
    ):

        self.n_rec = n_rec
        self.n_rec_pre = n_rec_pre
        self.n_new = n_new
        self.n_exp = n_exp
        self.price_lte = price_lte
        self.filter_item_id = filter_item_id
        self.filter = filter
        self.filter_post = filter_post
        self.postfilter_func = postfilter_func

        self.factors = factors
        self.regularization = regularization
        self.iterations = iterations
        self.matrix_values = matrix_values
        self.matrix_aggfunc = matrix_aggfunc
        self.recommendations = recommendations
        self.weighting = True

        self.use_native = use_native
        self.use_gpu = use_gpu

    def _reset(self):
        if hasattr(self, 'item_info'):
            del self.item_info
        if hasattr(self, 'user_history'):
            del self.user_history
        if hasattr(self, 'top_purchases'):
            del self.top_purchases
        if hasattr(self, 'overall_top_purchases'):
            del self.overall_top_purchases
        if hasattr(self, 'user_item_matrix'):
            del self.user_item_matrix
        if hasattr(self, 'id_to_itemid'):
            del self.id_to_itemid
        if hasattr(self, 'id_to_userid'):
            del self.id_to_userid
        if hasattr(self, 'itemid_to_id'):
            del self.itemid_to_id
        if hasattr(self, 'userid_to_id'):
            del self.userid_to_id
        if hasattr(self, '_fit'):
            del self._fit

    @staticmethod
    def _prepare_matrix(data: pd.DataFrame, values: str, aggfunc: str):
        """Готовит user-item матрицу"""
        user_item_matrix = pd.pivot_table(data,
                                          index='user_id', columns='item_id',
                                          values=values,
                                          aggfunc=aggfunc,
                                          fill_value=0
                                          )

        user_item_matrix = user_item_matrix.astype(float)

        return user_item_matrix

    @staticmethod
    def _prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""

        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))

        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id

    def fit(self, X, y=None):
        self._reset()
        self.item_info = X.groupby('item_id').agg({'price': 'max', 'SUB_COMMODITY_DESC': 'first'})
        self.user_history = pd.DataFrame(X.groupby('user_id').item_id.unique().rename('history'))

        self.top_purchases = X.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != self.filter_item_id]

        # Топ покупок по всему датасету
        self.overall_top_purchases = X.groupby('item_id')['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases['item_id'] != self.filter_item_id]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist()

        self.user_item_matrix = self._prepare_matrix(X, self.matrix_values, self.matrix_aggfunc)

        self.id_to_itemid, self.id_to_userid, \
            self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix)

        if self.weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T

        self.model = AlternatingLeastSquares(
            factors=self.factors,
            regularization=self.regularization,
            iterations=self.iterations,
            dtype=np.float32,
            use_native=self.use_native,
            use_gpu=self.use_gpu,
        )

        self.model.fit(csr_matrix(self.user_item_matrix).T.tocsr())

        self.model_own_recommender = ItemItemRecommender(K=1)
        self.model_own_recommender.fit(csr_matrix(self.user_item_matrix).T.tocsr())

        self._fit = True

    def transform(self, X):
        if self._fit:
            X = X['user_id'].drop_duplicates()
            X.index = X.values
        return X

    def _update_dict(self, user_id):
        """Если появился новыю user / item, то нужно обновить словари"""

        if user_id not in self.userid_to_id.keys():

            max_id = max(list(self.userid_to_id.values()))
            max_id += 1

            self.userid_to_id.update({user_id: max_id})
            self.id_to_userid.update({max_id: user_id})

    def _get_similar_item(self, item_id):
        """Находит товар, похожий на item_id"""
        recs = self.model.similar_items(self.itemid_to_id[item_id], N=2)  # Товар похож на себя -> рекомендуем 2 товара
        top_rec = recs[1][0]  # И берем второй (не товар из аргумента метода)
        return self.id_to_itemid[top_rec]

    def _extend_with_top_popular(self, recommendations):
        """Если кол-во рекоммендаций < N, то дополняем их топ-популярными"""

        if self.filter_post:
            n_rec = self.n_rec_pre
        else:
            n_rec = self.n_rec

        if len(recommendations) < n_rec:
            recommendations.extend(self.overall_top_purchases[:n_rec])
            recommendations = recommendations[:n_rec]

        return recommendations

    def _get_recommendations(self, user, model):
        """Рекомендации через стардартные библиотеки implicit"""
        if self.filter_post:
            n_rec = self.n_rec_pre
        else:
            n_rec = self.n_rec

        self._update_dict(user_id=user)
        try:
            res = [self.id_to_itemid[rec[0]] for rec in model.recommend(
                userid=self.userid_to_id[user],
                user_items=csr_matrix(self.user_item_matrix).tocsr(),
                N=n_rec,
                filter_already_liked_items=False,
                filter_items=[self.itemid_to_id[self.filter_item_id]],
                recalculate_user=True
            )]
        except:
            res = list()
        finally:
            res = self._extend_with_top_popular(res)

            assert len(res) == n_rec, 'Количество рекомендаций != {}'.format(n_rec)
            return res

    def get_als_recommendations(self, user):
        """Рекомендации через стардартные библиотеки implicit"""
        if self.filter_post:
            n_rec = self.n_rec_pre
        else:
            n_rec = self.n_rec

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.model)

    def get_own_recommendations(self, user):
        """Рекомендуем товары среди тех, которые юзер уже купил"""

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.model_own_recommender)

    def get_similar_items_recommendations(self, user):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""
        if self.filter_post:
            n_rec = self.n_rec_pre
        else:
            n_rec = self.n_rec

        top_users_purchases = self.top_purchases[self.top_purchases['user_id'] == user].head(n_rec)

        res = top_users_purchases['item_id'].apply(lambda x: self._get_similar_item(x)).tolist()
        res = self._extend_with_top_popular(res)

        assert len(res) == n_rec, 'Количество рекомендаций != {}'.format(n_rec)
        return res

    def predict(self, X):
        X = self.transform(X)
        recommender = getattr(self, f'get_{self.recommendations}_recommendations')

        rec = X.swifter.progress_bar(False).apply(lambda item: recommender(user=item))
        if self.postfilter_func is not None and self.filter_post:
            rec = self.postfilter_func(
                rec,
                item_info=self.item_info,
                user_history=self.user_history,
                n_rec=self.n_rec,
                n_new=self.n_new,
                n_exp=self.n_exp,
                price_lte=self.price_lte,
            )

        assert (rec.swifter.progress_bar(False).apply(len) == self.n_rec).all(), f'The number of recommendations is not equal {self.n_rec}.'

        return rec

### Settings

In [8]:
RANDOM_STATE=42
pd.options.display.float_format = '{:.3f}'.format

In [9]:
RETAIL_TRAIN = 'https://d2xzmw6cctk25h.cloudfront.net/asset/2105403/attachment/b70bcdded071f30660fc25f875863ac5.zip'
RETAIL_TEST = 'https://d2xzmw6cctk25h.cloudfront.net/asset/2105404/attachment/0303cbb4c877096c5b306a4e5c35f96b.csv'
PRODUCT = 'https://d2xzmw6cctk25h.cloudfront.net/asset/2105401/attachment/26981bfce1b672ad9a0684f7ff39bae5.csv'
HH_DEMOGRAPHIC = 'https://d2xzmw6cctk25h.cloudfront.net/asset/2105400/attachment/bd91ad23de8c24c14f3ff3c6061edb0c.csv'

### Load data

In [10]:
data = pd.read_csv(RETAIL_TRAIN)
data_valid = pd.read_csv(RETAIL_TEST)
item_features = pd.read_csv(PRODUCT)
user_features = pd.read_csv(HH_DEMOGRAPHIC)

<a id=explore></a>
## Exploratory Data Analysis

In [11]:
data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2396804 entries, 0 to 2396803
Data columns (total 12 columns):
 #   Column             Dtype  
---  ------             -----  
 0   user_id            int64  
 1   basket_id          int64  
 2   day                int64  
 3   item_id            int64  
 4   quantity           int64  
 5   sales_value        float64
 6   store_id           int64  
 7   retail_disc        float64
 8   trans_time         int64  
 9   week_no            int64  
 10  coupon_disc        float64
 11  coupon_match_disc  float64
dtypes: float64(4), int64(8)
memory usage: 219.4 MB


In [13]:
item_features.head()

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ


In [14]:
item_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92353 entries, 0 to 92352
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   PRODUCT_ID            92353 non-null  int64 
 1   MANUFACTURER          92353 non-null  int64 
 2   DEPARTMENT            92353 non-null  object
 3   BRAND                 92353 non-null  object
 4   COMMODITY_DESC        92353 non-null  object
 5   SUB_COMMODITY_DESC    92353 non-null  object
 6   CURR_SIZE_OF_PRODUCT  92353 non-null  object
dtypes: int64(2), object(5)
memory usage: 4.9+ MB


In [15]:
user_features.head()

Unnamed: 0,AGE_DESC,MARITAL_STATUS_CODE,INCOME_DESC,HOMEOWNER_DESC,HH_COMP_DESC,HOUSEHOLD_SIZE_DESC,KID_CATEGORY_DESC,household_key
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8
3,25-34,U,75-99K,Homeowner,2 Adults Kids,4,2,13
4,45-54,B,50-74K,Homeowner,Single Female,1,None/Unknown,16


In [16]:
user_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 801 entries, 0 to 800
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   AGE_DESC             801 non-null    object
 1   MARITAL_STATUS_CODE  801 non-null    object
 2   INCOME_DESC          801 non-null    object
 3   HOMEOWNER_DESC       801 non-null    object
 4   HH_COMP_DESC         801 non-null    object
 5   HOUSEHOLD_SIZE_DESC  801 non-null    object
 6   KID_CATEGORY_DESC    801 non-null    object
 7   household_key        801 non-null    int64 
dtypes: int64(1), object(7)
memory usage: 50.2+ KB


In [17]:
data['week_no'].nunique()

95

In [18]:
users, items, interactions = data.user_id.nunique(), data.item_id.nunique(), data.shape[0]

print(f'# users: {users}')
print(f'# items: {items}')
print(f'# interactions: {interactions}')

# users: 2499
# items: 89051
# interactions: 2396804


In [19]:
popularity = data.groupby('item_id')['sales_value'].sum().reset_index()
popularity.describe()

Unnamed: 0,item_id,sales_value
count,89051.0,89051.0
mean,5115771.858,83.458
std,5178973.33,1628.715
min,25671.0,0.0
25%,966583.0,3.5
50%,1448516.0,10.78
75%,9553041.5,46.105
max,18024556.0,467993.62


In [20]:
popularity = data.groupby('item_id')['user_id'].nunique().reset_index()
popularity.describe()

Unnamed: 0,item_id,user_id
count,89051.0,89051.0
mean,5115771.858,14.76
std,5178973.33,45.904
min,25671.0,1.0
25%,966583.0,1.0
50%,1448516.0,2.0
75%,9553041.5,10.0
max,18024556.0,2039.0


## Data processing

### Price items

In [21]:
item_price = data.loc[data.quantity != 0, ['item_id', 'sales_value', 'quantity']] 

In [22]:
item_price['price'] = item_price.sales_value / item_price.quantity

In [23]:
item_price = item_price.groupby('item_id').agg({'price': 'max'})

In [24]:
item_features = item_features.merge(item_price, how='left', left_on='PRODUCT_ID', right_on='item_id')

### Adding features

In [25]:
data = data.merge(item_features, how='left', left_on='item_id', right_on='PRODUCT_ID')
data_valid = data_valid.merge(item_features, how='left', left_on='item_id', right_on='PRODUCT_ID')

In [26]:
data = optimizing_df(data)
data_valid = optimizing_df(data_valid)

*******************************Start of dataframe memory optimization*******************************
Memory usage by dataframe: 1063.56 MB
___________________________________MEMORY USAGE AFTER COMPLETION:___________________________________
Memory usage of properties dataframe is : 153.85 MB
This is 14.47 % of the initial size
*******************************Start of dataframe memory optimization*******************************
Memory usage by dataframe: 39.38 MB
___________________________________MEMORY USAGE AFTER COMPLETION:___________________________________
Memory usage of properties dataframe is : 5.91 MB
This is 15.02 % of the initial size


### Train-test split

In [27]:
test_size_weeks = 3

X_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
X_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [28]:
X_train.shape[0], X_test.shape[0]

(2278490, 118314)

# Modelling

## Random recommendation

In [29]:
baseline_columns = ['user_id', 'item_id', 'quantity','week_no', 'price', 'SUB_COMMODITY_DESC']

In [30]:
pipeline_random = make_pipeline(
    ColumnSelector(baseline_columns),
    PrefilterItems(item_features=item_features),
    RandomEstimator(n_rec=5, postfilter_func=postfilter_items),
)

## Alternating Least Squares

In [31]:
als_columns = ['user_id', 'item_id', 'quantity','week_no', 'price', 'sales_value', 'SUB_COMMODITY_DESC']

In [32]:
ilter_item_id=999999

pipeline_als = make_pipeline(
    ColumnSelector(als_columns),
    PrefilterItems(item_features=item_features, filter_item_id=ilter_item_id, n_last_week=26, take_n_popular=5000),
    AlsEstimator(
        matrix_values='quantity', matrix_aggfunc='count', recommendations='own',
        n_rec=5, postfilter_func=postfilter_items, filter_item_id=ilter_item_id, use_gpu=True
    )
)

## Grid search

In [53]:
def money_precision_scoring(estimator, X_test, item_price=item_price, k=5):
    y_true = X_test.groupby('user_id')['item_id'].unique() \
        .reset_index() \
        .rename(columns={'item_id': 'actual'}) \
        .set_index('user_id').actual
    y_pred = estimator.predict(X_test)
    return money_precision_at_k(y_pred=y_pred, y_true=y_true, item_price=item_price, k=k)

### Random recommendation 

In [123]:
param_grid_random= {
    'prefilteritems__take_n_popular': [4000, 5000, 6000],
    'prefilteritems__n_last_week': [26, 52, 78]
}

In [125]:
grid_search_random = GridSearchCV(pipeline_random, param_grid_random, scoring=money_precision_scoring)
grid_search_random = grid_search_random.fit(X_train)

In [127]:
grid_search_random.best_params_

{'prefilteritems__n_last_week': 26, 'prefilteritems__take_n_popular': 5000}

In [126]:
grid_search_random.best_score_

0.031124192643470522

In [33]:
pipeline_random = make_pipeline(
    ColumnSelector(baseline_columns),
    PrefilterItems(item_features=item_features, n_last_week=26, take_n_popular=5000),
    RandomEstimator(n_rec=5, postfilter_func=postfilter_items),
)

### Alternating Least Squares

In [119]:
param_grid_als={
    'prefilteritems__take_n_popular': [4000, 5000, 6000],
    'prefilteritems__n_last_week': [26, 52, 78]
    'prefilteritems__price_low': [1, 2, 3],
    'prefilteritems__price_high': [25, 35, 50]
    'alsestimator__factors': [20, 50, 100],
    'alsestimator__regularization': [1e-2, 1e-3, 1e-4],
    'alsestimator__iterations': [10, 20], 
}

In [None]:
grid_search_als = GridSearchCV(
    pipeline_als, param_grid_als, 
    scoring=money_precision_scoring,
)

grid_search_als = grid_search_als.fit(X_train)

In [69]:
grid_search_als.best_score_

0.3499918526585718

In [67]:
pipeline_als = make_pipeline(
    ColumnSelector(als_columns),
    PrefilterItems(
        item_features=item_features, filter_item_id=ilter_item_id, 
        price_high=35, price_low=1, n_last_week=78, take_n_popular=4000
        ),
    AlsEstimator(
        matrix_values='quantity', matrix_aggfunc='count', recommendations='own',
        factors=20, iterations=10, regularization=0.01,
        n_rec=5, postfilter_func=postfilter_items, filter_item_id=ilter_item_id, use_gpu=True
    )
)

## Validate

In [35]:
y_valid = data_valid.groupby('user_id')['item_id'].unique() \
                    .reset_index() \
                    .rename(columns={'item_id': 'actual'}) \
                    .set_index('user_id').actual

In [36]:
test_users = y_valid.shape[0]
new_test_users = len(set(y_valid.index) - set(X_train.user_id))
user_test_less = (y_valid.swifter.apply(len) < 5).sum()

print(f'В тестовом дата сете {test_users} юзеров')
print(f'В тестовом дата сете {new_test_users} новых юзеров')
print(f'В тестовом дата сете {user_test_less} юзеров, которые купили меньше 5 товаров')

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=1885.0, style=ProgressStyle(descriptio…


В тестовом дата сете 1885 юзеров
В тестовом дата сете 1 новых юзеров
В тестовом дата сете 177 юзеров, которые купили меньше 5 товаров


### Random recommendation 

In [44]:
pipeline_random = pipeline_random.fit(data)
y_pred_random = pipeline_random.predict(data_valid)
money_precision_at_k(y_pred_random, y_valid, item_price=item_price, k=5)

7.993281203077634e-05

### Alternating Least Squares

In [68]:
pipeline_als = pipeline_als.fit(data)
y_pred_als = pipeline_als.predict(data_valid)
money_precision_at_k(y_pred_als, y_valid, item_price=item_price, k=5)



HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4001.0), HTML(value='')))




0.2054075060754634

## Submit

In [70]:
y_pred_als.head()

1      [856942, 9297615, 1074612, 8293439, 9655212]
2        [1076580, 838136, 911974, 1007414, 826784]
3       [998206, 921345, 1092937, 964594, 13842214]
6    [1119051, 13003092, 9911484, 5580166, 8203834]
7       [840386, 949836, 9338009, 6602729, 1122358]
dtype: object

In [72]:
y_pred_als.to_csv('recommendations.csv', header=None)