## **Основное**
- На retail_test1.csv целевая метрика precision@5 > 0.235
- Бейзлайн решения - [MainRecommender](https://github.com/geangohn/recsys-tutorial/blob/master/src/recommenders.py)
- Сдаем ссылку на github с решением. В решении должны быть отчетливо видна метрика на новом тестовом сете из файла retail_test1.csv, то есть вам нужно для всех юзеров из этого файла выдать выши рекомендации, и посчитать на actual покупках precision@5. 

# Импорт

In [1]:
#!pip install implicit==0.4.4
#!pip install catboost

In [2]:
_GPU_ACTION=True

In [3]:
from src.utils import *
from src.metrics import precision_at_k
from src.recommenders import MainRecommender
import pandas as pd
import numpy as np
#import yaml

import sys
import inspect
from pprint import pprint
import warnings


In [4]:
!pip install -U -q PyDrive
 
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
 
 
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

link1 = 'https://drive.google.com/file/d/11I5eGSyDkFvSbU5J-5HTkunRoVjUPvZ4/view'
id1 = link1.split("/")[-2]
downloaded1 = drive.CreateFile({'id':id1})
downloaded1.GetContentFile('retail_train.csv')

link2 = 'https://drive.google.com/file/d/15r1tmfW5NbS-wE0EkZ_KWBEWRb29PPFF/view'
id2 = link2.split("/")[-2]
downloaded2 = drive.CreateFile({'id':id2})
downloaded2.GetContentFile('product.csv')

link3 = 'https://drive.google.com/file/d/1x8MG76QgnJQZ5nsv8J35sGTm0kMc5vaZ/view'
id3 = link3.split("/")[-2]
downloaded3 = drive.CreateFile({'id':id3})
downloaded3.GetContentFile('hh_demographic.csv')

link4 = 'https://drive.google.com/file/d/16ZdmG_RJX6kUSOrAwhQ1rAOmPXTbhbaw/view'
id4 = link4.split("/")[-2]
downloaded4 = drive.CreateFile({'id':id4})
downloaded4.GetContentFile('retail_test1.csv')


# Предоброботка данных


In [5]:
_CONSTANTS = {
    'ITEM_COL': 'item_id',
    'USER_COL': 'user_id',
    'ACTUAL_COL': 'actual',
    'TOPK_PRECISION': 5,
    'VAL_MATCHER_WEEKS': 6,
    'VAL_RANKER_WEEKS': 3,
}

In [6]:
class Dataset:
    data_train = pd.read_csv('retail_train.csv')
    data_test = pd.read_csv(r'retail_test1.csv')
    item_features = pd.read_csv('product.csv')
    user_features = pd.read_csv('hh_demographic.csv')
    _CONSTANTS = _CONSTANTS

    

    def data_prefilter(self, make_worse=False):
        # column start processing
        self.item_features.columns = [col.lower() for col in self.item_features.columns]
        self.user_features.columns = [col.lower() for col in self.user_features.columns]
        self.item_features.rename(columns={'product_id': _CONSTANTS['ITEM_COL']}, inplace=True)
        self.user_features.rename(columns={'household_key': _CONSTANTS['USER_COL']}, inplace=True)

        # mark unpopular departments
        department_size = pd.DataFrame(self.item_features. \
                                       groupby('department')[_CONSTANTS['ITEM_COL']].nunique(). \
                                       sort_values(ascending=False)).reset_index()
        department_size.columns = ['department', 'n_items']
        rare_departments = department_size[department_size['n_items'] < 10].department.tolist()
        items_in_rare_departments = self.item_features[self.item_features['department']. \
            isin(rare_departments)].item_id.unique().tolist()

        # transform categorical data to numeric
        self.item_features['brand'] = pd.factorize(self.item_features['brand'])[0]
        self.item_features['commodity_type'] = pd.factorize(self.item_features['commodity_desc'])[0]

        # recalculate mean age, income of users
        self.user_features.loc[self.user_features['age_desc'] == '65+', 'age'] = 75
        self.user_features.loc[self.user_features['age_desc'] != '65+', 'age'] = self.user_features.loc[
            self.user_features['age_desc'] != '65+', 'age_desc'].apply(
            lambda x: int((int(x.split('-')[0]) + int(x.split('-')[1])) / 2))

        self.user_features.loc[self.user_features['income_desc'].str.contains('\+'), 'income'] = 300
        self.user_features.loc[self.user_features['income_desc'].str.contains('Under'), 'income'] = 10
        self.user_features.loc[self.user_features['income_desc'].str.contains('-'), 'income'] = self.user_features.loc[
            self.user_features['income_desc'].str.contains('-'), 'income_desc'].apply(
            lambda x: int((int(x.split('-')[0]) + int(x.split('-')[1][:-1])) / 2))
        # calculating adults_num and has_kids
        self.user_features.loc[(self.user_features['hh_comp_desc'].str.contains('Kids')) & \
                               (~self.user_features['hh_comp_desc'].str.contains('No')), 'has_kids'] = 1
        self.user_features.loc[self.user_features['has_kids'].isnull(), 'has_kids'] = 0
        self.user_features.loc[self.user_features['hh_comp_desc'].str.contains('Adults'), 'adults_num'] = 2
        self.user_features.loc[self.user_features['hh_comp_desc'].str.contains('Single'), 'adults_num'] = 1
        self.user_features.loc[self.user_features['hh_comp_desc'].str.contains('1 Adult'), 'adults_num'] = 1
        self.user_features.loc[self.user_features['hh_comp_desc'] == 'Unknown', 'adults_num'] = 1

        # remove '+' from category and make col type numeric
        self.user_features.loc[self.user_features['household_size_desc'] == '5+', 'household_size_desc'] = 5
        self.user_features.loc[self.user_features['kid_category_desc'] == '3+', 'kid_category_desc'] = 3
        self.user_features.loc[self.user_features['kid_category_desc'] == 'None/Unknown', 'kid_category_desc'] = 0
        self.user_features['household_size_desc'] = self.user_features['household_size_desc'].astype(int)
        self.user_features['kid_category_desc'] = self.user_features['kid_category_desc'].astype(int)

        # transform categorical data to numeric
        self.user_features = pd.concat([self.user_features, pd.get_dummies(self.user_features['homeowner_desc'])],
                                       axis=1)
        self.user_features = pd.concat([self.user_features, pd.get_dummies(self.user_features['marital_status_code'])],
                                       axis=1)

        # remove text data
        self.user_features = self.user_features.iloc[:, 5:]
        self.item_features = self.item_features[[self._CONSTANTS['ITEM_COL'],
                                                 'manufacturer',
                                                 'brand',
                                                 'commodity_type']]

        # iterate throw train and test
        for i in ['train', 'test']:
            if i == 'train':
                df = self.data_train.copy()
            else:
                df = self.data_test.copy()
            # remove unpopular departments
            df = df[~df[_CONSTANTS['ITEM_COL']].isin(items_in_rare_departments)]

            if make_worse:
                # making worst max precision score (hypothesis)
                # remove cheap ones
                df['price'] = df['sales_value'] / (np.maximum(df['quantity'], 1))
                df = df[df['price'] > 2]
                # remove expensive ones
                df = df[df['price'] < 50]

            # add new features
            # user's mean check
            basket_stat = self.user_features.merge(df, on=_CONSTANTS['USER_COL'], how='left')
            basket_stat = basket_stat.pivot_table(index=_CONSTANTS['USER_COL'], values=['basket_id', 'sales_value'],
                                                  aggfunc={'basket_id': 'count', 'sales_value': 'sum'})
            basket_stat = basket_stat['sales_value'] / basket_stat['basket_id']
            basket_stat = basket_stat.reset_index()
            basket_stat.rename(columns={0: 'avg_check'}, inplace=True)
            df = df.merge(basket_stat.reset_index(), on=_CONSTANTS['USER_COL'])
            del basket_stat

            # get top popularity items
            df = df.merge(df.groupby(_CONSTANTS['ITEM_COL'])['quantity'].sum().reset_index(),
                          on=_CONSTANTS['ITEM_COL'],
                          how='left',
                          suffixes=['', '_total'])

            # remove super unpopular items over 12 month
            max_day = df['day'].max()
            items_365 = df.loc[
                (df['day'] <= max_day) & (df['day'] >= max_day - 365), _CONSTANTS['ITEM_COL']].unique().tolist()
            df = df.loc[df[_CONSTANTS['ITEM_COL']].isin(items_365)]
            del items_365

            if make_worse:
                # change item_id to fakes where we think user "already" served his needs
                df.loc[df['quantity_total'] >= _CONSTANTS['TAKE_N_POPULAR'], _CONSTANTS['ITEM_COL']] = 999999

            # commit instance changes
            if i == 'train':
                self.data_train = df.copy()
            else:
                self.data_test = df.copy()
            del df

    def data_split(self,
                   val_lvl_1_size_weeks=_CONSTANTS['VAL_MATCHER_WEEKS'],
                   val_lvl_2_size_weeks=_CONSTANTS['VAL_RANKER_WEEKS']):

        # iterate throw train and test
        for i in ['train', 'test']:
            if i == 'train':
                df = self.data_train.copy()
            else:
                df = self.data_test.copy()

            data_train_lvl_1 = df[df['week_no'] < df['week_no'].max() - \
                                  (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
            data_val_lvl_1 = df[(df['week_no'] >= df['week_no'].max() - \
                                 (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) & \
                                (df['week_no'] < df['week_no'].max() - \
                                 (val_lvl_2_size_weeks))]

            data_train_lvl_2 = data_val_lvl_1.copy()
            data_val_lvl_2 = df[df['week_no'] >= df['week_no'].max() - \
                                val_lvl_2_size_weeks]

            result_lvl_1 = data_val_lvl_1.groupby(_CONSTANTS['USER_COL'])[
                _CONSTANTS['ITEM_COL']].unique().reset_index()
            result_lvl_1.columns = [_CONSTANTS['USER_COL'], _CONSTANTS['ACTUAL_COL']]

            users_train = data_train_lvl_1[_CONSTANTS['USER_COL']].tolist()
            users_valid = result_lvl_1[_CONSTANTS['USER_COL']].tolist()
            new_users = list(set(users_valid) - set(users_train))
            all_users = list(set(users_valid) & set(users_train))
            result_lvl_1 = result_lvl_1[~result_lvl_1[_CONSTANTS['USER_COL']].isin(new_users)]

            # commit instance changes
            if i == 'train':
                self.data_train_lvl_1 = data_train_lvl_1.copy()
                self.data_val_lvl_1 = data_val_lvl_1.copy()
                self.data_train_lvl_2 = data_train_lvl_2.copy()
                self.data_val_lvl_2 = data_val_lvl_2.copy()
                self.result_lvl_1 = result_lvl_1.copy()
            else:
                self.data_train_lvl_1_real = data_train_lvl_1.copy()
                self.data_val_lvl_1_real = data_val_lvl_1.copy()
                self.data_train_lvl_2_real = data_train_lvl_2.copy()
                self.data_val_lvl_2_real = data_val_lvl_2.copy()
                self.result_lvl_1_real = result_lvl_1.copy()

    def data_test_split(self):
        # iterate throw train and test
        data_train_lvl_1 = self.data_train.copy()
        data_val_lvl_1 = self.data_test.copy()

        data_train_lvl_2 = data_val_lvl_1.copy()
        data_val_lvl_2 = data_val_lvl_1.copy()

        result_lvl_1 = data_val_lvl_1.groupby(_CONSTANTS['USER_COL'])[
            _CONSTANTS['ITEM_COL']].unique().reset_index()
        result_lvl_1.columns = [_CONSTANTS['USER_COL'], _CONSTANTS['ACTUAL_COL']]

        users_train = data_train_lvl_1[_CONSTANTS['USER_COL']].tolist()
        users_valid = result_lvl_1[_CONSTANTS['USER_COL']].tolist()
        new_users = list(set(users_valid) - set(users_train))
        all_users = list(set(users_valid) & set(users_train))
        result_lvl_1 = result_lvl_1[~result_lvl_1[_CONSTANTS['USER_COL']].isin(new_users)]

        # commit instance changes
        self.data_train_lvl_1_real = data_train_lvl_1.copy()
        self.data_val_lvl_1_real = data_val_lvl_1.copy()
        self.data_train_lvl_2_real = data_train_lvl_2.copy()
        self.data_val_lvl_2_real = data_val_lvl_2.copy()
        self.result_lvl_1_real = result_lvl_1.copy()

    def postfilter_items(user_id, recommendations):
        pass

In [7]:
data = Dataset()
data.data_prefilter()
data.data_split()

# Формирования рекомендаций

In [8]:
recommender = MainRecommender(data.data_train_lvl_1, gpu=_GPU_ACTION, K1=1, B=0.3)
recommender.preprocessing(data, t='train')
recommender.preprocessing(data, t='test')

GPU training requires factor size to be a multiple of 32. Increasing factors from 20 to 32.


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/48263 [00:00<?, ?it/s]

In [9]:
recommender.recommend_1lvl(data.result_lvl_1).head(1)

Unnamed: 0,user_id,actual,own_recs,own_recs_score
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[940947, 1075074, 9527290, 9655212, 934369, 85...",0.420986


# Ранжирования полученных рекомендаций 


In [10]:
recommender.ranker_fit()

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [11]:
recommender.ranker_predict(recommender.X_train).head()

Unnamed: 0,user_id,item_id,manufacturer,brand,commodity_type,household_size_desc,kid_category_desc,age,income,has_kids,adults_num,Homeowner,Probable Owner,Probable Renter,Renter,Unknown,A,B,U,predict
0,1364,1007136,2,0,215,1,0,75.0,112.0,0.0,1.0,1,0,0,0,0,0,1,0,0.872931
1,1364,857055,560,0,124,1,0,75.0,112.0,0.0,1.0,1,0,0,0,0,0,1,0,0.670283
2,1364,1135834,910,0,17,1,0,75.0,112.0,0.0,1.0,1,0,0,0,0,0,1,0,0.767965
3,1364,856252,988,0,40,1,0,75.0,112.0,0.0,1.0,1,0,0,0,0,0,1,0,0.628438
4,1364,1122115,194,0,38,1,0,75.0,112.0,0.0,1.0,1,0,0,0,0,0,1,0,0.642874


# Оценка результатов

In [12]:
recommender.evaluate_2models()

  mask |= (ar1 == a)


precision@5 of 2lvl-model is 0.27015706806282724


# Предсказываем по тестовым данным

In [13]:
data.data_test_split()
recommender = MainRecommender(data.data_train_lvl_1_real, gpu=_GPU_ACTION, K1=1, B=0.3)
recommender.preprocessing(data, t='train', training=False)
recommender.preprocessing(data, t='test', training=False)
recommender.ranker_fit(training=False)
recommender.evaluate_2models(training=False)



  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/52244 [00:00<?, ?it/s]

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

  mask |= (ar1 == a)


precision@5 of 2lvl-model is 0.3165775401069519
Файл с результатами сохранен.
