## Курсовой проект "Рекомедательные системы"

**Основное**
- Целевая метрика precision@5
- Бейзлайн решения - [MainRecommender](https://github.com/geangohn/recsys-tutorial/blob/master/src/recommenders.py)
- Сдаем ссылку на github с решением. На github должен быть файл recommendations.csv (user_id | [rec_1, rec_2, ...] с рекомендациями. rec_i - реальные id item-ов (из retail_train.csv)

**Hints:** 

Сначала просто попробуйте разные параметры MainRecommender:  
- N в топ-N товарах при формировании user-item матирцы (сейчас топ-5000)  
- Различные веса в user-item матрице (0/1, кол-во покупок, log(кол-во покупок + 1), сумма покупки, ...)  
- Разные взвешивания матрицы (TF-IDF, BM25 - у него есть параметры)  
- Разные смешивания рекомендаций (обратите внимание на бейзлайн - прошлые покупки юзера)  

Сделайте MVP - минимально рабочий продукт - (пусть даже top-popular), а потом его улучшайте

Если вы делаете двухуровневую модель - следите за валидацией 

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from scipy.sparse import csr_matrix
import re

from implicit.als import AlternatingLeastSquares

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, precision_recall_curve, ConfusionMatrixDisplay

from catboost import CatBoostClassifier

import warnings
warnings.simplefilter('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATASET_PATH_RETAIL = 'data/retail_train.csv'
DATASET_PATH_PRODUCT = 'data/product.csv'
DATASET_PATH_DEMOGRAPHIC = 'data/hh_demographic.csv'

In [3]:
def recall_at_k(recommended_list, bought_list, k):
  recommended_list = np.array(recommended_list)[:k]
  bought_list = np.array(bought_list)

  recall = np.isin(recommended_list, bought_list).sum() / len(bought_list)

  return recall

In [4]:
def precision_at_k(recommended_list, bought_list, k):
  recommended_list = np.array(recommended_list)[:k]
  bought_list = np.array(bought_list)

  precision = np.isin(recommended_list, bought_list).sum() / len(recommended_list)

  return precision

In [5]:
def ndcg_at_k(recommended_list, bought_list, k):
  recommended_list = np.array(recommended_list)[:k]
  bought_list = np.array(bought_list)

  flags = np.isin(recommended_list, bought_list)
  ideal_flags = np.ones(len(flags))

  # DCG@K
  dcg_at_k_list = []

  for n, i in enumerate(flags, 1):
    if n <= 2:
      dcg_at_k_list.append(i / n)

    if n > 2:
      dcg_at_k_list.append(i / np.log(n))

  ideal_dcg_at_k_list = []

  # idealDCG@K
  for n, i in enumerate(ideal_flags, 1):
    if n <= 2:
      ideal_dcg_at_k_list.append(i / n)

    if n > 2:
      ideal_dcg_at_k_list.append(i / np.log(n))

  dcg_at_k = (1 / len(flags)) * sum(dcg_at_k_list)
  ideal_dcg_at_k = (1 / len(flags)) * sum(ideal_dcg_at_k_list)

  result = dcg_at_k / ideal_dcg_at_k

  return result

In [6]:
data = pd.read_csv(DATASET_PATH_RETAIL, sep=',').iloc[int(1e6):]
item_features = pd.read_csv(DATASET_PATH_PRODUCT, sep=',')
user_features = pd.read_csv(DATASET_PATH_DEMOGRAPHIC, sep=',')

In [7]:
item_features.columns = [col.lower() for col in item_features.columns.tolist()]
user_features.columns = [col.lower() for col in user_features.columns.tolist()]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [8]:
class FirstStage():
  def __init__(self, data):
    self.train_lvl_1, self.valid_lvl_1, \
      self.valid_lvl_1_u_i, self.valid_lvl_2 = self.split_data(data)

    self.user_item_matrix = self.prepare_matrix(self.train_lvl_1)
    self.sparse_user_item = self.prepare_csr_matrix(self.user_item_matrix)

    self.new_to_old_user_ids, self.new_to_old_item_ids, \
      self.old_to_new_user_ids, self.old_to_new_item_ids = self.prepare_dicts(self.user_item_matrix)

    self.als = self.initialize_als(self.sparse_user_item)

  @staticmethod
  def split_data(data):

    val_lvl_1_split_week = 6
    val_lvl_2_split_week = 3

    # Входит все, что раньще первых 86 недель
    train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_split_week + val_lvl_2_split_week)]

    # Входит все, что от 86 недели до 91
    valid_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_split_week + val_lvl_2_split_week)) &
                      (data['week_no'] < data['week_no'].max() - val_lvl_2_split_week)]

    valid_lvl_1 = valid_lvl_1[valid_lvl_1['user_id'].isin(train_lvl_1['user_id']).values.tolist()]
    valid_lvl_1_u_i = valid_lvl_1.groupby('user_id')['item_id'].unique().reset_index()

    # Входит все, что от 92 недели до 95
    valid_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_split_week]
    valid_lvl_2 = valid_lvl_2[valid_lvl_2['user_id'].isin(train_lvl_1['user_id']).values.tolist()]


    return train_lvl_1, valid_lvl_1, valid_lvl_1_u_i, valid_lvl_2

  @staticmethod
  def prepare_dicts(user_item_matrix):
    old_user_ids = user_item_matrix.index.tolist()
    old_item_ids = user_item_matrix.columns.tolist()

    new_user_ids = np.arange(len(old_user_ids))
    new_item_ids = np.arange(len(old_item_ids))

    new_to_old_user_ids = dict(zip(new_user_ids, old_user_ids))
    new_to_old_item_ids = dict(zip(new_item_ids, old_item_ids))

    old_to_new_user_ids = dict(zip(old_user_ids, new_user_ids))
    old_to_new_item_ids = dict(zip(old_item_ids, new_item_ids))

    return new_to_old_user_ids, new_to_old_item_ids,  old_to_new_user_ids, \
      old_to_new_item_ids

  @staticmethod
  def prepare_matrix(train_data_lvl_1):
    top_5000 = train_data_lvl_1.groupby('item_id')['quantity'].sum().reset_index()
    top_5000.sort_values(by='quantity', ascending=False, inplace=True)
    top_5000 = top_5000['item_id'].values.tolist()[:5000]

    train_data_lvl_1.loc[~train_data_lvl_1['item_id'].isin(top_5000), 'item_id'] = 999999

    user_item_matrix = pd.pivot_table(train_data_lvl_1,
                                  index='user_id',
                                  columns='item_id',
                                  values='quantity',
                                  aggfunc='sum',
                                  fill_value=0)

    user_item_matrix[user_item_matrix > 0] = 1
    user_item_matrix = user_item_matrix.astype(float)

    return user_item_matrix


  @staticmethod
  def prepare_csr_matrix(user_item_matrix):
    return csr_matrix(user_item_matrix).tocsr()

  @staticmethod
  def initialize_als(sparse_user_item):
    model = AlternatingLeastSquares(factors=500,
                                regularization=1e-3,
                                iterations=100,
                                num_threads=6,
                                calculate_training_loss=True,
                                random_state=42)

    model.fit(sparse_user_item, show_progress=True)

    return model

  def recommend(self, valid_lvl_1_u_i):
    valid_lvl_1_u_i['als'] = valid_lvl_1_u_i['user_id'].\
             apply(lambda x: [self.new_to_old_item_ids[rec] for rec in \
                              self.als.recommend(userid=self.old_to_new_user_ids[x],
                                                        user_items=self.sparse_user_item[self.old_to_new_user_ids[x]],
                                                        N=300,
                                                        filter_already_liked_items=False,
                                                        filter_items=[self.old_to_new_item_ids[999999]],
                                                        recalculate_user=True)[0]])

    return valid_lvl_1_u_i


# Чтобы обращаться к аттрибутам класса FirstStage, нужно в класс SecondStage передать
# FirstStage, в ините класса SecondStage указать переменную вызываемого класса (first_stage) и через нее
# вызывать аттрибуты класса FirstStage
class SecondStage(FirstStage):
  def __init__(self, first_stage):
    self.scaler = StandardScaler()


    self.valid_lvl_2 = first_stage.valid_lvl_2
    self.train_lvl_2 = first_stage.train_lvl_1.copy()

    self.users_lvl_2 = pd.DataFrame(self.train_lvl_2['user_id'].unique(), columns=['user_id'])

    self.sparse_user_item = first_stage.sparse_user_item


    self.new_to_old_user_ids, self.new_to_old_item_ids, \
      self.old_to_new_user_ids, self.old_to_new_item_ids = \
    first_stage.new_to_old_user_ids, first_stage.new_to_old_item_ids, \
      first_stage.old_to_new_user_ids, first_stage.old_to_new_item_ids

    self.als = first_stage.als

    self.raw_data_for_cls = self.prepare_raw_data_for_cls(self.users_lvl_2)

    self.data_to_classify = self.data_processing(self.raw_data_for_cls)

    self.scaled_data = self.scale_data(self.data_to_classify)

    self.X_train, self.X_valid, self.y_train, self.y_valid = self.split_data(self.scaled_data)

    self.catb = self.initialize_catboost(self.X_train, self.y_train)


  def prepare_raw_data_for_cls(self, users_lvl_2, return_candidates=False):
    users_lvl_2['candidates'] = users_lvl_2['user_id'].\
             apply(lambda x: [self.new_to_old_item_ids[rec] for rec in \
                              self.als.recommend(userid=self.old_to_new_user_ids[x],
                                                        user_items=self.sparse_user_item[self.old_to_new_user_ids[x]],
                                                        N=300,
                                                        filter_already_liked_items=False,
                                                        filter_items=[self.old_to_new_item_ids[999999]],
                                                        recalculate_user=True)[0]])

    s = users_lvl_2 .apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
    s.name='item_id'

    users_lvl_2 .drop(columns=['candidates'], inplace=True)
    users_lvl_2  = users_lvl_2.join(s)
    users_lvl_2 ['flag'] = 1

    targets_lvl_2 = self.train_lvl_2[['user_id', 'item_id']]

    targets_lvl_2 = targets_lvl_2.merge(users_lvl_2 , on=['user_id', 'item_id'], how='left')
    targets_lvl_2.rename(columns={'flag': 'target'}, inplace=True)

    targets_lvl_2.fillna(0, inplace=True)

    targets_lvl_2 = targets_lvl_2[targets_lvl_2['item_id']!=999999]

    targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')
    targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')

    if return_candidates==True:
        return targets_lvl_2, users_lvl_2

    return targets_lvl_2


  @staticmethod
  def data_processing(raw_data_for_cls):

    processing_data = raw_data_for_cls.copy()

    # 1. Разбиение объектов рейнджа чисел на две колонки чисел
    # split
    processing_data.age_desc = processing_data.age_desc.apply(lambda x: str(x).split('-'))

    processing_data[['age_desc_min', 'age_desc_max']] = pd.DataFrame(processing_data.age_desc.tolist(), index=processing_data.index)
    processing_data.drop(columns='age_desc', inplace=True)

    # age_desc_min
    processing_data['age_desc_min'] = processing_data['age_desc_min'].apply(lambda x: x.split('+')[0] if '+' in x else x)
    processing_data['age_desc_min'] = processing_data['age_desc_min'].apply(lambda x: None if x=='nan' else x)

    age_desc_min_mode = processing_data.age_desc_min.mode()[0]
    processing_data['age_desc_min'].fillna(age_desc_min_mode, inplace=True)

    # age_desc_max
    processing_data['age_desc_max'] = processing_data['age_desc_max'].apply(lambda x: None if x=='nan' else x)
    age_desc_max_mode = processing_data['age_desc_max'].mode()[0]

    processing_data['age_desc_max'].fillna(age_desc_max_mode, inplace=True)


    # 2. Заполнение пропусков категориальных колонок модами этих колонок
    # и небольшая очистка
    obj_cols = ['marital_status_code', 'income_desc',	'homeowner_desc', 'hh_comp_desc', 'household_size_desc',	'kid_category_desc']
    processing_data[obj_cols] = processing_data[obj_cols].fillna(processing_data.mode().iloc[0])

    processing_data.loc[processing_data['kid_category_desc']=='None/Unknown', 'kid_category_desc'] = processing_data['kid_category_desc'].value_counts().index[1]

    processing_data['household_size_desc'] = processing_data['household_size_desc'].apply(lambda x: '5' if '+' in x else x)
    processing_data['kid_category_desc'] = processing_data['kid_category_desc'].apply(lambda x: '3' if '+' in x else x)

    # 3. Преобразование категориальных признаков в количественные
    processing_data['age_desc_min'] = processing_data['age_desc_min'].astype(int)
    processing_data['age_desc_max'] = processing_data['age_desc_max'].astype(int)

    processing_data[['household_size_desc',	'kid_category_desc']] = processing_data[['household_size_desc',	'kid_category_desc']].astype(int)

    # OHE
    cols_to_ohe = processing_data.select_dtypes('object').columns.tolist()
    for col in cols_to_ohe:
      processing_data = pd.concat([processing_data, pd.get_dummies(processing_data[col], prefix=col)], axis=1)

    processing_data.drop(columns=cols_to_ohe, inplace=True)


    return processing_data


  # Масштабирование данных
  def scale_data(self, data_to_classify):

    scaling_data = data_to_classify.copy()

    scaler_cols = scaling_data.select_dtypes(include=['int64']).drop(columns=['user_id', 'item_id']).columns.tolist()
    scaling_data[scaler_cols] = self.scaler.fit_transform(scaling_data[scaler_cols])

    return scaling_data


  @staticmethod
  # train_test_split
  def split_data(scaled_data):

    X = scaled_data.drop(columns='target')
    X = X.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
    X = X.iloc[:, ~X.columns.duplicated()]

    y = scaled_data['target']
    y = y.astype(int)

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33,
                                                          shuffle=True, random_state=42)

    return X_train, X_valid, y_train, y_valid

  @staticmethod
  def initialize_catboost(X_train, y_train):
    class_weights = {0: 10, 1: 1}

    # Копия для искусственной генерации названия колонок
    X_train_ = X_train.copy()
    X_train_.columns = [np.arange(1, X_train_.shape[1]+1)]

    catb = CatBoostClassifier(class_weights=class_weights,
                          n_estimators=1000, max_depth=15, l2_leaf_reg=9, silent=True)
    catb.fit(X_train_, y_train)

    return catb

# Порядок вызова классов (MRO)
class GetRecommendations(SecondStage, FirstStage):
  def __init__(self, first_stage, second_stage, data_pred):
      self.valid_lvl_2 = first_stage.valid_lvl_2

      self.sparse_user_item = first_stage.sparse_user_item

      self.new_to_old_user_ids, self.new_to_old_item_ids, \
      self.old_to_new_user_ids, self.old_to_new_item_ids = \
        first_stage.new_to_old_user_ids, first_stage.new_to_old_item_ids, \
      first_stage.old_to_new_user_ids, first_stage.old_to_new_item_ids

      self.als = first_stage.als

      self.catb = second_stage.catb

      self.scale_data = second_stage.scale_data

      self.data_processing = second_stage.data_processing


      # Аттрибуты фин. прогноза
      self.pred_candidates = self.get_pred_candidates(self.valid_lvl_2)

      self.pred_user_item = self.get_user_item(self.pred_candidates)

      self.pred_data_to_classify = self.data_processing(self.pred_user_item)

      self.pred_scaled_data = self.scale_data(self.pred_data_to_classify)

      self.pred_recommendations = self.get_recommendations(self.pred_scaled_data)


  # Формирование ранжирующего списка рекомендаций, исходя из прогнозов второй
  # модели
  def post_processing(self, user_id, data):
    user_purchases = data[data['user_id']==user_id].drop_duplicates().reset_index()
    top_proba_df = user_purchases.sort_values(by='proba', ascending=False)[:5]
    top_proba_items = top_proba_df['item_id'].values.tolist()

    return top_proba_items




  def get_pred_candidates(self, valid_lvl_2):
    users_pred = pd.DataFrame(valid_lvl_2['user_id'].unique())
    users_pred.columns=['user_id']

    # Получение списка кандидатов для users_pred
    users_pred['candidates'] = users_pred['user_id'].\
                          apply(lambda x: [self.new_to_old_item_ids[rec] for rec in \
                                          fs.als.recommend(userid=self.old_to_new_user_ids[x],
                                                          user_items = self.sparse_user_item[self.old_to_new_user_ids[x]],
                                                          N=300,
                                                          filter_already_liked_items=False,
                                                          filter_items=[self.old_to_new_item_ids[999999]],
                                                          recalculate_user=True)[0]])
    return users_pred

  @staticmethod
  def get_user_item(pred_candidates):
    s = pred_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
    s.name = 'item_id'

    users_pred = pred_candidates.drop(columns=['candidates'])
    users_pred = users_pred.join(s)


    users_pred = users_pred.merge(item_features, on='item_id', how='left')
    users_pred = users_pred.merge(user_features, on='user_id', how='left')

    return users_pred


  def get_recommendations(self, scaled_data):
      users_pred = scaled_data.iloc[:, :self.catb.n_features_in_]

      # Копия для искусственной генерации названия колонок
      users_pred_ = users_pred.copy()

      users_pred_.columns=[np.arange(1, users_pred.shape[1]+1)]

      recs = self.catb.predict_proba(users_pred_)[:, 1]

      users_pred['proba'] = recs

      valid_lvl_2_u_i = self.valid_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
      valid_lvl_2_u_i['recs'] = valid_lvl_2_u_i['user_id'].apply(lambda x: self.post_processing(x, users_pred))

      return valid_lvl_2_u_i

In [9]:
fs = FirstStage(data)

100%|██████████| 100/100 [02:52<00:00,  1.72s/it, loss=0.0101]


In [10]:
valid_lvl_1_u_i = fs.recommend(fs.valid_lvl_1_u_i)

In [11]:
fs.valid_lvl_1_u_i.apply(lambda x: recall_at_k(x['als'], x['item_id'], k=300), axis=1).mean()

0.2786351909537758

In [12]:
ss = SecondStage(fs)

In [13]:
ss.X_valid.columns = [np.arange(1, ss.X_valid.shape[1]+1)]

y_valid_pred_proba = ss.catb.predict_proba(ss.X_valid)[:, 1]

precision, recall, threshold = precision_recall_curve(ss.y_valid, y_valid_pred_proba)
f1_score = (2 * precision * recall) / (precision + recall)
ix = np.argmax(f1_score)

y_valid_pred = [1 if el > threshold[ix] else 0 for el in y_valid_pred_proba]


print(classification_report(ss.y_valid, y_valid_pred))

              precision    recall  f1-score   support

           0       0.71      0.38      0.50     24878
           1       0.93      0.98      0.96    213731

    accuracy                           0.92    238609
   macro avg       0.82      0.68      0.73    238609
weighted avg       0.91      0.92      0.91    238609


In [None]:
get_recs = GetRecommendations(fs, ss, ss.valid_lvl_2)

In [None]:
get_recs.pred_recommendations

In [None]:
get_recs.pred_recommendations.apply(lambda x: precision_at_k(x['recs'], x['item_id'], k=5), axis=1).mean()

In [None]:
get_recs.pred_recommendations.apply(lambda x: ndcg_at_k(x['recs'], x['item_id'], k=5), axis=1).mean()