In [4]:
pip install implicit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import implicit
from scipy import sparse
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix

from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import BM25Recommender
from implicit.lmf import LogisticMatrixFactorization

from implicit.evaluation import mean_average_precision_at_k, ndcg_at_k

In [6]:
path = "/content/drive/MyDrive/WB School/data.csv.gzip"
df = pd.read_csv(path, compression='gzip')
df["order_ts"] = pd.to_datetime(df["order_ts"])

# Функции

In [7]:
def drop_rare_items(df):

  df_temp = df.drop_duplicates()
  df_count = df_temp.groupby(["user_id", "item_id"], as_index=False).count().rename(columns={"order_ts": "counter"})
  df_count_items = df_count.groupby("item_id", as_index=False)["counter"].sum()

  items = df_count_items.loc[df_count_items.counter == 1, "item_id"].values
  df = df[~df.item_id.isin(items)]

  return df

In [13]:
def extract_ones(df):

  len_df = len(df)
  df_temp = df.drop_duplicates()
  df_count = df_temp.groupby(["user_id", "item_id"], as_index=False).count().rename(columns={"order_ts": "counter"})
  
  df_count_users = df_count.groupby("user_id", as_index=False)["counter"].sum()
  users = df_count_users.loc[df_count_users.counter == 1, "user_id"].values
  
  df_ones = df[df.user_id.isin(users)]
  df = df[~df.user_id.isin(users)]

  df = df.drop_duplicates()

  return df_ones, df

In [9]:
def train_test(df, test_weeks=2):

  n_folds = 13 / test_weeks 

  delta = (df["order_ts"].max() - df["order_ts"].min()) / n_folds
  edge = df["order_ts"].max() - delta

  train = df.loc[df["order_ts"] <= edge]
  test = df.loc[df["order_ts"] > edge]
  
  return train, test

In [10]:
def csr_matrix_constructor(df):

  users = df["user_id"].unique()
  items = df["item_id"].unique()
  shape = (len(users), len(items))
  
  user_cat = CategoricalDtype(categories=sorted(users), ordered=True)
  item_cat = CategoricalDtype(categories=sorted(items), ordered=True)
  user_index = df["user_id"].astype(user_cat).cat.codes
  item_index = df["item_id"].astype(item_cat).cat.codes

  coo = sparse.coo_matrix((df["counter"], (user_index, item_index)), shape=shape)
  csr = coo.tocsr()

  return csr

In [29]:
def common_users_only(df1, df2):
  
  users = list(set(df1["user_id"]).intersection(set(df2["user_id"])))

  df1_new = df1[df1["user_id"].isin(users)].copy()
  df2_new = df2[df2["user_id"].isin(users)].copy()

  return df1_new, df2_new

Функция для кросс-валидации на вход принимает следующие параметры:

*   df (pd.DataFrame) - датафрейм;
*   n_folds (int) - количество фолдов;
*   model_name (str) - название алгоритма построение рекомендаций. Возможные алгоритмы: "AlternatingLeastSquares", "BM25", "LogisticMatrixFactorization".
*   parameters (dict) - сетка параметров модели;
*   metric (str) - критерий сравнения моделей, "nDCG" или "MAP".

Для оценивания моделей используется [nDCG@K](https://www.researchgate.net/publication/330653326_Selecting_Appropriate_Metrics_for_Evaluation_of_Recommender_Systems). [Код](https://github.com/benfred/implicit/commit/861713e6cb4e65d7485abfab5e843d4872bf4bd1).

Для MF модели необходимо подобрать:
*   factors - число скрытых свойств (факторов) каждого юзера и айтема. При малом значении модель может не выявить взаимосвязи, при большом - переобучиться;
*   iterations - число итераций (эпох) для приближения исходной матрицы $R$ как произведения матрицы из юзер-векторов $X$ и матрицы из айтем-векторов $Y$: $R=XY^T$;
*   regularization - значения коэффициента $L_2$-регуляризации: $\lambda(\sum_{u=1}^{|U|}||x_u||^2+\sum_{i=1}^{|I|}||y_i||^2)$;
*   alpha - скорость роста уверенности с ростом числа заказов юзером $u$ товара $i$: $c_{ui}=1+\alpha r_{ui}$.

Для BM-25 модели подбирается 3 гиперпараметра:
*   $K$ - число соседей;
*   $K_1$ - вес, с которым взаимодействие юзера с айтемом входит в скор для айтема;
*   $B$ - коэффициент, отвечающий за то, насколько важна нормализация числа заказов средним числом (длины документа). Чем меньше $B$, тем сильнее смещение к популярным айтемам.

Для [Logistic MF](https://github.com/benfred/implicit/blob/main/implicit/lmf.py) модели подбирается 4 параметра:
*   factors - число скрытых свойств (факторов) каждого юзера и айтема. При малом значении модель может не выявить взаимосвязи, при большом - переобучиться;
*   learning_rate - скорость обновления юзер-векора и айтем-вектора. Например, зафиксируем $y_{i}$ и обозначим learning_rate за $\gamma$. Тогда на $t$-ой итераации вектор $x_{u}$ обновится по формуле: $x_{u}^{t}=x_{u}^{t-1}+\gamma ∇_{x_{u}}^{t-1} lnL$. [Почему](https://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) в знаменателе корень. Правдоподобие максимизируется, поэтому градиент прибавляется;
*   iterations - число итераций;
*   regularization - значения коэффициента $L_2$-регуляризации.

Для [BPR](https://arxiv.org/ftp/arxiv/papers/1205/1205.2618.pdf) гиперпараметры будут следующими:
*   factors - скрытые факторы в модели MF;
*   learning_rate - скорость обновления вероятностей при градиентном спуске;
*   regularization - коэффициент регуляризации, т.к. в основе BPR лежит MF, которая склонна к переобучению;
*   iterations - число шагов градиентного спуска.

In [33]:
def cross_val(df, model_name, n_folds, parameters, metric="nDCG", K_items=10, show_params=True, show_plot=True):

  first_timestamp = df["order_ts"].min()
  delta = (df["order_ts"].max() - first_timestamp) / n_folds

  best_metric_score = 0

  combinations = 1

  for key in parameters.keys():
    combinations *= len(parameters[key])
  
  c = 0

  if model_name == "AlternatingLeastSquares":

    factors = parameters["factors"]
    iterations = parameters["iterations"]
    regularization = parameters["regularization"]
    alpha = parameters["alpha"]

    for f in factors:
      for iter in iterations:
        for reg in regularization:
          for a in alpha:

            c +=1
            
            metric_at_k_list = list()

            if show_params == True:
              print("№ " + str(c) + "/" + str(combinations))

            for i in range(1, n_folds):

              last_timestamp = first_timestamp + (i * delta)
              train_df = df.loc[df["order_ts"] <= last_timestamp]
              val_df = df.loc[(df["order_ts"] > last_timestamp) & (df["order_ts"] < (last_timestamp + delta))]

              train_df, test_df = common_users_only(train_df, val_df)

              train_new = train_df.groupby(["user_id", "item_id"], as_index=False).count().rename(columns={"order_ts": "counter"})
              val_new = val_df.groupby(["user_id", "item_id"], as_index=False).count().rename(columns={"order_ts": "counter"})

              train_csr = csr_matrix_constructor(train_new)
              val_csr = csr_matrix_constructor(val_new)

              model = AlternatingLeastSquares(factors=f, 
                                              iterations=iter, 
                                              regularization=reg, 
                                              alpha=a,
                                              random_state=42)
              model.fit(train_csr)

              if metric == "MAP":
                mapk = mean_average_precision_at_k(model=model, 
                                                  train_user_items=train_csr, 
                                                  test_user_items=val_csr, 
                                                  K=K_items,
                                                  show_progress=False) # Иначе много 
              
                metric_at_k_list.append(mapk)

              elif metric == "nDCG":
                ndcgk = ndcg_at_k(model=model, 
                                  train_user_items=train_csr, 
                                  test_user_items=val_csr, 
                                  K=K_items,
                                  show_progress=False)
                  
                metric_at_k_list.append(ndcgk)

            del last_timestamp
            
            metric_score = sum(metric_at_k_list) / len(metric_at_k_list)

            if metric_score > best_metric_score:
              f_best, iter_best, reg_best, a_best = f, iter, reg, a
              best_metric_score = metric_score

            if show_params==True:
              print("factors:", f, "|", "iterations:", iter, "|", "regularization:", reg, "|", "alpha:", a)
              print(metric + "@" + str(K_items) + " = " + str(metric_score))
              print()
              print("------------------")
              print()
    
    best_params = {"factors": f_best, 
                  "iterations": iter_best, 
                  "regularization": reg_best, 
                  "alpha": a_best}

    print("Model", model_name)
    print("Best_score", best_metric_score)
    print("factors =", f_best, "|", "iterations = ", iter_best, "|", "regularization =", reg_best, "|", "alpha =", a_best)

    return best_params


  elif model_name == "BM25":
    K = parameters["K"]
    K1 = parameters["K1"]
    B = parameters["B"]

    best_metric_score = 0

    for k in K:
      for k1 in K1:
        for b in B:
          for c in range(combinations):
            
            metric_at_k_list = list()

            if show_params == True:
              print("№ " + str(1 + c) + "/" + str(combinations))

            for i in range(1, n_folds):

              last_timestamp = first_timestamp + (i * delta)
              train_df = df.loc[df["order_ts"] <= last_timestamp]
              val_df = df.loc[(df["order_ts"] > last_timestamp) & (df["order_ts"] < (last_timestamp + delta))]

              train_matrix = csr_matrix_constructor(train_df)
              val_matrix = csr_matrix_constructor(val_df)
              
              model = BM25Recommender(K=k, 
                                      K1=k1, 
                                      B=b)
              model.fit(train_matrix)
              
              if metric == "MAP":
                mapk = mean_average_precision_at_k(model=model, 
                                                  train_user_items=train_matrix, 
                                                  test_user_items=val_matrix, 
                                                  K=K_items,
                                                  show_progress=False) # Иначе много 
              
                metric_at_k_list.append(mapk)

              elif metric == "nDCG":
                ndcgk = ndcg_at_k(model=model, 
                                  train_user_items=train_matrix, 
                                  test_user_items=val_matrix, 
                                  K=K_items,
                                  show_progress=False)
                  
                metric_at_k_list.append(ndcgk)

            metric_score = sum(metric_at_k_list) / len(metric_at_k_list)

            if metric_score > best_metric_score:
              K_best, K1_best, B_best = k, k1, b
              best_metric_score = metric_score

            if show_params==True:

              print("K:", k, "|", "K1:", k1, "|", "B:", b)
              print(metric + "@" + str(K_items) + " = " + str("metric_score"))
              print()
              print("------------------")
              print()
    
    best_params = {"K": K_best, 
                   "K1": K1_best, 
                   "B": B_best}

    print("Model", model_name)
    print("Best_score", best_metric_score)
    print("K = {K_best}", "K1 = {K1_best}", "B = {B_best}")

    return best_params


  elif model_name == "LogisticMatrixFactorization":

    factors = parameters["factors"]
    learning_rate = parameters["learning_rate"]
    regularization = parameters["regularization"]
    iterations = parameters["iterations"]
    
    for f in factors:
      for lr in learning_rate:
        for reg in regularization:
          for iter in iterations:
            for c in range(combinations):
            
              metric_at_k_list = list()

              if show_params == True:
                print("№ " + str(1 + c) + "/" + str(combinations))            

            for i in range(1, n_folds):

              last_timestamp = first_timestamp + (i * delta)
              train_df = df.loc[df["order_ts"] <= last_timestamp]
              val_df = df.loc[(df["order_ts"] > last_timestamp) & (df["order_ts"] < (last_timestamp + delta))]

              train_matrix = csr_matrix_constructor(train_df)
              val_matrix = csr_matrix_constructor(val_df)
                
              model = LogisticMatrixFactorization(factors=f, 
                                                  learning_rate=lr, 
                                                  regularization=reg,
                                                  iterations=iter)
              model.fit(train_matrix)
                
              if metric == "MAP":
                mapk = mean_average_precision_at_k(model=model, 
                                                  train_user_items=train_matrix, 
                                                  test_user_items=val_matrix, 
                                                  K=K_items,
                                                  show_progress=False) # Иначе много 
              
                metric_at_k_list.append(mapk)

              elif metric == "nDCG":
                ndcgk = ndcg_at_k(model=model, 
                                  train_user_items=train_matrix, 
                                  test_user_items=val_matrix, 
                                  K=K_items,
                                  show_progress=False)
                  
                metric_at_k_list.append(ndcgk)

            metric_score = sum(metric_at_k_list) / len(metric_at_k_list)

            if metric_score > best_metric_score:
              f_best, lr_best, reg_best, iter_best = f, lr, reg, iter
              best_metric_score = metric_score

            if show_params==True:

              print("factors:", f_best, "|", "learning_rate:", lr_best, "|", "regularization:", reg_best, "|", "iterations: {iter_best}")
              print(metric + "@" + str(K_items) + " = " + str("metric_score"))
              print()
              print("------------------")
              print()

    print("Model", model_name)
    print("Best_score", best_score)
    print("factors = {f_best}", "learning_rate = {lr_best}", "regularization = {reg_best}")
    
    best_params = {"factors": K_best, 
                   "learning_rate": K1_best, 
                   "regularization": B_best,
                   "iterations": iter_best}

    print("Model", model_name)
    print("Best_score", best_metric_score)
    print("factors = {f_best}", "learning_rate = {lr_best}", "regularization = {reg_best}", "iterations = {iter_best}")

    return best_params

  elif model_name == "Bayesian Personalized Ranking":
    print(1)

  else:
    print(2)

# Пример использования функций:

In [17]:
df_ones_new, df_new = extract_ones(df)
df_new = drop_rare_items(df_new)
train_new, test_new = train_test(df_new, test_weeks=2)

parameters = {"factors": [20], 
              "iterations": [4], 
              "regularization": [0.03], 
              "alpha": [6]}

als_best_parameters = cross_val(df=train_new, 
                                model_name="AlternatingLeastSquares", 
                                n_folds=6, 
                                parameters=parameters, 
                                metric="nDCG", 
                                K_items=20, 
                                show_params=True, 
                                show_plot=True)