In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

from metrics import precision_at_k
import warnings
warnings.simplefilter('ignore')

In [2]:
def get_recommendations(user, model, N=5):
    res = [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[user], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=N, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=True)]
    return res

In [3]:
data = pd.read_csv('./data/transaction_data.csv')

data.columns = [col.lower() for col in data.columns]
data.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)


test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(10)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0
5,2375,26984851516,1,826249,2,1.98,364,-0.6,1642,1,0.0,0.0
6,2375,26984851516,1,1043142,1,1.57,364,-0.68,1642,1,0.0,0.0
7,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,0.0,0.0
8,2375,26984851516,1,1102651,1,1.89,364,0.0,1642,1,0.0,0.0
9,2375,26984851516,1,6423775,1,2.0,364,-0.79,1642,1,0.0,0.0


In [4]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']

In [5]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

In [6]:
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

In [7]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

### BM25

In [8]:
user_item_matrix = bm25_weight(user_item_matrix.T).T

In [26]:
model = AlternatingLeastSquares(factors=64, 
                                regularization=0.05,
                                iterations=15, 
                                calculate_training_loss=True, 
                                num_threads=0)

model.fit(csr_matrix(user_item_matrix).T.tocsr(), show_progress=True)

result['als_bm25'] = result['user_id'].apply(lambda x: get_recommendations(x, model=model, N=5))

  0%|          | 0/15 [00:00<?, ?it/s]

In [27]:
result.apply(lambda row: precision_at_k(row['als_bm25'], row['actual']), axis=1).mean()

0.17790055248618786

### Попробуем оптимизировать параметры модели Grid search'ем

In [11]:
# Пусть выборка будет производиться из следующих значений
param = {
    'factors': [32, 64, 128, 256],
    'regularization': np.arange(0.01, 0.06, 0.01),
    'iterations': [5, 10, 15, 20, 25],
}

In [45]:
# соберем матрицу всех возможных вариаций параметров
combined_params = np.array(np.meshgrid(param['factors'], 
                                       param['regularization'], 
                                       param['iterations'])).T.reshape(-1,3)

In [30]:
# прогоним нашу модель с этими параметрами и выберем лучший
result_list = []
for par in combined_params:
    model = AlternatingLeastSquares(factors=int(par[0]), 
                                regularization=par[1],
                                iterations=int(par[2]), 
                                calculate_training_loss=True, 
                                num_threads=0)
    model.fit(csr_matrix(user_item_matrix).T.tocsr(), show_progress=False)
    result['als_bm25'] = result['user_id'].apply(lambda x: get_recommendations(x, model=model, N=5))
    current_score = result.apply(lambda row: precision_at_k(row['als_bm25'], row['actual']), axis=1).mean() 
    result_list.append((par, current_score))    

In [40]:
best_result = sorted(result_list, key=lambda x: x[1], reverse=True)[0]

In [44]:
print(f'factors: {int(best_result[0][0])}\n'
      f'regularization: {best_result[0][1]}\n'
      f'iterations: {int(best_result[0][2])}\n'
      f'result: {best_result[1]}')

factors: 256
regularization: 0.04
iterations: 5
result: 0.19919638372677048


In [47]:
# еще раз протестируем модель с найденными значениями
model = AlternatingLeastSquares(factors=256, 
                                regularization=0.04,
                                iterations=5, 
                                calculate_training_loss=True, 
                                num_threads=0)
model.fit(csr_matrix(user_item_matrix).T.tocsr(), show_progress=False)
result['als_bm25'] = result['user_id'].apply(lambda x: get_recommendations(x, model=model, N=5))
result.apply(lambda row: precision_at_k(row['als_bm25'], row['actual']), axis=1).mean()

0.2004018081366148

Получили даже чуть больше, полагаю из-за другого random_state