In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv('./retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [4]:
items = data_test.groupby('item_id')['sales_value'].sum().reset_index()
items.columns=['item_id', 'weight']


items.drop(items[items['weight'] < 1.0].index, inplace=True)
items['weight'] = np.log(items['weight'])

items['weight'] = items['weight'] / sum(items['weight'])

items

Unnamed: 0,item_id,weight
1,30356,0.000018
2,32392,0.000007
3,32439,0.000013
4,34873,0.000015
5,40405,0.000017
...,...,...
24324,17991689,0.000019
24325,17991691,0.000019
24326,18000012,0.000063
24327,18024155,0.000029


### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [5]:
def weighted_random_recommendation(items_weights, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    
    recs = np.random.choice(items_weights['item_id'], size=n, replace=False, p=items_weights['weight'])
    
    
    return recs.tolist()

In [6]:
%%time

result = weighted_random_recommendation(items)

result

CPU times: total: 0 ns
Wall time: 3.99 ms


[13513120, 13039321, 9419268, 13416167, 933835]

## Задание 2. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая случаный на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

In [7]:
items = data_test.groupby('item_id')['sales_value'].sum().reset_index()
items.columns=['item_id', 'weight']

items = items.sort_values('weight', ascending=False)[:5000]

items.drop(items[items['weight'] < 1.0].index, inplace=True)
items['weight'] = np.log(items['weight'])

items['weight'] = items['weight'] / sum(items['weight'])

items

items

Unnamed: 0,item_id,weight
16073,6534178,0.000561
9544,1029743,0.000436
16058,6533889,0.000434
12946,1106523,0.000412
16052,6533765,0.000412
...,...,...
5035,926318,0.000158
17408,8293426,0.000158
12124,1087955,0.000158
19190,10356542,0.000158


In [8]:
%%time

result = weighted_random_recommendation(items)

result

CPU times: total: 0 ns
Wall time: 998 µs


[1027372, 12811993, 17208470, 1123022, 12731436]

#### Item-Item Recommender / ItemKNN

In [9]:
%%time

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0)

user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 

user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат разряженной матрицы
sparse_user_item = csr_matrix(user_item_matrix)

CPU times: total: 3min 42s
Wall time: 3min 50s


In [10]:
# создаем словари мапинга между id бизнеса к строчному id матрицы

userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [11]:
%%time

model = ItemItemRecommender(K=5, num_threads=4) # K - кол-во билжайших соседей


model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=False)

CPU times: total: 9.2 s
Wall time: 8.44 s


In [None]:
%%time

recs = model.recommend(userid=userid_to_id[2],  # userid - id от 0 до N
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, # кол-во рекомендаций 
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=True)

Не могу выполнить код выше, постоянно падает ядро с ошибкой "The kernel appears to have died. It will restart automatically."
Так что сдаю без этой части