# Baseline решение хакатона от Ленты

## I. Подготовка данных

In [3]:
import gc
import pandas as pd
import numpy as np

from scipy import sparse as sps
from sparsesvd import sparsesvd
from collections import Counter 
from matplotlib import pyplot as plt


def price_encoder(x):
    return min(x // 50, 10000//50)


def append_hours(df):
    
    # выделяем час заказа
    df['hour'] = df['created'].apply(lambda x: int(x.split()[1].split(':')[0]))
    df['created'] = pd.to_datetime(df['created'])
    
    # выделяем день недели заказа
    df['dow'] = df['created'].apply(lambda x: x.weekday())
    
    # Стоимость заказа
    df['price_sold'] = df.price_sold.apply(lambda x: price_encoder(x))
    
    # Собираем агрегированную инфу по заказу
    grouped = df.groupby(['buyer_id','pav_order_id','created']).agg(
        {'hour':'mean', 'dow':'mean', 'price_sold':'mean'}
    ).reset_index()
    
    joined = df.copy()
    
    # и добавляем ее к интеракциям
    for col in ['hour','dow','price_sold']:
        
        temp = grouped.copy()
        temp['item_id'] = temp[col].apply(lambda x: f'{col}_{int(x)}')
        temp['count'] = 1
        temp['price_sold'] = 1
        temp['flag_weight_goods'] = False
        temp['weight'] = 1

        joined = pd.concat([joined, temp[df.columns]], sort=True)

    return joined
    
    

path = '../../data/'
test_data = pd.read_csv(path+'test.csv')
hist_data = pd.read_csv(path+'hist_data.csv')
hist_data.head()

Unnamed: 0,buyer_id,pav_order_id,created,item_id,count,price_sold,flag_weight_goods,weight
0,95203091,98506637863,2021-07-01 00:03:44,202808329,1.0,79.99,False,11.14
1,95203091,98506637863,2021-07-01 00:03:44,202953905,1.072,44.945,True,11.14
2,95203091,98506637863,2021-07-01 00:03:44,203566452,1.0,69.99,False,11.14
3,95203091,98506637863,2021-07-01 00:03:44,202820143,1.972,41.295,True,11.14
4,95203091,98506637863,2021-07-01 00:03:44,204400422,1.0,269.99,False,11.14


In [4]:
hist_data = append_hours(hist_data)
print(test_data.shape)
test_data = append_hours(test_data)
print(test_data.shape)

(1081420, 7)
(1322152, 9)


In [5]:
# фильтруем на малоактивных пользователей и малопопулярные продукты
def filter_column(ds, col, min_freq, free_col='rating'):
    
    temp = ds.groupby(col, as_index=False).agg({free_col:'count'})
    filtered = temp.loc[temp[free_col] >= min_freq, col].values
    ds = ds.loc[(ds[col].isin(filtered))].copy()

    return ds


hist_data = filter_column(hist_data, col='pav_order_id', min_freq=10, free_col='price_sold')
hist_data = filter_column(hist_data, col='item_id', min_freq=20,free_col='price_sold')

дальше делаем кодировку, чтобы составить матрицу интеркаций

In [6]:
user2idx = {v: k for k, v in enumerate(hist_data.pav_order_id.unique())}
item2idx = {v: k for k, v in enumerate(hist_data.item_id.unique())}
print(len(item2idx))
idx2user = {k:v for v, k in user2idx.items()}
idx2item = {k:v for v, k in item2idx.items()}

hist_data['pav_order_id'] = hist_data.pav_order_id.apply(lambda x: user2idx[x])
hist_data['item_id'] = hist_data.item_id.apply(lambda x: item2idx[x])

hist_data.head()

19159


Unnamed: 0,buyer_id,count,created,dow,flag_weight_goods,hour,item_id,pav_order_id,price_sold,weight
0,95203091,1.0,2021-07-01 00:03:44,3.0,False,0.0,0,0,1.0,11.14
1,95203091,1.072,2021-07-01 00:03:44,3.0,True,0.0,1,0,0.0,11.14
2,95203091,1.0,2021-07-01 00:03:44,3.0,False,0.0,2,0,1.0,11.14
3,95203091,1.972,2021-07-01 00:03:44,3.0,True,0.0,3,0,0.0,11.14
4,95203091,1.0,2021-07-01 00:03:44,3.0,False,0.0,4,0,5.0,11.14


In [7]:

matrix = sps.coo_matrix(
    (np.ones(hist_data.shape[0]), (hist_data['pav_order_id'], hist_data['item_id'])),
    shape=(len(user2idx), len(item2idx)),
)
matrix

<231339x19159 sparse matrix of type '<class 'numpy.float64'>'
	with 5044335 stored elements in COOrdinate format>

## II. Обучение модели

В качестве модели используем EASE.

<a href="https://arxiv.org/abs/1905.03375">Статья на arxiv</a>

<a href='https://github.com/RUCAIBox/RecBole/blob/master/recbole/model/general_recommender/ease.py'>Имплементация в RecBole</a>

In [8]:
%%time

# Обучаем конечную модель
# Мы взяли реализацию из RecBole
# 
def fit_ease(X, reg_weight=100):
    
 # gram matrix
    G = X.T @ X

    # add reg to diagonal
    G += reg_weight * sps.identity(G.shape[0])

    # convert to dense because inverse will be dense
    G = G.todense()

    # invert. this takes most of the time
    P = np.linalg.inv(G)
    B = P / (-np.diag(P))
    # zero out diag
    np.fill_diagonal(B, 0.)
    
    return B

w = fit_ease(matrix)

CPU times: user 9min 33s, sys: 20.8 s, total: 9min 54s
Wall time: 2min 9s


In [9]:
w.shape

(19159, 19159)

## III. Делаем предсказания

In [10]:
def get_preds(x):
    
    basket = [item2idx[t] for t in x if t in item2idx]
    
    # Составляем вектор интеракций человека
    vector = np.zeros(len(item2idx))
    vector[basket] = 1
    
    return vector

pred = test_data.groupby(['pav_order_id'])['item_id'].agg([('basket', list)]).reset_index()
pred['vector'] = pred.basket.apply(lambda x: get_preds(x))
pred.head()

Unnamed: 0,pav_order_id,basket,vector
0,4620121489,"[203164283, 204043498, 204146308, 204119602, 2...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,4620121505,"[202819114, 204074914, 202822471, 202880254, 2...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
2,4620121594,"[202818687, 203430473, 204016498, 203017711, 2...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
3,4620121684,"[203338264, 203436378, 203433668, 202812161, 2...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,4620121902,"[205768202, 202811971, 203429467, 204393593, 2...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


Дальше мы должны перемножить все вектора пользователей на матрицу весов EASE. Сделать это сразу - долго и непрактично. Поэтому мы будем делать это по батчам, перемножая по 10к пользователей за один раз

In [11]:
from tqdm import tqdm
scores = []
recs_for_user = []
print(pred.shape)

batch = []
#w = w.toarray()
for idx, row in tqdm(pred.iterrows(), total = pred.shape[0]):
    vector = row.vector
    
    batch.append(vector)
    if len(batch) > 10000:
        batch = np.array(batch)
        user_scores = batch.dot(w)
        user_scores = np.argsort(-user_scores)[:,:200]
        for i in range(len(user_scores)):
            recs_for_user.append(user_scores[i].tolist())
            
        batch = []
        
batch = np.array(batch)
user_scores = batch.dot(w)
user_scores = np.argsort(-user_scores)[:,:200]
for i in range(len(user_scores)):
    recs_for_user.append(user_scores[i].tolist())
            

(80244, 3)


100%|██████████| 80244/80244 [07:20<00:00, 182.16it/s] 


In [12]:
def get_decoded_recommendations(x):
    
    # создаем конечные декодированные рекомендации
    recs = []
    
    # смотрим, что уже лежит в корзине
    consumed = [item2idx[t] for t in x.basket if t in item2idx]
    for el in x.recs[0]:
        
        # нам надо выкинуть специальные токены из рекомендаций. Это то, что связано с аггрегированными
        # статистиками в заказе. А также не положить то, что уже лежит в корзине
        cond = el not in consumed and 'hour' not in str(idx2item[el])
        cond2 = 'price' not in str(idx2item[el]) and 'dow' not in str(idx2item[el])
        if cond and  cond2:
            recs.append(idx2item[el])
            
        if len(recs) == 20:
            break
            
    return recs


pred['recs'] = recs_for_user
pred['preds'] = pred.apply(lambda x: get_decoded_recommendations(x), axis=1)
pred.head()

Unnamed: 0,pav_order_id,basket,vector,recs,preds
0,4620121489,"[203164283, 204043498, 204146308, 204119602, 2...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[19144, 19152, 19141, 5392, 115, 19130, 19123...","[203566491, 202820148, 202791620, 202820143, 2..."
1,4620121505,"[202819114, 204074914, 202822471, 202880254, 2...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[[19150, 19144, 19140, 7, 115, 564, 291, 293, ...","[203068900, 202820148, 203059303, 202880262, 2..."
2,4620121594,"[202818687, 203430473, 204016498, 203017711, 2...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[[19150, 19144, 19134, 115, 564, 7, 7050, 783,...","[202820148, 203059303, 203068900, 202809628, 2..."
3,4620121684,"[203338264, 203436378, 203433668, 202812161, 2...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[19150, 19144, 19132, 2023, 9420, 10471, 336,...","[202812162, 203041368, 203405624, 203090014, 2..."
4,4620121902,"[205768202, 202811971, 203429467, 204393593, 2...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[19150, 19144, 19130, 115, 858, 4282, 33, 783...","[202820148, 203422957, 203431923, 202872237, 2..."


In [14]:
to_save = pred.copy()
to_save = to_save.set_index('pav_order_id')
to_save['preds'].to_csv(path+'baseline5.csv')