In [None]:
import pandas as pd
import scipy.sparse as sparse

from code.preprocessing import Dataset
from core.database.db import DB
from code.metrics import fuzzy, precision
from implicit.als import AlternatingLeastSquares

db = DB(db='recsys')
from code.preprocessing import filter_old_cards, filter_rare_cards, filter_rare_goods, filter_old_goods, filter_by_quantile
%load_ext autoreload
%autoreload 2

### Препроцессинг трейна

In [None]:
train = pd.read_sql('select * from db.train', con = db.engine)
print('Shape: %s' % train.shape[0])

train = filter_rare_goods(train, rarity_num=5)
print('Shape without rare goods: %s' % train.shape[0])

train = filter_rare_cards(train, rarity_num=5)
print('Shape without rare cards: %s' % train.shape[0])

train = filter_old_cards(train, month_threshold=1)
print('Shape without old cards: %s' % train.shape[0])

train = filter_old_goods(train, month_threshold=1)
print('Shape without old goods: %s' % train.shape[0])

train = filter_by_quantile(train, plu_count_quantiles=(0.5, 0.99), cards_count_quantiles=(0.4, 0.99))
print('Shape without low and high quantiles: %s' % train.shape[0])

Shape: 17985532
Shape without rare goods: 17591183
Shape without rare cards: 17231447
Shape without old cards: 13718026
Shape without old goods: 12065529


In [3]:
ds = Dataset(train)
matrix = ds.make_matrix()
matrix = ds.transform(matrix, method='clip', clip_upper_value=1000)
matrix = ds.transform(matrix, method='log')
matrix = ds.apply_weights(matrix, weight='bm25')

## Подготовка и очистка тестового сета

In [4]:
products = pd.read_sql('select * from db.products', con = db.engine)
test = pd.read_sql('select * from db.test', con = db.engine)
val = pd.read_sql('select * from db.val', con = db.engine)

test.columns = [x.lower() for x in test.columns]
products.columns = [x.lower() for x in products.columns]
val.columns = [x.lower() for x in val.columns]

crd_no_unique_train = matrix.index.unique()
plu_id_unique_train = matrix.columns.unique()
test = test[test['crd_no'].isin(crd_no_unique_train)]
test = test[test['plu_id'].isin(plu_id_unique_train)]
val = val[val['crd_no'].isin(crd_no_unique_train)]
val = val[val['plu_id'].isin(plu_id_unique_train)]

plu_category_dict = products.set_index('plu_id').to_dict()['level_2_name']
val_facts_dict = dict(val[['crd_no', 'plu_id']].groupby('crd_no').apply(lambda x: x['plu_id'].unique().tolist()))
test_facts_dict = dict(test[['crd_no', 'plu_id']].groupby('crd_no').apply(lambda x: x['plu_id'].unique().tolist()))

plu_price = pd.read_sql('select * from db.plu_price', con=db.engine)
plu_price['mean_price'] = plu_price['mean_price'].astype('float16')
plu_price = dict(plu_price[['plu_id', 'mean_price']].values.tolist())

### Строим модель

In [6]:
model = AlternatingLeastSquares(factors=50, regularization=0.0001, 
                                iterations=20, num_threads=16,
                                calculate_training_loss=True)
model.fit(sparse.csr_matrix(matrix).T.tocsr(), show_progress=True)

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))




### Проверяем метрики

In [73]:
%%time
fz = fuzzy(matrix, model, val_facts_dict, plu_category_dict, weight_by_price=False)
prc = precision(matrix, model, val_facts_dict, weight_by_price=False)
fz_w = fuzzy(matrix, model, val_facts_dict, plu_category_dict, plu_price=plu_price)
prc_w = precision(matrix, model, val_facts_dict, plu_price=plu_price)

CPU times: user 31.7 s, sys: 35.9 s, total: 1min 7s
Wall time: 54.4 s


In [74]:
print('Fuzzy: %s' % fz)
print('Fuzzy Weighted: %s' % fz_w)
print('Precision: %s' % prc)
print('Precision Weighted: %s' % prc_w)

Fuzzy: 0.571240600323802
Fuzzy Weighted: 0.5640391714805725
Precision: 0.025803834737907417
Precision Weighted: 0.12764180505283507
