# Двухуровневая модель рекомендаций товаров для пользователя


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm
tqdm.pandas()
from sklearn.model_selection import train_test_split

from scipy.sparse import csr_matrix


from implicit import als

from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV


import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

    
from src.metrics import precision_at_k, money_precision_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender
from src.preprocessing import new_features, train_test_preprocessing, get_important_features, get_final_recomendation, new_user_features, new_item_features

  from pandas import Panel


In [2]:
pd.set_option('display.max_columns', None)

### Загрузка данных и разделение на тренировочный и валидационных датасет

In [3]:
data = pd.read_csv('data/retail_train.csv')
item_features = pd.read_csv('data/product.csv')
user_features = pd.read_csv('data/hh_demographic.csv')
TEST_1_PATH = 'data/retail_test1.csv'

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)
test_1 = pd.read_csv(TEST_1_PATH)

N=150 # Количество рекомендаций

VAL_SIZE = 5

train_1 = data[data['week_no'] < data['week_no'].max() - (VAL_SIZE)]
val = data[data['week_no'] >= data['week_no'].max() - (VAL_SIZE)]

train_2 = val.copy()

In [4]:
item_features

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ
...,...,...,...,...,...,...,...
92348,18293142,6384,DRUG GM,National,BOOKSTORE,PAPERBACK BOOKS,
92349,18293439,6393,DRUG GM,National,BOOKSTORE,CHILDRENS LOW END,
92350,18293696,6406,DRUG GM,National,BOOKSTORE,PAPERBACK BEST SELLER,
92351,18294080,6442,DRUG GM,National,BOOKSTORE,PAPERBACK BOOKS,


### Предобработка тренировочного датасета

In [5]:
n_items_before = train_1['item_id'].nunique()
train_1 = prefilter_items(train_1, item_features=item_features, take_n_popular= 3000)
n_items_after = train_1['item_id'].nunique()

print(f'Decreased # items from {n_items_before} to {n_items_after}')

Decreased # items from 85828 to 3001


In [6]:
recommender = MainRecommender(train_1)



HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3001.0), HTML(value='')))




### Получение эмбеддингов

In [7]:
items_emb_df = recommender.items_emb_df
users_emb_df = recommender.users_emb_df

### Создание item-user матрицы и получение новых фич (свойств)

In [8]:
%%time
train = new_features(train_2, train_1, recommender, item_features, user_features, items_emb_df, users_emb_df, N)
train.head(2)

Wall time: 16.2 s


Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc_x,coupon_match_disc,price,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,coupon_disc_y,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,9_x,10_x,11_x,12_x,13_x,14_x,15_x,16_x,17_x,18_x,19_x,marital_status_code,homeowner_desc,hh_comp_desc,household_size_desc,0_y,1_y,2_y,3_y,4_y,5_y,6_y,7_y,8_y,9_y,10_y,11_y,12_y,13_y,14_y,15_y,16_y,17_y,18_y,19_y,mean_time,age,income,kids,average_basket,sum_per_week,count_purchases_week_mean,sum_purchases_week_mean,target
0,843,40955282722,622,845193,3,5.37,364,-1.5,19,90,0.0,0.0,1.79,5628,GROCERY,National,FROZEN PIZZA,SNACKS/APPETIZERS,10.5 OZ,0.0,0.003492,-0.005722,0.011992,-0.005268,-0.007853,0.005875,-0.004604,0.007576,0.016448,0.00604,0.012484,0.007506,0.003992,0.016161,0.004383,0.003873,0.000216,-0.003894,0.001294,0.005995,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002468,0.013529,1.0
1,843,40955282722,622,845193,3,5.37,364,-1.5,19,90,0.0,0.0,1.79,5628,GROCERY,National,FROZEN PIZZA,SNACKS/APPETIZERS,10.5 OZ,0.0,0.003492,-0.005722,0.011992,-0.005268,-0.007853,0.005875,-0.004604,0.007576,0.016448,0.00604,0.012484,0.007506,0.003992,0.016161,0.004383,0.003873,0.000216,-0.003894,0.001294,0.005995,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002468,0.013529,1.0


In [9]:
X_train = train.drop(['target'], axis=1)
y_train = train[['target']]

In [10]:
cat_feats=[]
for y in X_train.columns:
    if(X_train[y].dtype == np.object):
          cat_feats.append(y)
            
X_train[cat_feats + ['user_id', 'item_id']] = X_train[cat_feats + ['user_id', 'item_id']].astype('category')

In [11]:
test = new_features(data, train_1, recommender, item_features, user_features, items_emb_df, users_emb_df, N)


In [12]:
X_test = test.iloc[:1000].drop(['target'], axis=1)
y_test = test.iloc[:1000][['target']]
X_test[cat_feats + ['user_id', 'item_id']] = X_test[cat_feats + ['user_id', 'item_id']].astype('category')

### Выделение признаков влияющих на результат предсказания модели

In [13]:
%%time

lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_feats)

basic_feats = get_important_features(lgb, X_train, y_train)

  return f(**kwargs)


Wall time: 11.6 s


In [14]:
%%time

lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_feature=cat_feats)

lgb.fit(X_train[basic_feats], y_train)

  return f(**kwargs)
Please use categorical_feature argument of the Dataset constructor to pass this parameter.


Wall time: 2.89 s


LGBMClassifier(categorical_feature=['department', 'brand', 'commodity_desc',
                                    'sub_commodity_desc',
                                    'curr_size_of_product',
                                    'marital_status_code', 'homeowner_desc',
                                    'hh_comp_desc', 'household_size_desc'],
               max_depth=7, objective='binary')

In [15]:
X_test[basic_feats]

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,price,manufacturer,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,coupon_disc_y,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,9_x,10_x,11_x,12_x,13_x,14_x,15_x,16_x,17_x,18_x,19_x,0_y,1_y,2_y,3_y,4_y,5_y,6_y,7_y,8_y,9_y,10_y,11_y,12_y,13_y,14_y,15_y,16_y,17_y,18_y,19_y,mean_time,age,income,average_basket,sum_per_week,count_purchases_week_mean,sum_purchases_week_mean
0,2375,26984851472,1,1004906,1,1.39,364,-0.60,1631,1.39,69,Private,POTATOES,POTATOES RUSSET (BULK&BAG),5 LB,0.000000,0.009265,0.010417,0.013228,0.015150,0.007285,0.006418,0.009918,0.000883,0.007517,0.007781,0.010191,0.012781,0.011288,0.006468,0.004878,0.014056,0.007239,0.016336,0.006110,0.012972,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016228,0.015390
1,2375,26984851472,1,1033142,1,0.82,364,0.00,1631,0.82,2,National,ONIONS,ONIONS SWEET (BULK&BAG),40 LB,0.000000,0.008063,0.004465,0.006509,0.012637,0.007741,0.004115,0.015778,0.004775,0.003408,0.010447,0.014623,0.013791,0.009258,0.000101,0.005548,0.011236,0.001477,0.005004,0.005000,0.005529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012520,0.013361
2,2375,26984851472,1,1036325,1,0.99,364,-0.30,1631,0.99,69,Private,VEGETABLES - ALL OTHERS,CELERY,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012078,0.008512
3,2375,26984851472,1,1082185,1,1.21,364,0.00,1631,1.21,2,National,TROPICAL FRUIT,BANANAS,40 LB,0.000000,0.010890,0.013841,0.006750,0.015490,0.016713,0.009931,0.016291,0.011073,0.014816,0.011689,0.002030,0.013923,0.008569,0.011732,0.018879,0.012911,0.009024,0.010233,0.006327,0.019095,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010855,0.018306
4,2375,26984851472,1,8160430,1,1.50,364,-0.39,1631,1.50,69,Private,ORGANICS FRUIT & VEGETABLES,ORGANIC CARROTS,1 LB,0.000000,0.001706,-0.000239,-0.007310,0.011478,0.011928,-0.000497,0.010115,0.002725,-0.002502,-0.003023,0.000072,0.001693,0.004604,0.001232,0.018710,0.007616,-0.000658,0.005872,0.008172,0.005110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.053761,0.051871
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,293,27008831408,3,840361,1,0.60,296,-0.79,1501,0.60,69,Private,EGGS,EGGS - LARGE,1 DZ,-0.000486,0.009257,0.009978,0.007040,0.011856,0.011973,0.008465,0.012047,0.005903,0.009627,0.011271,0.009511,0.014436,0.009916,0.008828,0.006419,0.016274,0.008208,0.014606,0.009898,0.013203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004266,0.005601
996,293,27008831408,3,859662,1,2.79,296,0.00,1501,2.79,194,National,CONVENIENT BRKFST/WHLSM SNACKS,FRUIT SNACKS,5.4 OZ,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.162016
997,293,27008831408,3,862728,1,3.49,296,0.00,1501,3.49,781,National,LUNCHMEAT,PEPPERONI/SALAMI,6 OZ,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018761,0.026458
998,293,27008831408,3,866227,1,0.50,296,0.00,1501,0.50,2,National,BREAKFAST SWEETS,SW GDS:DONUTS,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062500,0.043573


In [16]:
%%time
preds = lgb.predict(X_test[basic_feats])
test_preds_proba = lgb.predict_proba(X_test[basic_feats])[:, 1]

Wall time: 61.8 ms


In [17]:
result_train = get_final_recomendation(X_test, test_preds_proba, data, train_1, item_features)

100%|██████████████████████████████████████████████████████████████████████████████| 2499/2499 [34:23<00:00,  1.21it/s]


In [18]:
df_price = train_1.groupby('item_id')['price'].mean().reset_index()

## money precision @k для train

In [19]:
result_train.apply(lambda row: money_precision_at_k(row['recomendations'], row['actual'], df_price), axis=1).mean()

0.2565460984393755

In [20]:
test_2 = new_features(test_1, train_1, recommender, item_features, user_features, items_emb_df, users_emb_df, N)
X_test_2 = test_2.drop(['target'], axis=1)
y_test_2 = test_2[['target']]
X_test_2[cat_feats + ['user_id', 'item_id']] = X_test_2[cat_feats + ['user_id', 'item_id']].astype('category')

In [21]:
test_preds_proba = lgb.predict_proba(X_test_2[basic_feats])[:, 1]
result = get_final_recomendation(X_test_2, test_preds_proba, test_1, train_1, item_features)

100%|██████████████████████████████████████████████████████████████████████████████| 1885/1885 [06:24<00:00,  4.90it/s]


###  Получение money precision @k для тестового датасета

In [22]:
result.apply(lambda row: money_precision_at_k(row['recomendations'], row['actual'], df_price), axis=1).mean()

0.5583716180371352

In [23]:
result.drop('actual', axis=1, inplace=True)

In [24]:
result.to_csv('recommendatinons.csv', index=False)