### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

import warnings
warnings.simplefilter("ignore")

pd.set_option("display.max_columns", 999)
pd.options.display.float_format = '{:,.2f}'.format

___

### Constants

In [2]:
TRAIN = './data/retail_train.csv'
TEST = './data/retail_test1.csv'
TRANSACTION = './data/transaction_data.csv'
PRODUCT = './data/product.csv'
USER = './data/hh_demographic.csv'

ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'

N_PREDICT = 500 
TOP_K_RECALL = 50
TOP_K_PRECISION = 5

___

### Functions

In [3]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(60 * '=')
    print(f"Shape: {df_data.shape}\tUsers: {df_data[USER_COL].nunique()}\t"
          f"Items: {df_data[ITEM_COL].nunique()}\n")

In [4]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, \
        df_data.apply(lambda row: precision_at_k(row[col_name], 
                                                 row[ACTUAL_COL], 
                                                 k=top_k), axis=1).mean()

In [5]:
def calc_recall(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, \
        df_data.apply(lambda row: recall_at_k(row[col_name], 
                                              row[ACTUAL_COL], 
                                              k=top_k), axis=1).mean()

In [6]:
# функция переранжирования в зависимости от предсказанного скора
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', 
                                                                               ascending=False).head(5)['item_id'].tolist()

In [7]:
def day_time(trans_time):
    """
     с 0 до 6 часов — ночь 
     с 6 до 12 часов — утро 
     с 12 до 18 часов — день 
     с 18 до 24 часов — вечер
    """
    if 0 <= trans_time < 600:
        return 'night'
    elif 600 <= trans_time < 1200:
        return 'morning'
    elif 1200 <= trans_time < 1800:
        return 'day'
    elif 1800 <= trans_time <= 2359:
        return 'evening'
    else:
        return None

___

### Preprocessing

In [8]:
df_train = pd.read_csv(TRAIN)
df_test = pd.read_csv(TEST)
df_trans = pd.read_csv(TRANSACTION)
df_prod = pd.read_csv(PRODUCT)
df_user = pd.read_csv(USER)

In [9]:
df_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [10]:
df_test.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0


In [11]:
df_user.head(2)

Unnamed: 0,AGE_DESC,MARITAL_STATUS_CODE,INCOME_DESC,HOMEOWNER_DESC,HH_COMP_DESC,HOUSEHOLD_SIZE_DESC,KID_CATEGORY_DESC,household_key
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [12]:
df_prod.head(2)

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [13]:
# Переведем названия столбцов в малые буквы
df_trans.columns = list(map(str.lower, df_test.columns))
df_prod.columns = list(map(str.lower, df_prod.columns))
df_user.columns = list(map(str.lower, df_user.columns))

# переименуем столбцы
df_prod.rename(columns={'product_id': ITEM_COL}, inplace=True)
df_trans.rename(columns={'household_key': USER_COL }, inplace=True)
df_user.rename(columns={'household_key': USER_COL }, inplace=True)

**Найдем "холодных" юзеров в тесте**

In [14]:
df_test[~df_test[USER_COL].isin(df_train[USER_COL])][USER_COL].nunique()

1

In [15]:
# один юзер с 10ю покупками
df_test[~df_test[USER_COL].isin(df_train[USER_COL])]

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
24436,2325,41756111316,671,849274,2,8.0,427,-2.38,1530,97,0.0,0.0
24437,2325,41756111316,671,863885,1,3.69,427,0.0,1530,97,0.0,0.0
24438,2325,41756111316,671,872137,1,3.65,427,0.0,1530,97,0.0,0.0
24439,2325,41756111316,671,877913,1,2.79,427,0.0,1530,97,0.0,0.0
24440,2325,41756111316,671,883932,2,4.38,427,0.0,1530,97,0.0,0.0
24441,2325,41756111316,671,965208,1,3.49,427,0.0,1530,97,0.0,0.0
24442,2325,41756111316,671,1106116,1,3.69,427,0.0,1530,97,0.0,0.0
24443,2325,41756111316,671,5979276,1,1.5,427,-0.49,1530,97,0.0,0.0
24444,2325,41756111316,671,16729299,2,6.38,427,-2.2,1530,97,0.0,0.0
52379,2325,41834501725,677,18055532,1,7.99,367,0.0,1145,97,0.0,0.0


In [16]:
# дропнем этого пользователя из теста
df_test.drop(df_test[~df_test[USER_COL].isin(df_train[USER_COL])].index, inplace=True)

In [17]:
# берем данные для тренировки matching модели (модель, выдающая список рекомендаций)
data_train_matcher = df_train[df_train['week_no'] < df_train['week_no'].max() - 6]

# берем данные для валидации matching модели
data_val_matcher = df_train[df_train['week_no'] >= df_train['week_no'].max() - 6]


# берем данные для тренировки ranking модели (модель, ранжирующая выдачу предыдущей модели)
data_train_ranker = data_val_matcher.copy()
# берем данные для теста ranking, matching модели
data_val_ranker = df_test

In [18]:
# prefilter
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, 
                                     item_features=df_prod, 
                                     take_n_popular=3550)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 85334 to 3551


In [19]:
common_users = data_train_matcher['user_id'].values

data_val_matcher = data_val_matcher[data_val_matcher['user_id'].isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker['user_id'].isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker['user_id'].isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (896610, 13)	Users: 2496	Items: 3551

val_matcher
Shape: (203235, 12)	Users: 2194	Items: 30037

train_ranker
Shape: (203235, 12)	Users: 2194	Items: 30037

val_ranker
Shape: (88665, 12)	Users: 1883	Items: 20492



### Recommender part

In [20]:
%%time
recommender = MainRecommender(data_train_matcher, weighting='bm25')

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/3551 [00:00<?, ?it/s]

  0%|          | 0/3551 [00:00<?, ?it/s]

CPU times: user 5.15 s, sys: 187 ms, total: 5.34 s
Wall time: 2.99 s


In [21]:
%%time
# создаем датафрейм со всеми уникальными юзерами из тренировочного
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique(), columns=[USER_COL])
# для каждого делаем предсказания get_own_recommendation (ItemItem)
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(lambda x: 
                                                                        recommender.get_own_recommendations(x, 
                                                                                                            N=N_PREDICT))

CPU times: user 9.33 s, sys: 105 ms, total: 9.43 s
Wall time: 8.4 s


In [22]:
# разворачиваем все это в датафрейм, где в одной строке один юзер и один итем
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), 
                                     axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

In [23]:
# создадим датафрейм из тренировочного, и так как товары здесь были реально куплены проставим
# в таргете 1
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1

In [24]:
# для нулей мерджим с нашим подготовленным датасетом предсказанных кандидатов
df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

# чистим дубликаты
df_ranker_train = df_ranker_train.drop_duplicates(subset=[USER_COL, ITEM_COL])

# заполняем nan нулями
df_ranker_train['target'].fillna(0, inplace= True)

In [25]:
df_ranker_train['target'].value_counts(normalize=True)

0.00   0.97
1.00   0.03
Name: target, dtype: float64

### Feature engineering

In [26]:
# намерджим к нашему датасету фичей из item features датасета
df_ranker_train = df_ranker_train.merge(df_prod, on='item_id', how='left')

In [27]:
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,84,903529,1.0,539,DRUG GM,National,CIGARETTES,CIGARETTES,972976 PK
1,84,920025,1.0,764,GROCERY,National,LAUNDRY ADDITIVES,FABRIC SOFTENER LIQUID,60 LOAD
2,84,829722,0.0,70,GROCERY,National,LAUNDRY DETERGENTS,LIQUID LAUNDRY DETERGENTS,32 LOAD
3,84,987518,0.0,415,GROCERY,National,LAUNDRY DETERGENTS,LIQUID LAUNDRY DETERGENTS,100 OZ
4,84,901061,0.0,608,GROCERY,National,FRZN MEAT/MEAT DINNERS,FRZN BREADED PREPARED CHICK,28.8 OZ


___

In [28]:
%%time
# Для создания фичей смерджим датафрейм с покупками и с данными по товарам
data_item_df = pd.merge(df_train, df_prod, on='item_id', how='left')

# data_item_df = data_item_df[data_item_df['user_id'].isin(common_users)]

# средний чек каждого юзера по корзинам
data_item_df = data_item_df.merge(data_item_df.groupby(by=['user_id', 'basket_id']).agg('sales_value').mean().reset_index().groupby(['user_id'])['sales_value'].mean().rename('mean_basket'), how='left',on=USER_COL)

# средний чек юзера по каждой из категорий
data_item_df = data_item_df.merge(data_item_df.groupby(by=['user_id', 'commodity_desc']).agg('sales_value').mean().rename('mean_commodity'), how='left',on=['user_id', 'commodity_desc'])

# количество покупок на каждую категорию
data_item_df = data_item_df.merge(data_item_df.groupby(by=['user_id', 'commodity_desc']).agg('quantity').count().rename('quantity_commodity'), how='left',on=['user_id', 'commodity_desc'])

# Частотность покупок раз/месяц
# для начала разобьем наши дни на месяцы (несколько приблизительно, по 30 дней)
data_item_df['month'] = data_item_df['day'] // 30

data_item_df = data_item_df.merge(data_item_df.groupby(by=['user_id', 'month']).agg('quantity').count().rename('month_quantity'), how='left',on=['user_id', 'month'])

# Доля покупок в выходные
# создадим функцию определения субботы/воскресенья в зависимости от дня/недели
weekends = lambda day, week: (day == week * 7 - 2) or (day == week * 7 - 3)

data_item_df['weekend_buy'] = data_item_df.apply(lambda x: weekends(x['day'], x['week_no']) * 1, 
                                                 axis=1)

# Долю покупок утром/днем/вечером
data_item_df['day_time'] = data_item_df.apply(lambda x: day_time(x['trans_time']), axis=1)

# среднее количество покупок товара в неделю
data_item_df = data_item_df.merge(data_item_df.groupby(by=['item_id', 'week_no']).agg('quantity').count().reset_index().groupby('item_id')['quantity'].mean().rename('item_week_quantity'), how='left',on=ITEM_COL)

# Среднее кол-во покупок 1 товара в категории в неделю
data_item_df = data_item_df.merge(data_item_df.groupby(by=['item_id', 'commodity_desc', 'week_no']).agg('quantity').mean().reset_index().groupby(['item_id', 'commodity_desc'])['quantity'].mean().rename('item_commodity_quantity'), how='left',on=['item_id', 'commodity_desc'])


# (Кол-во покупок в неделю) / (Среднее кол-во покупок 1 товара в категории в неделю)
data_item_df['week-commodity'] = data_item_df['item_week_quantity'] / data_item_df['item_commodity_quantity']
# цена
data_item_df['price'] = data_item_df.apply(lambda x: x['sales_value'] / x['quantity'] if x['quantity'] != 0 else 0, axis=1)
data_item_df = data_item_df.merge(data_item_df.groupby(by=['user_id', 'commodity_desc', 'week_no']).agg('quantity').count().rename('user_com_week'), how='left',on=['user_id', 'commodity_desc', 'week_no'])
data_item_df = data_item_df.merge(data_item_df.groupby(by=['commodity_desc', 'week_no']).agg('quantity').count().rename('item_com_week'), how='left',on=['commodity_desc', 'week_no'])
# (Кол-во покупок юзером конкретной категории в неделю) - 
# (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)
data_item_df['user_item_diff'] = data_item_df['user_com_week'] - data_item_df['item_com_week']
# (Кол-во покупок юзером конкретной категории в неделю) / 
# (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)
data_item_df['user_item_diff'] = data_item_df['user_com_week'] / data_item_df['item_com_week']

item_freq = data_item_df['item_id'].value_counts()

data_item_df['item_fq_log'] = data_item_df.apply(lambda x: np.log(item_freq[x['item_id']] / data_item_df.shape[0] + 1e-2), axis=1)
data_item_df['item_fq_log_denom'] = data_item_df.apply(lambda x: item_freq[x['item_id']] / (1 + np.log(data_item_df.shape[0])), axis=1)

CPU times: user 2min 56s, sys: 5.22 s, total: 3min 2s
Wall time: 3min 2s


In [29]:
%%time
# своровано с урока. Много буста метрики не дает, но чем богаты
data_item_df = data_item_df.merge(data_item_df.groupby(by=ITEM_COL).agg('sales_value').sum().rename('total_item_sales_value'), how='left',on=ITEM_COL)
data_item_df = data_item_df.merge(data_item_df.groupby(by=ITEM_COL).agg('quantity').sum().rename('total_quantity_value'), how='left',on=ITEM_COL)
data_item_df = data_item_df.merge(data_item_df.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq'), how='left',on=ITEM_COL)
data_item_df = data_item_df.merge(data_item_df.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq'), how='left',on=USER_COL)
data_item_df = data_item_df.merge(data_item_df.groupby(by=USER_COL).agg('sales_value').sum().rename('total_user_sales_value'), how='left',on=USER_COL)
data_item_df = data_item_df.merge(data_item_df.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_week')/data_item_df.week_no.nunique(), how='left',on=ITEM_COL)
data_item_df = data_item_df.merge(data_item_df.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_week')/data_item_df.week_no.nunique(), how='left',on=USER_COL)
data_item_df = data_item_df.merge(data_item_df.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_basket')/data_item_df.basket_id.nunique(), how='left',on=ITEM_COL)
data_item_df = data_item_df.merge(data_item_df.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_baskter')/data_item_df.basket_id.nunique(), how='left',on=USER_COL)
data_item_df = data_item_df.merge(data_item_df.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq_per_basket')/data_item_df.basket_id.nunique(), how='left',on=ITEM_COL)
data_item_df = data_item_df.merge(data_item_df.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq_per_basket')/data_item_df.basket_id.nunique(), how='left',on=USER_COL)


CPU times: user 8.03 s, sys: 944 ms, total: 8.97 s
Wall time: 9 s


In [30]:
data_item_df.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,mean_basket,mean_commodity,quantity_commodity,month,month_quantity,weekend_buy,day_time,item_week_quantity,item_commodity_quantity,week-commodity,price,user_com_week,item_com_week,user_item_diff,item_fq_log,item_fq_log_denom,total_item_sales_value,total_quantity_value,item_freq,user_freq,total_user_sales_value,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,69,PRODUCE,Private,POTATOES,POTATOES RUSSET (BULK&BAG),5 LB,3.87,7.58,14,0,42,0,day,39.75,1.06,37.61,1.39,1,9,0.11,-4.47,212.82,8765.53,3675,3339,657,2486.42,38.68,10.55,0.01,0.0,0.01,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0,2,PRODUCE,National,ONIONS,ONIONS SWEET (BULK&BAG),40 LB,3.87,2.08,13,0,42,0,day,25.63,1.0,25.56,0.82,1,14,0.07,-4.51,150.29,2242.71,2371,2358,657,2486.42,24.96,10.55,0.01,0.0,0.01,0.0


___

In [31]:
feat_cols = [
    'user_id',
    'item_id',
    'mean_basket',
    'mean_commodity',
    'quantity_commodity',
    'month',
    'month_quantity',
    'weekend_buy',
    'day_time',
    'item_week_quantity',
    'item_commodity_quantity',
    'week-commodity',
    'price',
    'user_com_week',
    'item_com_week',
    'user_item_diff',
    'item_fq_log',
    'item_fq_log_denom',
    'total_item_sales_value',
    'total_quantity_value',
    'item_freq',
    'user_freq',
    'total_user_sales_value',
    'item_quantity_per_week',
    'user_quantity_per_week',
    'item_quantity_per_basket',
    'user_quantity_per_baskter',
    'item_freq_per_basket',
    'user_freq_per_basket'
]

In [32]:
cat_feats = [
    'manufacturer',
    'department',
    'brand',
    'commodity_desc',
    'sub_commodity_desc',
    'curr_size_of_product',
    'weekend_buy',
    'day_time',
]

### Ранжирующая модель

In [33]:
df_ranker_train = df_ranker_train.merge(data_item_df[feat_cols], on=['user_id', 'item_id'], how='left')

In [34]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [35]:
# Переводим в категориальный формат
X_train[cat_feats] = X_train[cat_feats].astype('category')

In [36]:
# нормируем вещественные столбцы
scaler = StandardScaler()
scale_feats = X_train.select_dtypes(include=['float64']).columns

X_train[scale_feats] = scaler.fit_transform(X_train[scale_feats])

In [37]:
# grid search буста не дал, поэтому используем вебинарный вариант
lgb = LGBMClassifier(objective='binary',
                     max_depth=8,
                     n_estimators=300,
                     learning_rate=0.05,
                     categorical_column=cat_feats)

lgb.fit(X_train, y_train)
train_preds = lgb.predict_proba(X_train)

In [38]:
df_ranker_predict = df_ranker_train.copy()
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [39]:
result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]

In [40]:
result_eval_ranker['feat_reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))

In [41]:
print(*sorted(calc_precision(result_eval_ranker, TOP_K_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('feat_reranked_own_rec', 0.2086235489220534)


### Итоговая метрика: 0.2086