<a href="https://colab.research.google.com/github/ddekun/Recommendation_systems/blob/lesson6/lesson6/hw_webinar_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Вебинар 6. Двухуровневые модели рекомендаций


Код для src, utils, metrics вы можете скачать из [этого](https://github.com/geangohn/recsys-tutorial) github репозитория

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [None]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [None]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [None]:
recommender = MainRecommender(data_train_lvl_1)



HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [None]:
recommender.get_als_recommendations(2375, N=5)

[899624, 1106523, 1044078, 871756, 844179]

In [None]:
recommender.get_own_recommendations(2375, N=5)

[948640, 918046, 847962, 907099, 873980]

In [None]:
recommender.get_similar_items_recommendation(2375, N=5)

[1046545, 1044078, 1044078, 1078652, 1018809]

In [None]:
recommender.get_similar_users_recommendation(2375, N=5)

[1101502, 979674, 10457044, 974265, 959455]

### Задание 1

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 50 кандидатов (k=50)
- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?


In [None]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [None]:
cold_users = np.setdiff1d(result_lvl_1['user_id'], data_train_lvl_1['user_id']).tolist()
result_lvl_1 = result_lvl_1[~result_lvl_1['user_id'].isin(cold_users)]

In [None]:
result_lvl_1['als_rec'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=50))
result_lvl_1['sim_items_rec'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_similar_items_recommendation(x, N=50))
result_lvl_1['own_rec'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))
result_lvl_1['top_rec'] = [recommender.overall_top_purchases[:50] for x in np.arange(len(result_lvl_1))]
result_lvl_1.head(5)

In [None]:
recall_als = result_lvl_1.apply(lambda row: recall_at_k(row['als_rec'], row['actual'], k=50), axis=1).mean()
recall_sim_items = result_lvl_1.apply(lambda row: recall_at_k(row['sim_items_rec'],
                                                                  row['actual'], k=50), axis=1).mean()
recall_own = result_lvl_1.apply(lambda row: recall_at_k(row['own_rec'], row['actual'], k=50), axis=1).mean()
recall_top = result_lvl_1.apply(lambda row: recall_at_k(row['top_rec'], row['actual'], k=50), axis=1).mean()

print('Recall@50:')
print(f'als_rec      : {recall_als:.6f}')
print(f'sim_items_rec: {recall_sim_items:.6f}')
print(f'own_rec      : {recall_own:.6f}')
print(f'top_rec      : {recall_top:.6f}')

In [None]:
result_lvl_1['own_rec20']  = result_lvl_1['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=20))
result_lvl_1['own_rec100'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=100))
result_lvl_1['own_rec200'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=200))
result_lvl_1['own_rec500'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=500))
result_lvl_1.head(5)

In [None]:
rec_own20  = result_lvl_1.apply(lambda row: recall_at_k(row['own_rec20'],  row['actual'], k=20),  axis=1).mean()
rec_own50  = result_lvl_1.apply(lambda row: recall_at_k(row['own_rec'],    row['actual'], k=50),  axis=1).mean()
rec_own100 = result_lvl_1.apply(lambda row: recall_at_k(row['own_rec100'], row['actual'], k=100), axis=1).mean()
rec_own200 = result_lvl_1.apply(lambda row: recall_at_k(row['own_rec200'], row['actual'], k=200), axis=1).mean()
rec_own500 = result_lvl_1.apply(lambda row: recall_at_k(row['own_rec500'], row['actual'], k=500), axis=1).mean()
print(f'Own Recall@20 : {rec_own20:.6f}')
print(f'Own Recall@50 : {rec_own50:.6f}')
print(f'Own Recall@100: {rec_own100:.6f}')
print(f'Own Recall@200: {rec_own200:.6f}')
print(f'Own Recall@500: {rec_own500:.6f}')

In [None]:
recall = []
recall.append(rec_own20)
recall.append(rec_own50)
recall.append(rec_own100)
recall.append(rec_own200)
recall.append(rec_own500)
plt.plot(np.array([20, 50, 100, 200, 500]), np.array(recall))
plt.show()

### Задание 2.

Обучите модель 2-ого уровня, при этом:
    - Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар
    - Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_lvl_2
    - Вырос ли precision@5 при использовании двухуровневой модели?

In [None]:
data_val_lvl_2.head()

In [None]:
own_candidates = result_lvl_1[['user_id', 'own_rec500']]
own_candidates.head()

In [None]:
valid_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index().rename(columns={'item_id': 'actual'})
valid_lvl_2 = valid_lvl_2.merge(own_candidates, on='user_id', how='left')

In [None]:
valid_lvl_2.head()

In [None]:
# Точность
valid_lvl_2[valid_lvl_2.own_rec500.notna()].apply(lambda row: precision_at_k(row['own_rec500'],row['actual'], k=5), axis=1).mean()

In [None]:
# Новые признаки для товара
def new_item_features(data, item_features):
    
    # Цена
    data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))
    new_item_features = item_features.merge(data, on='item_id', how='left')
    
    # Средняя цена по категории
    avg_price_by_cat = new_item_features.groupby('commodity_desc')['price'].mean()
    item_features = item_features.merge(avg_price_by_cat, on='commodity_desc', how='left').rename(columns={'price_x': 'price', 'price_y': 'avg_price'})    

    # Количество продаж и среднее количество продаж товара
    item_qnt = new_item_features.groupby(['item_id'])['quantity'].count().reset_index()
    item_qnt.rename(columns={'quantity': 'quantity_of_sales'}, inplace=True)
    item_qnt['quantity_of_sales_per_week'] = item_qnt['quantity_of_sales'] / new_item_features['week_no'].nunique()
    item_features = item_features.merge(item_qnt, on='item_id')
    
    return item_features

In [None]:
# Новые признаки для пользователя
def new_user_features(data, user_features):
    
    new_user_features = user_features.merge(data, on='user_id', how='left')

    # Сумма чека
    basket = new_user_features.groupby(['user_id'])['sales_value'].sum().reset_index()
    baskets_qnt = new_user_features.groupby('user_id')['basket_id'].count().reset_index()
    baskets_qnt.rename(columns={'basket_id': 'baskets_qnt'}, inplace=True)
    # Средний недельный чек
    average_basket = basket.merge(baskets_qnt)
    average_basket['average_basket'] = average_basket.sales_value / average_basket.baskets_qnt
    average_basket['sum_per_week'] = average_basket.sales_value / new_user_features.week_no.nunique()
    average_basket = average_basket.drop(['sales_value', 'baskets_qnt'], axis=1)
    user_features = user_features.merge(average_basket, on='user_id')

    return user_features

In [None]:
item_features = new_item_features(data_train_lvl_2, item_features)
user_features = new_user_features(data_train_lvl_2, user_features)

In [None]:
item_features.head()

In [None]:
user_features.head()

In [None]:
def train_test_preprocessing(data):    
    
    users_lvl_2 = pd.DataFrame(data['user_id'].unique())
    users_lvl_2.columns = ['user_id']

    train_users = data_train_lvl_1['user_id'].unique()
    train_users.shape

    users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]
    users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=500))
    
    s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
    s.name = 'item_id'
    users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
    users_lvl_2['flag'] = 1

    targets_lvl_2 = data[['user_id', 'item_id']].copy()
    targets_lvl_2['target'] = 1  
    targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')
    targets_lvl_2['target'].fillna(0, inplace= True)
    targets_lvl_2.drop('flag', axis=1, inplace=True)

    targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
    targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

    X = targets_lvl_2.drop('target', axis=1)
    y = targets_lvl_2['target'].ravel()
    
    return X, y

In [None]:
X_train, y_train = train_test_preprocessing(data_train_lvl_2)

In [None]:
X_train.head().T

In [None]:
cat_feats = ['manufacturer', 'department', 'brand', 'commodity_desc', 'sub_commodity_desc',
             'curr_size_of_product', 'age_desc', 'marital_status_code', 'income_desc',
             'homeowner_desc', 'hh_comp_desc', 'household_size_desc', 'kid_category_desc']

X_train[cat_feats] = X_train[cat_feats].astype('category')
X_test, y_test = train_test_preprocessing(data_val_lvl_2)
X_test[cat_feats] = X_test[cat_feats].astype('category')

In [None]:
%%time
lgb = LGBMClassifier(objective='binary', max_depth = 7)
lgb.fit(X_train, y_train)

In [None]:
feature = list(zip(X_train.columns.tolist(), lgb.feature_importances_))
feature = pd.DataFrame(feature, columns=['feature', 'value'])
feature

In [None]:
def get_important_features(model, X_train, y_train):
    # Отбор важных признаков
    model.fit(X_train, y_train)
    feature = list(zip(X_train.columns.tolist(), model.feature_importances_))
    feature = pd.DataFrame(feature, columns=['feature', 'value'])
    features = feature.loc[feature.value > 0, 'feature'].tolist()
    return features

In [None]:
important_features = get_important_features(lgb, X_train, y_train)
important_features

In [None]:
%%time
lgb.fit(X_train[important_features], y_train)

In [None]:
test_preds_proba = lgb.predict_proba(X_test[important_features])[:, 1]
test_preds_proba[:10]

In [None]:
def get_final_recomendation(X_test, test_preds_proba, data_val_lvl_2):
    
    X_test['predict_proba'] = test_preds_proba
    X_test.sort_values(['user_id', 'predict_proba'], ascending=[True, False], inplace=True)
    lgb_candidates = X_test.groupby('user_id').head(5).groupby('user_id')['item_id'].unique().reset_index()
    
    result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index().rename(columns={'item_id': 'actual'})
    result_lvl_2 = result_lvl_2.merge(lgb_candidates, on='user_id', how='left')
    result_lvl_2.rename(columns={'item_id': 'lgb_candidates'}, inplace=True)
    
    return result_lvl_2

In [None]:
result_lvl_2 = get_final_recomendation(X_test, test_preds_proba, data_val_lvl_2)

In [None]:
result_lvl_2.head()

In [None]:
# Точность
result_lvl_2[result_lvl_2.lgb_candidates.notna()].apply(lambda row: precision_at_k(row['lgb_candidates'], row['actual'], k=5), axis=1).mean()