<a href="https://colab.research.google.com/github/ddekun/Recommendation_systems/blob/course_project/Course_project/Course_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Рекомендательные системы. Курсовой проект.

---

**Целевая метрика precision@5**

In [None]:
import pandas as pd
import numpy as np

import os
os.environ['MKL_NUM_THREADS'] = '1'

# Для работы с матрицами
from scipy.sparse import csr_matrix
# Матричная факторизация
from implicit import als
# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import matplotlib
import matplotlib.image as img
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline
matplotlib.rcParams.update({'font.size': 12})
matplotlib.rcParams.update({'figure.figsize': (12,6)})
matplotlib.rcParams["axes.grid"] = True

In [None]:
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items, get_targets_sec_level, extend_new_user_features, extend_new_item_features, \
extend_user_item_new_features, get_popularity_recommendations, postfilter_items, get_final_recomendations
from src.recommenders import MainRecommender

In [None]:
data = pd.read_csv('data/retail_train.csv')
item_features = pd.read_csv('data/product.csv')
user_features = pd.read_csv('data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

# -- давние покупки -- | -- 6 недель -- | -- 3 недели -- 
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(5)

Unnamed: 0.1,Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1832874,1078,35573861879,524,1082185,1,0.56,375,0.0,1440,76,0.0,0.0
1,402281,324,29170411703,165,7168774,2,6.98,367,0.0,1115,24,0.0,0.0
2,1348564,1982,32957769022,404,12811490,1,3.99,319,0.0,2101,58,0.0,0.0
3,1714815,1023,34573871336,495,920025,1,5.99,299,0.0,1643,71,0.0,0.0
4,1266182,695,32672141822,383,941357,1,3.19,396,0.0,1743,55,0.0,0.0


**Предварительная фильтрация**

In [None]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=3000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 33411 to 3001


**Обучаем модель первого уровня**

In [None]:
recommender = MainRecommender(data_train_lvl_1)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/2346 [00:00<?, ?it/s]

**Эмбеддинги**

In [None]:
items_emb_df = recommender.items_emb_df
users_emb_df = recommender.users_emb_df

**Добавляем новые признаки**

In [None]:
# Количество рекомендаций
N = 100
train = extend_user_item_new_features(data_train_lvl_2, data_train_lvl_1, recommender, item_features, user_features, items_emb_df, users_emb_df, N)
train.head()

Unnamed: 0.1,Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,...,19_y,mean_time,age,income,children,avr_bask,sum_per_week,count_purchases_week_mean,sum_purchases_week_mean,target
0,2032473,1501,40399811362,575,8090657,1,2.99,432,-0.3,1447,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,2103714,1633,40618357715,593,953476,1,0.5,32004,-0.07,1813,...,4.078001,1887.900024,50.0,70.0,0.0,3.147473,15.737366,0.041667,0.036364,0.0
2,2031107,336,40387622167,574,824663,1,5.79,343,0.0,2057,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.215642,0.0
3,2019642,2195,40341743149,572,916260,1,1.69,427,0.0,2024,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.166012,0.0
4,2162153,2107,40788501083,607,916122,2,14.68,450,-2.94,1251,...,-7.727289,1609.1875,40.0,95.0,0.0,2.628125,7.008333,0.08,0.10866,1.0


In [None]:
X_train = train.drop(['target'], axis=1)
y_train = train['target'].ravel()

In [None]:
cat_features=[]
for col in X_train.columns:
    if X_train[col].dtype == object:
          cat_features.append(col)
            
X_train[cat_features + ['user_id', 'item_id']] = X_train[cat_features + ['user_id', 'item_id']].astype('category')

In [None]:
test = extend_user_item_new_features(data_val_lvl_2, data_val_lvl_1, recommender, item_features, user_features, items_emb_df, users_emb_df, N)
X_test = test.drop(['target'], axis=1)
y_test = test['target'].ravel()
X_test[cat_features + ['user_id', 'item_id']] = X_test[cat_features + ['user_id', 'item_id']].astype('category')

In [None]:
%%time
# lgb = LGBMClassifier(objective='binary', max_depth = 7, categorical_column=cat_features)
# lgb.fit(X_train, y_train)
lgb = LGBMClassifier(objective='binary', max_depth=7)
lgb.fit(X_train, y_train, categorical_feature=cat_features)


In [None]:
def get_important_features(model, X_train, y_train):
    # Отбор важных признаков
    model.fit(X_train, y_train)
    feature = list(zip(X_train.columns.tolist(), model.feature_importances_))
    feature = pd.DataFrame(feature, columns=['feature', 'value'])
    features = feature.loc[feature.value > 0, 'feature'].tolist()
    return features

In [None]:
important_features = get_important_features(lgb, X_train, y_train)

**Обучаем модель второго уровня**

In [None]:
%%time
lgb.fit(X_train[important_features], y_train)

In [None]:
preds = lgb.predict(X_test[important_features])
test_preds_proba = lgb.predict_proba(X_test[important_features])[:, 1]

**Финальная фильтрация данных**

In [None]:
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

In [None]:
result = get_final_recomendations(X_test, test_preds_proba, data, data_train_lvl_1, item_features)

In [None]:
result.head()

**Метрика precision@5**

In [None]:
# Точность
result.apply(lambda row: precision_at_k(row['recomendations'], row['actual'], k=5), axis=1).mean()

**Сохранение предсказаний**

In [None]:
result.drop('actual', axis=1, inplace=True)
result.to_csv('recommendations.csv', index=False)