# Вебинар 6. Двухуровневые модели рекомендаций


Код для src, utils, metrics вы можете скачать из [этого](https://github.com/geangohn/recsys-tutorial) github репозитория

In [193]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [194]:
data = pd.read_csv('data/retail_train.csv')
item_features = pd.read_csv('data/product.csv')
user_features = pd.read_csv('data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [195]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [196]:
recommender = MainRecommender(data_train_lvl_1)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [197]:
recommender.get_als_recommendations(2375, N=5)

[899624, 1044078, 871756, 1106523, 844179]

In [198]:
recommender.get_own_recommendations(2375, N=5)

[948640, 918046, 847962, 907099, 873980]

In [199]:
recommender.get_similar_items_recommendation(2375, N=5)

[1046545, 1044078, 1042907, 1078652, 1133312]

In [200]:
recommender.get_similar_users_recommendation(2375, N=5)

[820612, 974265, 10457044, 12523928, 1124971]

### Задание 1

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 50 кандидатов (k=50)
- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?


In [9]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [None]:
lget_als_recommendations = []
lget_own_recommendations = []
lget_similar_items_recommendation =[]
lget_similar_users_recommendation = []

for user, actual in zip(result_lvl_1['user_id'],result_lvl_1['actual']):

    try:
        get_als_recommendations = recommender.get_als_recommendations(user, N=50)
        lget_als_recommendations.append(recall_at_k(get_als_recommendations, actual, k = 50))

        get_own_recommendations = recommender.get_own_recommendations(user, N=50)
        lget_own_recommendations.append(recall_at_k(get_own_recommendations, actual, k = 50))

        get_similar_items_recommendation = recommender.get_similar_items_recommendation(user, N=50)
        lget_similar_items_recommendation.append(recall_at_k(get_similar_items_recommendation, actual, k = 50))
     
        get_similar_users_recommendation = recommender.get_similar_users_recommendation(user, N=50)  
        lget_similar_users_recommendation.append(recall_at_k(get_similar_users_recommendation, actual, k = 50))        
        
        
    except (IndexError, KeyboardInterrupt, ValueError):
        continue


In [11]:
print(f'recall@50: \n get_als_recommendations = {np.round(np.mean(lget_als_recommendations)*100,2)}%,\n get_own_recommendations = {np.round(np.mean(lget_own_recommendations)*100,2)}%, \n get_similar_items_recommendation = {np.round(np.mean(lget_similar_items_recommendation)*100,2)}%, \n get_similar_users_recommendation = {np.round(np.mean(lget_similar_users_recommendation)*100, 2)}%')

recall@50: 
 get_als_recommendations = 4.7%,
 get_own_recommendations = 6.53%, 
 get_similar_items_recommendation = 3.36%, 
 get_similar_users_recommendation = 0.2%


Сделаем все в двойном цикле, чтобы узнать при каком К результат лучше. Процесс достаточно долгий.

In [10]:
k = [20, 50, 100, 200, 500]

for i in k:
    lget_als_recommendations = []
    lget_own_recommendations = []
    lget_similar_items_recommendation =[]
    lget_similar_users_recommendation = []
    for user, actual in zip(result_lvl_1['user_id'],result_lvl_1['actual']):

        try:
            get_als_recommendations = recommender.get_als_recommendations(user, N=500)
            lget_als_recommendations.append(recall_at_k(get_als_recommendations, actual, k = i))

            get_own_recommendations = recommender.get_own_recommendations(user, N=500)
            lget_own_recommendations.append(recall_at_k(get_own_recommendations, actual, k = i))

            get_similar_items_recommendation = recommender.get_similar_items_recommendation(user, N=500)
            lget_similar_items_recommendation.append(recall_at_k(get_similar_items_recommendation, actual, k = i))

            get_similar_users_recommendation = recommender.get_similar_users_recommendation(user, N=500)  
            lget_similar_users_recommendation.append(recall_at_k(get_similar_users_recommendation, actual, k = i))        


        except (IndexError, KeyboardInterrupt, ValueError):
            continue
    print(f'recall@{i}: \n get_als_recommendations = {np.round(np.mean(lget_als_recommendations)*100,2)}%,\n get_own_recommendations = {np.round(np.mean(lget_own_recommendations)*100,2)}%, \n get_similar_items_recommendation = {np.round(np.mean(lget_similar_items_recommendation)*100,2)}%, \n get_similar_users_recommendation = {np.round(np.mean(lget_similar_users_recommendation)*100, 2)}%')

recall@20: 
 get_als_recommendations = 2.95%,
 get_own_recommendations = 3.93%, 
 get_similar_items_recommendation = 1.78%, 
 get_similar_users_recommendation = 0.18%
recall@50: 
 get_als_recommendations = 4.78%,
 get_own_recommendations = 6.53%, 
 get_similar_items_recommendation = 3.32%, 
 get_similar_users_recommendation = 0.26%
recall@100: 
 get_als_recommendations = 6.95%,
 get_own_recommendations = 9.6%, 
 get_similar_items_recommendation = 5.3%, 
 get_similar_users_recommendation = 0.41%
recall@200: 
 get_als_recommendations = 9.82%,
 get_own_recommendations = 13.54%, 
 get_similar_items_recommendation = 8.53%, 
 get_similar_users_recommendation = 0.66%
recall@500: 
 get_als_recommendations = 14.76%,
 get_own_recommendations = 18.21%, 
 get_similar_items_recommendation = 13.51%, 
 get_similar_users_recommendation = 1.37%


Процесс очень долгий. Вероятнее всего разумнее брать К = 20 и меньше рекомендаций.

### Задание 2.

Обучите модель 2-ого уровня, при этом:
    - Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар
    - Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_lvl_2
    - Вырос ли precision@5 при использовании двухуровневой модели?

In [263]:
def get_als_recommendations_mapping(data, name, count_of_recomendation): # Function of adding col with result at DataFrame
    mapping = {}
    try:
        for i in result_lvl_2.user_id:

            mapping.update({i:recommender.get_als_recommendations(i, count_of_recomendation)})

    except(IndexError):
        None
    mapping_res = pd.DataFrame(pd.Series(mapping).reset_index()).set_axis(['user_id', name],1,inplace=False)
    res = data.merge(mapping_res, on = 'user_id')
    return res
    

In [249]:
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns=['user_id', 'actual']
result_lvl_2.shape
result_lvl_2

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412..."
...,...,...
2037,2496,[6534178]
2038,2497,"[1016709, 9835695, 1132298, 16809501, 845294, ..."
2039,2498,"[15716530, 834484, 901776, 914190, 958382, 972..."
2040,2499,"[867188, 877580, 902396, 914190, 951590, 95813..."


### Model of first level

In [267]:
name = 'als_first'
result_lvl_2_als = get_als_recommendations_mapping(result_lvl_2, name, 50)

In [266]:
result_lvl_2_als

Unnamed: 0,user_id,actual,als_first
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1037332, 1094924, 883616, 957938, 856942, 107..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1106523, 1044078, 1029743, 844179, 12810393, ..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[1026118, 854852, 878996, 965267, 951834, 1042..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[1041688, 915715, 839419, 916122, 1100140, 945..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[916122, 919535, 9676938, 981660, 6391134, 839..."
...,...,...,...
1617,1979,"[845208, 908846, 1029743, 6534077, 8090532, 80...","[888586, 962266, 903800, 878996, 5578547, 1127..."
1618,1980,"[820291, 822133, 824593, 826385, 827749, 83439...","[1004436, 1132771, 1034176, 5569471, 948670, 8..."
1619,1981,"[1082212, 15511233, 963517, 968936, 972312, 97...","[878996, 965267, 866871, 996070, 1137284, 8966..."
1620,1982,"[6533889, 1071759, 1118963, 10142276, 856335, ...","[1080472, 6602697, 10284929, 6039759, 857503, ..."


In [253]:
result_lvl_2_als.apply(lambda row: precision_at_k(row.als_first, row.actual, 5), axis = 1).mean()*100

9.876695437731211

## precision at 5 on 50 recommendation is 9.87%

In [304]:
data_val_lvl_2['quantity'].value_counts()

1        93212
2        17352
3         3036
4         1897
0          668
         ...  
14382        1
3753         1
13742        1
19809        1
12505        1
Name: quantity, Length: 917, dtype: int64

In [321]:
result_lvl_2_quantity_more_1 = data_val_lvl_2[(data_val_lvl_2['quantity']==1) | (data_val_lvl_2['quantity']==2)].groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2_quantity_more_1.columns=['user_id', 'actual']
result_lvl_2_quantity_more_1

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1017061, 1078346, 110..."
3,7,"[840386, 898068, 909714, 929067, 953476, 95454..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412..."
...,...,...
2022,2495,[1096603]
2023,2497,"[1016709, 9835695, 1132298, 16809501, 845294, ..."
2024,2498,"[15716530, 834484, 901776, 914190, 958382, 972..."
2025,2499,"[867188, 877580, 902396, 914190, 951590, 95813..."


In [322]:
name = 'result_lvl_2_quantity_more_1'
result_lvl_2_quantity_more_1 = get_als_recommendations_mapping(result_lvl_2_quantity_more_1, name, 50)

In [323]:
result_lvl_2_quantity_more_1.apply(lambda col:  precision_at_k(col['result_lvl_2_quantity_more_1'], col['actual'], 5), axis=1).mean()*100

9.441687344913172

# Добавьте минимум по 2 фичи для юзера
### Как я понимаю это различные данные (одни с hh другие с магазина), но мы можем их объединить, что "поиграть" и "набить" руку

In [326]:
data_val_lvl_2

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2277416,338,41260573635,636,840173,1,1.99,369,0.00,112,92,0.0,0.0
2277417,338,41260573635,636,1037348,1,0.89,369,-0.30,112,92,0.0,0.0
2277418,338,41260573635,636,5592737,2,1.58,369,-0.20,112,92,0.0,0.0
2277419,338,41260573635,636,7441679,1,3.69,369,0.00,112,92,0.0,0.0
2277420,338,41260573635,636,7442317,1,2.69,369,0.00,112,92,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2396799,1613,41655820646,663,16102849,1,2.00,3262,-1.15,1231,95,0.0,0.0
2396800,1001,41655829421,663,13217063,1,1.69,3131,0.00,2231,95,0.0,0.0
2396801,1001,41655829421,663,13217800,1,1.69,3131,0.00,2231,95,0.0,0.0
2396802,1167,41656790510,663,6410462,22451,43.98,3385,-0.65,1059,95,0.0,0.0


In [409]:
user_features['income_desc'].replace({'35-49K':35, 
                       '50-74K':50,
                       '25-34K':25,
                       '75-99K':75,
                       'Under 15K':15,
                       '100-124K':100,
                       '15-24K':15,
                       '125-149K':125,
                       '150-174K':150,
                       '250K+':250,
                       '175-199K':175,
                       '200-249K':200}, inplace=True)

In [410]:
user_features.income_desc.value_counts()

50     192
35     172
15     135
75      96
25      77
125     38
100     34
150     30
250     11
175     11
200      5
Name: income_desc, dtype: int64

In [69]:
income_desc_less_50 = user_features.income_desc[user_features.income_desc<=50].reset_index()
income_desc_less_50.columns = ['user_id', 'income_desc']

In [70]:
name = 'less_50'
less_50 = get_als_recommendations_mapping(income_desc_less_100, name, 50)
less_50

Unnamed: 0,user_id,income_desc,less_50
0,1,50,"[1082212, 883616, 955587, 996259, 884686, 1064..."
1,15,50,"[1082185, 933637, 9553193, 1042616, 12263857, ..."
2,20,50,"[8090560, 1085664, 1135983, 1123106, 6534078, ..."
3,30,50,"[1044078, 8090537, 844179, 861345, 12301100, 1..."
4,48,50,"[7168759, 942278, 846760, 1117556, 1085846, 92..."
...,...,...,...
149,785,50,"[1134633, 916993, 1116050, 8177622, 825703, 55..."
150,787,50,"[1079528, 1104349, 847207, 857538, 5566717, 10..."
151,795,50,"[5568378, 5569327, 1029743, 907631, 1044078, 9..."
152,796,50,"[1127758, 1038745, 972143, 890269, 956486, 944..."


In [71]:
less_50 = less_50.merge(result_lvl_2, on = 'user_id')

In [72]:
less_50

Unnamed: 0,user_id,income_desc,less_50,actual
0,1,50,"[1082212, 883616, 955587, 996259, 884686, 1064...","[821867, 834484, 856942, 865456, 889248, 90795..."
1,15,50,"[1082185, 933637, 9553193, 1042616, 12263857, ...","[910439, 1082185, 959076, 1023958, 1082310, 13..."
2,20,50,"[8090560, 1085664, 1135983, 1123106, 6534078, ...","[819112, 944419, 945611, 971684, 1025522, 1058..."
3,30,50,"[1044078, 8090537, 844179, 861345, 12301100, 1...","[823758, 833025, 840361, 852793, 873203, 88180..."
4,48,50,"[7168759, 942278, 846760, 1117556, 1085846, 92...","[857000, 890533, 960718, 971585, 979178, 99606..."
...,...,...,...,...
149,785,50,"[1134633, 916993, 1116050, 8177622, 825703, 55...","[830965, 916050, 825703, 878996, 911351, 91162..."
150,787,50,"[1079528, 1104349, 847207, 857538, 5566717, 10...","[957736, 1075214, 5567702, 5569230, 7025106, 7..."
151,795,50,"[5568378, 5569327, 1029743, 907631, 1044078, 9...","[910032, 916122, 956125, 961747, 1043751, 1065..."
152,796,50,"[1127758, 1038745, 972143, 890269, 956486, 944...","[840361, 970119, 12731425, 907014, 995242, 100..."


In [78]:
less_50.apply(lambda x: precision_at_k(x.less_50, x.actual, 5), axis = 1).mean()*100

11.298701298701292

In [325]:
## Precision became higher. I think we can use different group  (precission немного увеличился. Думаю, что можно еще поиграть зарплатами)

# We can group according age

In [327]:
user_features.age_desc

0        65+
1      45-54
2      25-34
3      25-34
4      45-54
       ...  
796    35-44
797    45-54
798    45-54
799    25-34
800    25-34
Name: age_desc, Length: 801, dtype: object

In [328]:
voc = ({
    '45-54':44,
    '35-44':35,
    '25-34':25,
    '65+':65,
    '55-64':55,
    '19-24':19
        
})

In [329]:
user_features['age_desc'].replace(voc, inplace = True)

In [330]:
user_features_more_35 = user_features.age_desc[user_features.age_desc<35].reset_index()
user_features_more_35.columns = ['user_id', 'age']

In [331]:
user_features_more_35

Unnamed: 0,user_id,age
0,2,25
1,3,25
2,8,25
3,18,19
4,24,25
...,...,...
183,777,19
184,779,19
185,791,25
186,799,25


In [332]:
user_features_more_35=user_features_more_35.merge(result_lvl_2, on = 'user_id')

In [333]:
user_features_more_35 = user_features_more_35[['user_id', 'actual']]

In [334]:
user_features_more_35

Unnamed: 0,user_id,actual
0,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
1,8,"[835098, 872137, 910439, 924610, 992977, 10412..."
2,18,"[831628, 907877, 914697, 995242, 1118878, 1128..."
3,24,"[831815, 844165, 849843, 853904, 903230, 10118..."
4,31,"[855916, 948239, 1054424, 1126899, 5570383, 98..."
...,...,...
140,772,"[1733943, 1735401, 1745709, 1816914, 1827411, ..."
141,777,"[824005, 832513, 849843, 860174, 863447, 86403..."
142,779,"[835499, 838539, 857538, 891616, 934645, 10085..."
143,799,"[6534178, 842140, 843756, 852864, 873654, 9046..."


In [335]:
name = 'older_35'
age = get_als_recommendations_mapping(user_features_more_35, name,50)

In [336]:
age

Unnamed: 0,user_id,actual,older_35
0,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1106523, 1044078, 1029743, 844179, 12810393, ..."
1,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[916122, 919535, 9676938, 981660, 6391134, 839..."
2,18,"[831628, 907877, 914697, 995242, 1118878, 1128...","[1104414, 976199, 972437, 925766, 1022066, 951..."
3,24,"[831815, 844165, 849843, 853904, 903230, 10118...","[1016800, 917033, 1008814, 1044078, 1099164, 9..."
4,31,"[855916, 948239, 1054424, 1126899, 5570383, 98...","[898847, 13007435, 1094924, 824305, 7409997, 6..."
...,...,...,...
140,772,"[1733943, 1735401, 1745709, 1816914, 1827411, ...","[5585510, 1106523, 986912, 9836526, 1044078, 6..."
141,777,"[824005, 832513, 849843, 860174, 863447, 86403...","[12781564, 1059823, 1094223, 916487, 12782182,..."
142,779,"[835499, 838539, 857538, 891616, 934645, 10085...","[951412, 946995, 976199, 1082990, 968072, 9161..."
143,799,"[6534178, 842140, 843756, 852864, 873654, 9046...","[7442180, 7442056, 913210, 6424472, 819978, 64..."


In [337]:
age.apply(lambda x: precision_at_k(x.older_35, x.actual, 5), axis=1).mean()*100

10.482758620689651

## Precision is increased

# Добавьте минимум по 2 фичи для товара

In [201]:
item_features.department.value_counts().head(5) # We can see the most popular spheres and merge users on it

GROCERY      39021
DRUG GM      31529
PRODUCE       3118
COSMETICS     3011
NUTRITION     2914
Name: department, dtype: int64

## Lets check if we will be able to increase precision at users in DRUG GM

In [202]:
item_features

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ
...,...,...,...,...,...,...,...
92348,18293142,6384,DRUG GM,National,BOOKSTORE,PAPERBACK BOOKS,
92349,18293439,6393,DRUG GM,National,BOOKSTORE,CHILDRENS LOW END,
92350,18293696,6406,DRUG GM,National,BOOKSTORE,PAPERBACK BEST SELLER,
92351,18294080,6442,DRUG GM,National,BOOKSTORE,PAPERBACK BOOKS,


In [339]:
gr = pd.DataFrame(item_features.department[item_features.department == 'DRUG GM'].reset_index()['index'])


In [340]:
gr.columns = ['user_id']

In [354]:
department = gr.merge(result_lvl_2, on = 'user_id')

In [355]:
department

Unnamed: 0,user_id,actual
0,7,"[840386, 889774, 898068, 909714, 929067, 95347..."
1,20,"[819112, 944419, 945611, 971684, 1025522, 1058..."
2,108,"[836762, 941734, 1009046, 1011457, 1022254, 10..."
3,110,"[977654, 1044655, 1075707, 9832160, 900697, 92..."
4,113,"[827261, 833458, 850533, 851676, 863780, 87156..."
...,...,...
260,2471,"[851819, 895680, 969568, 979975, 995628]"
261,2479,"[834484, 839094, 844165, 852864, 868764, 89712..."
262,2497,"[1016709, 9835695, 1132298, 16809501, 845294, ..."
263,2498,"[15716530, 834484, 901776, 914190, 958382, 972..."


In [356]:
name = 'DRUG_GM'
department = get_als_recommendations_mapping(department, name, 50)

In [357]:
department

Unnamed: 0,user_id,actual,DRUG_GM
0,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[1041688, 915715, 839419, 916122, 1100140, 945..."
1,20,"[819112, 944419, 945611, 971684, 1025522, 1058...","[12352248, 6633273, 5577022, 894360, 1123106, ..."
2,108,"[836762, 941734, 1009046, 1011457, 1022254, 10...","[888586, 830526, 925854, 7410336, 1053988, 559..."
3,110,"[977654, 1044655, 1075707, 9832160, 900697, 92...","[1106523, 1044078, 836445, 910673, 1025611, 96..."
4,113,"[827261, 833458, 850533, 851676, 863780, 87156...","[916122, 1127831, 1062128, 985999, 1116741, 87..."
...,...,...,...
187,1945,"[822965, 828516, 829852, 833715, 835576, 83941...","[898867, 990797, 1115098, 1044153, 12988031, 9..."
188,1947,"[888350, 1105917, 10456126, 12132353, 845307, ...","[933835, 1040807, 1006483, 1128016, 921504, 97..."
189,1959,"[2770330, 2805130, 2838504, 3018382, 2698572, ...","[5569374, 866548, 866878, 1003524, 12810391, 1..."
190,1964,"[870428, 873564, 889774, 896672, 920455, 95701...","[870428, 944534, 8019968, 854373, 1125530, 128..."


In [358]:
department.apply(lambda row: precision_at_k(row.DRUG_GM, row.actual, 5), axis = 1).mean()*100

10.416666666666659

## If we use model of 2 levels (the first - is group by DRUG_GM and second - als) the precision is improved also but not dramatically

In [201]:
item_features.department.value_counts().head(5) # We can see the most popular spheres and merge users on it

GROCERY      39021
DRUG GM      31529
PRODUCE       3118
COSMETICS     3011
NUTRITION     2914
Name: department, dtype: int64

In [374]:
for i in item_features.columns:
    print(item_features[i].value_counts(),'\n----------------\n')
    

15599615    1
1131069     1
1092160     1
1282212     1
10259012    1
           ..
943398      1
13095254    1
9704744     1
1025147     1
1050625     1
Name: item_id, Length: 92353, dtype: int64 
----------------

69      12676
2        1411
5423     1376
764      1332
1407     1210
        ...  
659         1
5864        1
1770        1
2642        1
1887        1
Name: manufacturer, Length: 6476, dtype: int64 
----------------

GROCERY            39021
DRUG GM            31529
PRODUCE             3118
COSMETICS           3011
NUTRITION           2914
MEAT                2544
MEAT-PCKGD          2427
DELI                2354
PASTRY              2149
FLORAL               938
SEAFOOD-PCKGD        563
MISC. TRANS.         490
SPIRITS              377
SEAFOOD              369
GARDEN CENTER        128
RESTAURANT           102
MISC SALES TRAN       88
SALAD BAR             48
COUP/STR & MFG        39
TRAVEL & LEISUR       28
FROZEN GROCERY        23
KIOSK-GAS             16
              

## lets try to group by brand

In [397]:
gr = pd.DataFrame(item_features.department[item_features.brand == 'National'].reset_index()['index'])

In [398]:
gr.columns = ['user_id']

In [399]:
department = gr.merge(result_lvl_2, on = 'user_id')

In [400]:
department

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,13,"[6534178, 1104146, 829197, 840361, 862070, 884..."
2,83,"[900802, 960278, 962568, 994799, 1065045, 5568..."
3,86,"[863505, 865026, 866488, 894236, 951590, 96155..."
4,91,"[6534178, 7433029, 859075, 1082185, 14106445, ..."
...,...,...
1404,2496,[6534178]
1405,2497,"[1016709, 9835695, 1132298, 16809501, 845294, ..."
1406,2498,"[15716530, 834484, 901776, 914190, 958382, 972..."
1407,2499,"[867188, 877580, 902396, 914190, 951590, 95813..."


In [402]:
name = 'National'
department = get_als_recommendations_mapping(department, name, 50)

In [403]:
department.apply(lambda col: precision_at_k(col.National, col.actual, 5), axis = 1).mean()*100

10.029325513196529

### We can use two groups. For instance DRUG GM department and salary less than 50.
### При использовании ALS pricision не сильно, но увеличиватеся. Попробуем совместить, например, отдел лекарства и зарплату менее 50тр.

In [488]:
item_features

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ
...,...,...,...,...,...,...,...
92348,18293142,6384,DRUG GM,National,BOOKSTORE,PAPERBACK BOOKS,
92349,18293439,6393,DRUG GM,National,BOOKSTORE,CHILDRENS LOW END,
92350,18293696,6406,DRUG GM,National,BOOKSTORE,PAPERBACK BEST SELLER,
92351,18294080,6442,DRUG GM,National,BOOKSTORE,PAPERBACK BOOKS,


In [489]:
group = pd.DataFrame(item_features[item_features.department == 'DRUG GM'].reset_index()['index'])
group.columns = ['user_id']

In [490]:
income_desc_less_50_1 = user_features.income_desc[user_features.income_desc<=50].reset_index()
income_desc_less_50_1.columns = ['user_id', 'DRUG GM']

In [491]:
income_desc_less_50_1.merge(group, on = 'user_id')

Unnamed: 0,user_id,DRUG GM
0,7,15
1,11,25
2,20,50
3,124,50
4,131,50
...,...,...
60,647,35
61,691,50
62,706,50
63,721,50


In [492]:
group1 = group.merge(result_lvl_2, on = 'user_id')

In [493]:
group1

Unnamed: 0,user_id,actual
0,7,"[840386, 889774, 898068, 909714, 929067, 95347..."
1,20,"[819112, 944419, 945611, 971684, 1025522, 1058..."
2,108,"[836762, 941734, 1009046, 1011457, 1022254, 10..."
3,110,"[977654, 1044655, 1075707, 9832160, 900697, 92..."
4,113,"[827261, 833458, 850533, 851676, 863780, 87156..."
...,...,...
260,2471,"[851819, 895680, 969568, 979975, 995628]"
261,2479,"[834484, 839094, 844165, 852864, 868764, 89712..."
262,2497,"[1016709, 9835695, 1132298, 16809501, 845294, ..."
263,2498,"[15716530, 834484, 901776, 914190, 958382, 972..."


In [494]:
name = 'misce'
group1 = get_als_recommendations_mapping(group1, name, 50)

In [495]:
group1.apply(lambda col: precision_at_k(col.misce, col.actual, 5), axis = 1).mean()*100

10.416666666666659

### We can try to use goods from DRUG GM and GROCERY
### Не очень сильно улушилось. Можно попробовать посмотреть рекомендации - заказы товаров из двух групп лекарства и бакалея.

In [478]:
group1 = pd.DataFrame(item_features[(item_features.department == 'DRUG GM') | (item_features.department == 'GROCERY')].reset_index()['index'])
group1.columns = ['user_id']

In [479]:
group1 = group1.merge(result_lvl_2, on = 'user_id')

In [481]:
group1

Unnamed: 0,user_id,actual
0,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
1,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."
2,7,"[840386, 889774, 898068, 909714, 929067, 95347..."
3,9,"[864335, 990865, 1029743, 9297474, 10457112, 8..."
4,13,"[6534178, 1104146, 829197, 840361, 862070, 884..."
...,...,...
1758,2496,[6534178]
1759,2497,"[1016709, 9835695, 1132298, 16809501, 845294, ..."
1760,2498,"[15716530, 834484, 901776, 914190, 958382, 972..."
1761,2499,"[867188, 877580, 902396, 914190, 951590, 95813..."


In [485]:
name = 'National'
group1 = get_als_recommendations_mapping(group1, name, 50)

In [486]:
group1.apply(lambda col: precision_at_k(col.National, col.actual, 5), axis = 1).mean()*100

10.035435861091452

## The result did not increase. I guess to improve result we can use LGBMClassifier.
## То же не лучше результат, чем если бы использовались отделы по отдельности
