# Hometask 2

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [2]:
data = pd.read_csv('retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
data.describe()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
count,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0
mean,1271.904,33366430000.0,363.977,2827247.0,100.3763,3.100821,3048.227,-0.5400708,1561.714,52.68156,-0.01638696,-0.002897905
std,726.5644,4284798000.0,175.9385,3732798.0,1152.379,4.210229,8785.542,1.245824,401.5691,25.1331,0.2168615,0.03974618
min,1.0,26984850000.0,1.0,25671.0,0.0,0.0,1.0,-130.02,0.0,1.0,-55.93,-7.7
25%,655.0,30087140000.0,216.0,916993.0,1.0,1.29,330.0,-0.69,1307.0,32.0,0.0,0.0
50%,1271.0,32419980000.0,366.0,1027569.0,1.0,2.0,370.0,-0.02,1614.0,53.0,0.0,0.0
75%,1914.0,35145800000.0,515.0,1132178.0,1.0,3.49,422.0,0.0,1844.0,74.0,0.0,0.0
max,2500.0,41656790000.0,663.0,18024560.0,89638.0,840.0,34280.0,3.99,2359.0,95.0,0.0,0.0


In [4]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [5]:
def weighted_random_recommendation(items_weights, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    recs = items_weights.sample(n=n, weights='weight')['item_id']
    
    return recs.tolist()

In [6]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [7]:
%%time

items_weights = data.groupby('item_id')['sales_value'].sum().reset_index()
items_weights['weight'] = np.log(items_weights['sales_value'] + 0.0000001)
items_weights.loc[items_weights['weight'] < 0, 'weight'] = 0.0
items_weights['weight'] /= items_weights['weight'].sum()
print(items_weights['weight'].sum())

result['weighted_random_recommendation'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items_weights))

1.0
CPU times: user 2.89 s, sys: 24.8 ms, total: 2.91 s
Wall time: 2.91 s


In [8]:
result.head()

Unnamed: 0,user_id,actual,weighted_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[17284297, 8116837, 1924981, 848492, 13842217]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[970611, 505630, 1142118, 1018222, 13416421]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[9654918, 868741, 12728994, 960919, 6463401]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[1103902, 12301188, 7167249, 13214182, 8068356]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[905072, 913863, 12692119, 1059969, 1005899]"


### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [9]:
import json

In [10]:
result = pd.read_csv('predictions_basic.csv')
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,[ 821867 834484 856942 865456 889248 ...,"[1788537, 8181325, 1802197, 12301586, 976199]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1098066, 995242, 981760, 1127831, 1082185]","[961554, 1098066, 1127831, 981760, 1082185]","[961554, 1098066, 1127831, 981760, 1082185]","[1081177, 995785, 1004906, 1082185, 1029743, 9..."
1,3,[ 835476 851057 872021 878302 879948 ...,"[839421, 12132508, 15574274, 9221006, 5577593]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1098066, 995242, 981760, 1127831, 1082185]","[961554, 1098066, 1127831, 981760, 1082185]","[961554, 1098066, 1127831, 981760, 1082185]","[1081177, 995785, 1004906, 1082185, 1029743, 9..."


In [11]:
result.shape

(2042, 8)

In [12]:
result.loc[0]

user_id                                                                   1
actual                    [  821867   834484   856942   865456   889248 ...
random_recommendation         [1788537, 8181325, 1802197, 12301586, 976199]
popular_recommendation        [6534178, 6533889, 1029743, 6534166, 1082185]
itemitem                        [1098066, 995242, 981760, 1127831, 1082185]
cosine                          [961554, 1098066, 1127831, 981760, 1082185]
tfidf                           [961554, 1098066, 1127831, 981760, 1082185]
own_purchases             [1081177, 995785, 1004906, 1082185, 1029743, 9...
Name: 0, dtype: object

In [13]:
result.loc[0, 'actual']

'[  821867   834484   856942   865456   889248   907957   914190   943316\n   951954   954486   958046   962568   969231   971585   979707   986947\n   990656   995242  1004906  1005186  1042083  1050310  1060819  1062002\n  1064441  1069733  1074612  1082185  1131115  1132771  6534544 13876341\n 15971874 17178953   883616   917704   931860   961554  1002032  1031190\n  8090541  8293439  9297615  9527329 15926712  1049998   861272   869465\n   877373   908213   933913   940947   945809   959316   978974  1031697\n  1041796  1048918  1081189  1101422  1115576  1122428  1132231  1132814\n  5577022  8091601  9296986  9677939 10356149 13417048 15741823 15830875]'

In [14]:
def precision_at_k(recommended_list, bought_list, k=5):
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)[:k]
    
    flags = np.isin(recommended_list, bought_list)
    
    precision = flags.sum() / len(recommended_list)
    
    return precision

In [15]:
result.columns

Index(['user_id', 'actual', 'random_recommendation', 'popular_recommendation',
       'itemitem', 'cosine', 'tfidf', 'own_purchases'],
      dtype='object')

In [16]:
actual = result['actual']
result.drop(['user_id', 'actual'], axis=1, inplace=True)
result.columns

Index(['random_recommendation', 'popular_recommendation', 'itemitem', 'cosine',
       'tfidf', 'own_purchases'],
      dtype='object')

In [17]:
%%time

for column in result.columns:
    precision = list()
    
    for i in range(result.shape[0]):
        t = precision_at_k(json.loads(result.loc[i][column]),
                           np.fromstring(actual[i].replace('[', '').replace(']', ''), dtype=int, sep=' '), k=5)
        precision.append(t)
    
    print('{}: {}'.format(column, np.array(precision).mean()))

random_recommendation: 0.00019588638589618023
popular_recommendation: 0.15523996082272282
itemitem: 0.14573947110675808
cosine: 0.13545543584720862
tfidf: 0.13545543584720862
own_purchases: 0.16229187071498533
CPU times: user 1.74 s, sys: 9.16 ms, total: 1.75 s
Wall time: 1.75 s


### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

In [18]:
# your_code