In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [4]:
data = pd.read_csv('D:/КУРСЫ/GeekBrains/Рекомендательные системы/webinar_2/retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [5]:
users, items, interactions = data.user_id.nunique(), data.item_id.nunique(), data.shape[0]

print('# users: ', users)
print('# items: ', items)
print('# interactions: ', interactions)

# users:  2499
# items:  89051
# interactions:  2396804


In [6]:
popularity = data.groupby('item_id')['sales_value'].sum().reset_index()
popularity.describe()

Unnamed: 0,item_id,sales_value
count,89051.0,89051.0
mean,5115772.0,83.458481
std,5178973.0,1628.715079
min,25671.0,0.0
25%,966583.0,3.5
50%,1448516.0,10.78
75%,9553042.0,46.105
max,18024560.0,467993.62


In [7]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [8]:
data_train.shape[0], data_test.shape[0]

(2278490, 118314)

In [9]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [10]:
test_users = result.shape[0]
new_test_users = len(set(data_test['user_id']) - set(data_train['user_id']))

print('В тестовом дата сете {} юзеров'.format(test_users))
print('В тестовом дата сете {} новых юзеров'.format(new_test_users))

В тестовом дата сете 2042 юзеров
В тестовом дата сете 0 новых юзеров


### 1.1 Random recommendation

In [48]:
popular = data.groupby('item_id')['sales_value'].sum().reset_index()
popular

Unnamed: 0,item_id,sales_value
0,25671,20.94
1,26081,0.99
2,26093,1.59
3,26190,1.54
4,26355,1.98
...,...,...
89046,17991689,2.49
89047,17991691,2.49
89048,18000012,19.96
89049,18024155,3.99


In [49]:
popular['sales_value'] = popular['sales_value'] / popular['sales_value'].sum()
popular.rename(columns = {'sales_value':'weight'}, inplace = True )
popular

Unnamed: 0,item_id,weight
0,25671,2.817523e-06
1,26081,1.332067e-07
2,26093,2.139380e-07
3,26190,2.072103e-07
4,26355,2.664133e-07
...,...,...
89046,17991689,3.350349e-07
89047,17991691,3.350349e-07
89048,18000012,2.685661e-06
89049,18024155,5.368632e-07


In [51]:
#def random_recommendation(items, n=5):
#    """Случайные рекоммендации"""
    
#    items = np.array(items['item_id'])
#    weights = np.array(items['weight'])
#    recs = np.random.choice(items, size=n, replace=False, p= weights)
    
#    return recs.tolist()

In [56]:
#random_recommendation(popular, 5) -- Почему то при вызове функции, программа "падает" на строчке weights = np.array(items['weight']),
# а если просто код писать то всё нормально

In [57]:
weights = np.array(popular['weight'])
items = np.array(popular['item_id'])

In [58]:
np.random.choice(items, size=5, replace=False, p= weights).tolist()

[6534178, 12262778, 10204488, 1092311, 1095964]

### Задание 2. Расчет метрик

In [4]:
result = pd.read_csv('D:/КУРСЫ/GeekBrains/Рекомендательные системы/webinar_2/predictions_basic.csv')
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,[ 821867 834484 856942 865456 889248 ...,"[5586238, 1015228, 866118, 2416733, 2603573]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,[ 835476 851057 872021 878302 879948 ...,"[161354, 63027, 1027802, 12263694, 307395]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[999999, 1082185, 1098066, 6534178, 1127831]"


In [76]:
result.count()

user_id                   2042
actual                    2042
random_recommendation     2042
popular_recommendation    2042
itemitem                  2042
cosine                    2042
tfidf                     2042
own_purchases             2042
dtype: int64

In [24]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    precision = flags.sum() / len(recommended_list)
    
    
    return precision

In [14]:
def precision(recommended_list, bought_list):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    flags = np.isin(bought_list, recommended_list)
    
    precision = flags.sum() / len(recommended_list)
    
    return precision

In [13]:
rs = result['random_recommendation'].values[0]
rs2 = list(map(int, rs.replace('[', '').replace(']', '').replace(' ', '').split(',')))
rs2

[5586238, 1015228, 866118, 2416733, 2603573]

In [74]:
pr_ran_rec_fin = 0
or i in range(2042):
    pr_ran_rec = precision(result['random_recommendation'], result['own_purchases'])
    pr_ran_rec_fin = pr_ran_rec_fin + pr_ran_rec

0         [5586238, 1015228, 866118, 2416733, 2603573]
1           [161354, 63027, 1027802, 12263694, 307395]
2         [13416054, 936084, 7410040, 9527114, 377218]
3            [5574336, 990072, 868548, 995880, 842226]
4           [1277401, 94446, 3133282, 1925252, 855699]
                             ...                      
2037       [932962, 845876, 1578206, 13159156, 175031]
2038      [5726138, 15717067, 908287, 915356, 5558852]
2039    [9836300, 12326063, 7414863, 1730240, 7441652]
2040       [849152, 952122, 906965, 16124540, 1008041]
2041        [862358, 81611, 7466803, 1102999, 9855981]
Name: random_recommendation, Length: 2042, dtype: object

In [75]:
result['own_purchases']

0        [999999, 1082185, 1029743, 995785, 1004906]
1       [999999, 1082185, 1098066, 6534178, 1127831]
2       [999999, 1082185, 1029743, 6534178, 1127831]
3        [999999, 1082185, 1029743, 1127831, 995785]
4       [999999, 1082185, 1029743, 1098066, 6534178]
                            ...                     
2037     [999999, 1082185, 1098066, 6534178, 995785]
2038    [999999, 1082185, 1029743, 1098066, 6534178]
2039    [999999, 1082185, 1029743, 1068719, 1106523]
2040    [999999, 1082185, 1098066, 6534178, 1004906]
2041    [999999, 1082185, 1029743, 1098066, 6534178]
Name: own_purchases, Length: 2042, dtype: object

Лучший результат у popular_recommendation: 792

In [25]:
pr_fin = 0
for i in range(2042):
    rs = result['popular_recommendation'].values[i]
    recommended_list = list(map(int, rs.replace('[', '').replace(']', '').replace(' ', '').split(',')))
    rs = result['own_purchases'].values[i]
    bought_list = list(map(int, rs.replace('[', '').replace(']', '').replace(' ', '').split(',')))
    pr_fin = pr_fin + precision_at_k(recommended_list, bought_list, k=5)
pr_fin

792.000000000002

In [20]:
pr_fin = 0
for i in range(2042):
    rs = result['random_recommendation'].values[i]
    recommended_list = list(map(int, rs.replace('[', '').replace(']', '').replace(' ', '').split(',')))
    rs = result['own_purchases'].values[i]
    bought_list = list(map(int, rs.replace('[', '').replace(']', '').replace(' ', '').split(',')))
    pr_fin = pr_fin + precision_at_k(recommended_list, bought_list, k=5)
pr_fin

0.4

In [26]:
pr_fin = 0
for i in range(2042):
    rs = result['itemitem'].values[i]
    recommended_list = list(map(int, rs.replace('[', '').replace(']', '').replace(' ', '').split(',')))
    rs = result['own_purchases'].values[i]
    bought_list = list(map(int, rs.replace('[', '').replace(']', '').replace(' ', '').split(',')))
    pr_fin = pr_fin + precision_at_k(recommended_list, bought_list, k=5)
pr_fin

1.2

In [27]:
pr_fin = 0
for i in range(2042):
    rs = result['cosine'].values[i]
    recommended_list = list(map(int, rs.replace('[', '').replace(']', '').replace(' ', '').split(',')))
    rs = result['own_purchases'].values[i]
    bought_list = list(map(int, rs.replace('[', '').replace(']', '').replace(' ', '').split(',')))
    pr_fin = pr_fin + precision_at_k(recommended_list, bought_list, k=5)
pr_fin

1.0

In [28]:
pr_fin = 0
for i in range(2042):
    rs = result['tfidf'].values[i]
    recommended_list = list(map(int, rs.replace('[', '').replace(']', '').replace(' ', '').split(',')))
    rs = result['own_purchases'].values[i]
    bought_list = list(map(int, rs.replace('[', '').replace(']', '').replace(' ', '').split(',')))
    pr_fin = pr_fin + precision_at_k(recommended_list, bought_list, k=5)
pr_fin

1.0