In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [60]:
data = pd.read_csv('retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [186]:
predict = pd.read_csv('predictions_basic.csv')

In [179]:
predict['tfidf'][2][0]

'['

Преобразование колонок df

In [187]:
predict['actual'] = predict['actual'].apply(lambda x:  np.fromstring(x[1:-1],sep=' ',dtype=int))
for name_col in predict.columns[2:7]:
    print(name_col)
    predict[name_col] = predict[name_col].apply(lambda x:  np.fromstring(x[1:-1],sep=',',dtype=int))

random_recommendation
popular_recommendation
itemitem
cosine
tfidf


In [188]:
predict.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2042 entries, 0 to 2041
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   user_id                 2042 non-null   int64 
 1   actual                  2042 non-null   object
 2   random_recommendation   2042 non-null   object
 3   popular_recommendation  2042 non-null   object
 4   itemitem                2042 non-null   object
 5   cosine                  2042 non-null   object
 6   tfidf                   2042 non-null   object
dtypes: int64(1), object(6)
memory usage: 111.8+ KB


In [189]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2278490 entries, 0 to 2282324
Data columns (total 12 columns):
 #   Column             Dtype  
---  ------             -----  
 0   user_id            int64  
 1   basket_id          int64  
 2   day                int64  
 3   item_id            int64  
 4   quantity           int64  
 5   sales_value        float64
 6   store_id           int64  
 7   retail_disc        float64
 8   trans_time         int64  
 9   week_no            int64  
 10  coupon_disc        float64
 11  coupon_match_disc  float64
dtypes: float64(4), int64(8)
memory usage: 226.0 MB


### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [190]:
def weighted_random_recommendation(items_weght, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    
    # Подсказка: необходимо модифицировать функцию random_recommendation()
    # your_code
    items = np.array(items_weght['item_id'])
    proba = items_weght['weight'].to_list()
    recs = np.random.choice(items, size=n, replace=False, p=proba)
            
    return recs.tolist()

In [191]:
data_train.head(5)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [192]:
def get_items_weght(df):
    
    df_weight = df.groupby('item_id')['sales_value'].sum().reset_index()
    df_weight = df_weight[df_weight['sales_value'] > 1]
    df_weight['weight']= np.log(df_weight['sales_value'] + 1e-8)
    df_weight['weight'] = df_weight['weight'] / df_weight['weight'].sum()
    df_weight.drop('sales_value', axis=1, inplace=True)
    
    return df_weight

In [193]:
%%time

# your_code
items_weght = get_items_weght(data_train)
predict['weighted_random_recommendation'] = predict['user_id'].apply(lambda x: weighted_random_recommendation(items_weght, n=5))

CPU times: user 6.76 s, sys: 0 ns, total: 6.76 s
Wall time: 6.75 s


In [196]:
predict.head(5)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,weighted_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1073639, 12427738, 12757258, 63622, 476598]","[6534178, 6533889, 1029743, 6534166, 1082185]","[6666, 1082185, 981760, 1127831, 995242]","[1082185, 6666, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 6666, 1098066]","[902260, 1033168, 1021225, 855335, 7465730]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[869577, 12949920, 1033602, 7407147, 12647938]","[6534178, 6533889, 1029743, 6534166, 1082185]","[6666, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 6666, 826249]","[1082185, 981760, 1098066, 826249, 6666]","[851711, 12352258, 1103351, 1062002, 1085138]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[920881, 13671443, 968478, 5575712, 1133826]","[6534178, 6533889, 1029743, 6534166, 1082185]","[6666, 1082185, 981760, 1127831, 995242]","[1082185, 6666, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 6666, 878996]","[947983, 9297231, 975505, 1017446, 1013571]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[13986525, 15452833, 5650354, 993223, 9673490]","[6534178, 6533889, 1029743, 6534166, 1082185]","[6666, 1082185, 981760, 1127831, 995242]","[1082185, 981760, 6666, 1127831, 961554]","[1082185, 981760, 1127831, 6666, 961554]","[1007597, 10149728, 888543, 1008882, 12132582]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[12263614, 9446529, 1023082, 967144, 1071674]","[6534178, 6533889, 1029743, 6534166, 1082185]","[6666, 1082185, 981760, 1127831, 1098066]","[1082185, 981760, 6666, 1098066, 826249]","[1082185, 981760, 6666, 1098066, 826249]","[15801333, 205735, 2594056, 6981942, 8118757]"


### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [197]:
# your_code
# def precision_at_k(recommended_list, bought_list, k=5):
# #   сделать дома
#     bought_list = np.array(bought_list)
#     recommended_list = np.array(recommended_list[:k])
    
#     flags = np.isin(bought_list, recommended_list)
#     return flags.sum() / (recommended_list.size)

In [198]:
from metrics import precision_at_k

In [199]:
df_t = predict 
k = 5
for name_col in df_t.columns[1:]:
    print(f"{round(df_t.apply(lambda row: precision_at_k(row[name_col], row['actual']), axis=1).mean(),4)}:{name_col}")

1.0:actual
0.0004:random_recommendation
0.1552:popular_recommendation
0.1368:itemitem
0.1329:cosine
0.139:tfidf
0.0009:weighted_random_recommendation


### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

In [74]:
# your_code