In [223]:
from collections import Counter, OrderedDict

class OrderedCounter(Counter, OrderedDict):
    'Counter that remembers the order elements are first seen'
    def __repr__(self):
        return '%s(%r)' % (self.__class__.__name__,
                            OrderedDict(self))
    def __reduce__(self):
        return self.__class__, (OrderedDict(self),)

In [224]:
import pandas as pd
train = pd.read_csv("train.txt", sep=';', header=None)
test = pd.read_csv("test.txt", sep=';', header=None)

test_case = pd.read_csv("test_case.txt", sep=';', header=None)
test_case[1] = test_case[1].fillna(-1).astype(int).astype(str).replace('-1', np.nan)

    Сессии, в которых пользователь ничего не купил, исключаем из оценки качества.
    Если товар не встречался в обучающей выборке, его популярность равна 0.
    Рекомендуем разные товары. И их число должно быть не больше, чем количество различных просмотренных пользователем товаров.
    Рекомендаций всегда не больше, чем минимум из двух чисел: количество просмотренных пользователем товаров и k в recall@k / precision@k.

In [225]:
# Сессии, в которых пользователь ничего не купил, исключаем из оценки качества.
print(test.shape[0])
test.dropna(inplace=True)
print(test.shape[0])

50000
3665


In [226]:
c_view = OrderedCounter()
c_buy = OrderedCounter()

In [227]:
for l in train[0].str.split(',').values:
    for token in l:
        c_view[token] += 1
        
# Если товар не встречался в обучающей выборке, его популярность равна 0.
for l in test[0].str.split(',').values:
    for token in l:
        if token not in c_view:
            c_view[token] = 0
        
for l in train[1].dropna().str.split(',').values:
    if l:
        for token in l:
            c_buy[token] += 1

In [228]:
def recommend_most_viewed(items, k):
    assert isinstance(items, list)
    k = min(k, len(np.unique(items)))
    _items_weight = {}
    for item in items:
        _items_weight[item] = c_view[item]
        
    return [item for item, weight in sorted(_items_weight.items() ,  key=lambda x: x[1], reverse=True)][:k]
def recommend_most_buy(items, k):
    assert isinstance(items, list)
    k = min(k, len(np.unique(items)))
    _items_weight = {}
    for item in items:
        _items_weight[item] = c_buy[item]
        
    return [item for item, weight in sorted(_items_weight.items() ,  key=lambda x: x[1], reverse=True)][:k]

In [229]:
def precision_at_k(buy, recommend, k):
    buy = set(buy)
#     k = len(recommend)
    res = 0
    for i in recommend:
        if i in buy:
            res += 1
#     print(res, k)
    return res / k

def recall_at_k(buy, recommend, k):
    buy = set(buy)
    k = len(buy)
    res = 0
    for i in recommend:
        if i in buy:
            res += 1
    return res / k

def predict(recommender, data, k):
    preds = []
    for view in tqdm(data.loc[:,0].str.split(',').values):
        preds.append(recommender(view, k))
    return preds


def calc_metric(metric, data, preds, k, return_seq=False):
    if return_seq:
        res = []
        for _pred, _target in zip(preds, data.loc[:,1].str.split(',').values):
            res.append(metric(_target, _pred, k))
        return res
    else:
        res = 0
        for _pred, _target in zip(preds, data.loc[:,1].str.split(',').values):
            res += metric(_target, _pred, k)
        return round(res / data.shape[0],2)


def calc_all_metrics(target, preds):
    pass

In [230]:
train.dropna(inplace=True)
train.shape

(3608, 2)

In [231]:
# _k = [1, 5]
_recommender = [recommend_most_viewed, recommend_most_buy]
_data = [train, test]
file_names = ['recommend_most_viewed_train.txt', 
              'recommend_most_viewed_train_test.txt',
              'recommend_most_buy_train.txt',
              'recommend_most_buy_test.txt']
i = 0
for recommender in tqdm(_recommender):
    for data in _data:
        preds = predict(recommender, data, 1)
        AveragePrecision_at1 = calc_metric(precision_at_k, data, preds, 1)
        AverageRecall_at1 = calc_metric(recall_at_k, data, preds, 1)

        preds = predict(recommender, data, 5)
        AveragePrecision_at5 = calc_metric(precision_at_k, data, preds, 5)
        AverageRecall_at5 = calc_metric(recall_at_k, data, preds, 5)
        print(f"{AverageRecall_at1} {AveragePrecision_at1} {AverageRecall_at5} {AveragePrecision_at5}", end='', file=open(file_names[i], 'w'))
        i += 1

  0%|          | 0/2 [00:00<?, ?it/s]
100%|██████████| 3608/3608 [00:00<00:00, 63607.49it/s]

100%|██████████| 3608/3608 [00:00<00:00, 66829.40it/s]

100%|██████████| 3665/3665 [00:00<00:00, 64653.14it/s]

100%|██████████| 3665/3665 [00:00<00:00, 66778.42it/s]
 50%|█████     | 1/2 [00:00<00:00,  2.83it/s]
100%|██████████| 3608/3608 [00:00<00:00, 69576.60it/s]

100%|██████████| 3608/3608 [00:00<00:00, 66458.13it/s]

100%|██████████| 3665/3665 [00:00<00:00, 66410.58it/s]

100%|██████████| 3665/3665 [00:00<00:00, 70384.86it/s]
100%|██████████| 2/2 [00:00<00:00,  3.21it/s]


In [232]:
!cat recommend_most_viewed_train.txt

0.44 0.51 0.82 0.21

# Check algorithm on the test case

https://www.coursera.org/learn/data-analysis-applications/discussions/weeks/4/threads/DNfz8-c5EeaLPwr1rOSl0g

In [219]:
c_view = OrderedCounter()
c_buy = OrderedCounter()

for l in test_case[0].str.split(',').values:
    for token in l:
        c_view[token] += 1
        
for l in test_case[1].dropna().values:
    if l:
        for token in l:
            c_buy[token] += 1    

In [220]:
test_case

Unnamed: 0,0,1
0,423,
1,324,
2,423,2.0
3,324,2.0
4,444,1.0
5,23235,5.0


In [221]:
c_view

OrderedCounter(OrderedDict([('4', 7), ('2', 6), ('3', 6), ('5', 1)]))

In [222]:
# _k = [1, 5]
_recommender = [recommend_most_viewed, recommend_most_buy]
test_case.dropna(inplace=True)
_data = [test_case]
file_names = ['recommend_most_viewed_train.txt', 
              'recommend_most_viewed_train_test.txt',
              'recommend_most_buy_train.txt',
              'recommend_most_buy_test.txt']

i = 0
for recommender in tqdm(_recommender):
    for data in _data:
        preds = predict(recommender, data, 1)
        AveragePrecision_at1 = calc_metric(precision_at_k, data, preds, 1)
        AverageRecall_at1 = calc_metric(recall_at_k, data, preds, 1)

        preds = predict(recommender, data, 2)
        AveragePrecision_at5 = calc_metric(precision_at_k, data, preds, 2)
        AverageRecall_at5 = calc_metric(recall_at_k, data, preds, 2)
        print(f"{AveragePrecision_at1} {AveragePrecision_at5} {AverageRecall_at1} {AverageRecall_at5} ")

  0%|          | 0/2 [00:00<?, ?it/s]
100%|██████████| 4/4 [00:00<00:00, 19152.07it/s]

100%|██████████| 4/4 [00:00<00:00, 24600.02it/s]

100%|██████████| 4/4 [00:00<00:00, 20262.34it/s]

100%|██████████| 4/4 [00:00<00:00, 16810.84it/s]
100%|██████████| 2/2 [00:00<00:00, 126.28it/s]

0.0 0.12 0.0 0.25 
0.5 0.38 0.5 0.75 





In [162]:
test_case

Unnamed: 0,0,1
2,423,2
3,324,2
4,444,1
5,23235,5


In [190]:
preds = predict(recommend_most_viewed, test_case, 1)
calc_metric(precision_at_k, test_case, preds, 1, return_seq=True)

100%|██████████| 4/4 [00:00<00:00, 11732.32it/s]


[0.0, 0.0, 0.0, 0.0]

In [191]:
preds

[['4'], ['4'], ['4'], ['2']]

In [188]:
preds = predict(recommend_most_viewed, test_case, 2)
calc_metric(precision_at_k, test_case, preds, 2, return_seq=True)

100%|██████████| 4/4 [00:00<00:00, 11096.04it/s]


[0.5, 0.0, 0.0, 0.0]

In [189]:
preds

[['4', '2'], ['4', '3'], ['4'], ['2', '3']]