In [3]:
%cd '/content/drive/MyDrive/Colab Notebooks/courses/stepik/m5. tasks/w4. recommender'
!ls

/content/drive/MyDrive/Colab Notebooks/courses/stepik/m5. tasks/w4. recommender
'4.3 recommender.ipynb'       coursera_sessions_train.txt
 coursera_sessions_test.txt   github.ipynb


In [4]:
def merge(left_list, left_length, right_list, right_length, key, descending):
    """ applies ordered merging of two lists based on key """
    left_index = 0
    right_index = 0
    result = []
    if descending:
        less_than = (lambda right, left: right > left)
    else:
        less_than = (lambda right, left: right < left)
    while left_index < left_length and right_index < right_length:
        left = left_list[left_index]
        right = right_list[right_index]
        if less_than(key(right), key(left)):
            result.append(right)
            right_index += 1
        else:
            result.append(left)
            left_index += 1
    if left_index == left_length:
        return result + right_list[right_index:right_length]
    else:
        return result + left_list[left_index:left_length]

def merge_sort(data, key, descending=False):
    """ Applies merge sort in data based on key """
    list_length = len(data)
    if list_length > 1:
        left_length = int(list_length/2)
        right_length = list_length-left_length
        left_list = merge_sort(data[0:left_length], key, descending)
        right_list = merge_sort(data[left_length:list_length], key, descending)

        return merge(left_list, left_length, right_list, right_length, key, descending)
    else:
        return data

In [14]:
import sys
import pandas as pd
import numpy as np
data_train = pd.read_csv(
    "coursera_sessions_train.txt",
    ";",
    header=0,
    names=["viewed", "bought"])
data_train.head()
#%%
data_test = pd.read_csv(
    "coursera_sessions_test.txt",
    ";",
    header=0,
    names=["viewed", "bought"]).dropna(axis=0, how="any")
data_test.head()
data_train.shape

(49999, 2)

In [None]:
def parse_session_column(column):
    """ parses string from session column """
    return [int(val) for val in column.split(",")]


def parse_session(views, buys):
    """ parses session string """
    return (parse_session_column(views),
            parse_session_column(buys) if isinstance(buys, str) else [])
def update_frequencies_count(keys, frequencies):


    """ increments dictionary value if key is present in dictionary or puts this key """
    for key in keys:
        frequencies[key] = frequencies[key] + 1 if key in frequencies else 1


def build_data_frequencies(data):
    """ counts product frequencies in views and buys """
    view_freqs = {}
    buy_freqs = {}
    for views_col, buys_col in data.values:
        views, buys = parse_session(views_col, buys_col)
        update_frequencies_count(views, view_freqs)
        update_frequencies_count(buys, buy_freqs)
    return view_freqs, buy_freqs
#%%
view_frequencies, buy_frequencies = build_data_frequencies(data_train)
view_frequencies


In [17]:
# 2. Реализуйте два алгоритма рекомендаций:
#    сортировка просмотренных id по популярности (частота появления в просмотренных),
#    сортировка просмотренных id по покупаемости (частота появления в покупках).
#%%
def popularity(view, frequencies):
    """ Calculates item popularity based on frequency value. """
    return float(frequencies[view] if view in frequencies else 0)

def unique_values(data, key):
    """ returns unique items from data based on key value """
    values = {}
    result = []
    for value in data:
        k = key(value)
        if k not in values:
            result.append(value)
            values[k] = True
    return result

def build_recommendations(views, frequencies, k):
    """ sort session data based on appearence in frequency dictionary """
    views_popularity = [(view, popularity(view, frequencies)) for view in views]
    sorted_views = merge_sort(views_popularity, lambda v: v[1], descending=True)
    views_count = len(set(views))
    return [view for (view, pop) in unique_values(sorted_views, lambda v: v[0])][:k]

def precision(buys, recommendations, k):
    """ calculates precision of recommendations """
    recomended_buys = [1 if recommendation in buys else 0 for recommendation in recommendations]
    return float(sum(recomended_buys))/float(k)

def recall(buys, recommendations):
    """ calculates recall of recommendations """
    recomended_buys = [1 if recommendation in buys else 0 for recommendation in recommendations]
    return float(sum(recomended_buys))/float(len(buys)) if len(recomended_buys) > 0 else 0.

def estimate_model(data, model, k):
    """ estimates recommendations provided by model based on precision and recall """
    precision_sum = 0.
    recall_sum = 0.
    for views_col, buys_col in data.values:
        views, buys = parse_session(views_col, buys_col)
        recommendations = model(views, k)
        precision_sum += precision(buys, recommendations, k)
        recall_sum += recall(buys, recommendations)
    data_length = float(len(data))
    return (recall_sum/data_length, precision_sum/data_length)

def save_answer_array(fname, array):
    """ Saves array of answers """
    with open(fname, "w") as fout:
        fout.write(" ".join([str(el) for el in array]))
#%%
data_train_clear = data_train.dropna(axis=0, how="any")
models = [
    ("View frequency model", lambda views, k: build_recommendations(views, view_frequencies, k)),
    ("Purchases frequency model", lambda views, k: build_recommendations(views, buy_frequencies, k))
]
datas = [
    ("Train", data_train_clear),
    ("Test", data_test)
]
ks = [1,5]
for data_name, data in datas:
    print(data_name)
    for model_name, model in models:
        results = []
        for k in ks:
            average_recall, average_precision = estimate_model(data, model, k)
            results.append(np.round(average_recall, 2))
            results.append(np.round(average_precision, 2))
            print("%s@%i: recall=%.2f\tprecision=%.2f" % (model_name, k, average_recall, average_precision))
        fileName = (data_name + "_" + model_name + ".txt")
        save_answer_array("DataAnalisysMipt\\Results\\" + fileName, results)
    print()

# Дополнительные вопросы
# 1. Обратите внимание, что при сортировке по покупаемости возникает много товаров с одинаковым
#    рангом - это означает, что значение метрик будет зависеть от того, как мы будем сортировать
#    товары с одинаковым рангом. Попробуйте убедиться, что при изменении сортировки таких товаров
#    recall@k меняется. Подумайте, как оценить минимальное и максимальное значение recall@k в
#    зависимости от правила сортировки.
# 2. Мы обучаемся и тестируемся на полных сессиях (в которых есть все просмотренные за сессию
#    товары).
#    Подумайте, почему полученная нами оценка качества рекомендаций в этом случае несколько
#    завышена.
0.4426, 0.5122, 0.8247, 0.2125

Train
View frequency model@1: recall=0.44	precision=0.51
View frequency model@5: recall=0.82	precision=0.21
Purchases frequency model@1: recall=0.69	precision=0.80
Purchases frequency model@5: recall=0.93	precision=0.25

Test
View frequency model@1: recall=0.42	precision=0.48
View frequency model@5: recall=0.80	precision=0.20
Purchases frequency model@1: recall=0.46	precision=0.53
Purchases frequency model@5: recall=0.82	precision=0.21

