# Домашнее задание: рекомендательные системы - 1

В этом домашнем задании будем работать с данными о пользователях и оценках, которые они поставили различным фильмам.

На основе этих данных будем рекомендовать пользователям к просмотру новые для них фильмы.

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from typing import List, Dict
from tqdm import tqdm, tqdm_notebook

## Загрузка и обработка данных

Загрузим данные.

In [None]:
ratings = pd.read_csv('https://raw.githubusercontent.com/aiedu-courses/stepik_applied_tasks/main/datasets/movies_ratings.csv')

In [None]:
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

ratings['userId'] = user_encoder.fit_transform(ratings['userId'])
ratings['movieId'] = item_encoder.fit_transform(ratings['movieId'])

num_users, num_movies = ratings.userId.nunique(), ratings.movieId.nunique()
num_users, num_movies

(671, 9025)

Поделим выборку на train и test так, чтобы у каждого пользователя последние 10 фильмов оказались в тесте для подсчета метрики качества рекомендаций k=10.  

In [None]:
train, test = [], []
num_test_samples = 10

for user, data in ratings.groupby('userId'):
    train += [data[:-num_test_samples]]
    test += [data[-num_test_samples:]]

train = pd.concat(train)
test = pd.concat(test)
print(train.shape, test.shape)

(93140, 5) (6710, 5)


In [None]:
train.head()

Unnamed: 0,userId,movieId,rating,timestamp,title
0,0,30,2.5,1260759144,Dangerous Minds
42,0,830,3.0,1260759179,Dumbo
84,0,856,3.0,1260759182,Sleepers
117,0,903,2.0,1260759185,Escape from New York
165,0,927,4.0,1260759205,Cinema Paradiso


## Quiz

Запишите данные в формате, где строка соответствует пользователю, а столбцы будут истинными метками и предсказаниями в виде списков.

Назовите полученную таблицу `interactions`, действуйте по аналогии или воспользуйтесь кодом из урока.

В ответ запишите максимальное значение `movieId` из тестовых фильмов для пользователя `userId=2`.

In [None]:
train_df = (
    train.groupby('userId')['movieId'].agg(lambda x : list(x)).reset_index().rename(columns={'movieId' : 'true_train'}).set_index('userId')
)

In [None]:
train_df

Unnamed: 0_level_0,true_train
userId,Unnamed: 1_level_1
0,"[30, 830, 856, 903, 927, 1013, 1037, 1043, 107..."
1,"[9, 16, 37, 45, 48, 49, 58, 100, 123, 129, 132..."
2,"[100, 266, 321, 341, 472, 521, 524, 525, 56, 2..."
3,"[1107, 1511, 1661, 1739, 2375, 9, 132, 163, 26..."
4,"[1811, 37, 129, 321, 328, 331, 341, 447, 519, ..."
...,...
666,"[16, 100, 123, 129, 140, 144, 163, 198, 237, 2..."
667,"[266, 525, 284, 877, 965, 2437, 973, 2402, 1098]"
668,"[1511, 196, 427, 962, 2158, 2369, 232, 731, 17..."
669,"[45, 48, 100, 129, 406, 472, 523, 525, 284, 22..."


In [None]:
train_df['true_test'] = (
    test.groupby('userId')['movieId'].agg(lambda x : list(x))
)

Unnamed: 0_level_0,true_train,true_test
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"[30, 830, 856, 903, 927, 1013, 1037, 1043, 107...","[1107, 1136, 1511, 1661, 1704, 1739, 1811, 195..."
1,"[9, 16, 37, 45, 48, 49, 58, 100, 123, 129, 132...","[518, 519, 520, 521, 522, 523, 524, 525, 543, ..."
2,"[100, 266, 321, 341, 472, 521, 524, 525, 56, 2...","[5008, 5107, 5456, 5461, 5874, 6345, 6518, 656..."
3,"[1107, 1511, 1661, 1739, 2375, 9, 132, 163, 26...","[2491, 2495, 2543, 2575, 2576, 2602, 2606, 261..."
4,"[1811, 37, 129, 321, 328, 331, 341, 447, 519, ...","[5955, 5957, 6098, 6118, 6144, 6172, 6260, 627..."
...,...,...
666,"[16, 100, 123, 129, 140, 144, 163, 198, 237, 2...","[54, 97, 448, 496, 208, 39, 287, 401, 395, 367]"
667,"[266, 525, 284, 877, 965, 2437, 973, 2402, 1098]","[1835, 535, 1999, 2329, 984, 2404, 2578, 3209,..."
668,"[1511, 196, 427, 962, 2158, 2369, 232, 731, 17...","[3054, 3211, 782, 907, 2146, 345, 2172, 2216, ..."
669,"[45, 48, 100, 129, 406, 472, 523, 525, 284, 22...","[34, 535, 1253, 1809, 24, 937, 1791, 2206, 233..."


In [None]:
train_df.loc[pd.isnull(train_df.true_test), 'true_test'] = [
    [''] for x in range(len(train_df.loc[pd.isnull(train_df.true_test), 'true_test']))]

In [None]:
interactions = train_df.copy()

In [None]:
ans = interactions.loc[2]
max(ans['true_test'])

7681

Для оценки качества модели будем использовать метрику  precision@10 для каждого пользователя (доля угаданных рекомендаций). Усредним ее по всем пользователям (полученная метрика называется MAP@10).

In [None]:
def calc_precision(column):
    return (
        interactions
        .apply(
            lambda row:
            len(set(row['true_test']).intersection(
                set(row[column]))) /
            min(len(row['true_test']) + 0.001, 10.0),
            axis=1)).mean()

## Коллаборативная фильтрация

## Quiz

Составьте матрицу "оценок" пользователей - `ratings`. Нули будут обозначать отсутствие взаимодействия.

Действуйте по аналогии или воспользуйтесь кодом из урока.

В ответ запишите число столбцов в матрице `ratings`.

In [None]:
ratings = pd.pivot_table(
    train,
    values='rating',
    index='userId',
    columns='movieId').fillna(0)

In [None]:
ratings.shape[1]

8044

In [None]:
ratings_m = ratings.values

In [None]:
ratings_m

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.]])

## Quiz

Посчитайте схожести пользователей (запишите их в np.array `similarity_users`) с помощью корреляции Пирсона. Для каждой пары учитывайте только ненулевые значения.

Действуйте по аналогии или воспользуйтесь кодом из урока.

В ответ запишите значение `similarity_users[0,6]` без округления.

In [None]:
similarity_users = np.zeros((len(ratings_m), len(ratings_m)))

for i in tqdm_notebook(range(len(ratings_m) -1)):
  for j in range(i+1, len(ratings_m)):

    fl = (ratings_m[i] !=0) & (ratings_m[j] !=0)

    if np.sum(fl) == 0:
      continue
    ratings_v = ratings_m[i, fl]

    ratings_w = ratings_m[j, fl]

    if len(np.unique(ratings_v)) < 2 or len(np.unique(ratings_w)) < 2:
      continue
    similarity_users[i, j] = np.corrcoef(ratings_v, ratings_w)[1, 0]

    similarity_users[j, i] = similarity_users[i, j]



  0%|          | 0/670 [00:00<?, ?it/s]

In [None]:
similarity_users[0,6]

-0.5

## Quiz

Сделайте user-based прогнозы по тому же правилу, что и в уроке:

Для каждого пользователя:

1. Найдём пользователей с похожестью больше $\alpha$ на нашего пользователя.
2. Посчитаем для каждого фильма долю пользователей (среди выделенных на первом шаге), которые взаимодействовали с этим фильмом.
3. Порекомендуем фильмы с наибольшими долями со второго шага (среди тех, которые пользователь ещё не видел).

В нашем примере данных не очень много, поэтому возьмём $\alpha = 0$.

Сделайте предсказания и запишите их в столбец
`prediction_user_based` таблицы `interactions`.

В ответ запишите минимальный предсказанный `movieId` для пользователя `userId=4`.

In [None]:
prediction_user_based = []

for i in tqdm_notebook(range(len(similarity_users))):
    users_sim = similarity_users[i] > 0

    if sum(users_sim) == 0:
        prediction_user_based.append([])
    else:
        tmp_recommend = np.argsort(ratings_m[users_sim].sum(axis=0))[::-1]
        tmp_recommend = ratings.columns[tmp_recommend]
        recommend = np.array(tmp_recommend)[~np.in1d(tmp_recommend, interactions.iloc[i])][:10]
        prediction_user_based.append(list(recommend))

interactions['prediction_user_based'] = prediction_user_based

  0%|          | 0/671 [00:00<?, ?it/s]

In [None]:
min(prediction_user_based[4])

100

## Quiz

Посчитайте значение метрики MAP@10 для user-based подхода.

Ответ округлите до тысячных.

In [None]:
calc_precision('prediction_user_based')

0.005365126676602086

## SVD-разложение

Для выполнения заданий при необходимости заглядывайте в ноутбук из урока.

## Quiz

Сделайте сингулярное разложение (svd в scipy.linalg), на выходе вы получите три матрицы - `U`,`sigma`,`V`.

В ответ запишите число элементов матрицы `U`.

In [None]:
from scipy.linalg import svd

In [None]:
U, sigma, v = svd(ratings)

In [None]:
ratings.shape, U.shape, sigma.shape, v.shape

((671, 8044), (671, 671), (671,), (8044, 8044))

Значения у матрицы с сингулярными числами отсортированы по убыванию.

Оставьте только первые 150 компонент, чтобы получить скрытые представления размерности 150. Для этого необходимо оставить 150 столбцов в матрице U, оставить из sigma только первые 150 значений (и сделать из них диагональную матрицу) и 150 столбцов в матрице V. Перемножим преобразованные матрицы ($\hat{U}, \hat{sigma}, \hat{V^T}$), чтобы получить восстановленную матрицу оценок.

In [None]:

Sigma = np.zeros((671, 8044))

sigma[150:] = 0

Sigma[:671, :671] = np.diag(sigma)


In [None]:
new_ratings = U.dot(Sigma).dot(v)

In [None]:
ratings

movieId,0,1,2,3,4,5,6,7,8,9,...,8990,8992,8993,8995,8996,8998,9000,9004,9008,9010
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
666,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Quiz

Посчитайте качество аппроксимации матрицы по норме Фробениуса (среднеквадратичную ошибку между всеми элементами соответствующими элементами двух матриц). Сравните его с простым бейзлайном с константным значением, равным среднему значению исходной матрицы. У аппроксимации ошибка должна получиться ниже.

Во сколько раз ошибка аппроксимации меньше, чем ошибка бейзлайна? Ответ округлите до целого числа.

In [None]:
a = sum(sum((new_ratings - ratings.values) ** 2))

b = sum(sum((ratings.values.mean() - ratings.values) ** 2))

In [None]:
print(round(b / a, 1))

5.1


## Quiz

Теперь сделайте предсказания по матрице (не забывайте про то, что уже было просмотрено пользователем).

Для этого необходимо для каждого пользователя найти предметы с наибольшими оценками в восстановленной матрице.

В ответ запишите максимальный предсказанный `movieId` для пользователя `userId=4`.

In [None]:
top_k = 10

new_ratings = pd.DataFrame(new_ratings, index=ratings.index, columns=ratings.columns)

predictions = []

for personId in tqdm_notebook(interactions.index):
    prediction = (
        new_ratings
        .loc[personId]
        .sort_values(ascending=False)
        .index.values
    )

    predictions.append(
        list(prediction[~np.in1d(
            prediction,
            interactions.loc[personId, 'true_train'])])[:top_k])

interactions['prediction_svd'] = predictions

  0%|          | 0/671 [00:00<?, ?it/s]

In [None]:
max(interactions['prediction_svd'].loc[4])

3373

## Quiz

Посчитайте значение метрики MAP@10 для SVD-подхода.

Ответ округлите до тысячных.

In [None]:
np.round(calc_precision('prediction_svd'), 3)

0.023