## Задача - сделать гибридную рекомендательную систему ##

Датасет - Movies.

Система должна выводить рекомендации фильмов для пользователя на основе рейтингов, которые он поставил.

Решение будем строить в два этапа:
- выберем список топ фильмов (несколько тысяч) из общего множества, на основе всех оценок всех пользователей (подход с самого первого занятия)
- из этого списка подготовим рекомендации фильмов для конкретного пользователя (при помощи SVD++)

### Первый этап - готовим список топ фильмов ###

Выберем, к примеру, 500 лучших фильмов

In [1]:
top_films_num = 500

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

In [3]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [4]:
movies.shape

(9742, 3)

In [5]:
ratings.shape

(100836, 4)

In [6]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
ratings_movies = pd.merge(ratings, movies, on='movieId')

In [9]:
ratings_movies.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [10]:
ratings_movies.shape

(100836, 6)

In [11]:
ratings_movies.isnull().any().any()

False

---
Метрика для выбора топ фильмов: количество отзывов (нормированное), умноженное на среднюю оценку

---

In [12]:
# по каждому фильму получим количество рейтингов
title_num_ratings = {}

for title, group in tqdm_notebook(ratings_movies.groupby('title')):
    title_num_ratings[title] = len(group.userId.unique())

HBox(children=(IntProgress(value=0, max=9719), HTML(value='')))




In [13]:
# посмотрим, у кого больше всего отзывов
sorted(title_num_ratings.items(), key=lambda x: x[1], reverse=True)[:10]

[('Forrest Gump (1994)', 329),
 ('Shawshank Redemption, The (1994)', 317),
 ('Pulp Fiction (1994)', 307),
 ('Silence of the Lambs, The (1991)', 279),
 ('Matrix, The (1999)', 278),
 ('Star Wars: Episode IV - A New Hope (1977)', 251),
 ('Jurassic Park (1993)', 238),
 ('Braveheart (1995)', 237),
 ('Terminator 2: Judgment Day (1991)', 224),
 ("Schindler's List (1993)", 220)]

In [14]:
# считаем средние оценки для фильмов
title_mean_rating = ratings_movies.groupby('title')['rating'].mean()

In [15]:
# посмотрим, у кого самый высокий средний рейтинг
title_mean_rating.sort_values(ascending=False)[:10]

title
Karlson Returns (1970)                           5.0
Winter in Prostokvashino (1984)                  5.0
My Love (2006)                                   5.0
Sorority House Massacre II (1990)                5.0
Winnie the Pooh and the Day of Concern (1972)    5.0
Sorority House Massacre (1986)                   5.0
Bill Hicks: Revelations (1993)                   5.0
My Man Godfrey (1957)                            5.0
Hellbenders (2012)                               5.0
In the blue sea, in the white foam. (1984)       5.0
Name: rating, dtype: float64

In [16]:
# для расчета нашей метрики нам понадобятся некоторые статистики по рейтингам
title_num_ratings_values = list(title_num_ratings.values())

min_num_ratings = np.min(title_num_ratings_values)
max_num_ratings = np.max(title_num_ratings_values)
mean_num_ratings = np.mean(title_num_ratings_values)

min_num_ratings, max_num_ratings, mean_num_ratings

(1, 329, 10.374729910484618)

In [17]:
# считаем значение метрики для каждого фильма
films_marks = {}
for title in title_num_ratings.keys():
    films_marks[title] = title_mean_rating[title] * (title_num_ratings[title] - mean_num_ratings) / (max_num_ratings - min_num_ratings)

In [18]:
# получаем результат: список топ-фильмов
top_films = sorted(films_marks.items(), key = lambda x: x[1], reverse=True)[:top_films_num]
top_films[:20]

[('Shawshank Redemption, The (1994)', 4.140396622352077),
 ('Forrest Gump (1994)', 4.04511657667948),
 ('Pulp Fiction (1994)', 3.795599234431761),
 ('Matrix, The (1999)', 3.4207454409691413),
 ('Silence of the Lambs, The (1991)', 3.4080113927564395),
 ('Star Wars: Episode IV - A New Hope (1977)', 3.103974793934813),
 ('Braveheart (1995)', 2.7855877015865484),
 ('Fight Club (1999)', 2.7047848943888955),
 ("Schindler's List (1993)", 2.7002035552689096),
 ('Jurassic Park (1993)', 2.6024230574258618),
 ('Terminator 2: Judgment Day (1991)', 2.5862869902088406),
 ('Star Wars: Episode V - The Empire Strikes Back (1980)', 2.5785484011187134),
 ('Usual Suspects, The (1995)', 2.5016296926169597),
 ('Toy Story (1995)', 2.4461018531687673),
 ('Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
  2.4324644021391335),
 ('American Beauty (1999)', 2.394561678011029),
 ('Godfather, The (1972)', 2.375006509125951),
 ('Lord of the Rings: The Fellowship of the Ring, The (2001

### Второй этап: готовим список рекомендаций для конкретного пользователя ###

In [19]:
new_user_ratings = {
    'Wolf of Wall Street, The (2013)': 5.0,
    'Skyfall (2012)': 5.0,
    'The Butterfly Effect (2004)': 5.0,
    'Big Short, The (2015)': 5.0,
    'Spy Game (2001)': 4.0,
    'Forrest Gump (1994)': 5.0
}

In [20]:
ratings_movies.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [21]:
ratings_movies_filtered = ratings_movies.copy()
ratings_movies_filtered.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [22]:
ratings_movies_filtered.drop(columns=['timestamp', 'genres'], inplace=True)

In [23]:
ratings_movies_filtered.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [24]:
ratings_movies_filtered.shape

(100836, 4)

In [25]:
# оставляем только фильмы, вошедшие в топ
top_films_titles = [film[0] for film in top_films]
ratings_movies_filtered_top = ratings_movies_filtered[ratings_movies_filtered.title.isin(top_films_titles)]

In [26]:
ratings_movies_filtered_top.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [27]:
ratings_movies_filtered_top.shape

(43601, 4)

In [28]:
ratings_movies_filtered.isnull().any().any()

False

In [29]:
dataset = pd.DataFrame({
    'uid': ratings_movies_filtered_top.userId,
    'iid': ratings_movies_filtered_top.title,
    'rating': ratings_movies_filtered_top.rating
})
dataset.tail()

Unnamed: 0,uid,iid,rating
79246,603,"Exorcist, The (1973)",4.0
79247,606,"Exorcist, The (1973)",3.0
79248,607,"Exorcist, The (1973)",5.0
79249,608,"Exorcist, The (1973)",4.5
79250,610,"Exorcist, The (1973)",4.0


In [30]:
# добавляем данные о новом пользователе
print(np.max(ratings.userId))
print(ratings.userId.nunique())
new_user_id = np.max(ratings.userId) + 1
print(new_user_id)

for film, rating in new_user_ratings.items():
    try:
        dataset = dataset.append({
            'uid': new_user_id,
            'iid': film,
            'rating': rating
        }, ignore_index=True)
    except:
        continue

dataset.tail(10)

610
610
611


Unnamed: 0,uid,iid,rating
43597,606,"Exorcist, The (1973)",3.0
43598,607,"Exorcist, The (1973)",5.0
43599,608,"Exorcist, The (1973)",4.5
43600,610,"Exorcist, The (1973)",4.0
43601,611,"Wolf of Wall Street, The (2013)",5.0
43602,611,Skyfall (2012),5.0
43603,611,The Butterfly Effect (2004),5.0
43604,611,"Big Short, The (2015)",5.0
43605,611,Spy Game (2001),4.0
43606,611,Forrest Gump (1994),5.0


In [31]:
dataset.shape

(43607, 3)

---
Применяем SVD++

---

In [32]:
from surprise import SVDpp
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

In [33]:
reader = Reader(rating_scale=(ratings.rating.min(), ratings.rating.max()))

In [34]:
data = Dataset.load_from_df(dataset, reader)

In [35]:
%%time
algo = SVDpp()
trainset = data.build_full_trainset()
algo.fit(trainset)

Wall time: 2min 27s


---
Выводим список рекомендаций (например, 40 штук) для нового пользователя

---

In [36]:
recommendations_count = 40

In [37]:
new_user_predicted_ratings = {}
for movie in tqdm_notebook(top_films_titles):
    if movie in new_user_ratings:
        continue
    new_user_predicted_ratings[movie] = algo.predict(uid=new_user_id, iid=movie)

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))




In [38]:
recommendations = sorted(new_user_predicted_ratings.items(), key=lambda x: x[1].est, reverse=True)[:recommendations_count]

In [39]:
print(f'{len(recommendations)} лучших фильмов для пользователя с ID={new_user_id}:\n')
print(f"{'#': <4} {'Название': <90} Вероятная оценка")
for index, r in enumerate(recommendations):
    print(f'{index + 1: <4} {r[1].iid: <90} {r[1].est}')

40 лучших фильмов для пользователя с ID=611:

#    Название                                                                                   Вероятная оценка
1    Shawshank Redemption, The (1994)                                                           4.9162451178190425
2    Life Is Beautiful (La Vita è bella) (1997)                                                 4.909017935118776
3    Eternal Sunshine of the Spotless Mind (2004)                                               4.839669666864801
4    Departed, The (2006)                                                                       4.831984651844077
5    Psycho (1960)                                                                              4.82406343968285
6    Fight Club (1999)                                                                          4.770230548253857
7    Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)                4.759938977222605
8    Boondock Saints, The (2000)           