In [2]:
from surprise import KNNWithMeans, KNNBasic
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

### Коллаборативная фильтрация (ДЗ) ###

Используем датасет Movies Full

Создаем рекомендательную систему для пользователя (например, с ID = 2)

Целевое значение метрики RMSE на тестовом сете - не более 0.87

---
Загружаем и готовим данные

---

In [14]:
movies = pd.read_csv('movies.dat', sep='::', header=None)
ratings = pd.read_csv('ratings.dat', sep='::', header=None)

  """Entry point for launching an IPython kernel.
  


In [15]:
movies.head(4)

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama


In [16]:
movies.rename(columns={0: 'movieId', 1: 'title', 2: 'genres'}, inplace=True)
movies.head(4)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama


In [17]:
ratings.head(4)

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275


In [18]:
ratings.rename(columns={0: 'userId', 1: 'movieId', 2: 'rating', 3: 'timestamp'}, inplace=True)
ratings.head(4)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275


In [19]:
movies.shape, ratings.shape

((3883, 3), (1000209, 4))

In [20]:
movies_with_ratings = pd.merge(movies, ratings, on='movieId', how='outer')
movies_with_ratings.head(4)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Animation|Children's|Comedy,1.0,5.0,978824268.0
1,1,Toy Story (1995),Animation|Children's|Comedy,6.0,4.0,978237008.0
2,1,Toy Story (1995),Animation|Children's|Comedy,8.0,4.0,978233496.0
3,1,Toy Story (1995),Animation|Children's|Comedy,9.0,5.0,978225952.0


In [21]:
movies_with_ratings.isnull().any()

movieId      False
title        False
genres       False
userId        True
rating        True
timestamp     True
dtype: bool

In [22]:
movies_with_ratings.drop(columns=['timestamp'], inplace=True)
movies_with_ratings.head(2)

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Animation|Children's|Comedy,1.0,5.0
1,1,Toy Story (1995),Animation|Children's|Comedy,6.0,4.0


In [23]:
movies_with_ratings.shape

(1000386, 5)

In [24]:
movies_with_ratings.dropna(inplace=True)
movies_with_ratings.shape

(1000209, 5)

In [25]:
movies_with_ratings.isnull().any().any()

False

In [26]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})
dataset.head(4)

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),5.0
1,6.0,Toy Story (1995),4.0
2,8.0,Toy Story (1995),4.0
3,9.0,Toy Story (1995),5.0


In [27]:
dataset.shape

(1000209, 3)

In [28]:
reader = Reader(rating_scale=(ratings.rating.min(), ratings.rating.max()))
data = Dataset.load_from_df(dataset, reader)

---
Отделяем тестовую выборку, обучаем и оцениваем модель

---

In [29]:
trainset, testset = train_test_split(data, test_size=0.2)

In [30]:
# используем item_based подход, т.к. на практике в ходе занятия он позволил получить более точную модель
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x18c374df710>

In [31]:
test_pred = algo.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8605


0.860543258404344

---
Попробуем user-based подход с этим же алгоритмом

---

In [33]:
algo_ubased = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo_ubased.fit(trainset)
test_pred_ubased = algo_ubased.test(testset)
accuracy.rmse(test_pred_ubased, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8896


0.8896279678334132

---
Возьмем другой алгоритм, SVD

---

In [36]:
from surprise import SVD
algo_svd = SVD()
algo_svd.fit(trainset)
test_pred_svd = algo_svd.test(testset)
accuracy.rmse(test_pred_svd, verbose=True)

RMSE: 0.8744


0.8744428046154787

---
Лучший результат из рассмотренных показал алгоритм KNNWithMeans в режиме item-based

Используя эту модель, выводим рекомендации 10 фильмов для пользователя с ID = 2

---

In [38]:
movies_ratings_uid2 = {}

for movie in tqdm_notebook(movies_with_ratings.title.unique()):
    movies_ratings_uid2[movie] = algo.predict(uid=2, iid=movie).est

HBox(children=(IntProgress(value=0, max=3706), HTML(value='')))




In [39]:
sorted(movies_ratings_uid2.items(), key=lambda x: x[1], reverse=True)[:10]

[('Lamerica (1994)', 5),
 ('All Things Fair (1996)', 5),
 ('Tigrero: A Film That Was Never Made (1994)', 5),
 ('Gate of Heavenly Peace, The (1995)', 5),
 ('Schlafes Bruder (Brother of Sleep) (1995)', 5),
 ('Follow the Bitch (1998)', 5),
 ('Savior (1998)', 5),
 ('Ulysses (Ulisse) (1954)', 5),
 ('Smashing Time (1967)', 5),
 ('Song of Freedom (1936)', 5)]