Домашнее задание " Гибридные рекомендательные системы"

In [1]:
!pip install surprise

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 356kB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp36-cp36m-linux_x86_64.whl size=1618291 sha256=ed47ca7dcea853f60729a7ab308eab0d23f1826a452fc9ff1293134c2530da4e
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [2]:

from surprise import SVD, SVDpp, Dataset, accuracy, Reader
from surprise.model_selection import train_test_split

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

import pandas as pd
import numpy as np

In [6]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv(r'ratings.csv')

In [13]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [14]:
movies_with_ratings.head(10)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0
5,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,18.0,3.5,1455210000.0
6,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19.0,4.0,965705600.0
7,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,21.0,3.5,1407619000.0
8,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,27.0,3.0,962685300.0
9,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,31.0,5.0,850466600.0


In [15]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [16]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [17]:
trainset, testset = train_test_split(data, test_size=.15, random_state=42)

In [18]:
%%time
algo = SVD(n_factors=20, n_epochs=20)
algo.fit(trainset)

CPU times: user 2.33 s, sys: 0 ns, total: 2.33 s
Wall time: 2.34 s


In [19]:
test_pred = algo.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8676


0.8675985703615122

In [20]:
algo.predict(uid=2.0, iid='Terminator (1995)').est

3.4917552850516946

In [21]:
current_user_id = 2.0
user_movies = movies_with_ratings[movies_with_ratings.userId == current_user_id].title.unique()

scores = []
titles = []

for movie in movies_with_ratings.title.unique():
    if movie in user_movies:
        continue
        
    scores.append(algo.predict(uid=current_user_id, iid=movie).est)
    titles.append(movie)

In [22]:
scores[:20]

[3.869105027271043,
 3.4420501622912996,
 3.3451788952525874,
 3.0179575498497755,
 3.0129852627747904,
 3.9645024863459932,
 3.1541468451751853,
 3.1173360243675337,
 3.0950898315955437,
 3.590916405294185,
 3.8062610291119263,
 2.8143246739849737,
 3.4613396455819676,
 3.7437269679135916,
 3.1968075622101813,
 3.992604535050109,
 3.9547527707781382,
 3.6710272980368344,
 2.7307772172806364,
 2.803078065799876]

In [23]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [24]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [26]:
movie_genres[321]

'Documentary'

In [27]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

neigh = NearestNeighbors(n_neighbors=20, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)

In [28]:
test = change_string("Documentary")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [29]:
movies_with_ratings.sort_values('timestamp', inplace=True)

In [30]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [31]:
title_genres = {}

for index, row in tqdm_notebook(movies.iterrows()):
    title_genres[row.title] = row.genres

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [32]:
def recommend_for_user(user_id):
    current_user_id = user_id
    df_user = movies_with_ratings.pivot_table(index='userId', columns='title', values='rating', aggfunc='count')
    
    #### создаём фичи для knn
    df_user.fillna(0, inplace=True)
    neigh = NearestNeighbors(n_neighbors=2)
    neigh.fit(df_user) 
    NearestNeighbors(n_neighbors=2) #### топ 2 максимально похожим
    num = neigh.kneighbors(df_user[df_user.index==user_id])[1][0][1] ####  берём второго
    film = df_user[df_user.index==num].T     
    film = film[film[num]!=0]
    movies_to_score = list(film.reset_index()['title']) ####  оцениваем только его фильмы

    scores = []
    titles = []

    for movie in movies_to_score:
        if movie in user_movies:
            continue

        scores.append(algo.predict(uid=current_user_id, iid=movie).est)
        titles.append(movie)
        
    
    best_indexes = np.argsort(scores)[-10:]
    for i in reversed(best_indexes):
        print(titles[i], scores[i])

In [34]:
recommend_for_user(543) #### рекомендации для пользователя.

Streetcar Named Desire, A (1951) 5.0
Usual Suspects, The (1995) 5.0
Pulp Fiction (1994) 4.983943955634603
Forrest Gump (1994) 4.96181783057776
Silence of the Lambs, The (1991) 4.7711619473003966
Jurassic Park (1993) 4.689305630799513
Elephant Man, The (1980) 4.612045251294037
Cider House Rules, The (1999) 4.611203568474344
Pink Floyd: The Wall (1982) 4.602176205198506
American Graffiti (1973) 4.578736425259906
