
Домашнее задание 
«Гибридные рекомендатльные системы»


In [112]:
import pandas as pd
import numpy as np

from surprise import Dataset, Reader, KNNWithMeans, SVD
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import cross_val_score

In [113]:
links = pd.read_csv('../lecture-1/links.csv')
movies = pd.read_csv('../lecture-1/movies.csv')
ratings = pd.read_csv('../lecture-1/ratings.csv')
tags = pd.read_csv('../lecture-1/tags.csv')

In [114]:
movies['genres_red'] = movies.apply(lambda x: ' '.join(x.genres.split('|')), axis=1)
movies['genres_red'] = movies.apply(lambda x: x.genres_red.replace('(no genres listed)', 'NoGenres'), axis=1)

1.Используем knn для жанров

In [115]:
genres_corpus = movies.genres_red.tolist()

In [116]:
cv = CountVectorizer()
cv_genres_corpus = cv.fit_transform(genres_corpus)

In [117]:
tfidf = TfidfTransformer()
tfidf_cv_genres_corpus = tfidf.fit_transform(cv_genres_corpus)

In [118]:
genre_list = cv.get_feature_names()

In [119]:
df_tfidf = pd.DataFrame(tfidf_cv_genres_corpus.toarray(), index=movies.movieId, columns=genre_list)

In [120]:
movies = movies.merge(df_tfidf, on='movieId')

In [121]:
df = ratings.merge(movies.set_index('movieId'), on='movieId')

In [122]:
knn_on_genres = NearestNeighbors(n_neighbors=20, n_jobs=-1)

In [123]:
knn_on_genres.fit(movies.iloc[:,-len(genre_list):])

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)

2. Используем KNNWithMeans и SVD для предсказания рейтинга

In [124]:
surp_df = pd.DataFrame({
    'uid': df.userId,
    'iid': df.movieId,
    'rating': df.rating
})

In [125]:
surp_reader = Reader(rating_scale=(df.rating.min(), df.rating.max()))
surp_dataset = Dataset(surp_reader)
surp_df = surp_dataset.load_from_df(surp_df, surp_reader)
surp_df = surp_df.build_full_trainset()

In [126]:
surp_SVD = SVD(n_epochs=10, lr_all=0.005, reg_all=0.4)
surp_SVD.fit(surp_df)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a244e57d0>

In [127]:
surp_KNN = KNNWithMeans()
surp_KNN.fit(surp_df)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1a24d75410>

3.Наша функция рекомендации

In [128]:
# Пишем функцию рекомендации
def recomend_for_user(userId=1):
    user_ratings = df[df.userId == userId]

    def knn_rec_rating(userId, movieId):
        rating_knn = 0
        try:
            rating_knn = surp_KNN.estimate(userId, movieId)[0]
        except:
            rating_knn = user_ratings.rating.mean()
        return rating_knn
    
    user_ratings['rating_svd'] = user_ratings.apply(lambda x: surp_SVD.estimate(userId, x.movieId), axis=1)
    user_ratings['rating_knn'] = user_ratings.apply(lambda x: knn_rec_rating(userId, x.movieId), axis=1)
    
    meta_algo = LinearRegression()
    X = user_ratings[['rating_svd', 'rating_knn']]
    y = user_ratings[['rating']]
    print('Meta algorithm RMSE: ', cross_val_score(meta_algo, X, y, scoring='neg_mean_squared_error', cv=5).mean()*-1)
    meta_algo.fit(X, y)
    
    # фильмы которые пользоатель не видел
    user_not_viewed_films = movies[~movies.movieId.isin(user_ratings.movieId.tolist())]
    
    user_not_viewed_films['rating_svd'] = user_not_viewed_films.apply(lambda x: surp_SVD.estimate(userId, x.movieId), axis=1)
    user_not_viewed_films['rating_knn'] = user_not_viewed_films.apply(lambda x: knn_rec_rating(userId, x.movieId), axis=1)
    user_not_viewed_films['rating'] = user_not_viewed_films.apply(lambda x: meta_algo.predict(np.array([[x[['rating_svd']][0]], \
                                                                                                        [x[['rating_knn']][0]]]).reshape(1,-1))[0][0], axis=1)
    
    # самый похожий фильм на последний посмотренный
    knn_for_last_film = knn_on_genres.kneighbors(user_ratings.sort_values('timestamp', ascending=False).iloc[:1,-len(genre_list):], n_neighbors=20)[1][0]
    last_relevant_movies = []
    for each in knn_for_last_film:
        last_relevant_movies.append(movies.ix[each].movieId)

    rec_1 = user_not_viewed_films[user_not_viewed_films.movieId.isin(last_relevant_movies)].sort_values('rating', ascending=False).head(1)[['title', 'movieId', 'genres', 'rating']]

    # самый похожий фильм на последний посмотренный + макс рейт
    knn_for_last_film = knn_on_genres.kneighbors(user_ratings.sort_values('timestamp', ascending=False).sort_values('rating', ascending=False).iloc[:1,-len(genre_list):], n_neighbors=20)[1][0]
    last_relevant_movies = []
    for each in knn_for_last_film:
        last_relevant_movies.append(movies.ix[each].movieId)
    last_relevant_movies.remove(rec_1.movieId.tolist()[0])
    rec_2 = user_not_viewed_films[user_not_viewed_films.movieId.isin(last_relevant_movies)].sort_values('rating', ascending=False).head(1)[['title', 'movieId', 'genres', 'rating']]
    
    # остальные 8 лучших
    rec_3 = user_not_viewed_films.drop(rec_1.index[0], axis=0).drop(rec_2.index[0], axis=0).sort_values('rating', ascending=False).head(8)[['title', 'movieId', 'genres', 'rating']]
    
    return pd.concat([rec_1, rec_2, rec_3])

In [129]:
recomend_for_user(userId=25)

Meta algorithm RMSE:  0.15961542317359362


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas

Unnamed: 0,title,movieId,genres,rating
1084,"Last of the Mohicans, The (1992)",1408,Action|Romance|War|Western,5.125643
4029,Tom Horn (1980),5699,Western,5.077013
1139,B*A*P*S (1997),1490,Comedy,5.711099
2294,Meatballs Part II (1984),3041,Comedy,5.646464
592,"Rock, The (1996)",733,Action|Adventure|Thriller,5.601988
94,Nobody Loves Me (Keiner liebt mich) (1994),106,Comedy|Drama,5.565838
1235,I Know What You Did Last Summer (1997),1644,Horror|Mystery|Thriller,5.553667
619,"Cable Guy, The (1996)",784,Comedy|Thriller,5.545207
121,"Awfully Big Adventure, An (1995)",148,Drama,5.522735
537,Theodore Rex (1995),634,Comedy,5.505938
