### Задание
##### Преподаватель: Даниил Корбут, Наталья Баданина

#### Что делать?

   * Датасет ml-latest
   * Вспомнить подходы, которые мы разбирали
   * Выбрать понравившийся подход к гибридным системам
   * Написать свою




In [1]:
from surprise import SVD, SVDpp
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
movies = pd.read_csv('D:/DATA_SCIENCE/DZ/rsml-2/movies.csv')
ratings = pd.read_csv('D:/DATA_SCIENCE/DZ/rsml-2/ratings.csv')
tags = pd.read_csv('D:/DATA_SCIENCE/DZ/rsml-2/tags.csv')

##### Гибридная система на SVD и тегах для 610 пользователя

In [3]:
movies_with_ratings = movies.drop(['genres'], axis=1).join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
mov_tag = movies_with_ratings.join(tags.drop(['userId', 'timestamp'], axis=1).set_index('movieId'), on='movieId').reset_index(drop=True)
mov_tag.dropna(inplace=True)
mov_tag.tag = mov_tag.tag.fillna('other')
mov_tag

Unnamed: 0,movieId,title,userId,rating,timestamp,tag
0,1,Toy Story (1995),1.0,4.0,9.649827e+08,pixar
1,1,Toy Story (1995),1.0,4.0,9.649827e+08,pixar
2,1,Toy Story (1995),1.0,4.0,9.649827e+08,fun
3,1,Toy Story (1995),5.0,4.0,8.474350e+08,pixar
4,1,Toy Story (1995),5.0,4.0,8.474350e+08,pixar
...,...,...,...,...,...,...
285745,187595,Solo: A Star Wars Story (2018),586.0,5.0,1.529900e+09,star wars
285770,193565,Gintama: The Movie (2010),184.0,3.5,1.537099e+09,anime
285771,193565,Gintama: The Movie (2010),184.0,3.5,1.537099e+09,comedy
285772,193565,Gintama: The Movie (2010),184.0,3.5,1.537099e+09,gintama


In [4]:
dataset = pd.DataFrame({
    'uid': mov_tag.userId,
    'iid': mov_tag.title,
    'rating': mov_tag.rating
})

In [5]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [6]:
trainset, testset = train_test_split(data, test_size=.15, random_state=42)

In [7]:
%%time
algo = SVD(n_factors=20, n_epochs=20)
algo.fit(trainset)

Wall time: 9.35 s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x25fc8bf6190>

In [8]:
test_pred = algo.test(testset)

In [9]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.4686


0.46864680691492244

In [10]:
algo.predict(uid=610.0, iid='Mortal Kombat (1995)').est

4.169278551880152

In [11]:
current_user_id = 610.0
user_movies = mov_tag[mov_tag.userId == current_user_id].title.unique()
scores = []
titles = []

for movie in mov_tag.title.unique():
    if movie in user_movies:
        continue
        
    scores.append(algo.predict(uid=current_user_id, iid=movie).est)
    titles.append(movie)

In [12]:
sorted(scores)[-10:]

[4.76051048271256,
 4.764146581769927,
 4.766677583082098,
 4.772922147651449,
 4.782544351770793,
 4.810886735123502,
 4.827559909841226,
 4.906845654025222,
 4.922311971930765,
 5.0]

In [13]:
dict_tag = []
for g in mov_tag.tag.values:
    dict_tag.append(g)
dict_tag = set(dict_tag)


In [14]:
countVec = CountVectorizer(stop_words='english')
#fit transform
cv = countVec.fit_transform(tags.tag)
#feature names
cv_feature_names = countVec.get_feature_names()
#feature counts
feature_count = cv.toarray().sum(axis = 0)
#feature name to count
dict_tag = sorted(list(zip(cv_feature_names, feature_count)), key=lambda x: x[1], reverse=True)[:100]
dict_tag = [x[0] for x in dict_tag]
dict_tag[0:10]

['netflix',
 'queue',
 'comedy',
 'dark',
 'atmospheric',
 'ending',
 'space',
 'bad',
 'funny',
 'fi']

In [15]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(dict_tag)

In [16]:
neigh = NearestNeighbors(n_neighbors=20, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(metric='euclidean', n_jobs=-1, n_neighbors=20)

In [17]:
test = ['netflix', 'comedy', 'dark']
X_tfidf2 = vectorizer.transform([test][0])

In [18]:
res = neigh.kneighbors(X_tfidf2, return_distance=True)
res

(array([[0.        , 1.41421356, 1.41421356, 1.41421356, 1.41421356,
         1.41421356, 1.41421356, 1.41421356, 1.41421356, 1.41421356,
         1.41421356, 1.41421356, 1.41421356, 1.41421356, 1.41421356,
         1.41421356, 1.41421356, 1.41421356, 1.41421356, 1.41421356],
        [0.        , 1.41421356, 1.41421356, 1.41421356, 1.41421356,
         1.41421356, 1.41421356, 1.41421356, 1.41421356, 1.41421356,
         1.41421356, 1.41421356, 1.41421356, 1.41421356, 1.41421356,
         1.41421356, 1.41421356, 1.41421356, 1.41421356, 1.41421356],
        [0.        , 1.41421356, 1.41421356, 1.41421356, 1.41421356,
         1.41421356, 1.41421356, 1.41421356, 1.41421356, 1.41421356,
         1.41421356, 1.41421356, 1.41421356, 1.41421356, 1.41421356,
         1.41421356, 1.41421356, 1.41421356, 1.41421356, 1.41421356]]),
 array([[ 0, 64, 55, 54, 53, 57, 56, 63, 62, 65, 66, 67, 68, 69, 70, 71,
         72, 73, 76, 60],
        [ 2, 64, 54, 53, 57, 56, 63, 75, 65, 66, 67, 68, 69, 70, 71,

In [19]:
mov_tag.sort_values('timestamp', inplace=True)

In [20]:
title_tag = {}

for index, row in tqdm(mov_tag.iterrows()):
    title_tag[row.title] = row.tag

0it [00:00, ?it/s]

In [45]:
def recommend_for_user(user_id):
    current_user_id = user_id
    user_movies = mov_tag[mov_tag.userId == current_user_id].title.unique()
    
    last_user_movie = user_movies[-1]
    
    mov_tags = title_tag[last_user_movie]  

    X_tfidf2 = vectorizer.transform([mov_tags])

    res = neigh.kneighbors(X_tfidf2, return_distance=True)
    
    movies_to_score = movies.iloc[res[1][0]].title.values

    scores = []
    titles = []

    for movie in movies_to_score:
        if movie in user_movies:
            continue

        scores.append(algo.predict(uid=current_user_id, iid=movie).est)
        titles.append(movie)
        
    
    best_indexes = np.argsort(scores)[-10:]
    for i in reversed(best_indexes):
        print(titles[i], scores[i])

In [46]:
recommend_for_user(610.0)

French Twist (Gazon maudit) (1995) 4.169278551880152
Angels and Insects (1995) 4.169278551880152
Eye for an Eye (1996) 4.169278551880152
Indian in the Cupboard, The (1995) 4.169278551880152
Two if by Sea (1996) 4.169278551880152
Don't Be a Menace to South Central While Drinking Your Juice in the Hood (1996) 4.169278551880152
Fair Game (1995) 4.169278551880152
Once Upon a Time... When We Were Colored (1995) 4.169278551880152
Misérables, Les (1995) 4.169278551880152
Bed of Roses (1996) 4.169278551880152


In [23]:
#### Сравним с рекомендациями при использовании биб-ки LightFM

In [24]:
%time
import numpy as np
from lightfm.datasets import fetch_movielens
movielens = fetch_movielens()

Wall time: 0 ns


In [25]:
for key, value in movielens.items():
    print(key, type(value), value.shape)

train <class 'scipy.sparse.coo.coo_matrix'> (943, 1682)
test <class 'scipy.sparse.coo.coo_matrix'> (943, 1682)
item_features <class 'scipy.sparse.csr.csr_matrix'> (1682, 1682)
item_feature_labels <class 'numpy.ndarray'> (1682,)
item_labels <class 'numpy.ndarray'> (1682,)


In [26]:
train = movielens['train']
test = movielens['test']

In [27]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

model = LightFM()
model.fit(train, epochs=10)

train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))

Precision: train 0.44, test 0.13.


In [28]:
def sample_recommendation(model, data, user_ids):
    n_users, n_items = data['train'].shape
    for user_id in user_ids:
        known_positives = data['item_labels'][data['train'].tocsr()                                    
                          [user_id].indices]
        
        scores = model.predict(user_id, np.arange(n_items))

        top_items = data['item_labels'][np.argsort(-scores)]

        print("User %s" % user_id)
#         print("     Known positives:")
        
#         for x in known_positives[:10]:
#             print("        %s" % x)
        
        print("     Recommended:")
        
        for x in top_items[:10]:
            print("        %s" % x)


In [49]:
sample_recommendation(model, movielens, [610])
print()
recommend_for_user(610.0)

User 610
     Recommended:
        Star Wars (1977)
        English Patient, The (1996)
        Contact (1997)
        Fargo (1996)
        Liar Liar (1997)
        Scream (1996)
        Return of the Jedi (1983)
        Air Force One (1997)
        Toy Story (1995)
        Independence Day (ID4) (1996)

French Twist (Gazon maudit) (1995) 4.169278551880152
Angels and Insects (1995) 4.169278551880152
Eye for an Eye (1996) 4.169278551880152
Indian in the Cupboard, The (1995) 4.169278551880152
Two if by Sea (1996) 4.169278551880152
Don't Be a Menace to South Central While Drinking Your Juice in the Hood (1996) 4.169278551880152
Fair Game (1995) 4.169278551880152
Once Upon a Time... When We Were Colored (1995) 4.169278551880152
Misérables, Les (1995) 4.169278551880152
Bed of Roses (1996) 4.169278551880152
