In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from surprise import KNNWithZScore, SVD, SVDpp
from surprise import Dataset, accuracy, Reader, AlgoBase
from surprise.model_selection import train_test_split
from scipy.spatial.distance import euclidean, jaccard

In [2]:
movies = pd.read_csv('C:/Users/User/Desktop/Data Scientist/AML/Recsys/data/ml-1m/movies.dat',
                     sep='::', names=['movieId', 'title', 'genres'], engine='python')
ratings = pd.read_csv('C:/Users/User/Desktop/Data Scientist/AML/Recsys/data/ml-1m/ratings.dat', 
                     sep='::', names=['userId', 'movieId', 'rating', 'timestamp'], engine='python')
users = pd.read_csv('C:/Users/User/Desktop/Data Scientist/AML/Recsys/data/ml-1m/users.dat', 
                   sep='::', names=['userId', 'gender', 'age', 'occupation', 'zipcode'], engine='python')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
users.head()

Unnamed: 0,userId,gender,age,occupation,zipcode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [6]:
movies_with_ratings = pd.merge(ratings, movies, how='left', on='movieId')
movies_with_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [7]:
movies_with_ratings.shape

(1000209, 6)

In [8]:
dataset = pd.DataFrame({'uid': movies_with_ratings.userId,
                        'iid': movies_with_ratings.title, 
                        'rating': movies_with_ratings.rating})

reader = Reader(rating_scale=(ratings.rating.min(), ratings.rating.max()))
data = Dataset.load_from_df(dataset, reader)

In [9]:
trainset = data.build_full_trainset()
testset = trainset.build_testset()

Применим три разных алгоритма к датасету для предсказания рейтинга и сравним метрики оценок предсказанных значений с помощью RMSE. На основе двух лучших алгоритмов построим гибридный с помощью взвешивания.

In [10]:
algo1 = KNNWithZScore(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})

In [11]:
%%time
algo1.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Wall time: 5min 38s


<surprise.prediction_algorithms.knns.KNNWithZScore at 0x2733ed11e88>

In [12]:
%%time
accuracy.rmse(algo1.test(testset), verbose=True)

RMSE: 0.5868
Wall time: 23min 59s


0.5867598932537077

In [13]:
algo2 = KNNWithZScore(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [14]:
%%time
algo2.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Wall time: 2min 2s


<surprise.prediction_algorithms.knns.KNNWithZScore at 0x27320ee1ec8>

In [15]:
%%time
accuracy.rmse(algo2.test(testset), verbose=True)

RMSE: 0.5941
Wall time: 12min 5s


0.5941407156491711

In [16]:
algo3 = SVD(n_factors=20, n_epochs=30)

In [17]:
%%time
algo3.fit(trainset)

Wall time: 1min 7s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x27320eecf88>

In [18]:
%%time
accuracy.rmse(algo3.test(testset), verbose=True)

RMSE: 0.7562
Wall time: 22.9 s


0.7562139223399588

In [19]:
class HybridAlgorithm(AlgoBase):
    
    def __init__(self, alg1, alg2):
        
        AlgoBase.__init__(self)
        self.alg1 = alg1
        self.alg2 = alg2
    
    
    def fit(self, trainset):
        
        AlgoBase.fit(self, trainset)
        
        return self
    
    
    def estimate(self, u, i):
        ru = self.trainset.to_raw_uid(u)
        ri = self.trainset.to_raw_iid(i)
        a1 = self.alg1.predict(uid=ru, iid=ri).est
        a2 = self.alg2.predict(uid=ru, iid=ri).est

        return 0.7 * a1 + 0.3 * a2

In [20]:
# Гибридный алгоритм на основе первого и второго алгоритмов
hybrid_alg = HybridAlgorithm(algo1, algo2)
hybrid_alg.fit(trainset)

<__main__.HybridAlgorithm at 0x2733ed11788>

In [21]:
%%time
accuracy.rmse(hybrid_alg.test(testset), verbose=True)

RMSE: 0.5799
Wall time: 38min 6s


0.5799258106265032

Оценка *RMSE* 1-го, 2-го и 3-го алгоритма, соответственно, равна 0.5868, 0.5941 и 0.7562. Оценка гибридного на основе 1-го и 2-го немного улучшилась и равна 0.5799.

---

Построим гибридную рекомендательную систему на основе близости векторов пользователей с оценками фильмов в качестве компонентов вектора и применения гибридного алгоритма для предсказания оценки фильма.  
Для рекомендации фильма пользователю найдём похожих к нему 10 других пользователей по поставленным фильмам рейтингам.  

1-й подход: применим наш алгоритм для предсказания рейтингов к фильмам, которые не оценивал пользователь и которым похожие на него пользователи поставили рейтинг 5, и выберем 10 фильмов с наибольшими предсказанными значениями  

2-й подход: применим алгоритм для предсказания рейтингов к фильмам, которые пользователь не оценивал и которым наиболее похожий к нему пользователь по признакам пол и возраст поставил рейтинг 5, и также выберем 10 фильмов с наибольшими предсказанными значениями.

In [29]:
movies_with_ratings['movieId_new'] = movies_with_ratings.movieId.astype('category').cat.codes

In [30]:
num_movies = movies_with_ratings.movieId_new.unique().shape[0]
user_vector = {}

for user, group in tqdm(movies_with_ratings.groupby('userId')):
    
    user_vector[user] = np.zeros(num_movies)
    for i in range(len(group.movieId_new.values)):
        m = group.movieId_new.values[i]
        r = group.rating.values[i]
        user_vector[user][int(m)] = r

HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))




In [34]:
def recommendation(cur_user, alg, user_description=True):
    """
    1-й подход: user_description=False
    2-й подход: user_description=True
    """
    cur_user_list = movies_with_ratings.loc[movies_with_ratings.userId == cur_user, 'title'].values.tolist()

    sim_users = []
    user_dist = []

    for key in tqdm(user_vector.keys()):
        if key == cur_user:
            continue

        sim_users.append(key)
        user_dist.append(jaccard(user_vector[cur_user], user_vector[key]))

    best_ind = np.argsort(user_dist)[:10]
    most_sim_users = [sim_users[i] for i in best_ind]
    
    
    if not user_description:

        filter_str1 = 'userId in @most_sim_users and rating == 5 and title not in @cur_user_list'
        temp1 = movies_with_ratings.query(filter_str1).drop_duplicates(subset='movieId')

        temp1['prediction'] = temp1.apply(lambda x: alg.predict(uid=x.userId, iid=x.title).est, axis=1)

        return temp1.sort_values(by='prediction', ascending=False)[['title', 'prediction']].head(10)
    
    else:

        users['gender_new'] = users.gender.astype('category').cat.codes
        users['age_new'] = users.age.astype('category').cat.codes

        cur_user_descr = users.loc[users.userId == 1, ['gender_new', 'age_new']].values
        user_descr_dist = []

        for usr in most_sim_users:
            sim_user_descr = users.loc[users.userId == usr, ['gender_new', 'age_new']].values
            user_descr_dist.append(euclidean(cur_user_descr, sim_user_descr))

        most_sim_user = most_sim_users[np.argsort(user_descr_dist)[0]]


        most_sim_user_list = movies_with_ratings.loc[movies_with_ratings.userId == most_sim_user, 'title'].values.tolist()

        filter_str2 = 'title not in @cur_user_list and title in @most_sim_user_list and userId == @most_sim_user'
        temp2 = movies_with_ratings.query(filter_str2).sort_values(by='rating', ascending = False)

        temp2['prediction'] = temp2.apply(lambda x: alg.predict(uid=x.userId, iid=x.title).est, axis=1)

        return temp2.sort_values(by='prediction', ascending=False)[['title', 'prediction']].head(10)

In [35]:
# 1-й подход
recommendation(1, hybrid_alg, False)

HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))




Unnamed: 0,title,prediction
245276,"Wrong Trousers, The (1993)",5.0
884702,Winnie the Pooh and the Blustery Day (1968),4.980765
245320,"Grand Day Out, A (1992)",4.980733
245259,Gone with the Wind (1939),4.977521
245304,"Shawshank Redemption, The (1994)",4.969353
884692,Gladiator (2000),4.962702
26007,Babe (1995),4.953871
245323,"Sting, The (1973)",4.95256
26026,Amadeus (1984),4.950202
884694,Annie Hall (1977),4.942871


In [32]:
# 2-й подход
recommendation(1, hybrid_alg)

HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))




Unnamed: 0,title,prediction
94465,Star Wars: Episode V - The Empire Strikes Back...,4.900287
94495,Star Wars: Episode VI - Return of the Jedi (1983),4.801998
94493,"Lion King, The (1994)",4.719842
94496,Willy Wonka and the Chocolate Factory (1971),4.612494
94460,Star Wars: Episode I - The Phantom Menace (1999),4.60183
94462,Little Nemo: Adventures in Slumberland (1992),4.594471
94483,"Parent Trap, The (1961)",4.581795
94452,Robin Hood (1973),4.547415
94477,"Prince of Egypt, The (1998)",4.490733
94448,Newsies (1992),4.469394
