#### Домашнее задание 11 november

    - выбрать item-based/user-based подход для работы с датасетом
    - реализовать модель подсчета рейтинга с использованием кластеризации (количество кластеров обосновать)
    - посчитать рейтинги с помощью матричной факторизации SVD/NMF
    - сравнить три способа (имеющийся с корреляцией Пирсона, кластеризация, матричная факторизация)

In [114]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

import surprise
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader

from sklearn.cluster import KMeans
from sklearn import datasets
from sklearn import  metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.base import BaseEstimator

In [117]:
#Util for time benchmark

def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print(f'{method.__name__} works {(te - ts) * 1000 } ms')
        return result
    return timed

In [118]:
ratings = pd.read_csv('./ml-latest-small/ratings.csv', parse_dates=['timestamp'])
ratings = ratings.sample(frac=1).reset_index(drop=True)

In [119]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,367,1,5.0,997811550
1,274,7347,3.0,1171785570
2,274,56012,3.5,1284685853
3,189,2762,4.0,1439465088
4,610,91500,4.0,1493845427


In [120]:
RATIO = 0.2

In [121]:
test = ratings[-int(ratings.shape[0]*RATIO):]
test = test.drop(["timestamp"], axis =1)
ratings.loc[test.index,"rating"] = 0
train = ratings.drop(["timestamp"], axis =1)

In [158]:
class Recommender:
    
    def __init__(self, model, train_set, test_set):
        self._model = model
        self._train_set = train_set
        self._test_set = test_set
    
    @property
    def model(self):
        return self._model
    
    @property
    def test_set(self):
        return self._test_set
    
    def rmse(self):
        etalon_values = self._test_set["rating"].tolist()
        predicted_values = self._test_set["predict"].tolist()
        rmse_result = self.__calculate_rmse__(etalon_values, predicted_values)
        return rmse_result
    
    @timeit
    def recommend(self):
        lower_rating = self._train_set["rating"].min()
        upper_rating = self._train_set["rating"].max()
        data = Dataset.load_from_df(self._train_set, Reader(rating_scale=(lower_rating,upper_rating)))
        trainset = data.build_full_trainset()
        self._model.fit(trainset)
        predict = lambda row: self._model.predict(uid=row.movieId,iid=row.userId).est
        self._test_set["predict"] = self._test_set.apply(predict, axis = 1)
    
    def __calculate_rmse__(self, y_true, y_pred):
        return np.sqrt(mean_squared_error(y_true, y_pred))

In [163]:
class ClusterizationRecommender(Recommender):
    
    def __init__(self, model, train_set, test_set):
        super().__init__(model, train_set, test_set)
        self.__cluster_df = pd.pivot_table(train_set, values='rating', index='userId',columns='movieId', fill_value=0).T
    
    @property
    def data_frame(self):
        return self.__cluster_df
    
    @timeit
    def recommend(self):
        self._model.fit(self.__cluster_df)
        self.__cluster_df["cluster"] = self._model.labels_
        self.__cluster_df = self.__cluster_df\
            .groupby("cluster")\
            .transform(lambda x: x.replace(0,x.mean()))
        map_function = lambda row: self.__cluster_df.loc[row.movieId, row.userId]
        self._test_set["predict"] = self._test_set.apply(map_function, axis = 1)
        

In [160]:
sim_options = {
            'name': 'MSD',
            'user_based': True
}
knn_model = KNNBasic(k=150, sim_options = sim_options)
pirson_recomender = Recommender(knn_model, train.copy(), test.copy())
pirson_recomender.recommend()
rmse_result = pirson_recomender.rmse()

Computing the msd similarity matrix...
Done computing similarity matrix.
recommend works 2920.1743602752686 ms


In [161]:
rmse_result

1.3242309987496674

In [162]:
pirson_recomender.test_set

Unnamed: 0,userId,movieId,rating,predict
80669,104,1688,3.0,2.802531
80670,105,92259,5.0,2.802531
80671,475,3578,5.0,2.802531
80672,573,67087,4.0,2.802531
80673,22,858,3.0,2.802531
...,...,...,...,...
100831,387,2470,2.5,2.802531
100832,199,5500,4.5,2.802531
100833,448,165,4.0,2.295896
100834,593,7153,4.5,2.802531


## Clusterization (user based)

Выбрал такое количество кластеров из-за того, что количество фильмов достаточно большое - 9724

In [164]:
COUNT_CLUSTERS = 200
kmeans_model = KMeans(COUNT_CLUSTERS,n_jobs=-1)
clusterization_recommender = ClusterizationRecommender(kmeans_model, train.copy(), test.copy())

In [165]:
clusterization_recommender.recommend()

recommend works 206450.26540756226 ms


In [166]:
clusterization_recommender.data_frame

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.000000,0.000000,0.000000,0.000000,4.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,4.00000,0.000000,4.000000,3.000000,4.000000,2.500000,4.000000,0.000000,0.000000,5.000000
2,0.000000,0.000000,0.000000,0.000000,1.500000,4.000000,0.000000,4.000000,0.000000,0.000000,...,0.00000,4.000000,0.000000,5.000000,3.500000,0.000000,0.000000,2.000000,0.000000,0.000000
3,4.000000,0.000000,0.000000,0.159091,0.181818,5.000000,0.000000,0.681818,0.000000,0.000000,...,0.00000,1.363636,0.431818,1.159091,0.079545,0.761364,0.227273,2.000000,0.136364,0.000000
4,0.011536,0.001802,0.013879,0.014780,0.000000,3.000000,0.002884,0.000000,0.004686,0.009913,...,0.00757,0.003605,0.013338,0.005768,0.028479,0.003064,0.004326,0.021089,0.002523,0.003965
5,0.500000,0.000000,0.000000,0.159091,0.181818,5.000000,0.000000,0.681818,0.000000,0.000000,...,0.00000,1.363636,0.431818,3.000000,0.079545,0.761364,0.227273,1.011364,0.136364,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.011536,0.001802,0.013879,0.014780,0.000000,0.047945,0.002884,0.000000,0.004686,0.009913,...,0.00757,0.003605,0.013338,0.005768,0.028479,0.003064,0.004326,0.021089,0.002523,0.003965
193583,0.011536,0.001802,0.013879,0.014780,0.000000,0.047945,0.002884,0.000000,0.004686,0.009913,...,0.00757,0.003605,0.013338,0.005768,0.028479,0.003064,0.004326,0.021089,0.002523,0.003965
193585,0.011536,0.001802,0.013879,0.014780,0.000000,0.047945,0.002884,0.000000,0.004686,0.009913,...,0.00757,0.003605,0.013338,0.005768,0.028479,0.003064,0.004326,0.021089,0.002523,0.003965
193587,0.011536,0.001802,0.013879,0.014780,0.000000,0.047945,0.002884,0.000000,0.004686,0.009913,...,0.00757,0.003605,0.013338,0.005768,0.028479,0.003064,0.004326,0.021089,0.002523,0.003965


In [170]:
rmse_result = clusterization_recommender.rmse()

In [171]:
rmse_result

3.275985875036119

In [169]:
clusterization_recommender.test_set

Unnamed: 0,userId,movieId,rating,predict
80669,104,1688,3.0,0.909091
80670,105,92259,5.0,0.186957
80671,475,3578,5.0,0.000000
80672,573,67087,4.0,0.483333
80673,22,858,3.0,0.000000
...,...,...,...,...
100831,387,2470,2.5,2.518519
100832,199,5500,4.5,0.025048
100833,448,165,4.0,0.000000
100834,593,7153,4.5,0.000000


## SVD approach

In [172]:
svd_model = SVDpp(n_factors=2, n_epochs=30, lr_all = 0.01, reg_all=0.1 )
svd_recommender = Recommender(svd_model, train.copy(), test.copy())
svd_recommender.recommend()

recommend works 1380747.231245041 ms


In [174]:
svd_recommender.test_set

Unnamed: 0,userId,movieId,rating,predict
80669,104,1688,3.0,2.680493
80670,105,92259,5.0,2.127163
80671,475,3578,5.0,3.589948
80672,573,67087,4.0,2.797115
80673,22,858,3.0,2.812688
...,...,...,...,...
100831,387,2470,2.5,2.741093
100832,199,5500,4.5,3.438904
100833,448,165,4.0,2.246129
100834,593,7153,4.5,3.365725


In [175]:
rmse_result = svd_recommender.rmse()
rmse_result

1.511530023452685

## Итоги

|     Метод     | Точность |Время работы|
|:-------------:|:--------:|:---------:|
|     Пирсон    |    1.3242309987496674      | 2920.1743602752686 ms|
|      SVD      |    1.511530023452685    |1380747.231245041 ms|
| Кластеризация |    3.275985875036119       |206450.26540756226 ms|


### Корреляция Пирсона: 

Точность лучшая, превосходит кластеризацию почти в два раза, время выполнения в 100 раз превосходит Кластеризации

### SVD: 

Работает безумно медленно(23 минуты), при этом точность не превосходит метод Пирсона, что делает модель непригодной для использования

### Кластеризация:
Худший метод из всех, что свидетельствует о его недостатке - низкой точности при малом количестве данных
