In [1]:
from pyspark.sql import *
import pandas as pd
from pyspark.sql.types import *
from pyspark.sql.functions import *
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV


spark = SparkSession.builder.appName("recommendation_system").getOrCreate()

In [1]:
import pandas as pd
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

print('Размерность датасета с рейтингами:', ratings_df.shape)

Размерность датасета с фильмами: (9742, 3) 
Размерность датасета с рейтингами: (100836, 4)


In [2]:
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Число уникальных юзеров:", n_users)
print("Число уникальных фильмов:", n_items)
print("Матрица предпочтений будет иметь", n_users*n_items, 'элементов.')
print("Количество оценок:", len(ratings_df))
print("Заполнено", len(ratings_df) / (n_users*n_items) * 100, '% от матрицы предпочтений')

Число уникальных юзеров: 610
Число уникальных фильмов: 9724
Матрица предпочтений будет иметь 5931640 элементов.
----------
Количество оценок: 100836
Заполнено  1.6999683055613624 % от матрицы предпочтений


In [10]:
algo =SVD()
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)
cross_validate(algo, data, measures=['RMSE','MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8716  0.8803  0.8645  0.8745  0.8785  0.8739  0.0056  
MAE (testset)     0.6685  0.6727  0.6636  0.6749  0.6739  0.6707  0.0042  
Fit time          5.16    4.64    4.09    3.84    3.88    4.32    0.50    
Test time         0.20    0.12    0.11    0.25    0.12    0.16    0.06    


{'test_rmse': array([0.87161019, 0.88030694, 0.86447473, 0.87450537, 0.87849763]),
 'test_mae': array([0.66849484, 0.67273369, 0.663605  , 0.67493185, 0.67387611]),
 'fit_time': (5.157941579818726,
  4.635645151138306,
  4.089778184890747,
  3.843646287918091,
  3.8838751316070557),
 'test_time': (0.20278716087341309,
  0.12413430213928223,
  0.10616517066955566,
  0.2458183765411377,
  0.1151888370513916)}

In [12]:
benchmark = []

algorithms = [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]

print ("Attempting: ", str(algorithms), '\n\n\n')

for algorithm in algorithms:
    print("Starting: " ,str(algorithm))
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    print("Done: " ,str(algorithm), "\n\n")

print ('\n\tDONE\n')

Attempting:  [<surprise.prediction_algorithms.matrix_factorization.SVD object at 0x000001983EFCDAF0>, <surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x000001983F0FFB80>, <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x000001983F0FF400>, <surprise.prediction_algorithms.matrix_factorization.NMF object at 0x000001983F0FFE50>, <surprise.prediction_algorithms.random_pred.NormalPredictor object at 0x000001983F0FF3D0>, <surprise.prediction_algorithms.knns.KNNBaseline object at 0x000001983F0FFA00>, <surprise.prediction_algorithms.knns.KNNBasic object at 0x000001983F0FFF10>, <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x000001983F0FFA60>, <surprise.prediction_algorithms.knns.KNNWithZScore object at 0x000001983F0FF070>, <surprise.prediction_algorithms.baseline_only.BaselineOnly object at 0x000001983F0FF280>, <surprise.prediction_algorithms.co_clustering.CoClustering object at 0x000001983F0FFB20>] 



Starting:  <surprise.prediction_algorithms.

  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Done:  <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x000001983EFCDAF0> 


Starting:  <surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x000001983F0FFB80>


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Done:  <surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x000001983F0FFB80> 


Starting:  <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x000001983F0FF400>


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Done:  <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x000001983F0FF400> 


Starting:  <surprise.prediction_algorithms.matrix_factorization.NMF object at 0x000001983F0FFE50>


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Done:  <surprise.prediction_algorithms.matrix_factorization.NMF object at 0x000001983F0FFE50> 


Starting:  <surprise.prediction_algorithms.random_pred.NormalPredictor object at 0x000001983F0FF3D0>


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Done:  <surprise.prediction_algorithms.random_pred.NormalPredictor object at 0x000001983F0FF3D0> 


Starting:  <surprise.prediction_algorithms.knns.KNNBaseline object at 0x000001983F0FFA00>
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Done:  <surprise.prediction_algorithms.knns.KNNBaseline object at 0x000001983F0FFA00> 


Starting:  <surprise.prediction_algorithms.knns.KNNBasic object at 0x000001983F0FFF10>
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Done:  <surprise.prediction_algorithms.knns.KNNBasic object at 0x000001983F0FFF10> 


Starting:  <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x000001983F0FFA60>
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Done:  <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x000001983F0FFA60> 


Starting:  <surprise.prediction_algorithms.knns.KNNWithZScore object at 0x000001983F0FF070>
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Done:  <surprise.prediction_algorithms.knns.KNNWithZScore object at 0x000001983F0FF070> 


Starting:  <surprise.prediction_algorithms.baseline_only.BaselineOnly object at 0x000001983F0FF280>
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Done:  <surprise.prediction_algorithms.baseline_only.BaselineOnly object at 0x000001983F0FF280> 


Starting:  <surprise.prediction_algorithms.co_clustering.CoClustering object at 0x000001983F0FFB20>
Done:  <surprise.prediction_algorithms.co_clustering.CoClustering object at 0x000001983F0FFB20> 



	DONE



  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


In [13]:
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
surprise_results

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVDpp,0.86918,301.514152,12.570909
BaselineOnly,0.876601,0.21892,0.211266
SVD,0.88172,3.91952,0.268098
KNNBaseline,0.882396,0.37258,2.25987
KNNWithMeans,0.903512,0.169325,1.910221
KNNWithZScore,0.903972,0.228817,2.106744
SlopeOne,0.90964,3.421339,7.316484
NMF,0.932192,4.528354,0.234962
CoClustering,0.949766,2.358594,0.258727
KNNBasic,0.955322,0.141917,1.782113


In [14]:
param_grid = {
    "n_epochs": [10, 20],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.02]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], refit=True, cv=5)

gs.fit(data)

training_parameters = gs.best_params["rmse"]

print("BEST RMSE: \t", gs.best_score["rmse"])
print("BEST MAE: \t", gs.best_score["mae"])
print("BEST params: \t", gs.best_params["rmse"])

BEST RMSE: 	 0.8723533210880243
BEST MAE: 	 0.6707469407405551
BEST params: 	 {'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02}
