In [1]:
from surprise import SVD
from surprise import SVDpp
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise import KNNBasic
from surprise.prediction_algorithms.knns import KNNWithZScore
from surprise.prediction_algorithms.slope_one import SlopeOne

import pandas as pd

In [2]:
movie_data = Dataset.load_builtin('ml-1m')
df = pd.DataFrame(movie_data.raw_ratings)
df.head()

   0     1    2          3
0  1  1193  5.0  978300760
1  1   661  3.0  978302109
2  1   914  3.0  978301968
3  1  3408  4.0  978300275
4  1  2355  5.0  978824291
Users= 6040
Items= 3706
Ratings= 1000209


In [None]:
print("Users=" ,len(df[0].unique()))
print("Items=" ,len(df[1].unique()))
print("Ratings=", len(df))

In [39]:
%%time
temp = cross_validate(KNNBasic(min_k=1),
                      movie_data, measures=['RMSE', 'MAE'], verbose=True)

Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9226  0.9224  0.9218  0.9252  0.9236  0.9231  0.0012  
MAE (testset)     0.7277  0.7274  0.7268  0.7285  0.7282  0.7277  0.0006  
Fit time          28.89   28.21   28.66   29.15   28.87   28.75   0.31    
Test time         88.21   86.08   86.36   85.01   85.42   86.22   1.11    
Wall time: 2min 8s


In [3]:
%%time
temp = cross_validate(SlopeOne(), movie_data, measures=['RMSE', 'MAE'], verbose=True)

Evaluating RMSE, MAE of algorithm SlopeOne on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9059  0.9052  0.9067  0.9083  0.9063  0.9065  0.0010  
MAE (testset)     0.7139  0.7136  0.7146  0.7156  0.7144  0.7144  0.0007  
Fit time          12.55   12.95   13.38   14.00   13.12   13.20   0.48    
Test time         44.55   43.40   42.91   41.90   42.07   42.97   0.96    
Wall time: 1min 8s


In [32]:
%%time
temp = cross_validate(SVD(n_epochs=60, n_factors=100, reg_all=0.04, lr_all=0.003),
                      movie_data, measures=['RMSE', 'MAE'], verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8584  0.8590  0.8549  0.8570  0.8583  0.8575  0.0015  
MAE (testset)     0.6743  0.6752  0.6716  0.6737  0.6741  0.6738  0.0012  
Fit time          164.07  164.10  164.29  164.17  163.63  164.05  0.22    
Test time         1.76    1.61    1.57    1.51    1.41    1.57    0.11    
Wall time: 2min 58s


In [11]:
%%time
temp = cross_validate(SVDpp(n_epochs=60, n_factors=100, reg_all=0.04, lr_all=0.003),
                      movie_data, measures=['RMSE', 'MAE'], verbose=True)

RMSE: 0.8667
MAE:  0.6768
CPU times: user 51min 1s, sys: 1.44 s, total: 51min 2s
Wall time: 50min 59s


Заключение:
 - Използвания kNN алгоритъм за User-user CL не можа да се подобри нито с промяна на мярката за разлика, нито с промяна на броя съседи, които трябва да се проверят
 - SlopeOne алгоритъмът дава по-малка грешка от kNN алгоритъма
 - SVD алгоритъмът дава по-добри резултати от SlopeOne и kNN, като при намаляване на learning rate-а и увеличаване на регуларизацията и броя на епохи се получават най-добри резултати
 - SVD++ 

In [4]:
jester_data = Dataset.load_builtin('jester')
df = pd.DataFrame(jester_data.raw_ratings)
print(df.head())
print("Users=" ,len(df[0].unique()))
print("Items=" ,len(df[1].unique()))
print("Ratings=", len(df))

   0   1       2     3
0  1   5  11.219  None
1  1   7   1.719  None
2  1   8   1.719  None
3  1  13   4.219  None
4  1  15  11.875  None
Users= 59132
Items= 140
Ratings= 1761439


In [None]:
%%time
temp = cross_validate(KNNBasic(min_k=1),
                      jester_data, measures=['RMSE', 'MAE'], verbose=True)

In [None]:
%%time
temp = cross_validate(KNNBasic(min_k=3),
                      jester_data, measures=['RMSE', 'MAE'], verbose=True)

In [5]:
%%time
temp = cross_validate(SlopeOne(), jester_data, measures=['RMSE', 'MAE'], verbose=True)

Evaluating RMSE, MAE of algorithm SlopeOne on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    4.2622  4.2432  4.2520  4.2445  4.2544  4.2512  0.0069  
MAE (testset)     3.2358  3.2222  3.2282  3.2247  3.2321  3.2286  0.0049  
Fit time          2.56    2.52    2.68    2.76    2.53    2.61    0.10    
Test time         12.43   12.92   12.42   12.17   11.72   12.33   0.39    
Wall time: 40 s


In [7]:
%%time
temp = cross_validate(SVD(), jester_data, measures=['RMSE', 'MAE'], verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    4.4950  4.4941  4.5046  4.5069  4.4947  4.4991  0.0055  
MAE (testset)     3.3679  3.3664  3.3750  3.3786  3.3694  3.3715  0.0046  
Fit time          82.90   85.27   87.52   85.44   88.12   85.85   1.85    
Test time         3.05    2.97    2.64    2.56    2.51    2.75    0.22    
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    4.3896  4.3681  4.3851  4.3808  4.3813  4.3810  0.0072  
MAE (testset)     3.3077  3.2899  3.3012  3.3013  3.3039  3.3008  0.0060  
Fit time          256.13  262.28  258.06  262.65  258.38  259.50  2.54    
Test time         2.63    3.00    3.02    2.54    2.56    2.75    0.21    
Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (tests

In [None]:
%%time
temp = cross_validate(SVD(n_epochs=60, n_factors=100, reg_all=0.04, lr_all=0.003),
                      jester_data, measures=['RMSE', 'MAE'], verbose=True)

In [None]:
%%time
temp = cross_validate(SVDpp(n_epochs=60, n_factors=100, reg_all=0.04, lr_all=0.003),
                      jester_data, measures=['RMSE', 'MAE'], verbose=True)

Conclusions:
 - 