In [75]:
import pandas as pd

ratings_df = pd.read_csv("ratings.csv").drop(columns=["timestamp"])

print(ratings_df)

        userId  movieId  rating
0            1        1     4.0
1            1        3     4.0
2            1        6     4.0
3            1       47     5.0
4            1       50     5.0
...        ...      ...     ...
100831     610   166534     4.0
100832     610   168248     5.0
100833     610   168250     5.0
100834     610   168252     5.0
100835     610   170875     3.0

[100836 rows x 3 columns]


In [76]:
from surprise import Dataset, Reader, SVD, NMF, KNNWithMeans, CoClustering
from surprise.model_selection import cross_validate

reader = Reader()
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

models = {
    "SVD": SVD(n_factors=10, n_epochs=30, lr_all=0.005, reg_all=0.02),
    "NMF": NMF(n_factors=15, n_epochs=50, reg_pu=0.06, reg_qi=0.06),
    "KNNWithMeans": KNNWithMeans(k=50, sim_options={"name": "pearson_baseline", "user_based": False}),
    "CoClustering": CoClustering(n_cltr_u=5, n_cltr_i=5, n_epochs=20)
}

results = {}
for name, model in models.items():
    cv_res = cross_validate(model, data, measures=["RMSE"], cv=5, verbose=True)
    results[name] = cv_res["test_rmse"].mean()


print("\nModels compare (avg rmse):")
for name, rmse in results.items():

    if rmse < 0.87:
        print(f"Appropriate model: {name}")

    print(f"{name}:    {rmse.round(4)}")



Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8728  0.8622  0.8596  0.8721  0.8700  0.8673  0.0054  
Fit time          0.16    0.16    0.17    0.15    0.14    0.16    0.01    
Test time         0.03    0.03    0.03    0.09    0.03    0.04    0.02    
Evaluating RMSE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9235  0.9152  0.9221  0.9232  0.9223  0.9213  0.0031  
Fit time          0.52    0.52    0.52    0.53    0.54    0.53    0.01    
Test time         0.03    0.03    0.03    0.09    0.03    0.04    0.03    
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline simila