# 1. Importing Libraries

In [62]:
import pandas as pd
import numpy as np
import surprise

# 2. Importing Dataset

In [79]:
dataset = pd.read_csv('../data/interim/user_item_matrix.csv')

In [80]:
dataset.head()

Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


# 3. Training

In [65]:
reader = surprise.Reader(rating_scale=(1, 5))
data = surprise.Dataset.load_from_df(dataset[['user_id', 'item_id', 'rating']], reader)

In [18]:
from surprise.model_selection import cross_validate
from surprise import SVD, SVDpp, SlopeOne, NMF, CoClustering, NormalPredictor, KNNBaseline, BaselineOnly, KNNWithZScore, \
    KNNWithMeans, KNNBasic

In [43]:
np.random.seed(42)
scores = []

# algorithms = [SVD(), SVDpp(), SlopeOne(), NMF(), CoClustering()]
# I wasnt satisfied with the results and decided to add some more algorithms to try
algorithms = [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]

algorithms_names = ['SVD', 'SVDpp', 'SlopeOne', 'NMF', 'NormalPredictor', 'KNNBaseline', 'KNNBasic', 'KNNWithMeans', 'KNNWithZScore', 'BaselineOnly', 'CoClustering']
for i in range(len(algorithms)):
    print("Training: ", algorithms_names[i])
    results = cross_validate(algorithms[i], data, measures=['RMSE'], cv=3, verbose=False)

    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    scores.append(tmp)
    print("Done: ", algorithms_names[i])


Training:  SVD
Done:  SVD
Training:  SVDpp
Done:  SVDpp
Training:  SlopeOne
Done:  SlopeOne
Training:  NMF
Done:  NMF
Training:  NormalPredictor
Done:  NormalPredictor
Training:  KNNBaseline
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done:  KNNBaseline
Training:  KNNBasic
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Done:  KNNBasic
Training:  KNNWithMeans
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing

In [46]:
benchmark = pd.DataFrame(scores, index=algorithms_names).sort_values('test_rmse')

In [47]:
benchmark

Unnamed: 0,test_rmse,fit_time,test_time
SVDpp,0.929427,9.195778,4.501023
KNNBaseline,0.936297,0.38269,3.55458
SVD,0.945918,0.68001,0.173367
BaselineOnly,0.947652,0.138998,0.13834
SlopeOne,0.949663,0.349993,2.432777
KNNWithZScore,0.956647,0.334333,3.344607
KNNWithMeans,0.95745,0.280987,3.173004
CoClustering,0.968217,1.532267,0.166334
NMF,0.974666,1.168703,0.160672
KNNBasic,0.987384,0.254972,2.974845


After the expanding the algorithms list, I got the same result. SVDpp is the best algorithm(among these) for this dataset.

# 4. Fine Tuning

In [19]:
from surprise.model_selection import GridSearchCV
param_grid = {
    "n_epochs": [40, 50, 100],
    "lr_all": [0.004, 0.006, 0.005],
    "reg_all": [0.08, 0.09, 0.07],
    "n_factors": [15,20,25]
}
gs = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=5)
gs.fit(data)

In [20]:
best_params = gs.best_params['rmse']
print("Best rmse: ", gs.best_score['rmse'])

Best rmse:  0.9087929784329034


In [21]:
best_params

{'n_epochs': 100, 'lr_all': 0.004, 'reg_all': 0.09, 'n_factors': 25}

# 5. Final Training and Evaluation

In [66]:
algo = SVDpp(n_epochs=best_params['n_epochs'], lr_all=best_params['lr_all'], reg_all=best_params['reg_all'], n_factors=best_params['n_factors'])

In [67]:
# train the final model
from surprise.model_selection import train_test_split

trainset, testset =  train_test_split(data, test_size=0.25)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x1118286e7a0>

In [25]:
from surprise import accuracy
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.9137


0.9136910619732397

# 6. Recommendations

In [68]:
# Now, after the evaluation stage, let's train the model on the whole dataset
algo = SVDpp(n_epochs=best_params['n_epochs'], lr_all=best_params['lr_all'], reg_all=best_params['reg_all'], n_factors=best_params['n_factors'])
trainset = data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x1118286c9d0>

In [69]:
df_item = pd.read_csv('../data/raw/ml-100k/u.item', sep='|', header=None, names=['movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'], encoding='latin-1')

In [70]:
df_item.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [71]:
def recommend_movies(algo, user_id, top_k):
    # First, the algorithm will predict ratings for each unseen movie, second, it will output the top k films with the highest (predicted) rating
    seen_movies = set(dataset[dataset['user_id']==user_id]['item_id'].tolist())
    all_movies = set(dataset['item_id'].tolist())
    unseen_movies = all_movies - seen_movies
    results = []
    for movie in unseen_movies:
        results.append((movie,algo.predict(user_id, movie).est))
    results.sort(key=lambda x: x[-1], reverse=True)
    # Names of movies
    results = [df_item[df_item['movie_id']==r[0]]['movie_title'].item() for r in results[:top_k]]
    return results

In [72]:
user_id = np.random.choice(dataset['user_id'].tolist())
print(f"Movies recommendations for user {user_id}")
for idx, movie_title in enumerate(recommend_movies(algo, user_id, top_k=5)):
    print(f"{idx+1}. {movie_title}")

Movies recommendations for user 12
1. Shawshank Redemption, The (1994)
2. Pather Panchali (1955)
3. Braveheart (1995)
4. Titanic (1997)
5. Saint of Fort Washington, The (1993)


In [74]:
# Saving the model
from surprise import dump
dump.dump("../models/model.pickle", algo=algo)
