## 5. collaborative filtering
`ratings_small.csv`: The subset of 100,000 ratings from 700 users on 9,000 movies. Can be used for **Collaborative Filtering**

In [1]:
import pandas as pd 
import numpy as np
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import accuracy
from surprise import dump
from surprise.model_selection import GridSearchCV

In [2]:
rating = pd.read_csv('data/ratings_small.csv')

In [3]:
reader = Reader()
data = Dataset.load_from_df(rating[['userId', 'movieId', 'rating']], reader)

In [20]:
trainset, testset = train_test_split(data, test_size=0.2)
alg = SVD()

In [22]:
alg.fit(trainset)
predictions = alg.test(testset)

In [23]:
accuracy.rmse(predictions)

RMSE: 0.8971


0.8970901599273817

## SVD algorithm  
1. train on a whole trainset using SVD   
2. use gridsearchcv to find optimal parameters  
3. get the *best_estimator* and used the predict () method to predict the rating of a specific user-item pair 

In [4]:
from surprise import SVD

param_grid = {
    "n_epochs": [5, 10],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

0.9146270401244182
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [5]:
alg = gs.best_estimator['rmse']
alg.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1ae44531280>

#### predict the rating of a user-movie pair and compare with the true value

In [6]:
pred = alg.predict(uid=1,iid=31,r_ui =2.5)

In [17]:
pred.est

2.778933473559862

#### save predictions and model for future use

In [54]:
dump.dump('try',algo=alg,verbose=1)

The dump has been saved as file try


In [57]:
pred, alg = dump.load('try')

In [59]:
alg.predict(uid=1,iid=31,r_ui =2.5)

Prediction(uid=1, iid=31, r_ui=2.5, est=2.7816008162786963, details={'was_impossible': False})

### recommend movies for a specific user  
when give a user ID we can get the ratings predictions of all movies for that user, and recommend top rating movies that the user has not rated.

In [44]:
combined = pd.read_csv('data/combined_info.csv',index_col=0)

In [16]:
user_id = 1

In [9]:
rating_mat = rating.pivot_table('rating',index='userId',columns='movieId')

In [11]:
movies = list(rating_mat.columns)

In [55]:
def get_predictions(user_id):
    """given a user ID, use the model to predict the user's rating for all the not rated movies
    parameter user_id: integer ID of user
    return predictions dictionary with movie ID as key and predicted rating as values"""
    predictions = {}
    for i in movies:
        if rating_mat.loc[user_id, i]>=0:
            #print("Already rated movie ID:", i)
            pass
        else:
            pred = alg.predict(user_id, i).est
            predictions[i] = pred
    return predictions

In [56]:
def get_recommend_collab(user_id):
    """given a user ID, get the movie rating predictions, sort the predictions from high to low,
    consider ratings >= 3 for recommendation and output names of the top 10 movies if there are more than 10
    parameter user_id: integer ID of user
    return the list of movie names with top rating values
    """
    predictions = get_predictions(user_id)
    predictions_df = pd.DataFrame.from_dict(predictions,orient='index',columns=['prediction'])
    predictions_df = predictions_df.sort_values(by='prediction',ascending=False)
    predictions_scores = predictions_df[predictions_df['prediction']>=3]
    if predictions_scores.shape[0] >= 10:
        movie_ids = list(predictions_scores.index[:10])
    else:
        movie_ids = list(predictions_scores.index)
    recom_list = []
    for i in movie_ids:
        try:        
            a = combined.loc[i,'title']
            recom_list.append(a)
        except:
            pass
    return recom_list

In [57]:
get_recommend_collab(1)

['Sleepless in Seattle',
 'The Million Dollar Hotel',
 'The Thomas Crown Affair',
 'While You Were Sleeping',
 'Broken Blossoms',
 'Galaxy Quest']