# Collaborative Filtering RecSys

Stanford CS246 Mining Massive Data Sets http://web.stanford.edu/class/cs246/

Lecture videos can be found on YouTube: https://www.youtube.com/playlist?list=PLLssT5z_DsK9JDLcT8T62VtzwyW9LNepV

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation
from sklearn import model_selection as model

In [2]:
ratings = pd.read_csv('data/ratings_small.csv')
movies = pd.read_csv('data/movies_small.csv')

In [3]:
ratings = ratings.drop('timestamp',axis=1)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.shape

(100836, 3)

In [6]:
movies.shape

(9742, 3)

In [7]:
ratings['rating'].describe()

count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

In [8]:
n_users = ratings.userId.unique().shape[0]
n_movies = ratings.movieId.unique().shape[0]
print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_movies))

Number of users = 610 | Number of movies = 9724


In [9]:
train, test = model.train_test_split(ratings, test_size=0.2)

In [10]:
train.count()

userId     80668
movieId    80668
rating     80668
dtype: int64

In [11]:
train.movieId.unique().shape[0]

8954

In [12]:
train.userId.unique().shape[0]

610

In [13]:
temp = pd.DataFrame(train['movieId'].value_counts())
temp.columns = ['number']
temp['number'].quantile(0.8)

11.0

In [14]:
# keep the movies with the number of ratings larger than 80% of the  movies
train = train[train.movieId.isin(temp[temp.number > temp['number'].quantile(0.80)].index)]

In [15]:
train.count()

userId     59557
movieId    59557
rating     59557
dtype: int64

In [16]:
temp2 = pd.DataFrame(train['userId'].value_counts())
temp2.columns = ['number']
temp2['number'].quantile(0.25)

25.0

In [17]:
# keep users who rated movies more than 25% of the users
train = train[train.userId.isin(temp2[temp2.number > temp2['number'].quantile(0.25)].index)]

In [18]:
train.count()

userId     56731
movieId    56731
rating     56731
dtype: int64

In [19]:
train_movies = movies[movies.movieId.isin(train['movieId'])]

In [20]:
temp = pd.DataFrame(test['movieId'].value_counts())
temp.columns = ['number']
test = test[test.movieId.isin(temp[temp.number > temp['number'].quantile(0.80)].index)]
temp2 = pd.DataFrame(test['userId'].value_counts())
temp2.columns = ['number']
test = test[test.userId.isin(temp2[temp2.number > temp2['number'].quantile(0.25)].index)]
test_movies = movies[movies.movieId.isin(test['movieId'])]

In [21]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11754 entries, 32038 to 73523
Data columns (total 3 columns):
userId     11754 non-null int64
movieId    11754 non-null int64
rating     11754 non-null float64
dtypes: float64(1), int64(2)
memory usage: 367.3 KB


In [22]:
# format the ratings matrix to be one row per user and one column per movie
train_matrix = train.pivot(index = 'userId', 
                            columns ='movieId', 
                            values = 'rating').fillna(0)
train_matrix.head()

movieId,1,2,3,5,6,7,9,10,11,12,...,139385,140110,142488,148626,152081,158238,164179,166528,168252,176371
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,5.0,4.0,4.0,0.0,3.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
test_matrix = test.pivot(index = 'userId', 
                            columns ='movieId', 
                            values = 'rating').fillna(0)
test_matrix.head()

movieId,1,2,3,5,6,7,10,11,15,16,...,122918,122920,134130,139385,148626,152081,157296,164179,166528,168250
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# Pearson correlation 
# User Similarity Matrix
user_similarity = 1 - pairwise_distances(train_matrix, metric='correlation')
user_similarity[np.isnan(user_similarity)] = 0

In [25]:
user_similarity = pd.DataFrame(user_similarity, columns=train_matrix.index, index = train_matrix.index)
user_similarity.head()

userId,1,4,5,6,7,8,10,11,14,15,...,600,601,602,603,604,605,606,607,608,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.128369,0.076588,0.044792,0.025698,0.055736,-0.043886,0.066843,0.00735,0.021578,...,0.142296,-0.005428,0.082217,0.128698,0.001013,0.083058,0.054576,0.150068,0.140995,-0.022112
4,0.128369,1.0,0.050531,0.020438,0.017474,0.030828,-0.038349,-0.007966,0.011089,0.005796,...,0.108385,0.004102,0.073545,0.21427,-0.009008,0.013905,0.182559,0.101518,0.037233,-0.016715
5,0.076588,0.050531,1.0,0.325126,0.076939,0.331215,0.020853,0.170392,0.116798,0.085062,...,0.053514,0.041321,0.329272,0.067873,0.193261,0.141172,0.096276,0.127122,0.041129,0.005442
6,0.044792,0.020438,0.325126,1.0,-0.011184,0.309098,-0.048388,0.201741,0.286169,0.029432,...,0.079567,-0.031646,0.422395,-0.014964,0.33535,0.060494,-0.009667,0.087103,0.017081,-0.107846
7,0.025698,0.017474,0.076939,-0.011184,1.0,0.119983,0.042415,0.132826,0.062089,0.161562,...,0.114444,0.124484,0.028609,0.007422,0.034315,0.169301,0.17199,0.101466,0.224036,0.138518


In [26]:
# Item Similarity Matrix
item_similarity = 1 - pairwise_distances(train_matrix.T, metric='correlation')
item_similarity[np.isnan(item_similarity)] = 0

In [27]:
item_similarity = pd.DataFrame(item_similarity, columns = train_matrix.columns, index = train_matrix.columns)
item_similarity.head()

movieId,1,2,3,5,6,7,9,10,11,12,...,139385,140110,142488,148626,152081,158238,164179,166528,168252,176371
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.141093,0.148871,0.102827,0.12983,0.158809,0.131753,0.104637,0.128254,0.116806,...,0.057149,-0.060276,0.068138,0.075039,0.042169,0.080323,0.088976,-0.010938,0.037608,-0.001185
2,0.141093,1.0,0.123454,0.206623,0.075082,0.101952,0.005475,0.18282,0.128416,0.13641,...,0.041698,-0.005942,0.012633,0.063239,0.063811,0.099431,0.075042,0.101568,0.106722,0.053011
3,0.148871,0.123454,1.0,0.314775,0.172749,0.36871,0.221261,0.083321,0.148878,0.194074,...,0.029567,-0.014202,0.029503,-0.004482,-0.060788,0.011782,0.016958,-0.008876,0.00725,-0.021452
5,0.102827,0.206623,0.314775,1.0,0.141883,0.305946,0.146897,0.109611,0.144529,0.012151,...,0.055838,0.070098,0.016599,0.10257,-0.054932,-0.007297,0.046197,-0.022085,-0.000502,-0.01202
6,0.12983,0.075082,0.172749,0.141883,1.0,0.136189,0.118881,0.169629,0.148771,0.138152,...,0.064161,-0.025343,0.020905,0.032699,0.028622,0.085798,0.126698,0.052009,0.050874,0.011194


## User-User Collaborative Filtering

In [28]:
def user_collaborative_filtering(userId, similarity, ratings, movies, num_of_neighbors):
    user_data = ratings.loc[ratings.userId == userId].copy()
    # movies the user has rated
    user_full = (user_data.merge(movies, how = 'left', left_on = 'movieId', 
                                 right_on = 'movieId').sort_values(['rating'], ascending=False))
    print('User {0} has already rated {1} movies.'.format(userId, user_full.shape[0]))
    # movies that haven't been rated by useer
    movies_NoRate = movies.loc[~movies['movieId'].isin(user_full['movieId'])].copy()
    preditions = []
    for movieId in movies_NoRate['movieId']:
        users_rated_this_movie = ratings[ratings.movieId == movieId]
        topK_similar_users = similarity.loc[users_rated_this_movie['userId']][userId].sort_values(ascending=False)[:num_of_neighbors]
        topK_similar_users = topK_similar_users[topK_similar_users > 0]
        if topK_similar_users.shape[0] == 0:
            ratings_preds = ratings[ratings.movieId == movieId]['rating'].mean()
        else:
            topK_users_ratings = users_rated_this_movie[users_rated_this_movie.userId.isin(topK_similar_users.index)]
            ratings_preds = np.dot(topK_similar_users.values, np.array(topK_users_ratings.rating))/sum(topK_similar_users.values)
        preditions.append(ratings_preds)
    movies_NoRate['pred_ratings'] = preditions
    return movies_NoRate.sort_values(by='pred_ratings', ascending=False)
#     return movies_NoRate

In [29]:
user_collaborative_filtering(6, user_similarity, train, train_movies, 10).head(10)

User 6 has already rated 152 movies.


Unnamed: 0,movieId,title,genres,pred_ratings
8395,110102,Captain America: The Winter Soldier (2014),Action|Adventure|Sci-Fi|IMAX,5.0
8312,106918,"Secret Life of Walter Mitty, The (2013)",Adventure|Comedy|Drama,4.953666
4927,7387,Dawn of the Dead (1978),Action|Drama|Horror,4.851421
148,176,Living in Oblivion (1995),Comedy,4.827328
8695,122918,Guardians of the Galaxy 2 (2017),Action|Adventure|Sci-Fi,4.823959
224,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,4.758188
891,1188,Strictly Ballroom (1992),Comedy|Romance,4.748524
841,1104,"Streetcar Named Desire, A (1951)",Drama,4.722366
7180,72226,Fantastic Mr. Fox (2009),Adventure|Animation|Children|Comedy|Crime,4.706998
266,306,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama,4.676658


## Item-Item Collaborative Filtering

In [30]:
def item_collaborative_filtering(userId, similarity, ratings, movies, num_of_neighbors):
    user_data = ratings.loc[ratings.userId == userId].copy()
    # movies the user has rated
    user_rated_movies = (user_data.merge(movies, how = 'left', left_on = 'movieId', 
                                 right_on = 'movieId').sort_values(['rating'], ascending=False))
    print('User {0} has already rated {1} movies.'.format(userId, user_rated_movies.shape[0]))
    # movies that haven't been rated by useer
    movies_NoRate = movies.loc[~movies['movieId'].isin(user_rated_movies['movieId'])].copy()
    preditions = []
    for movieId in movies_NoRate['movieId']:
        topK_similar_items = similarity.loc[user_rated_movies['movieId']][movieId].sort_values(ascending=False)[:num_of_neighbors]
        topK_similar_items = topK_similar_items[topK_similar_items > 0]
        
        if topK_similar_items.shape[0] == 0:
            ratings_preds = ratings[ratings.movieId == movieId]['rating'].mean()
        else:
            topK_movies_ratings = user_rated_movies[user_rated_movies.movieId.isin(topK_similar_items.index)]
            ratings_preds = np.dot(topK_similar_items.values, np.array(topK_movies_ratings.rating))/sum(topK_similar_items.values)
            
        preditions.append(ratings_preds)
    movies_NoRate['pred_ratings'] = preditions
    return movies_NoRate.sort_values(by='pred_ratings', ascending=False)

In [31]:
item_collaborative_filtering(6, item_similarity, train, train_movies, 10).head(10)

User 6 has already rated 152 movies.


Unnamed: 0,movieId,title,genres,pred_ratings
6943,65230,Marley & Me (2008),Comedy|Drama,4.558307
4807,7162,Cold Mountain (2003),Drama|Romance|War,4.47544
106,122,Boomerang (1992),Comedy|Romance,4.469666
780,1022,Cinderella (1950),Animation|Children|Fantasy|Musical|Romance,4.469388
785,1028,Mary Poppins (1964),Children|Comedy|Fantasy|Musical,4.454474
1550,2087,Peter Pan (1953),Animation|Children|Fantasy|Musical,4.424843
7665,88810,"Help, The (2011)",Drama,4.423997
7988,96821,"Perks of Being a Wallflower, The (2012)",Drama|Romance,4.413091
7655,88405,Friends with Benefits (2011),Comedy|Romance,4.408461
792,1035,"Sound of Music, The (1965)",Musical|Romance,4.387796


The above is the implemenation without global baseline. Next we'll try to add global baseline

### With Global Baseline 

In [32]:
# Item-Item CF with Global Baseline 
def item_collaborative_filtering_baseline(userId, similarity, ratings, movies, num_of_neighbors):
    user_data = ratings.loc[ratings.userId == userId].copy()
    # movies the user has rated
    user_rated_movies = (user_data.merge(movies, how = 'left', left_on = 'movieId', 
                                 right_on = 'movieId').sort_values(['rating'], ascending=False))
    print('User {0} has already rated {1} movies.'.format(userId, user_rated_movies.shape[0]))
    # movies that haven't been rated by useer
    movies_NoRate = movies.loc[~movies['movieId'].isin(user_rated_movies['movieId'])].copy()
    
    # Global baseline estimate
    avg_rating = ratings['rating'].mean()
    user_avg = user_data['rating'].mean()
    bx = user_avg - avg_rating
    
    preditions = []
    for movieId in movies_NoRate['movieId']:
        movie_avg = ratings[ratings.movieId == movieId]['rating'].mean()
        bi = movie_avg - avg_rating
        bxi = avg_rating + bx + bi
        
        topK_similar_items = similarity.loc[user_rated_movies['movieId']][movieId].sort_values(ascending=False)[:num_of_neighbors]
        topK_similar_items = topK_similar_items[topK_similar_items > 0]
        ttt = pd.DataFrame(topK_similar_items)
        ttt.columns = ['similarity']
        
        if topK_similar_items.shape[0] == 0:
            ratings_preds = bxi
        else:
            topK_movies_ratings = user_rated_movies[user_rated_movies.movieId.isin(topK_similar_items.index)]
            tt = topK_movies_ratings.groupby('movieId').mean().merge(ttt, how = 'left', 
                                                                     left_on = 'movieId', 
                                                                     right_on = 'movieId')
            tt['rating'] = tt['rating'] + bx
            
            ratings_preds = bxi + np.dot(np.array(tt.sort_values(by='movieId').similarity), 
                                   np.array(topK_movies_ratings.sort_values(by='movieId').rating) 
                                   - np.array(tt.sort_values(by='movieId').rating))/sum(topK_similar_items.values)
            
        preditions.append(ratings_preds)
    movies_NoRate['pred_ratings'] = preditions
    return movies_NoRate.sort_values(by='pred_ratings', ascending=False)

In [33]:
item_collaborative_filtering_baseline(6, item_similarity, train, train_movies, 10).head(10)

User 6 has already rated 152 movies.


Unnamed: 0,movieId,title,genres,pred_ratings
2568,3435,Double Indemnity (1944),Crime|Drama|Film-Noir,4.541667
906,1204,Lawrence of Arabia (1962),Adventure|Drama|War,4.5
9071,142488,Spotlight (2015),Thriller,4.433333
947,1248,Touch of Evil (1958),Crime|Film-Noir|Thriller,4.423077
841,1104,"Streetcar Named Desire, A (1951)",Drama,4.423077
2593,3468,"Hustler, The (1961)",Drama,4.40625
918,1217,Ran (1985),Drama|War,4.375
971,1272,Patton (1970),Drama|War,4.375
585,720,Wallace & Gromit: The Best of Aardman Animatio...,Adventure|Animation|Comedy,4.368421
659,858,"Godfather, The (1972)",Crime|Drama,4.350694


## Model Evaluation

### Root-Mean-Square-Error (RMSE)

In [34]:
test.head()

Unnamed: 0,userId,movieId,rating
32038,219,59369,3.5
5897,42,165,2.0
78768,489,2716,4.0
22803,156,1213,4.0
89571,580,4447,1.0


In [35]:
n_users = test.userId.unique().shape[0]
n_movies = test.movieId.unique().shape[0]
print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_movies))

Number of users = 453 | Number of movies = 942


In [36]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11754 entries, 32038 to 73523
Data columns (total 3 columns):
userId     11754 non-null int64
movieId    11754 non-null int64
rating     11754 non-null float64
dtypes: float64(1), int64(2)
memory usage: 367.3 KB


In [37]:
size = test.shape[0]
size

11754

In [38]:
# Test Item Similarity Matrix
test_item_similarity = 1 - pairwise_distances(test_matrix.T, metric='correlation')
test_item_similarity[np.isnan(test_item_similarity)] = 0
test_item_similarity = pd.DataFrame(test_item_similarity, columns = test_matrix.columns, index = test_matrix.columns)
test_item_similarity.values[[np.arange(test.movieId.unique().shape[0])] * 2] = 0
test_item_similarity.head()

movieId,1,2,3,5,6,7,10,11,15,16,...,122918,122920,134130,139385,148626,152081,157296,164179,166528,168250
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.066209,0.005977,0.036343,0.041088,-0.037751,-0.036562,-0.041628,-0.029196,-0.055684,...,-0.022158,-0.033386,0.02732,-0.034262,0.039731,-0.041149,-0.034808,-0.033477,-0.040145,-0.033274
2,0.066209,0.0,0.115513,-0.035454,0.010023,0.032486,0.057814,0.038439,-0.023389,-0.026673,...,0.061208,-0.026746,-0.02917,-0.027448,0.070028,0.036653,-0.027885,0.055338,0.135325,-0.026656
3,0.005977,0.115513,0.0,-0.02137,0.007574,-0.018228,-0.030371,-0.0201,-0.014098,-0.026888,...,-0.016877,-0.016121,-0.017582,-0.016544,-0.017254,-0.019869,-0.016807,-0.016165,-0.019385,-0.016066
5,0.036343,-0.035454,-0.02137,0.0,0.02985,-0.019428,-0.032369,-0.021423,-0.015025,0.209808,...,-0.017988,-0.017182,-0.018739,-0.017632,-0.018389,-0.021176,-0.017913,-0.017228,-0.02066,-0.017124
6,0.041088,0.010023,0.007574,0.02985,0.0,-0.016445,-0.054439,-0.03603,-0.02527,0.025464,...,0.133238,-0.028897,0.042393,0.031447,0.054155,-0.035615,-0.030127,0.050736,0.119258,-0.028799


In [39]:
def RMSE_item_collaborative_filtering(userId, similarity, ratings, movies, num_of_neighbors):
    user_data = ratings.loc[ratings.userId == userId].copy()
    # movies the user has rated
    user_rated_movies = (user_data.merge(movies, how = 'left', left_on = 'movieId', 
                                 right_on = 'movieId').sort_values(['rating'], ascending=False))
#     print('User {0} has already rated {1} movies.'.format(userId, user_rated_movies.shape[0]))
    # movies that haven't been rated by useer
#     movies_NoRate = movies.loc[~movies['movieId'].isin(user_rated_movies['movieId'])].copy()
    preditions = []
    for movieId in user_rated_movies['movieId']:
        topK_similar_items = similarity.loc[user_rated_movies['movieId']][movieId].sort_values(ascending=False)[:num_of_neighbors]
        topK_similar_items = topK_similar_items[topK_similar_items > 0]
        
        if topK_similar_items.shape[0] == 0:
            ratings_preds = ratings[ratings.movieId == movieId]['rating'].mean()
        else:
            topK_movies_ratings = user_rated_movies[user_rated_movies.movieId.isin(topK_similar_items.index)]
            ratings_preds = np.dot(topK_similar_items.values, np.array(topK_movies_ratings.rating))/sum(topK_similar_items.values)
            
        preditions.append(ratings_preds)
    user_rated_movies['pred_ratings'] = preditions
    return user_rated_movies.sort_values(by='pred_ratings', ascending=False)

In [40]:
def RMSE(similarity, ratings, movies, num_of_neighbors, baseline=False):
    sizeN = ratings.shape[0]
    err = 0
    if baseline:
        for userId in ratings.userId.unique():
            preds = RMSE_item_collaborative_filtering_baseline(userId, similarity, ratings, movies, num_of_neighbors)
            err += np.dot(np.array(preds['rating']) - np.array(preds['pred_ratings']), 
                          np.array(preds['rating']) - np.array(preds['pred_ratings']))
    else:
        for userId in ratings.userId.unique():
            preds = RMSE_item_collaborative_filtering(userId, similarity, ratings, movies, num_of_neighbors)
            err += np.dot(np.array(preds['rating']) - np.array(preds['pred_ratings']), 
                          np.array(preds['rating']) - np.array(preds['pred_ratings']))
    rmse = np.sqrt(err/sizeN)
    return rmse     

In [41]:
RMSE(test_item_similarity, test, test_movies, 10, False)

0.9832655528231398

### With global baseline

In [42]:
# Item-Item CF with Global Baseline 
def RMSE_item_collaborative_filtering_baseline(userId, similarity, ratings, movies, num_of_neighbors):
    user_data = ratings.loc[ratings.userId == userId].copy()
    # movies the user has rated
    user_rated_movies = (user_data.merge(movies, how = 'left', left_on = 'movieId', 
                                 right_on = 'movieId').sort_values(['rating'], ascending=False))
#     print('User {0} has already rated {1} movies.'.format(userId, user_rated_movies.shape[0]))
#     # movies that haven't been rated by useer
#     movies_NoRate = movies.loc[~movies['movieId'].isin(user_rated_movies['movieId'])].copy()
    
    # Global baseline estimate
    avg_rating = ratings['rating'].mean()
    user_avg = user_data['rating'].mean()
    bx = user_avg - avg_rating
    
    preditions = []
    for movieId in user_rated_movies['movieId']:
        movie_avg = ratings[ratings.movieId == movieId]['rating'].mean()
        bi = movie_avg - avg_rating
        bxi = avg_rating + bx + bi
        
        topK_similar_items = similarity.loc[user_rated_movies['movieId']][movieId].sort_values(ascending=False)[:num_of_neighbors]
        topK_similar_items = topK_similar_items[topK_similar_items > 0]
        ttt = pd.DataFrame(topK_similar_items)
        ttt.columns = ['similarity']
        
        if topK_similar_items.shape[0] == 0:
            ratings_preds = bxi
        else:
            topK_movies_ratings = user_rated_movies[user_rated_movies.movieId.isin(topK_similar_items.index)]
            tt = topK_movies_ratings.groupby('movieId').mean().merge(ttt, how = 'left', 
                                                                     left_on = 'movieId', 
                                                                     right_on = 'movieId')
            tt['rating'] = tt['rating'] + bx
            
            ratings_preds = bxi + np.dot(np.array(tt.sort_values(by='movieId').similarity), 
                                   np.array(topK_movies_ratings.sort_values(by='movieId').rating) 
                                   - np.array(tt.sort_values(by='movieId').rating))/sum(topK_similar_items.values)
            
        preditions.append(ratings_preds)
    user_rated_movies['pred_ratings'] = preditions
    return user_rated_movies.sort_values(by='pred_ratings', ascending=False)

In [43]:
RMSE(test_item_similarity, test, test_movies, 10, True)

0.8744799395615191