**Examples of Collaborative Filtering based Recommendation Systems**

In [2]:
#make necesarry imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation, cosine
import ipywidgets as widgets
from IPython.display import display, clear_output
from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt
import sys, os
from contextlib import contextmanager

In [3]:
#M is user-item ratings matrix where ratings are integers from 1-10
M = np.asarray([[3,7,4,9,9,7], 
                [7,0,5,3,8,8],
               [7,5,5,0,8,4],
               [5,6,8,5,9,8],
               [5,8,8,8,10,9],
               [7,7,0,4,7,8]])
M=pd.DataFrame(M)

#declaring k,metric as global which can be changed by the user later
global k,metric
k=4
metric='cosine' #can be changed to 'correlation' for Pearson correlation similaries

In [4]:
M

Unnamed: 0,0,1,2,3,4,5
0,3,7,4,9,9,7
1,7,0,5,3,8,8
2,7,5,5,0,8,4
3,5,6,8,5,9,8
4,5,8,8,8,10,9
5,7,7,0,4,7,8


**User-based Recommendation Systems**

In [5]:
#get cosine similarities for ratings matrix M; pairwise_distances returns the distances between ratings and hence
#similarities are obtained by subtracting distances from 1
cosine_sim = 1-pairwise_distances(M, metric="cosine")

In [6]:
#Cosine similarity matrix
pd.DataFrame(cosine_sim)

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.799268,0.779227,0.934622,0.97389,0.8846
1,0.799268,1.0,0.874744,0.90585,0.866146,0.827036
2,0.779227,0.874744,1.0,0.909513,0.865454,0.853275
3,0.934622,0.90585,0.909513,1.0,0.989344,0.865614
4,0.97389,0.866146,0.865454,0.989344,1.0,0.88164
5,0.8846,0.827036,0.853275,0.865614,0.88164,1.0


In [7]:
#get pearson similarities for ratings matrix M
pearson_sim = 1-pairwise_distances(M, metric="correlation")

In [8]:
#Pearson correlation similarity matrix
pd.DataFrame(pearson_sim)

Unnamed: 0,0,1,2,3,4,5
0,1.0,-0.137446,-0.357398,0.208179,0.761905,0.27735
1,-0.137446,1.0,0.453897,0.51591,0.112456,0.218328
2,-0.357398,0.453897,1.0,0.451378,-0.042888,0.297373
3,0.208179,0.51591,0.451378,1.0,0.763325,-0.057739
4,0.761905,0.112456,-0.042888,0.763325,1.0,0.039621
5,0.27735,0.218328,0.297373,-0.057739,0.039621,1.0


In [9]:
#This function finds k similar users given the user_id and ratings matrix M
#Note that the similarities are same as obtained via using pairwise_distances
def findksimilarusers(user_id, ratings, metric = metric, k=k):
    similarities=[]
    indices=[]
    model_knn = NearestNeighbors(metric = metric, algorithm = 'brute') 
    model_knn.fit(ratings)

    distances, indices = model_knn.kneighbors(ratings.iloc[user_id-1, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1-distances.flatten()
    print '{0} most similar users for User {1}:\n'.format(k,user_id)
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == user_id:
            continue;

        else:
            print '{0}: User {1}, with similarity of {2}'.format(i, indices.flatten()[i]+1, similarities.flatten()[i])
            
    return similarities,indices

In [10]:
similarities,indices = findksimilarusers(1,M, metric='cosine')

4 most similar users for User 1:

1: User 5, with similarity of 0.973889935402
2: User 4, with similarity of 0.934621684178
3: User 6, with similarity of 0.88460045723
4: User 2, with similarity of 0.799267978052


In [11]:
similarities,indices = findksimilarusers(1,M, metric='correlation')

4 most similar users for User 1:

1: User 5, with similarity of 0.761904761905
2: User 6, with similarity of 0.277350098113
3: User 4, with similarity of 0.208179450927
4: User 2, with similarity of -0.137446320513


In [12]:
#This function predicts rating for specified user-item combination based on user-based approach
def predict_userbased(user_id, item_id, ratings, metric = metric, k=k):
    prediction=0
    similarities, indices=findksimilarusers(user_id, ratings,metric, k) #similar users based on cosine similarity
    mean_rating = ratings.loc[user_id-1,:].mean() #to adjust for zero based indexing
    sum_wt = np.sum(similarities)-1
    product=1
    wtd_sum = 0 
    
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == user_id:
            continue;
        else: 
            ratings_diff = ratings.iloc[indices.flatten()[i],item_id-1]-np.mean(ratings.iloc[indices.flatten()[i],:])
            product = ratings_diff * (similarities[i])
            wtd_sum = wtd_sum + product
    
    prediction = int(round(mean_rating + (wtd_sum/sum_wt)))
    print '\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction)

    return prediction

In [13]:
predict_userbased(3,4,M);

4 most similar users for User 3:

1: User 4, with similarity of 0.90951268934
2: User 2, with similarity of 0.874744414849
3: User 5, with similarity of 0.86545387815
4: User 6, with similarity of 0.853274963344

Predicted rating for user 3 -> item 4: 3


**Item-based Recommendation Systems**

In [14]:
#This function finds k similar items given the item_id and ratings matrix M

def findksimilaritems(item_id, ratings, metric=metric, k=k):
    similarities=[]
    indices=[]    
    ratings=ratings.T
    model_knn = NearestNeighbors(metric = metric, algorithm = 'brute')
    model_knn.fit(ratings)

    distances, indices = model_knn.kneighbors(ratings.iloc[item_id-1, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1-distances.flatten()
    print '{0} most similar items for item {1}:\n'.format(k,item_id)
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == item_id:
            continue;

        else:
            print '{0}: Item {1} :, with similarity of {2}'.format(i,indices.flatten()[i]+1, similarities.flatten()[i])


    return similarities,indices

In [15]:
similarities,indices=findksimilaritems(3,M)

4 most similar items for item 3:

1: Item 5 :, with similarity of 0.918336125535
2: Item 6 :, with similarity of 0.874759773038
3: Item 1 :, with similarity of 0.810364746222
4: Item 4 :, with similarity of 0.796917800302


In [16]:
#This function predicts the rating for specified user-item combination based on item-based approach
def predict_itembased(user_id, item_id, ratings, metric = metric, k=k):
    prediction= wtd_sum =0
    similarities, indices=findksimilaritems(item_id, ratings) #similar users based on correlation coefficients
    sum_wt = np.sum(similarities)-1
    product=1
    
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == item_id:
            continue;
        else:
            product = ratings.iloc[user_id-1,indices.flatten()[i]] * (similarities[i])
            wtd_sum = wtd_sum + product                              
    prediction = int(round(wtd_sum/sum_wt))
    print '\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction)      

    return prediction

In [17]:
prediction = predict_itembased(1,3,M)

4 most similar items for item 3:

1: Item 5 :, with similarity of 0.918336125535
2: Item 6 :, with similarity of 0.874759773038
3: Item 1 :, with similarity of 0.810364746222
4: Item 4 :, with similarity of 0.796917800302

Predicted rating for user 1 -> item 3: 7


In [18]:
#This function is used to compute adjusted cosine similarity matrix for items
def computeAdjCosSim(M):
    sim_matrix = np.zeros((M.shape[1], M.shape[1]))
    M_u = M.mean(axis=1) #means
          
    for i in range(M.shape[1]):
        for j in range(M.shape[1]):
            if i == j:
                
                sim_matrix[i][j] = 1
            else:                
                if i<j:
                    
                    sum_num = sum_den1 = sum_den2 = 0
                    for k,row in M.loc[:,[i,j]].iterrows(): 

                        if ((M.loc[k,i] != 0) & (M.loc[k,j] != 0)):
                            num = (M[i][k]-M_u[k])*(M[j][k]-M_u[k])
                            den1= (M[i][k]-M_u[k])**2
                            den2= (M[j][k]-M_u[k])**2
                            
                            sum_num = sum_num + num
                            sum_den1 = sum_den1 + den1
                            sum_den2 = sum_den2 + den2
                        
                        else:
                            continue                          
                                       
                    den=(sum_den1**0.5)*(sum_den2**0.5)
                    if den!=0:
                        sim_matrix[i][j] = sum_num/den
                    else:
                        sim_matrix[i][j] = 0


                else:
                    sim_matrix[i][j] = sim_matrix[j][i]           
            
    return pd.DataFrame(sim_matrix)

In [19]:
adjcos_sim = computeAdjCosSim(M)

In [20]:
adjcos_sim

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.236908,0.421263,-0.519085,-0.125892,0.01009
1,0.236908,1.0,-0.805243,0.085741,0.237273,0.520625
2,0.421263,-0.805243,1.0,-0.767941,-0.230521,-0.05364
3,-0.519085,0.085741,-0.767941,1.0,-0.299059,-0.64455
4,-0.125892,0.237273,-0.230521,-0.299059,1.0,0.599158
5,0.01009,0.520625,-0.05364,-0.64455,0.599158,1.0


In [26]:
#This function finds k similar items given the item_id and ratings matrix M

def findksimilaritems_adjcos(item_id, ratings, k=k):
    
    sim_matrix = computeAdjCosSim(ratings)
    similarities = sim_matrix[item_id-1].sort_values(ascending=False)[:k+1].values
    indices = sim_matrix[item_id-1].sort_values(ascending=False)[:k+1].index
    
    print '{0} most similar items for item {1}:\n'.format(k,item_id)
    for i in range(0, len(indices)):
            if indices[i]+1 == item_id:
                continue;

            else:
                print '{0}: Item {1} :, with similarity of {2}'.format(i,indices[i]+1, similarities[i])
        
    return similarities ,indices    

In [27]:
similarities, indices = findksimilaritems_adjcos(3,M)

4 most similar items for item 3:

1: Item 1 :, with similarity of 0.421262731871
2: Item 6 :, with similarity of -0.0536398904889
3: Item 5 :, with similarity of -0.230521358269
4: Item 4 :, with similarity of -0.767941046575


In [28]:
#This function predicts the rating for specified user-item combination for adjusted cosine item-based approach
#As the adjusted cosine similarities range from -1,+1, sometimes the predicted rating can be negative or greater than max value
#Hack to deal with this: Rating is set to min if prediction is negative, Rating is set to max if prediction is above max
def predict_itembased_adjcos(user_id, item_id, ratings):
    prediction=0

    similarities, indices=findksimilaritems_adjcos(item_id, ratings) #similar users based on correlation coefficients
    sum_wt = np.sum(similarities)-1

    product=1
    wtd_sum = 0 
    for i in range(0, len(indices)):
        if indices[i]+1 == item_id:
            continue;
        else:
            product = ratings.iloc[user_id-1,indices[i]] * (similarities[i])
            wtd_sum = wtd_sum + product                              
    prediction = int(round(wtd_sum/sum_wt))
    if prediction < 0:
        prediction = 1
    elif prediction >10:
        prediction = 10
    print '\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction)      
        
    return prediction

In [29]:
prediction=predict_itembased_adjcos(3,4,M)

4 most similar items for item 4:

1: Item 2 :, with similarity of 0.0857414341149
2: Item 5 :, with similarity of -0.29905882779
3: Item 1 :, with similarity of -0.519085268895
4: Item 6 :, with similarity of -0.644550286954

Predicted rating for user 3 -> item 4: 6


In [30]:
adjcos_sim

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.236908,0.421263,-0.519085,-0.125892,0.01009
1,0.236908,1.0,-0.805243,0.085741,0.237273,0.520625
2,0.421263,-0.805243,1.0,-0.767941,-0.230521,-0.05364
3,-0.519085,0.085741,-0.767941,1.0,-0.299059,-0.64455
4,-0.125892,0.237273,-0.230521,-0.299059,1.0,0.599158
5,0.01009,0.520625,-0.05364,-0.64455,0.599158,1.0


In [31]:
#This function utilizes above function to recommend items for selected approach. Recommendations are made if the predicted
#rating for an item is greater than or equal to 6, and the items has not been rated already
def recommendItem(user_id, item_id, ratings):
    
    if user_id<1 or user_id>6 or type(user_id) is not int:
        print 'Userid does not exist. Enter numbers from 1-6'
    else:    
        ids = ['User-based CF (cosine)','User-based CF (correlation)','Item-based CF (cosine)',
               'Item-based CF (adjusted cosine)']

        approach = widgets.Dropdown(options=ids, value=ids[0],
                               description='Select Approach', width='500px')
        
        def on_change(change):
            prediction = 0
            clear_output(wait=True)
            if change['type'] == 'change' and change['name'] == 'value':            
                if (approach.value == 'User-based CF (cosine)'):
                    metric = 'cosine'
                    prediction = predict_userbased(user_id, item_id, ratings, metric)
                elif (approach.value == 'User-based CF (correlation)')  :                       
                    metric = 'correlation'               
                    prediction = predict_userbased(user_id, item_id, ratings, metric)
                elif (approach.value == 'Item-based CF (cosine)'):
                    prediction = predict_itembased(user_id, item_id, ratings)
                else:
                    prediction = predict_itembased_adjcos(user_id,item_id,ratings)

                if ratings[item_id-1][user_id-1] != 0: 
                    print 'Item already rated'
                else:
                    if prediction>=6:
                        print '\nItem recommended'
                    else:
                        print 'Item not recommended'

        approach.observe(on_change)
        display(approach)

In [32]:
#check for incorrect entries
recommendItem(-1,3,M)

Userid does not exist. Enter numbers from 1-6


In [33]:
recommendItem(3,4,M)

4 most similar users for User 3:

1: User 4, with similarity of 0.90951268934
2: User 2, with similarity of 0.874744414849
3: User 5, with similarity of 0.86545387815
4: User 6, with similarity of 0.853274963344

Predicted rating for user 3 -> item 4: 3
Item not recommended


In [34]:
recommendItem(3,4,M)

4 most similar users for User 3:

1: User 2, with similarity of 0.453897185842
2: User 4, with similarity of 0.451378005098
3: User 6, with similarity of 0.297373304825
4: User 5, with similarity of -0.04288778794

Predicted rating for user 3 -> item 4: 3
Item not recommended


In [35]:
recommendItem(3,4,M)

4 most similar items for item 4:

1: Item 6 :, with similarity of 0.89977997614
2: Item 2 :, with similarity of 0.887160079571
3: Item 5 :, with similarity of 0.88180009273
4: Item 3 :, with similarity of 0.796917800302

Predicted rating for user 3 -> item 4: 6

Item recommended


In [36]:
recommendItem(3,4,M)

4 most similar items for item 4:

1: Item 2 :, with similarity of 0.0857414341149
2: Item 5 :, with similarity of -0.29905882779
3: Item 1 :, with similarity of -0.519085268895
4: Item 6 :, with similarity of -0.644550286954

Predicted rating for user 3 -> item 4: 6

Item recommended


In [37]:
#if the item is already rated, it is not recommended
recommendItem(2,1,M)

4 most similar users for User 2:

1: User 4, with similarity of 0.515910067398
2: User 3, with similarity of 0.453897185842
3: User 6, with similarity of 0.218327934565
4: User 5, with similarity of 0.11245608042

Predicted rating for user 2 -> item 1: 5
Item already rated


In [38]:
#This is a quick way to temporarily suppress stdout in particular code section
@contextmanager
def suppress_stdout():
    with open(os.devnull, "w") as devnull:
        old_stdout = sys.stdout
        sys.stdout = devnull
        try:  
            yield
        finally:
            sys.stdout = old_stdout

In [39]:
#This is final function to evaluate the performance of selected recommendation approach and the metric used here is RMSE
#suppress_stdout function is used to suppress the print outputs of all the functions inside this function. It will only print 
#RMSE values
def evaluateRS(ratings):
    ids = ['User-based CF (cosine)','User-based CF (correlation)','Item-based CF (cosine)','Item-based CF (adjusted cosine)']
    approach = widgets.Dropdown(options=ids, value=ids[0],description='Select Approach', width='500px')
    n_users = ratings.shape[0]
    n_items = ratings.shape[1]
    prediction = np.zeros((n_users, n_items))
    prediction= pd.DataFrame(prediction)
    def on_change(change):
        clear_output(wait=True)
        with suppress_stdout():
            if change['type'] == 'change' and change['name'] == 'value':            
                if (approach.value == 'User-based CF (cosine)'):
                    metric = 'cosine'
                    for i in range(n_users):
                        for j in range(n_items):
                            prediction[i][j] = predict_userbased(i+1, j+1, ratings, metric)
                elif (approach.value == 'User-based CF (correlation)')  :                       
                    metric = 'correlation'               
                    for i in range(n_users):
                        for j in range(n_items):
                            prediction[i][j] = predict_userbased(i+1, j+1, ratings, metric)
                elif (approach.value == 'Item-based CF (cosine)'):
                    for i in range(n_users):
                        for j in range(n_items):
                            prediction[i][j] = predict_userbased(i+1, j+1, ratings)
                else:
                    for i in range(n_users):
                        for j in range(n_items):
                            prediction[i][j] = predict_userbased(i+1, j+1, ratings)
              
        MSE = mean_squared_error(prediction, ratings)
        RMSE = round(sqrt(MSE),3)
        print "RMSE using {0} approach is: {1}".format(approach.value,RMSE)
              
    approach.observe(on_change)
    display(approach)

In [40]:
evaluateRS(M)

RMSE using Item-based CF (cosine) approach is: 2.804


In [41]:
evaluateRS(M)

RMSE using Item-based CF (cosine) approach is: 2.804


**Thanks for reading this notebook**