In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('Data/movie_rating.csv')
df.head(5)

Unnamed: 0,userID,itemID,rating
0,1,1,2.5
1,1,2,3.5
2,1,3,3.0
3,1,4,3.5
4,1,5,2.5


In [5]:
n_users = df.userID.unique()
n_users.shape

(7,)

In [6]:
n_users = df.userID.unique().shape[0]
n_items = df.itemID.unique().shape[0]
print ('\nNumber of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items))


Number of users = 7 | Number of movies = 6


In [7]:
# Create user-item matrices
df_matrix = np.zeros((n_users, n_items))
df_matrix

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [8]:
for line in df.itertuples():
    df_matrix[line[1]-1, line[2]-1] = line[3]  

In [9]:
df_matrix

array([[2.5, 3.5, 3. , 3.5, 2.5, 3. ],
       [3. , 3.5, 1.5, 5. , 3.5, 3. ],
       [2.5, 3. , 0. , 3.5, 0. , 4. ],
       [0. , 3.5, 3. , 4. , 2.5, 4.5],
       [3. , 4. , 2. , 3. , 2. , 3. ],
       [3. , 4. , 0. , 5. , 3.5, 3. ],
       [4.5, 0. , 0. , 4. , 1. , 0. ]])

In [10]:
from sklearn.metrics.pairwise import pairwise_distances

In [11]:
user_similarity = pairwise_distances(df_matrix, metric='euclidean')
item_similarity = pairwise_distances(df_matrix.T, metric='euclidean')

In [12]:
user_similarity

array([[0.        , 2.39791576, 4.0620192 , 2.95803989, 1.41421356,
        3.57071421, 6.06217783],
       [2.39791576, 0.        , 4.27200187, 3.93700394, 2.59807621,
        1.58113883, 5.74456265],
       [4.0620192 , 4.27200187, 0.        , 4.71699057, 3.24037035,
        4.09267639, 5.5       ],
       [2.95803989, 3.93700394, 4.71699057, 0.        , 3.70809924,
        4.74341649, 8.        ],
       [1.41421356, 2.59807621, 3.24037035, 3.70809924, 0.        ,
        3.20156212, 5.7662813 ],
       [3.57071421, 1.58113883, 4.09267639, 4.74341649, 3.20156212,
        0.        , 5.87367006],
       [6.06217783, 5.74456265, 5.5       , 8.        , 5.7662813 ,
        5.87367006, 0.        ]])

In [13]:
# Top 3 similar users for user id 7
print ("Similar users for user id 7: \n", pd.DataFrame(user_similarity).loc[6,pd.DataFrame(user_similarity).loc[6,:] > 0].sort_values(ascending=False)[0:3])

Similar users for user id 7: 
 3    8.000000
0    6.062178
5    5.873670
Name: 6, dtype: float64


In [14]:
# Top 3 similar items for item id 6
print ("Similar items for item id 6: \n", pd.DataFrame(item_similarity).loc[5,pd.DataFrame(item_similarity).loc[5,:] > 0].sort_values(ascending=False)[0:3])

Similar items for item id 6: 
 0    6.557439
2    5.522681
3    4.974937
Name: 5, dtype: float64


In [15]:
# Function for item based rating prediction
def item_based_prediction(rating_matrix, similarity_matrix):
    return rating_matrix.dot(similarity_matrix) / np.array([np.abs(similarity_matrix).sum(axis=1)])     

item_based_prediction = item_based_prediction(df_matrix, item_similarity)
item_based_prediction

array([[3.10089937, 2.87828918, 3.04689383, 2.89342242, 3.09284254,
        2.91165823],
       [3.18216991, 3.10864889, 3.69949591, 2.74154275, 3.28162252,
        3.21345303],
       [2.09069955, 1.75272353, 2.78929231, 1.63508431, 2.67424164,
        1.67751256],
       [3.51787934, 2.45418768, 2.88651247, 2.7049201 , 2.96937945,
        2.33093136],
       [2.79646314, 2.56351224, 3.0513264 , 2.67727657, 2.99229519,
        2.6587726 ],
       [2.93354359, 2.72271192, 3.79381417, 2.38652591, 3.09936037,
        2.91180187],
       [0.86159244, 2.18940726, 2.22645543, 1.0060282 , 1.90409916,
        2.26249703]])

In [16]:
# Function for user based rating prediction
def user_based_prediction(rating_matrix, similarity_matrix):
    mean_user_rating = rating_matrix.mean(axis=1)
    ratings_diff = (rating_matrix - mean_user_rating[:, np.newaxis]) 
    return mean_user_rating[:, np.newaxis] + similarity_matrix.dot(ratings_diff) / np.array([np.abs(similarity_matrix).sum(axis=1)]).T

user_based_prediction = user_based_prediction(df_matrix, user_similarity)
user_based_prediction

array([[3.47624566, 3.05055639, 1.31236664, 4.68808306, 2.38133332,
        3.09141493],
       [3.53242292, 3.36886851, 2.0292077 , 4.63847048, 2.42427894,
        3.50675144],
       [2.14285946, 2.34725298, 0.96474355, 3.5688081 , 1.8911387 ,
        2.08519721],
       [3.72430235, 3.00224662, 1.22435809, 4.47388218, 2.32888497,
        2.74632579],
       [3.10729794, 2.83457839, 1.31492   , 4.52250588, 2.29876945,
        2.92192835],
       [3.21400701, 3.20445179, 2.07709214, 4.37874238, 2.28863638,
        3.3370703 ],
       [0.89326735, 2.28241501, 0.38650895, 2.70124095, 1.06358937,
        2.17297838]])

In [17]:
# Calculate the RMSE
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, actual):
    prediction = prediction[actual.nonzero()].flatten() 
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, actual))

print ('User-based CF RMSE: ' + str(rmse(user_based_prediction, df_matrix)))
print ('Item-based CF RMSE: ' + str(rmse(item_based_prediction, df_matrix))) 

User-based CF RMSE: 1.0705767848964718
Item-based CF RMSE: 1.3739228897064588


In [18]:
y_user_based = pd.DataFrame(user_based_prediction)

# Predictions for movies that the user 6 hasn't rated yet
predictions = y_user_based.loc[6,pd.DataFrame(df_matrix).loc[6,:] == 0] 
top = predictions.sort_values(ascending=False).head(n=1)
recommendations = pd.DataFrame(data=top)
recommendations.columns = ['Predicted Rating']
print (recommendations)

   Predicted Rating
1          2.282415


In [20]:
y_item_based = pd.DataFrame(item_based_prediction)

# Predictions for movies that the user 6 hasn't rated yet
predictions = y_item_based.loc[6,pd.DataFrame(df_matrix).loc[6,:] == 0] 
top = predictions.sort_values(ascending=False).head(n=1)
recommendations = pd.DataFrame(data=top)
recommendations.columns = ['Predicted Rating']
print (recommendations)

   Predicted Rating
5          2.262497


**Model based collaborative filtering (user based similarity)**

In [22]:
# calculate sparsity level
sparsity=round(1.0-len(df)/float(n_users*n_items),3)
print ('The sparsity level of is ' +  str(sparsity*100) + '%')


The sparsity level of is 0.0%


In [23]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

# Get SVD components from train matrix. Choose k.
u, s, vt = svds(df_matrix, k = 5)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print ('User-based CF MSE: ' + str(rmse(X_pred, df_matrix)))

User-based CF MSE: 0.01574289899495078
