In [38]:
import numpy as np
import pandas as pd
import os

In [39]:
path = os.path.expanduser('~/Desktop/ml-100k/u.data')

In [40]:
path

'/Users/dominikpeter/Desktop/ml-100k/u.data'

In [41]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv(path, sep='\t', names=header)

In [47]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [49]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items))

Number of users = 943 | Number of movies = 1682


In [50]:
from sklearn.model_selection import train_test_split
train_data, test_data = cv.train_test_split(df, test_size=0.25)

In [85]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]  

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [87]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [88]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [139]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

array([[ 0.34889958,  0.35454619,  0.36968891, ...,  0.41890835,
         0.41048672,  0.40751981],
       [ 0.09593339,  0.10950834,  0.1039772 , ...,  0.11065815,
         0.11081342,  0.11209646],
       [ 0.062099  ,  0.06531899,  0.0633507 , ...,  0.06207841,
         0.06466105,  0.06532092],
       ..., 
       [ 0.03035856,  0.03813111,  0.0366507 , ...,  0.04238172,
         0.04137622,  0.04179156],
       [ 0.12548781,  0.13489842,  0.13946134, ...,  0.14542697,
         0.14428684,  0.1459508 ],
       [ 0.20265871,  0.20079401,  0.21893161, ...,  0.25354706,
         0.24439529,  0.2450672 ]])

In [90]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [91]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 3.115963906045146
Item-based CF RMSE: 3.44461119320136


In [92]:
sparsity=round(1.0-len(df)/float(n_users*n_items),3)
print('The sparsity level of MovieLens100K is ' +  str(sparsity*100) + '%')

The sparsity level of MovieLens100K is 93.7%


In [93]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print('User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix)))

User-based CF MSE: 2.7049047537296795


In [99]:
from scipy.sparse import coo_matrix

In [100]:
from scipy.sparse import coo_matrix

In [114]:
j = coo_matrix((3, 4), dtype=np.int8)
i = coo_matrix((3, 4), dtype=np.int8)

In [116]:
j.transpose().dot(i).toarray()

array([[0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0]], dtype=int8)

In [117]:
row  = np.array([0, 3, 1, 0])
col  = np.array([0, 3, 1, 2])
data = np.array([4, 5, 7, 9])
coo_matrix((data, (row, col)), shape=(4, 4)).toarray()

array([[4, 0, 9, 0],
       [0, 7, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 5]])

In [130]:
col = train_data.user_id.as_matrix()
row = train_data.item_id.as_matrix()
data = train_data.rating.as_matrix()

In [141]:
x = coo_matrix((data, (row, col)))

In [138]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix. Choose k.
u, s, vt = svds(x, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print('User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix)))

ValueError: matrix type must be 'f', 'd', 'F', or 'D'

In [142]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(x, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [146]:
user_similarity

array([[ 0.        ,  1.        ,  1.        , ...,  1.        ,
         1.        ,  1.        ],
       [ 1.        ,  0.        ,  0.67258077, ...,  1.        ,
         1.        ,  1.        ],
       [ 1.        ,  0.67258077,  0.        , ...,  1.        ,
         0.90950545,  0.90950545],
       ..., 
       [ 1.        ,  1.        ,  1.        , ...,  0.        ,
         1.        ,  1.        ],
       [ 1.        ,  1.        ,  0.90950545, ...,  1.        ,
         0.        ,  1.        ],
       [ 1.        ,  1.        ,  0.90950545, ...,  1.        ,
         1.        ,  0.        ]])