In [1]:
import pandas as pd
import numpy as np
import math
import warnings
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("./ml-latest-small/ratings.csv")
# df = pd.read_csv("/home/tianyou/ml-latest/ratings.csv")
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
n_users = df.userId.unique().shape[0]
n_items = df.movieId.unique().shape[0]
n_users, n_items

(610, 9724)

In [4]:
movie_movieId = df.movieId.unique().tolist()
movie_movieId.sort()
d = dict()
for i in range(0, len(movie_movieId)):
    d[movie_movieId[i]] = i

In [5]:
def user_item_matrix(data):
    ratings = np.zeros((n_users, n_items))
    for row in data.itertuples():
        ratings[row[1]-1, d[row[2]]] = row[3]
    return ratings

In [6]:
train_set, test_set = train_test_split(df)

In [7]:
ratings = user_item_matrix(df)
train = user_item_matrix(train_set)
test = user_item_matrix(test_set)
train

array([[4. , 0. , 4. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [2.5, 0. , 2. , ..., 0. , 0. , 0. ],
       [3. , 0. , 0. , ..., 0. , 0. , 0. ],
       [5. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [8]:
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100
print ('Sparsity: {:4.2f}%'.format(sparsity))

Sparsity: 1.70%


In [23]:
def normalize(data, kind='user'):
    data2 = data.copy()
    data2[data2 == 0] = np.nan
    # print(np.nanmean(data[0]))
    if kind == 'user':
        for i in range(0, data2.shape[0]):
            data2[i] = data2[i] - np.nanmean(data2[i])
            # data[i][data[i] == 0] = np.nanmean(data[i])
            
    if kind == 'item':
        for j in range(0, data.shape[1]):
            data2.T[j] = data2.T[j] - np.nanmean(data2.T[j])

    data2[np.isnan(data2)] = 0
    return (data2)

In [24]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=RuntimeWarning)
    user_norm_matrix = normalize(train, kind = 'user')
    item_norm_matrix = normalize(train, kind = 'item')

In [25]:
def similarity(ratings, kind='user', epsilon=1e-12):
    # epsilon -> small number for handling dived-by-zero errors
    if kind == 'user':
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [26]:
print(user_norm_matrix)

[[-0.36813187  0.         -0.36813187 ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 ...
 [-0.62403101  0.         -1.12403101 ...  0.          0.
   0.        ]
 [-0.18518519  0.          0.         ...  0.          0.
   0.        ]
 [ 1.29735234  0.          0.         ...  0.          0.
   0.        ]]


In [27]:
user_norm_sim_matrix = similarity(user_norm_matrix, kind = 'user')
item_norm_sim_matrix = similarity(item_norm_matrix, kind = 'item')
user_sim_matrix = similarity(train, kind = 'user')
item_sim_matrix = similarity(train, kind = 'item')
print(user_sim_matrix)
print(user_norm_sim_matrix)

[[1.00000000e+00 9.73523635e-16 2.23961408e-02 ... 2.23492489e-01
  7.10209543e-02 9.40615343e-02]
 [9.73523635e-16 1.00000000e+00 3.48366243e-15 ... 4.13504075e-02
  3.50085037e-15 8.01547369e-02]
 [2.23961408e-02 3.48366243e-15 1.00000000e+00 ... 2.75345958e-02
  3.57946170e-15 1.42856652e-02]
 ...
 [2.23492489e-01 4.13504075e-02 2.75345958e-02 ... 1.00000000e+00
  9.67573083e-02 2.48864897e-01]
 [7.10209543e-02 3.50085037e-15 3.57946170e-15 ... 9.67573083e-02
  1.00000000e+00 5.23873500e-02]
 [9.40615343e-02 8.01547369e-02 1.42856652e-02 ... 2.48864897e-01
  5.23873500e-02 1.00000000e+00]]
[[ 1.00000000e+00  2.58318970e-14 -1.35467460e-02 ...  5.29321172e-02
   2.69815219e-02 -3.22884166e-03]
 [ 2.58318970e-14  1.00000000e+00  2.61970796e-14 ... -8.42807066e-03
   1.41552484e-13  2.40638330e-02]
 [-1.35467460e-02  2.61970796e-14  1.00000000e+00 ... -1.80442948e-02
   4.54262058e-14  1.43363029e-02]
 ...
 [ 5.29321172e-02 -8.42807066e-03 -1.80442948e-02 ...  1.00000000e+00
   3.67086

In [57]:
def predict_fast_simple(ratings, similarity, kind='user'):
    if kind == 'user':
        result = similarity.dot(ratings) / np.array([similarity.sum(axis=1)]).T
        # result[result < 0.5] = 1
        return result
    elif kind == 'item':
        result = ratings.dot(similarity) / np.array([similarity.sum(axis=1)])
        # result[result < 0.5] = 1
        return result

In [58]:
print(ratings)

[[4.  0.  4.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 ...
 [2.5 2.  2.  ... 0.  0.  0. ]
 [3.  0.  0.  ... 0.  0.  0. ]
 [5.  0.  0.  ... 0.  0.  0. ]]


In [60]:
user_norm_prediction = predict_fast_simple(train, user_norm_sim_matrix, kind = 'user')
item_norm_prediction = predict_fast_simple(train, item_norm_sim_matrix, kind = 'item')
user_prediction = predict_fast_simple(train, user_sim_matrix, kind = 'user')
item_prediction = predict_fast_simple(train, item_sim_matrix, kind = 'item')
print(user_prediction)
print(item_prediction)

[[1.53840037e+00 6.62612943e-01 3.77643133e-01 ... 4.50186946e-04
  4.50186946e-04 2.40600715e-03]
 [9.00244626e-01 5.14582788e-01 9.81201127e-02 ... 1.18769717e-02
  1.18769717e-02 1.80442682e-02]
 [1.26972342e+00 6.36552166e-01 3.13999565e-01 ... 1.08854053e-15
  1.08854053e-15 1.17021973e-15]
 ...
 [1.41364434e+00 6.81094650e-01 3.00603794e-01 ... 8.92471144e-04
  8.92471144e-04 8.37865531e-03]
 [1.43783364e+00 6.73445540e-01 2.62948950e-01 ... 1.02058024e-16
  1.02058024e-16 1.09716000e-16]
 [1.37306437e+00 6.63001487e-01 2.15049631e-01 ... 6.12491971e-03
  6.12491971e-03 1.56887269e-02]]
[[2.60599539e-01 2.30864163e-01 3.04623207e-01 ... 8.59549420e-03
  8.59549420e-03 9.79420769e-02]
 [1.71443495e-02 2.11305580e-02 8.97077526e-03 ... 3.21273405e-02
  3.21273405e-02 9.66876008e-02]
 [8.01283804e-03 8.56790279e-03 8.46249866e-03 ... 4.80597390e-14
  4.80597390e-14 1.16092434e-13]
 ...
 [5.77161621e-01 5.62176524e-01 5.77366624e-01 ... 4.93990365e-02
  4.93990365e-02 1.10089403e+00]

In [61]:
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(actual, pred)

In [62]:
print ('User_norm-based CF MSE: ' + str(get_mse(user_norm_prediction, test)))
print ('Item_norm-based CF MSE: ' + str(get_mse(item_norm_prediction, test)))
print ('User-based CF MSE: ' + str(get_mse(user_prediction, test)))
print ('Item-based CF MSE: ' + str(get_mse(item_prediction, test)))

User_norm-based CF MSE: 1492.5744449833173
Item_norm-based CF MSE: 86.80892470693588
User-based CF MSE: 10.601387449707303
Item-based CF MSE: 11.498314706888587


In [32]:
def predict_topk(ratings, similarity, kind='user', k=40):
    pred = np.zeros(ratings.shape)
    if kind == 'user':
        for i in range(ratings.shape[0]):
            top_k_users = [np.argsort(similarity[:,i])[:-k-1:-1]]
            for j in range(ratings.shape[1]):
                pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users]) 
                pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users]))
    if kind == 'item':
        for j in range(ratings.shape[1]):
            top_k_items = [np.argsort(similarity[:,j])[:-k-1:-1]]
            for i in range(ratings.shape[0]):
                pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T) 
                pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items]))        
    
    return pred

In [33]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=FutureWarning)
    pred = predict_topk(train, user_sim_matrix, kind='user', k=15)
    print ('Top-k User-based CF MSE: ' + str(get_mse(pred, test)))

    pred = predict_topk(train, item_sim_matrix, kind='item', k=15)
    print ('Top-k Item-based CF MSE: ' + str(get_mse(pred, test)))

Top-k User-based CF MSE: 8.853793645641968
Top-k Item-based CF MSE: 9.598315521069186


In [34]:
k_array = [5, 15, 30, 50, 100, 200]
user_train_mse = []
user_test_mse = []
item_test_mse = []
item_train_mse = []

for k in k_array:
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=FutureWarning)
        user_pred = predict_topk(train, user_sim_matrix, kind='user', k=k)
        item_pred = predict_topk(train, item_sim_matrix, kind='item', k=k)

    user_train_mse += [get_mse(user_pred, train)]
    user_test_mse += [get_mse(user_pred, test)]
    
    item_train_mse += [get_mse(item_pred, train)]
    item_test_mse += [get_mse(item_pred, test)]  

KeyboardInterrupt: 

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

pal = sns.color_palette("Set2", 2)

plt.figure(figsize=(8, 8))
plt.plot(k_array, user_train_mse, c=pal[0], label='User-based train', alpha=0.5, linewidth=5)
plt.plot(k_array, user_test_mse, c=pal[0], label='User-based test', linewidth=5)
plt.plot(k_array, item_train_mse, c=pal[1], label='Item-based train', alpha=0.5, linewidth=5)
plt.plot(k_array, item_test_mse, c=pal[1], label='Item-based test', linewidth=5)
plt.legend(loc='best', fontsize=20)
plt.xticks(fontsize=16);
plt.yticks(fontsize=16);
plt.xlabel('k', fontsize=30);
plt.ylabel('MSE', fontsize=30);