In [1]:
import pandas as pd
import numpy as np
import math
import warnings
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("./ml-latest-small/ratings.csv")
df.head()
# df = pd.read_csv("/home/tianyou/ml-latest/ratings.csv")

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
n_users = df.userId.unique().shape[0]
n_items = df.movieId.unique().shape[0]
n_users, n_items

(610, 9724)

In [4]:
movie_movieId = df.movieId.unique().tolist()
movie_movieId.sort()
d = dict()
for i in range(0, len(movie_movieId)):
    d[movie_movieId[i]] = i

In [5]:
ratings = np.zeros((n_users, n_items))
for row in df.itertuples():
    ratings[row[1]-1, d[row[2]]] = row[3]
ratings

array([[4. , 0. , 4. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [2.5, 2. , 2. , ..., 0. , 0. , 0. ],
       [3. , 0. , 0. , ..., 0. , 0. , 0. ],
       [5. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [9]:
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100
print ('Sparsity: {:4.2f}%'.format(sparsity))

Sparsity: 1.70%


In [6]:
def user_item_matrix(data):
    ratings = np.zeros((n_users, n_items))
    for row in data.itertuples():
        ratings[row[1]-1, d[row[2]]] = row[3]
    return ratings

In [7]:
train_set, test_set = train_test_split(df)

In [8]:
train = user_item_matrix(train_set)
test = user_item_matrix(test_set)
train

array([[0. , 0. , 4. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [2.5, 2. , 2. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [5. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [26]:
def normalize(data, kind='user'):
    data[data == 0] = np.nan
    # print(np.nanmean(data[0]))
    if kind == 'user':
        for i in range(0, data.shape[0]):
            # print(np.nanmean(data[i]))
            data[i] = data[i] - np.nanmean(data[i])
            
    if kind == 'item':
        for j in range(0, data.shape[1]):
            data.T[j] = data.T[j] - np.nanmean(data.T[j])

    data[np.isnan(data)] = 0
    return (data)

In [27]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=RuntimeWarning)
    train_norm_matrix = normalize(train, kind = 'user')

In [20]:
def similarity(ratings, kind='user', epsilon=1e-9):
    # epsilon -> small number for handling dived-by-zero errors
    if kind == 'user':
        sim = ratings.dot(ratings.T) + epsilon
        print(sim)
    elif kind == 'item':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [21]:
user_sim_matrix = similarity(train_norm_matrix, kind = 'user')
# item_sim_matrix = similarity(train, kind = 'item')

[[ 1.20000000e+02  4.54545465e-02  2.05376344e+00 ...  2.51596887e+01
  -4.82758620e-01  1.05846154e+00]
 [ 4.54545465e-02  1.06477273e+01  1.00000000e-09 ...  4.58168644e-01
  -6.42633228e-01  1.03972028e+00]
 [ 2.05376344e+00  1.00000000e-09  1.33338710e+02 ...  1.82587919e-01
   1.00000000e-09 -5.99131513e-01]
 ...
 [ 2.51596887e+01  4.58168644e-01  1.82587919e-01 ...  6.84471820e+02
   6.08556834e-01  2.21545312e+01]
 [-4.82758620e-01 -6.42633228e-01  1.00000000e-09 ...  6.08556834e-01
   6.20689655e+00  5.04615386e-01]
 [ 1.05846154e+00  1.03972028e+00 -5.99131513e-01 ...  2.21545312e+01
   5.04615386e-01  6.86509231e+02]]


In [None]:
def find_top_k_neighbor(sim_matrix):
    user_sim_list = sim_matrix[0].tolist()
    print(user_sim_list)