In [1]:
import pandas as pd
import numpy as np
import math
import warnings
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("./ml-latest-small/ratings.csv")
df.head()
# df = pd.read_csv("/home/tianyou/ml-latest/ratings.csv")

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
n_users = df.userId.unique().shape[0]
n_items = df.movieId.unique().shape[0]
n_users, n_items

(610, 9724)

In [4]:
movie_movieId = df.movieId.unique().tolist()
movie_movieId.sort()
d = dict()
for i in range(0, len(movie_movieId)):
    d[movie_movieId[i]] = i

In [5]:
ratings = np.zeros((n_users, n_items))
for row in df.itertuples():
    ratings[row[1]-1, d[row[2]]] = row[3]
ratings

array([[4. , 0. , 4. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [2.5, 2. , 2. , ..., 0. , 0. , 0. ],
       [3. , 0. , 0. , ..., 0. , 0. , 0. ],
       [5. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [6]:
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100
print ('Sparsity: {:4.2f}%'.format(sparsity))

Sparsity: 1.70%


In [7]:
def user_item_matrix(data):
    ratings = np.zeros((n_users, n_items))
    for row in data.itertuples():
        ratings[row[1]-1, d[row[2]]] = row[3]
    return ratings

In [8]:
train_set, test_set = train_test_split(df)

In [9]:
train = user_item_matrix(train_set)
test = user_item_matrix(test_set)
train

array([[4. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [2.5, 0. , 2. , ..., 0. , 0. , 0. ],
       [3. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [10]:
def normalize(data, kind='user'):
    data[data == 0] = np.nan
    # print(np.nanmean(data[0]))
    if kind == 'user':
        for i in range(0, data.shape[0]):
            # print(np.nanmean(data[i]))
            data[i] = data[i] - np.nanmean(data[i])
            
    if kind == 'item':
        for j in range(0, data.shape[1]):
            data.T[j] = data.T[j] - np.nanmean(data.T[j])

    data[np.isnan(data)] = 0
    return (data)

In [11]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=RuntimeWarning)
    train_norm_matrix = normalize(train, kind = 'user')

In [12]:
def norm(ratings):
    norm_list = list()
    for row in ratings:
        norm_list.append(np.linalg.norm(row, ord=2))
    np.multiply(norm_list, norm_list[0])
    return norm_list

In [13]:
def similarity(ratings, kind='user', epsilon=1e-9):
    # epsilon -> small number for handling dived-by-zero errors
    if kind == 'user':
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [14]:
user_sim_matrix = similarity(train_norm_matrix, kind = 'user')
user_sim_matrix
# item_sim_matrix = similarity(train, kind = 'item')

array([[ 1.00000000e+00, -1.91727150e-03, -4.33935419e-03, ...,
         7.10489809e-02,  1.33719035e-02,  3.12462885e-03],
       [-1.91727150e-03,  1.00000000e+00,  2.41818559e-11, ...,
        -7.53594342e-03,  1.21828981e-10,  1.49925387e-02],
       [-4.33935419e-03,  2.41818559e-11,  1.00000000e+00, ...,
        -5.18756211e-03,  4.25336093e-11,  2.10347388e-03],
       ...,
       [ 7.10489809e-02, -7.53594342e-03, -5.18756211e-03, ...,
         1.00000000e+00,  2.09700029e-03,  4.35641387e-02],
       [ 1.33719035e-02,  1.21828981e-10,  4.25336093e-11, ...,
         2.09700029e-03,  1.00000000e+00, -6.01326678e-03],
       [ 3.12462885e-03,  1.49925387e-02,  2.10347388e-03, ...,
         4.35641387e-02, -6.01326678e-03,  1.00000000e+00]])

In [15]:
def find_top_k_neighbor(sim_matrix):
    user_sim_list = sim_matrix[0].tolist()
    print(user_sim_list)