In [84]:
import pandas as pd
import numpy as np
import math
import warnings
from sklearn.model_selection import train_test_split

In [55]:
df = pd.read_csv("./ml-latest-small/ratings.csv")
df.head()
# df = pd.read_csv("/home/tianyou/ml-latest/ratings.csv")

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [52]:
n_users = df.userId.unique().shape[0]
n_items = df.movieId.unique().shape[0]
n_users, n_items

(610, 9724)

In [32]:
movie_movieId = df.movieId.unique().tolist()
movie_movieId.sort()
d = dict()
for i in range(0, len(movie_movieId)):
    d[movie_movieId[i]] = i

In [33]:
ratings = np.zeros((n_users, n_items))
for row in df.itertuples():
    ratings[row[1]-1, d[row[2]]] = row[3]
ratings

array([[4. , 0. , 4. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [2.5, 2. , 2. , ..., 0. , 0. , 0. ],
       [3. , 0. , 0. , ..., 0. , 0. , 0. ],
       [5. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [34]:
def user_item_matrix(data):
    ratings = np.zeros((n_users, n_items))
    for row in data.itertuples():
        ratings[row[1]-1, d[row[2]]] = row[3]
    return ratings

In [53]:
train_set, test_set = train_test_split(df)

In [54]:
train = user_item_matrix(train_set)
test = user_item_matrix(test_set)
train

array([[4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 2., ..., 0., 0., 0.],
       [3., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.]])

In [50]:
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100
print ('Sparsity: {:4.2f}%'.format(sparsity))

Sparsity: 1.70%


In [93]:
def normalize(data, kind='user'):
    data[data == 0] = np.nan
    print(np.nanmean(data[0]))
    if kind == 'user':
        for i in range(0, data.shape[0]):
            # print(np.nanmean(data[i]))
            data[i] = data[i] - np.nanmean(data[i])
            
    if kind == 'item':
        for j in range(0, data.shape[1]):
            data.T[j] = data.T[j] - np.nanmean(data.T[j])

    data[np.isnan(data)] = 0
    print(data)

In [94]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=RuntimeWarning)
    normalize(train, kind = 'user')
    # normalize(train, kind = 'item')


0.0
[[-0.70278355  0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 ...
 [ 0.          0.         -1.0367462  ...  0.          0.
   0.        ]
 [-0.58685967  0.          0.         ...  0.          0.
   0.        ]
 [ 0.90375277  0.          0.         ...  0.          0.
   0.        ]]


In [80]:
def fast_similarity(ratings, kind='user', epsilon=1e-9):
    # epsilon -> small number for handling dived-by-zero errors
    if kind == 'user':
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [82]:
sim_matrix = fast_similarity(train)

In [86]:
def predict_fast_simple(ratings, similarity, kind='user'):
    if kind == 'user':
        return similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif kind == 'item':
        return ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])

In [88]:
predict_fast_simple(train, sim_matrix, kind='user')

array([[-0.12219921, -0.01101529, -0.01166388, ...,  0.        ,
         0.        ,  0.        ],
       [-0.01197271,  0.01334992, -0.01218662, ...,  0.        ,
         0.        ,  0.        ],
       [-0.00545199,  0.00743032,  0.00306055, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.01175141, -0.01074755, -0.08858105, ...,  0.        ,
         0.        ,  0.        ],
       [-0.21039363, -0.00515308, -0.0098433 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.12949154, -0.01268769, -0.00334989, ...,  0.        ,
         0.        ,  0.        ]])