In [1]:
%config IPCompleter.greedy=True
%matplotlib inline

In [2]:
import pandas as pd, numpy as np, os, sys, cv2
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import display, HTML

font = {'size'   : 18}
matplotlib.rc('font', **font)

In [7]:
data = pd.read_csv('movie_reviews.data', sep='\t', header=None)
data.columns = ['user', 'movie', 'rating', 'timestamp']
print(data.head(10))
print(data.shape)

n_users = len(data.user.unique())
n_items = len(data.movie.unique())
print('users:', n_users)
print('movies:', n_items)

   user  movie  rating  timestamp
0   196    242       3  881250949
1   186    302       3  891717742
2    22    377       1  878887116
3   244     51       2  880606923
4   166    346       1  886397596
5   298    474       4  884182806
6   115    265       2  881171488
7   253    465       5  891628467
8   305    451       3  886324817
9     6     86       3  883603013
(100000, 4)
users: 943
movies: 1682


In [9]:
ratings = np.zeros((n_users, n_items))
for row in data.itertuples():
    ratings[row[1]-1, row[2]-1] = row[3]
print(ratings[0:4, :])

[[5. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [12]:
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
print('Sparsity:', sparsity)

Sparsity: 0.06304669364224531


In [13]:
def fast_similarity(ratings, kind='user', epsilon=1e-9):
    # epsilon -> small number for handling dived-by-zero errors
    if kind == 'user':
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [24]:
mat = ratings.dot(ratings.T)
print(mat.shape)
diag = np.array([np.sqrt(np.diagonal(mat))])
print(diag.shape)

(943, 943)
(1, 943)


In [29]:
print(diag.T.shape)
print(diag[:, 0:10], diag.T[0:10])

(943, 1)
[[63.07138812 30.29851482 22.38302929 21.67948339 42.05948169 54.90901565
  82.41359111 30.65941943 20.49390153 57.6020833 ]] [[63.07138812]
 [30.29851482]
 [22.38302929]
 [21.67948339]
 [42.05948169]
 [54.90901565]
 [82.41359111]
 [30.65941943]
 [20.49390153]
 [57.6020833 ]]


In [28]:
print(mat[0:10, 0:10])
a = mat/diag
print(a.shape)
print(a[0:10, 0:10])

[[3978.  319.   67.   88. 1004. 1490. 2289.  617.  101. 1368.]
 [ 319.  918.   75.  117.   93.  409.  268.   96.  100.  279.]
 [  67.   75.  501.  167.   20.   89.  122.   57.   28.   84.]
 [  88.  117.  167.  470.   29.   81.  163.  125.   45.   76.]
 [1004.   93.   20.   29. 1769.  548. 1295.  321.   49.  488.]
 [1490.  409.   89.   81.  548. 3015. 2214.  339.  207. 1745.]
 [2289.  268.  122.  163. 1295. 2214. 6792.  720.  246. 2312.]
 [ 617.   96.   57.  125.  321.  339.  720.  940.   54.  412.]
 [ 101.  100.   28.   45.   49.  207.  246.   54.  420.  234.]
 [1368.  279.   84.   76.  488. 1745. 2312.  412.  234. 3318.]]
(943, 943)
[[63.07138812 10.52856887  2.99333925  4.05913732 23.87095513 27.13579878
  27.77454506 20.12432105  4.92829537 23.74914103]
 [ 5.05776089 30.29851482  3.35075289  5.39680757  2.21115421  7.44868571
   3.25189082  3.13117475  4.87950036  4.84357482]
 [ 1.06228834  2.47536886 22.38302929  7.70313559  0.47551703  1.62086315
   1.48033836  1.85913501  1.36626

In [31]:
sim = a/diag.T
print(sim.shape)
print(sim[0:10, 0:10])

(943, 943)
[[1.         0.16693098 0.04745954 0.06435782 0.37847518 0.43023944
  0.4403668  0.31907211 0.07813837 0.37654381]
 [0.16693098 1.         0.11059132 0.17812119 0.07297896 0.24584326
  0.10732839 0.10334417 0.16104751 0.15986179]
 [0.04745954 0.11059132 1.         0.34415072 0.02124453 0.07241482
  0.06613664 0.08306003 0.06104    0.06515117]
 [0.06435782 0.17812119 0.34415072 1.         0.03180425 0.06804441
  0.09123045 0.18806031 0.10128356 0.06085923]
 [0.37847518 0.07297896 0.02124453 0.03180425 1.         0.23728647
  0.37360013 0.24892997 0.056847   0.20142701]
 [0.43023944 0.24584326 0.07241482 0.06804441 0.23728647 1.
  0.48925483 0.20136877 0.18395095 0.55171346]
 [0.4403668  0.10732839 0.06613664 0.09123045 0.37360013 0.48925483
  1.         0.2849507  0.14565038 0.48702449]
 [0.31907211 0.10334417 0.08306003 0.18806031 0.24892997 0.20136877
  0.2849507  1.         0.08594195 0.23328945]
 [0.07813837 0.16104751 0.06104    0.10128356 0.056847   0.18395095
  0.14565