In [7]:
import os
import pandas as pd
import numpy as np
data_path = "data/ml-100k"

df = pd.read_csv(os.path.join(data_path, "u.data"), sep="\t", header=None, names=["user_id", "item_id", "rating", "timestamp"])
df = df[['user_id', 'item_id', 'rating']]
df.head()

# Load data



Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [8]:
# Convert records into user-book review score matrix
reviewmatrix = df.pivot(index="user_id", columns="item_id", values="rating").fillna(0)
matrix = reviewmatrix.values
display(matrix)



array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [10]:
# Singular value decomposition
u, s, vh = np.linalg.svd(matrix, full_matrices=False)
 
# Find the highest similarity
def cosine_similarity(v,u):
    return (v @ u)/ (np.linalg.norm(v) * np.linalg.norm(u))
 
highest_similarity = -np.inf
highest_sim_col = -1
for col in range(1,vh.shape[1]):
    similarity = cosine_similarity(vh[:,0], vh[:,col])
    if similarity > highest_similarity:
        highest_similarity = similarity
        highest_sim_col = col
 
print("Column %d (movie id %s) is most similar to column 0 (movie id %s)" %
        (highest_sim_col, reviewmatrix.columns[col], reviewmatrix.columns[0])
)

newt = u @ np.diag(s) @ vh
display(newt)


Column 1615 (book id 1682) is most similar to column 0 (book id 1)


array([[ 5.00000000e+00,  3.00000000e+00,  4.00000000e+00, ...,
        -3.76434994e-16, -4.68375339e-16,  2.52510686e-16],
       [ 4.00000000e+00,  3.30568906e-13, -1.82145965e-16, ...,
        -2.11636264e-16,  1.83880688e-16, -2.46330734e-16],
       [-1.67053871e-14,  1.38500322e-14, -1.46965773e-14, ...,
        -1.87350135e-16,  6.93889390e-18, -2.56739074e-16],
       ...,
       [ 5.00000000e+00,  1.40859546e-15, -2.14411822e-15, ...,
        -1.09791733e-15,  2.67147415e-16, -2.61726404e-16],
       [ 2.84494650e-15,  7.77156117e-15, -2.84494650e-15, ...,
        -1.20389809e-15, -7.88431820e-16,  1.48318857e-16],
       [ 2.03448369e-14,  5.00000000e+00, -6.30051566e-15, ...,
         4.68375339e-17,  5.55111512e-16,  3.85108612e-16]])

In [11]:
display(s)

array([640.63362257, 244.83634567, 217.84622472, 159.15359872,
       158.21191449, 145.87261327, 126.57977314, 121.90769976,
       106.8291837 ,  99.74793974,  93.79885965,  93.25844284,
        89.91150168,  84.34178722,  83.81220836,  81.81204105,
        79.07796788,  77.88652669,  76.387996  ,  75.3415951 ,
        73.68235502,  72.80837191,  72.51350545,  71.52749477,
        69.77179735,  69.10881715,  68.8735702 ,  67.94277928,
        67.40829434,  67.06352378,  66.85757418,  65.59270059,
        65.27526042,  64.79965625,  64.44727664,  64.09819141,
        63.91638042,  63.08261122,  62.67586971,  62.23742793,
        62.03574728,  61.77291401,  61.33544177,  61.0632462 ,
        60.56817026,  60.30813928,  59.77166759,  59.51420996,
        59.40675   ,  59.10683763,  58.83667955,  58.53445585,
        58.33802154,  58.1323194 ,  57.41759146,  57.36384311,
        57.30977341,  56.99448748,  56.72636608,  56.239748  ,
        56.17894513,  55.8734678 ,  55.65459359,  55.52