In [1]:

import numpy as np
import pandas as pd

In [2]:
movie_ratings_data = pd.read_csv('../ml-1m/ratings.dat', names=['user_id', 'movie_id', 'rating', 'time'],
    engine='python', delimiter='::')

In [3]:
movie_ratings_data

Unnamed: 0,user_id,movie_id,rating,time
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [4]:
movie_data = pd.read_csv('../ml-1m/movies.dat', names=['movie_id', 'title', 'genre'],
    engine='python', delimiter='::')

In [5]:
ratings_mat = np.ndarray(
    shape=(np.max(movie_ratings_data.user_id.values), np.max(movie_ratings_data.movie_id.values)),
    dtype=np.uint8)
ratings_mat[movie_ratings_data.user_id.values-1, movie_ratings_data.movie_id.values-1] = movie_ratings_data.rating.values

In [6]:
ratings_mat.shape

(6040, 3952)

In [7]:
ratings_mat

array([[5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [3, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [18]:
normalised_mat = ratings_mat - np.asarray([(np.mean(ratings_mat, 1))]).T

In [19]:
A = normalised_mat.T / np.sqrt(ratings_mat.shape[0] - 1)
U, S, V = np.linalg.svd(A, full_matrices = False)

In [20]:
def top_cosine_similarity(data, movie_id, top_n=10):
    index = movie_id - 1 # Movie id starts from 1
    movie_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]

# Helper function to print top N similar movies
def print_similar_movies(movie_data, movie_id, top_indexes):
    print('Recommendations for {0}: \n'.format(
    movie_data[movie_data.movie_id == movie_id].title.values[0]))
    for id in top_indexes + 1:
        print(movie_data[movie_data.movie_id == id].title.values[0])

In [42]:
U.shape

(3952, 3952)

In [43]:
V.shape

(3952, 6040)

In [45]:
S

array([2.01680812e+01, 8.62790194e+00, 7.39264526e+00, ...,
       1.46912634e-15, 1.46912634e-15, 1.11368253e-15])

In [24]:
k = 10
user_id = 0 # Grab an id from users.dat
top_n = 10

sliced = U[:, :k] # representative data
indexes = top_cosine_similarity(sliced, movie_id, top_n)
indexes
# print_similar_movies(movie_data, movie_id, indexes)

array([3951, 2685, 3175, 2332, 2907, 2335, 3892, 3565, 3184, 3077])

In [27]:
print(ratings_mat[0][0:100])

[5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [28]:
ratings_mat[0:].shape

(6040, 3952)

In [29]:
ratings_mat

array([[5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [3, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [50]:
index = 0
user_row = ratings_mat[index, :]
top_indexes = np.argsort(user_row)[::-1]

In [52]:
top_indexes

array([   0,  526, 2803, ..., 2623, 2622, 1975])

In [53]:
for id in top_indexes[:20] + 1:
        print(movie_data[movie_data.movie_id == id].title.values[0])

Toy Story (1995)
Schindler's List (1993)
Christmas Story, A (1983)
Last Days of Disco, The (1998)
Bug's Life, A (1998)
Awakenings (1990)
Beauty and the Beast (1991)
One Flew Over the Cuckoo's Nest (1975)
Ben-Hur (1959)
Saving Private Ryan (1998)
Pocahontas (1995)
Back to the Future (1985)
Sound of Music, The (1965)
Dumbo (1941)
Mary Poppins (1964)
Cinderella (1950)
Apollo 13 (1995)
Rain Man (1988)
Antz (1998)
Run Lola Run (Lola rennt) (1998)


In [54]:
user_row_reshaped = np.zeros(3952).reshape(1,3952)
user_row_reshaped[0,0] = 5

In [55]:
user_row_reshaped

array([[5., 0., 0., ..., 0., 0., 0.]])

In [56]:
def query(q,V):
    #find q*v, w
    prd=np.dot(q,V)
    Vt=np.transpose(V)
    other=np.dot(prd,Vt)
    return other

In [57]:
predict = np.dot(user_row_reshaped,U)

In [58]:
predict

array([[-3.78466803e-01,  8.99797086e-02, -1.63666270e-01, ...,
         6.06669213e-08, -5.74946854e-08, -2.19201923e-03]])

In [60]:
top_indexes = np.argsort(predict[0])[::-1]

In [61]:
top_indexes

array([145,  87,  33, ..., 164, 126,  12])

In [63]:
for id in top_indexes[:10] + 1:
        print(movie_data[movie_data.movie_id == id].title.values[0])

Amazing Panda Adventure, The (1995)
Black Sheep (1996)
Babe (1995)
Blue in the Face (1995)
How to Make an American Quilt (1995)
Margaret's Museum (1995)
Ace Ventura: When Nature Calls (1995)
Unzipped (1995)
Postino, Il (The Postman) (1994)
Apollo 13 (1995)


In [31]:
V.shape

(6040, 6040)

In [67]:
U.shape

(3952, 3952)

In [37]:
S

array([2.01680812e+01, 8.62790194e+00, 7.39264526e+00, ...,
       1.46912634e-15, 1.46912634e-15, 1.11368253e-15])

In [69]:
S[0]

20.168081247879016