In [1]:
import pandas as pd
import numpy as np

movies = pd.read_csv('datasets/ml-latest-small/movies.csv', index_col=0)
ratings_df = pd.read_csv('datasets/ml-latest-small/ratings.csv')
ratings_df.columns = ['user_id', 'movie_id', 'rating', 'timestamp']

user_ratings = ratings_df.pivot(index='movie_id', columns='user_id',
                                values='rating')


In [2]:
ui_matrix = np.copy(user_ratings.values)
popularity = np.isfinite(ui_matrix).astype(int).sum(axis=1)
means = np.nanmean(ui_matrix, axis=1)
ui_matrix = ui_matrix - means.reshape(-1, 1)
ui_matrix = np.nan_to_num(ui_matrix)

In [3]:
from scipy.sparse.linalg import svds

u, s, vt = svds(ui_matrix, k=500)

In [4]:
from sklearn.cluster import KMeans

n_clusters = 10
kmeans = KMeans(n_clusters)
clusters = kmeans.fit_predict(u)

cluster_mask = np.asarray([clusters == i for i in range(n_clusters)])

ratings_mask = np.isfinite(user_ratings.values)

In [5]:
np.sum(ratings_mask, axis=1).size
relevance = ((np.sum(ui_matrix, axis=1) / user_ratings.shape[1]) * 
             (user_ratings.shape[0] / np.sum(ratings_mask, axis=1)))

relevance_df = pd.DataFrame(relevance, index=user_ratings.index)
relevance_df['title'] = [movies.title[id] for id in relevance_df.index]
relevance_df.columns = ['relevance', 'title']

relevance_df.sort_values(by='relevance', ascending=False)


masked_array = np.tile(relevance, (cluster_mask.shape[0], 1))
masked_array[~cluster_mask] = -np.inf
sorted_array = np.argsort(masked_array, axis=1)

relevance_df.iloc[sorted_array[:, -1]]

Unnamed: 0_level_0,relevance,title
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1304,5.920166e-15,Butch Cassidy and the Sundance Kid (1969)
4447,-2.04261e-15,Legally Blonde (2001)
3354,8.000224e-16,Mission to Mars (2000)
344,-4.114401e-16,Ace Ventura: Pet Detective (1994)
778,-5.806614e-15,Trainspotting (1996)
3730,5.739291e-15,"Conversation, The (1974)"
780,-5.504741e-16,Independence Day (a.k.a. ID4) (1996)
25,-2.613935e-15,Leaving Las Vegas (1995)
7153,-1.159123e-15,"Lord of the Rings: The Return of the King, The..."
1196,-8.205358e-16,Star Wars: Episode V - The Empire Strikes Back...
