In [171]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import csr_matrix
from mpl_toolkits.axes_grid1 import make_axes_locatable

# Import the Movies dataset
movies = pd.read_csv('movies.csv')
movies.head()
ratings = pd.read_csv('ratings.csv')
ratings.head()
ratings = pd.merge(ratings, movies, on='movieId')
ratings_matrix = ratings.pivot_table(index='userId', columns='title', values='rating')

print('dataset dimensions: ', ratings_matrix.shape, '\n\nSubset example:')
ratings_matrix.iloc[:6, :10]

n_movies = 30
n_users = 18
most_rated_movies_users_selection = sort_by_rating_density(ratings_matrix, n_movies, n_users)
most_rated_movies_users_selection

dataset dimensions:  (630, 9719) 

Subset example:


title,Forrest Gump (1994),"Shawshank Redemption, The (1994)",Pulp Fiction (1994),"Silence of the Lambs, The (1991)","Matrix, The (1999)",Star Wars: Episode IV - A New Hope (1977),Jurassic Park (1993),Braveheart (1995),Terminator 2: Judgment Day (1991),Schindler's List (1993),...,Star Wars: Episode VI - Return of the Jedi (1983),"Godfather, The (1972)","Fugitive, The (1993)",Batman (1989),Saving Private Ryan (1998),"Lord of the Rings: The Two Towers, The (2002)","Lord of the Rings: The Return of the King, The (2003)",Aladdin (1992),Fargo (1996),"Sixth Sense, The (1999)"
589,5.0,4.5,4.5,3.5,4.0,5.0,4.0,4.0,4.5,5.0,...,4.5,5.0,4.0,3.5,4.0,5.0,4.5,4.0,4.0,3.5
479,5.0,5.0,4.0,4.5,5.0,4.5,5.0,5.0,4.5,5.0,...,3.5,5.0,3.5,4.5,4.5,4.5,4.0,4.0,4.0,4.0
67,3.5,3.0,2.0,3.5,4.5,5.0,3.5,2.5,3.5,4.0,...,5.0,4.0,4.5,4.0,4.0,4.0,4.5,3.5,2.5,2.5
473,3.0,5.0,4.0,4.5,4.5,4.0,4.5,3.0,4.0,5.0,...,4.0,5.0,5.0,4.0,3.0,5.0,5.0,4.0,4.0,5.0
413,5.0,5.0,5.0,4.0,5.0,5.0,4.0,5.0,5.0,4.0,...,5.0,5.0,5.0,4.0,5.0,5.0,4.0,4.0,5.0,3.0
248,4.5,4.5,4.0,4.0,5.0,5.0,4.0,5.0,4.0,4.5,...,4.5,4.5,4.5,,4.5,4.5,5.0,4.0,4.5,4.0
424,5.0,5.0,4.0,4.0,3.5,3.0,3.5,4.0,3.5,4.0,...,3.0,3.5,3.5,3.5,4.0,4.5,4.0,3.0,2.5,3.5
181,5.0,4.5,5.0,4.5,5.0,3.5,3.5,3.5,2.0,4.0,...,2.5,4.5,3.5,3.5,3.0,3.0,1.0,,5.0,4.0
579,4.0,5.0,5.0,4.5,5.0,4.0,4.0,4.5,4.0,3.0,...,4.0,4.5,3.0,3.0,5.0,4.5,4.0,2.0,4.0,4.5
273,4.5,4.5,5.0,4.0,4.0,3.0,3.5,4.5,4.5,4.0,...,4.0,3.5,3.5,3.0,4.0,3.5,3.0,4.0,3.0,4.0


In [172]:
def users_top_rating(top_movies, max):
    top_movies['counts'] = pd.Series(top_movies.count(axis=1))
    top_movies_users = top_movies.sort_values('counts', ascending=False)
    top_movies_users_selection = top_movies_users.iloc[:max, :]
    top_movies_users_selection = top_movies_users_selection.drop(['counts'], axis=1)
    
    return top_movies_users_selection

def highest_rated_movie(user_ratings, max_ratings):
    user_ratings = user_ratings.append(user_ratings.count(), ignore_index=True)
    user_ratings_sorted = user_ratings.sort_values(len(user_ratings)-1, axis=1, ascending=False)
    user_ratings_sorted = user_ratings_sorted.drop(user_ratings_sorted.tail(1).index)
    top_movies = user_ratings_sorted.iloc[:, :max_ratings]
    return top_movies

def rating_density(user_movie_ratings, n_movies, n_users):
    top_movies = highest_rated_movie(user_movie_ratings, n_movies)
    top_movies = users_top_rating(top_movies, n_users)
    return top_movies

In [173]:
user_movie_ratings =  pd.pivot_table(ratings_title, index='userId', columns= 'title', values='rating')
most_rated_movies_1k = highest_rated_movie(user_movie_ratings, 2000)
sparse_ratings = csr_matrix(pd.SparseDataFrame(most_rated_movies_1k).to_coo())
predictions = KMeans(n_clusters=2, algorithm='full').fit_predict(sparse_ratings)
clustered = pd.concat([most_rated_movies_1k.reset_index(), pd.DataFrame({'group':predictions})], axis=1)
cluster_number = 1

In [174]:
num_users = 600
num_movies = 1000
cluster = clustered[clustered.group == cluster_number].drop(['index', 'group'], axis=1)

cluster = rating_density(cluster, num_movies, num_users)

In [176]:
user_id = 401

# Get all this user's ratings
user_2_ratings  = cluster.loc[user_id, :]

# Which movies did they not rate? (We don't want to recommend movies they've already rated)
user_2_unrated_movies =  user_2_ratings[user_2_ratings.isnull()]

# What are the ratings of these movies the user did not rate?
avg_ratings = pd.concat([user_2_unrated_movies, cluster.mean()], axis=1, join='inner').loc[:,0]

# Let's sort by rating so the highest rated movies are presented first
avg_ratings.sort_values(ascending=False)[:20]

Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)     4.500000
Emma (1996)                                       4.472222
Raging Bull (1980)                                4.468750
Harold and Maude (1971)                           4.454545
Cool Hand Luke (1967)                             4.446429
Shawshank Redemption, The (1994)                  4.426230
Philadelphia Story, The (1940)                    4.388889
Manchurian Candidate, The (1962)                  4.384615
Hoop Dreams (1994)                                4.375000
Lawrence of Arabia (1962)                         4.357143
North by Northwest (1959)                         4.344828
Cinema Paradiso (Nuovo cinema Paradiso) (1989)    4.333333
All About Eve (1950)                              4.333333
To Catch a Thief (1955)                           4.318182
Watchmen (2009)                                   4.312500
Gods Must Be Crazy, The (1980)                    4.307692
Rear Window (1954)                                4.2954