In [1]:
import IPython
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly as py
from plotly.offline import download_plotlyjs
import pylab
import scipy
import seaborn as sns
import sklearn
from sklearn import *
import statsmodels as sm

np.random.seed(1337)

%matplotlib inline

IPython.core.display.display(IPython.core.display.HTML(
    "<style>.container { width:90% !important; }</style>"))

sns.set(font_scale=1.3)
sns.set_style('whitegrid')
sns.set_palette(sns.color_palette('muted'))

pd.options.display.max_colwidth = 30
pd.options.display.float_format = '{:,.3f}'.format

py.offline.init_notebook_mode(connected=True)

## Load the data

In [2]:
movies = pd.read_csv('data/movies_clean.csv')
ratings = pd.read_csv('data/ratings_train.csv')
my_ratings = pd.read_csv('data/movielens-ratings.csv') # https://movielens.org/profile/settings/import-export

## Find a set of popular movies for clustering

In [3]:
# find a set of popular movies that the current user rated to use as a basis for the clustering
top_movies_count = 10
popular_movie_ids = ratings.movie_id.value_counts().head(top_movies_count).index.tolist()
my_movie_ids = my_ratings.movie_id.tolist()
clustering_movie_ids = list(set(popular_movie_ids).intersection(set(my_movie_ids)))
print(len(clustering_movie_ids))
movies[movies.movie_id.isin(clustering_movie_ids)][['movie_id', 'title']]

7


Unnamed: 0,movie_id,title
257,260,Star Wars: Episode IV - A ...
293,296,Pulp Fiction (1994)
315,318,"Shawshank Redemption, The ..."
476,480,Jurassic Park (1993)
583,589,Terminator 2: Judgment Day...
587,593,"Silence of the Lambs, The ..."
2486,2571,"Matrix, The (1999)"


## Find users who rated the same movies

In [4]:
clustering_movie_ratings = ratings[ratings.movie_id.isin(clustering_movie_ids)]
clustering_movie_rating_counts = clustering_movie_ratings.groupby(['user_id']).movie_id.count()
clustering_user_ids = clustering_movie_rating_counts[clustering_movie_rating_counts == len(clustering_movie_ids)].index.tolist()
clustering_ratings = clustering_movie_ratings[clustering_movie_ratings.user_id.isin(clustering_user_ids)]
print('number of users: ', len(clustering_user_ids))
print('number of ratings: ', len(clustering_ratings))

number of users:  2355
number of ratings:  16485


## Prepare the data for clustering

In [5]:
# append the current user's ratings to the ratings data set
my_ratings['user_id'] = -1
my_ratings = my_ratings[my_ratings.movie_id.isin(clustering_movie_ids)][['user_id', 'movie_id', 'rating']]
clustering_ratings = clustering_ratings[['user_id', 'movie_id', 'rating']]
clustering_ratings = pd.concat([clustering_ratings, my_ratings])

In [6]:
# create rectangular ratings matrix
clustering_ratings = clustering_ratings.sort_values(by=['user_id', 'movie_id'])
r = clustering_ratings.pivot(index='user_id', columns='movie_id')
r.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2356 entries, -1 to 138387
Data columns (total 7 columns):
(rating, 260)     2356 non-null float64
(rating, 296)     2356 non-null float64
(rating, 318)     2356 non-null float64
(rating, 480)     2356 non-null float64
(rating, 589)     2356 non-null float64
(rating, 593)     2356 non-null float64
(rating, 2571)    2356 non-null float64
dtypes: float64(7)
memory usage: 147.2 KB


In [7]:
r.head(10)

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating
movie_id,260,296,318,480,589,593,2571
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
-1,4.5,5.0,3.0,4.0,4.5,5.0,5.0
90,3.5,3.5,4.0,3.5,3.0,3.5,2.0
91,4.5,3.5,4.0,3.5,2.5,3.0,3.5
124,3.0,5.0,5.0,2.0,4.0,5.0,5.0
208,4.5,5.0,4.5,4.5,2.5,4.5,3.0
237,5.0,5.0,5.0,4.5,4.5,4.0,5.0
359,5.0,5.0,5.0,4.0,5.0,5.0,3.0
546,3.0,5.0,4.0,3.0,3.5,3.5,4.0
600,5.0,1.0,4.0,4.0,5.0,5.0,4.0
609,4.5,5.0,3.5,4.0,2.5,3.0,5.0


In [8]:
# normalize ratings (note: this is incorrect, the average of all movie ratings should be used here, but this is good enough for now)
rv = r.values
rv -= (rv.sum(axis=1)[:, np.newaxis] / 12.0)
rv.shape

(2356, 7)

## Clustering and visualization

In [9]:
# run the K-means clustering
n_clusters = 8
kmeans = sklearn.cluster.KMeans(n_clusters=n_clusters).fit(rv)
print('cluster label for the current user: ', kmeans.labels_[0])

cluster label for the current user:  7


In [10]:
# PCA only for visualization to project the data to 3D
pca = sklearn.decomposition.PCA(n_components=3)
rv_pca = pca.fit_transform(rv)
print('explained variane ratio with 3 dimensions: ', sum(pca.explained_variance_ratio_))

explained variane ratio with 3 dimensions:  0.5442367733905356


In [11]:
data = [
    py.graph_objs.Scatter3d(
    x=rv_pca[:, 0],
    y=rv_pca[:, 1],
    z=rv_pca[:, 2],
    mode='markers',
    marker={
        'size': 3,
        'color': kmeans.labels_,
        'colorscale': 'Rainbow',
        'line': {'width': 0.5}}),
    py.graph_objs.Scatter3d(
    x=[rv_pca[0, 0]],
    y=[rv_pca[0, 1]],
    z=[rv_pca[0, 2]],
    mode='markers',
    marker={
        'size': 10,
        'color': 'black'})]
layout = py.graph_objs.Layout(
    autosize=False,
    width=1200,
    height=900,
    margin={'l': 0, 'r': 0, 't': 0, 'b': 0},
    showlegend=False)
fig = py.graph_objs.Figure(data=data, layout=layout)
py.offline.iplot(fig, {'xaxis': {'range': [-1, 1]}}, config={'displayModeBar': False})

## Nearest neighbors

In [12]:
neigh = sklearn.neighbors.NearestNeighbors(n_neighbors=11)
neigh.fit(rv)
neighbors = neigh.kneighbors(rv[0:,:])[1][0]
neighbor_ids = []
for neighbor in neighbors:
    neighbor_ids.append(list(r.index)[neighbor])
r[r.index.isin(neighbor_ids)]

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating
movie_id,260,296,318,480,589,593,2571
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
-1,1.917,2.417,0.417,1.417,1.917,2.417,2.417
15814,1.5,2.5,0.5,1.5,1.5,2.5,2.5
16118,1.917,1.917,0.917,1.917,1.917,1.917,2.417
28933,2.292,2.292,0.792,1.792,2.292,2.292,1.792
49865,2.333,2.333,0.833,1.333,2.333,1.833,2.333
68932,2.417,2.417,0.417,1.417,1.417,2.417,2.417
88785,1.5,2.0,1.0,1.5,2.0,2.0,2.5
98412,1.958,2.458,0.458,0.958,2.458,1.958,2.458
106476,1.458,2.458,0.958,1.458,1.458,2.458,2.458
117716,2.417,2.417,0.417,1.417,1.417,2.417,2.417
