In [1]:
import IPython
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly as py
from plotly.offline import download_plotlyjs
import pylab
import scipy
import seaborn as sns
import sklearn
from sklearn import *
import statsmodels as sm

np.random.seed(1337)

%matplotlib inline

IPython.core.display.display(IPython.core.display.HTML(
    "<style>.container { width:90% !important; }</style>"))

sns.set(font_scale=1.3)
sns.set_style('whitegrid')
sns.set_palette(sns.color_palette('muted'))

pd.options.display.max_colwidth = 30
pd.options.display.float_format = '{:,.3f}'.format

py.offline.init_notebook_mode(connected=True)

## Load the data

In [2]:
movies = pd.read_csv('data/movies_clean.csv')
ratings = pd.read_csv('data/ratings_train.csv')
my_ratings = pd.read_csv('data/movielens-ratings.csv') # https://movielens.org/profile/settings/import-export

## Some data processing

In [3]:
# normalize ratings
ratings.rating = ratings.groupby('user_id').transform(lambda g: g - g.mean()).rating
#ratings.sample(20)

In [4]:
# filter out users and movies with too few ratings
min_user_ratings = 20
min_movie_ratings = 30

user_rating_counts = ratings.groupby('user_id').movie_id.count()
#print(user_rating_counts.quantile([0.2 * i for i in range(6)]))
user_ids = user_rating_counts[user_rating_counts > min_user_ratings].index.tolist()
ratings = ratings[ratings.user_id.isin(user_ids)]

movie_rating_counts = ratings.groupby('movie_id').user_id.count()
#print(movie_rating_counts.quantile([0.2 * i for i in range(6)]))
movie_ids = movie_rating_counts[movie_rating_counts > min_movie_ratings].index.tolist()
ratings = ratings[ratings.movie_id.isin(movie_ids)].copy()

user_ids = ratings.user_id.unique()
print('number of users selected:', len(user_ids))

movie_ids = ratings.movie_id.unique()
print('number of movies selected:', len(movie_ids))

movies = movies[movies.movie_id.isin(movie_ids)].copy()

number of users selected: 120669
number of movies selected: 11308


## Compute similarity matrix

In [5]:
# mapping required to have the ids starting from zero
user_id_to_sid = dict(zip(user_ids, range(len(user_ids))))
user_sid_to_id = dict(zip(range(len(user_ids)), user_ids))
ratings['user_sid'] = ratings.user_id.map(user_id_to_sid)

movie_id_to_sid = dict(zip(movie_ids, range(len(movie_ids))))
movie_sid_to_id = dict(zip(range(len(movie_ids)), movie_ids))
ratings['movie_sid'] = ratings.movie_id.map(movie_id_to_sid)

movies['movie_sid'] = movies.movie_id.map(movie_id_to_sid)

In [6]:
# raw rating matrix
r = scipy.sparse.csr_matrix((ratings.rating.astype(np.float32).values,
                            (ratings.user_sid.astype(np.int32).values,
                            ratings.movie_sid.astype(np.int32).values)))
r.shape

(120669, 11308)

In [7]:
s = sklearn.metrics.pairwise.cosine_similarity(r.T)
s.shape

(11308, 11308)

## Candidate selection

In [8]:
# finding similar movies based on collaborative filtering
def similar_movies(source_movie_id, n_results=11, min_similarity=0.0):
    print('finding similar movies to: ', movies[movies.movie_id == source_movie_id].title.tolist()[0])
    
    # find most similar movies
    source_movie_sid = movie_id_to_sid[source_movie_id]
    similar_movie_sids = np.argpartition(1 - s[source_movie_sid, :], n_results)[:n_results]
        
    # build results
    candidates = []
    for movie_sid in similar_movie_sids:
        movie_id = movie_sid_to_id[movie_sid]
        candidates.append({
            'movie_id': movie_id,
            'title': movies[movies.movie_id == movie_id].values[0][1],
            'similar_to': movies[movies.movie_id == source_movie_id].title.tolist()[0],
            'similarity': s[source_movie_sid, movie_sid]})
    candidates = pd.DataFrame(candidates)
        
    #return results
    if len(candidates):
        return candidates.sort_values(by='similarity', ascending=False).head(n_results)[['movie_id', 'title', 'similar_to', 'similarity']]
    else:
        return None

## Recommendation

In [9]:
# pick the user's top 20% ratings
source_movie_ids = my_ratings[my_ratings.movie_id.isin(movie_ids)].sort_values(by='rating', ascending=False).head(int(len(my_ratings) / 5)).movie_id.tolist()
print('number of source movies: ', len(source_movie_ids))

number of source movies:  42


In [10]:
# collect all similar movies to the source movies and filter out duplicates
recs = None
for movie_id in source_movie_ids:
    candidates = similar_movies(movie_id, n_results=16)
    if candidates is not None:
        if recs is not None:
            recs = pd.concat([recs, candidates])
        else:
            recs = candidates
recs = recs[~recs.movie_id.isin(my_ratings.movie_id.tolist())].sort_values(by='similarity', ascending=False).drop_duplicates(subset='movie_id', keep='first')

finding similar movies to:  Heat (1995)
finding similar movies to:  Lord of the Rings: The Return of the King, The (2003)
finding similar movies to:  Godfather: Part II, The (1974)
finding similar movies to:  Once Upon a Time in the West (C'era una volta il West) (1968)
finding similar movies to:  Lord of the Rings: The Fellowship of the Ring, The (2001)
finding similar movies to:  Good, the Bad and the Ugly, The (Buono, il brutto, il cattivo, Il) (1966)
finding similar movies to:  Aliens (1986)
finding similar movies to:  Star Wars: Episode V - The Empire Strikes Back (1980)
finding similar movies to:  Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)
finding similar movies to:  Die Hard (1988)
finding similar movies to:  Lord of the Rings: The Two Towers, The (2002)
finding similar movies to:  Godfather, The (1972)
finding similar movies to:  Kill Bill: Vol. 1 (2003)
finding similar movies to:  Fargo (1996)
finding similar movies to:  Psycho (1960)
finding similar movies to:  Sile

In [11]:
# extra info for ranking
rating_count = ratings[ratings.movie_id.isin(recs.movie_id.tolist())].groupby('movie_id').rating.count().to_frame(name='rating_count')
recs = recs.merge(rating_count, left_on='movie_id', right_index=True)

rating_average = ratings[ratings.movie_id.isin(recs.movie_id.tolist())].groupby('movie_id').rating.mean().to_frame(name='rating_avg')
recs = recs.merge(rating_average, left_on='movie_id', right_index=True)

recs['similarity_rank'] = recs.similarity.rank(ascending=False)
recs['rating_count_rank'] = recs.rating_count.rank(ascending=False)
recs['rating_avg_rank'] = recs.rating_avg.rank(ascending=False)

In [12]:
# very simple ranking
recs['score'] = (
    3.0 * recs.similarity_rank +
    1.0 * recs.rating_avg_rank +
    0.5 * recs.rating_count_rank)
recs.sort_values(by='score').head(50)

Unnamed: 0,movie_id,title,similar_to,similarity,rating_count,rating_avg,similarity_rank,rating_count_rank,rating_avg_rank,score
1,1198,Raiders of the Lost Ark (I...,Star Wars: Episode V - The...,0.399,34093,0.575,1.0,6.0,11.0,17.0
9,527,Schindler's List (1993),"Silence of the Lambs, The ...",0.27,38334,0.673,8.0,3.0,1.0,26.5
10,2959,Fight Club (1999),"Matrix, The (1999)",0.28,31128,0.599,6.0,8.0,6.0,28.0
2,1213,Goodfellas (1990),"Godfather: Part II, The (1...",0.332,20854,0.541,2.0,17.0,16.0,30.5
3,912,Casablanca (1942),"Godfather, The (1972)",0.245,18957,0.604,10.0,18.0,5.0,44.0
5,1089,Reservoir Dogs (1992),Pulp Fiction (1994),0.307,22008,0.472,4.0,14.0,31.0,50.0
6,908,North by Northwest (1959),Psycho (1960),0.246,12197,0.584,9.0,33.0,9.0,52.5
12,750,Dr. Strangelove or: How I ...,"Godfather, The (1972)",0.237,18232,0.607,13.0,21.0,3.0,52.5
8,1197,"Princess Bride, The (1987)",Star Wars: Episode V - The...,0.239,25735,0.539,12.0,12.0,17.0,59.0
12,903,Vertigo (1958),Psycho (1960),0.275,11013,0.513,7.0,38.0,22.0,62.0
