In [1]:
# general purpose python
import collections
import datetime
import glob
import importlib
import itertools
import json
import math
import os
import pickle
import random
import re
import shutil
import sys
import time
import warnings

# general purpose data science
import IPython
import ipywidgets as ipw
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs
import pylab
import scipy
import seaborn as sns
import sklearn
from sklearn import *
import statsmodels as sm

# computer vision
import cv2
import imageio
import PIL
from PIL import *

# deep learning
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision

# geospatial
import rasterio as rio
import rasterio.features

warnings.filterwarnings('ignore')

np.random.seed(1337)

mpl.rcParams['figure.dpi'] = 400

IPython.core.display.display(IPython.core.display.HTML("<style>.container { width:100% !important; }</style>"))

pd.options.display.max_colwidth = 32
pd.options.display.float_format = '{:,.6f}'.format
pd.options.display.expand_frame_repr = False

%matplotlib inline

sns.set(font_scale=1.3)
sns.set_style('whitegrid')
sns.set_palette(sns.color_palette('muted'))

plotly.offline.init_notebook_mode(connected=True)
plotly.io.templates.default = 'plotly_white'

In [2]:
data_dir = '../../data/movielens/'

## Load the data

In [3]:
movies = pd.read_csv(f'{data_dir}/movies_clean.csv')
ratings = pd.read_csv(f'{data_dir}/ratings_train.csv')
my_ratings = pd.read_csv(f'{data_dir}/movielens-ratings.csv') # https://movielens.org/profile/settings/import-export

## Parameters

In [4]:
min_user_ratings = 20
min_movie_ratings = 30

min_overlap = 20

prop_top_ratings_used = 0.3

n_similar_movies = 15

## Some data processing

In [5]:
# normalize ratings
ratings.rating = ratings.groupby('user_id').transform(lambda g: g - g.mean()).rating

In [6]:
# filter out users and movies with too few ratings
user_rating_counts = ratings.groupby('user_id').movie_id.count()
user_ids = user_rating_counts[user_rating_counts > min_user_ratings].index.tolist()
ratings = ratings[ratings.user_id.isin(user_ids)]

movie_rating_counts = ratings.groupby('movie_id').user_id.count()
movie_ids = movie_rating_counts[movie_rating_counts > min_movie_ratings].index.tolist()
ratings = ratings[ratings.movie_id.isin(movie_ids)].copy()

user_ids = ratings.user_id.unique()
print('number of users selected:', len(user_ids))

movie_ids = ratings.movie_id.unique()
print('number of movies selected:', len(movie_ids))

movies = movies[movies.movie_id.isin(movie_ids)].copy()

number of users selected: 120669
number of movies selected: 11308


## Compute similarity matrix

In [7]:
# mapping required to have the ids starting from zero
user_id_to_sid = dict(zip(user_ids, range(len(user_ids))))
user_sid_to_id = dict(zip(range(len(user_ids)), user_ids))
ratings['user_sid'] = ratings.user_id.map(user_id_to_sid)

movie_id_to_sid = dict(zip(movie_ids, range(len(movie_ids))))
movie_sid_to_id = dict(zip(range(len(movie_ids)), movie_ids))
ratings['movie_sid'] = ratings.movie_id.map(movie_id_to_sid)

ratings['rating_binary'] = 1

movies['movie_sid'] = movies.movie_id.map(movie_id_to_sid)

In [8]:
# raw rating matrix
r = scipy.sparse.csr_matrix((ratings.rating.astype(np.float32).values,
                            (ratings.user_sid.astype(np.int32).values,
                            ratings.movie_sid.astype(np.int32).values)))
r.shape

(120669, 11308)

In [9]:
# binary rating matrix for computing rating overlaps
r_bin = scipy.sparse.csr_matrix((ratings.rating_binary.astype(np.int32).values,
                            (ratings.user_sid.astype(np.int32).values,
                            ratings.movie_sid.astype(np.int32).values)))
r_bin.shape

(120669, 11308)

In [10]:
# pre-computed cosine similarity matrix to speed things up
s = sklearn.metrics.pairwise.cosine_similarity(r.T)
s.shape

(11308, 11308)

In [11]:
# rating overlap matrix to let us only compute similarity between movies with enough votes in common
s_bin = (scipy.sparse.csr_matrix(r_bin.T * r_bin) > min_overlap).astype(int)
s_bin.shape

(11308, 11308)

## Candidate selection

In [12]:
# finding similar movies based on collaborative filtering
def similar_movies(source_movie_id, n_results=11, min_similarity=0.0):
    print('finding similar movies to: ', movies[movies.movie_id == source_movie_id].title.tolist()[0])
    
    source_movie_sid = movie_id_to_sid[source_movie_id]
    
    # zero out elements in the similarity matrix that do not have enough overlap
    similarities = np.squeeze(np.asarray(np.multiply(s[source_movie_sid, :], s_bin[source_movie_sid, :].todense())))
    
    # find most similar movies
    similar_movie_sids = np.argpartition(1 - similarities, n_results)[:n_results]
        
    # build results
    candidates = []
    for movie_sid in similar_movie_sids:
        movie_id = movie_sid_to_id[movie_sid]
        candidates.append({
            'movie_id': movie_id,
            'title': movies[movies.movie_id == movie_id].values[0][1],
            'similar_to': movies[movies.movie_id == source_movie_id].title.tolist()[0],
            'similarity': s[source_movie_sid, movie_sid]})
    candidates = pd.DataFrame(candidates)
        
    #return results
    if len(candidates):
        return candidates.sort_values(by='similarity', ascending=False).head(n_results)[['movie_id', 'title', 'similar_to', 'similarity']]
    else:
        return None

## Recommendation

In [13]:
# pick the user's top ratings
source_movie_ids = (
    my_ratings[my_ratings.movie_id.isin(movie_ids)]
    .sort_values(by='rating', ascending=False)
    .head(int(len(my_ratings) * prop_top_ratings_used))
    .movie_id
    .tolist())
print('number of source movies: ', len(source_movie_ids))

number of source movies:  64


In [14]:
# collect all similar movies to the source movies and filter out duplicates
recs = None
for movie_id in source_movie_ids:
    candidates = similar_movies(movie_id, n_results=n_similar_movies+1)
    if candidates is not None:
        if recs is not None:
            recs = pd.concat([recs, candidates])
        else:
            recs = candidates
recs = recs[~recs.movie_id.isin(my_ratings.movie_id.tolist())].sort_values(by='similarity', ascending=False).drop_duplicates(subset='movie_id', keep='first')

finding similar movies to:  Heat (1995)
finding similar movies to:  Lord of the Rings: The Return of the King, The (2003)
finding similar movies to:  Godfather: Part II, The (1974)
finding similar movies to:  Once Upon a Time in the West (C'era una volta il West) (1968)
finding similar movies to:  Lord of the Rings: The Fellowship of the Ring, The (2001)
finding similar movies to:  Good, the Bad and the Ugly, The (Buono, il brutto, il cattivo, Il) (1966)
finding similar movies to:  Aliens (1986)
finding similar movies to:  Star Wars: Episode V - The Empire Strikes Back (1980)
finding similar movies to:  Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)
finding similar movies to:  Die Hard (1988)
finding similar movies to:  Lord of the Rings: The Two Towers, The (2002)
finding similar movies to:  Godfather, The (1972)
finding similar movies to:  Kill Bill: Vol. 1 (2003)
finding similar movies to:  Fargo (1996)
finding similar movies to:  Psycho (1960)
finding similar movies to:  Sile

In [15]:
# extra info for ranking
rating_count = ratings[ratings.movie_id.isin(recs.movie_id.tolist())].groupby('movie_id').rating.count().to_frame(name='rating_count')
recs = recs.merge(rating_count, left_on='movie_id', right_index=True)

rating_average = ratings[ratings.movie_id.isin(recs.movie_id.tolist())].groupby('movie_id').rating.mean().to_frame(name='rating_avg')
recs = recs.merge(rating_average, left_on='movie_id', right_index=True)

recs['similarity_rank'] = recs.similarity.rank(ascending=False)
recs['rating_count_rank'] = recs.rating_count.rank(ascending=False)
recs['rating_avg_rank'] = recs.rating_avg.rank(ascending=False)

In [16]:
# very simple ranking
recs['score'] = (
    3.0 * recs.similarity_rank +
    1.0 * recs.rating_avg_rank +
    0.5 * recs.rating_count_rank)
recs.sort_values(by='score').head(50)

Unnamed: 0,movie_id,title,similar_to,similarity,rating_count,rating_avg,similarity_rank,rating_count_rank,rating_avg_rank,score
3,1198,Raiders of the Lost Ark (Ind...,Star Wars: Episode V - The E...,0.399178,34093,0.574961,1.0,6.0,13.0,19.0
8,527,Schindler's List (1993),"Silence of the Lambs, The (1...",0.269636,38334,0.672684,10.0,3.0,1.0,32.5
5,2959,Fight Club (1999),"Matrix, The (1999)",0.280084,31128,0.598617,9.0,8.0,7.0,38.0
5,912,Casablanca (1942),Rear Window (1954),0.29212,18957,0.603931,8.0,19.0,5.0,38.5
3,908,North by Northwest (1959),Rear Window (1954),0.372289,12197,0.584421,3.0,42.0,10.0,40.0
2,1213,Goodfellas (1990),"Godfather: Part II, The (1974)",0.332327,20854,0.540926,4.0,17.0,22.0,42.5
7,750,Dr. Strangelove or: How I Le...,Apocalypse Now (1979),0.25782,18232,0.607073,11.0,23.0,3.0,47.5
6,903,Vertigo (1958),Rear Window (1954),0.38113,11013,0.513425,2.0,49.0,30.0,60.5
4,1089,Reservoir Dogs (1992),Pulp Fiction (1994),0.306786,22008,0.471912,6.0,14.0,42.0,67.0
4,913,"Maltese Falcon, The (1941)",Rear Window (1954),0.255989,9415,0.549021,12.0,55.0,19.0,82.5
