In [None]:
pip install scikit-surprise 

In [None]:
pip install lightfm

In [None]:
#pip install --upgrade numpy

In [1]:
# Libraries Used
import os
import warnings
import numpy as np
import pandas as pd
from math import sqrt
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import combinations
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import pairwise_distances,cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from scipy.sparse.linalg import svds
#from surprise import Dataset, KNNBasic, KNNWithMeans, KNNWithZScore, Reader, accuracy
#from surprise.model_selection import train_test_split,GridSearchCV
#from surprise.model_selection.validation import cross_validate
#from lightfm import LightFM
#from lightfm.evaluation import precision_at_k, auc_score
#from lightfm.datasets import fetch_movielens
warnings.filterwarnings('ignore')
%matplotlib inline

## Data

In [2]:
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")
#tags = pd.read_csv("tags.csv")

In [3]:
display(movies.head(5))
display(ratings.head(5))
#display(tags.head(5))

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
display(movies.info())
display(ratings.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


None

No null values in any of the columns in both datasets.

In [5]:
display(ratings['rating'].describe())

count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

On average, most users have rated 3.5 to movies.

## Content Based Filtering

### Motivation

We can try to recommend movies based on the similarity of genres, since we can assume that if a person likes a particular movie of a genre, say "Thriller", it is only reasonable to recommend them another movie of that genre. 

This concepts works even better when we take combinations of genres. For example, if a perosn likes a movie that has "Adventure", "Children" and "Fantasy" in its genre, then another movie having these 3 genres (a perfect match in genres) would seem to be a perfect recommendation.

### Aim

We would like to obtain a vector representation for each movie (something numeric) so that we can compare if 2 movies are similar or not. We would also like that vector representation to capture the importance of having a rare genre, i.e, if a person likes a genre that is not very common, finding a another movie with that genre would make a much better match than to find movies with other genre matches. 

*A way to explain this is I Google "The Queen and the King of the Southern Islands", Google should put more emphasis on words like "Queen", "King", and "Southern", "Islands", rather that the word with highest frequency, which is "the".*

We make use of TF-IDF for this vector representation. TF-IDF will put emphasis on the rare genres, the ones that lesser movies contain. Further, we use the cosine similarity to give us the recommendations based on a movie we have watched. 

In [None]:
tf = TfidfVectorizer(analyzer=lambda s: (c for i in range(1,4)
                                             for c in combinations(s.split('|'), r=i)))
tfidf_matrix = tf.fit_transform(movies['genres'])

pd.DataFrame(tfidf_matrix.todense(), columns=tf.get_feature_names(), index=movies.title).head()

Unnamed: 0_level_0,"((no genres listed),)","(Action,)","(Action, Adventure)","(Action, Adventure, Animation)","(Action, Adventure, Children)","(Action, Adventure, Comedy)","(Action, Adventure, Crime)","(Action, Adventure, Documentary)","(Action, Adventure, Drama)","(Action, Adventure, Fantasy)","(Action, Adventure, Film-Noir)","(Action, Adventure, Horror)","(Action, Adventure, IMAX)","(Action, Adventure, Mystery)","(Action, Adventure, Romance)","(Action, Adventure, Sci-Fi)","(Action, Adventure, Thriller)","(Action, Adventure, War)","(Action, Adventure, Western)","(Action, Animation)","(Action, Animation, Children)","(Action, Animation, Comedy)","(Action, Animation, Crime)","(Action, Animation, Drama)","(Action, Animation, Fantasy)","(Action, Animation, Film-Noir)","(Action, Animation, Horror)","(Action, Animation, IMAX)","(Action, Animation, Musical)","(Action, Animation, Mystery)","(Action, Animation, Romance)","(Action, Animation, Sci-Fi)","(Action, Animation, Thriller)","(Action, Animation, Western)","(Action, Children)","(Action, Children, Comedy)","(Action, Children, Crime)","(Action, Children, Drama)","(Action, Children, Fantasy)","(Action, Children, IMAX)",...,"(Mystery, Sci-Fi, IMAX)","(Mystery, Sci-Fi, Thriller)","(Mystery, Thriller)","(Mystery, Thriller, IMAX)","(Mystery, Thriller, War)","(Mystery, Thriller, Western)","(Mystery, War)","(Mystery, Western)","(Romance,)","(Romance, IMAX)","(Romance, Sci-Fi)","(Romance, Sci-Fi, IMAX)","(Romance, Sci-Fi, Thriller)","(Romance, Thriller)","(Romance, Thriller, IMAX)","(Romance, Thriller, War)","(Romance, Thriller, Western)","(Romance, War)","(Romance, War, Western)","(Romance, Western)","(Sci-Fi,)","(Sci-Fi, IMAX)","(Sci-Fi, Thriller)","(Sci-Fi, Thriller, IMAX)","(Sci-Fi, Thriller, War)","(Sci-Fi, Thriller, Western)","(Sci-Fi, War)","(Sci-Fi, War, IMAX)","(Sci-Fi, Western)","(Sci-Fi, Western, IMAX)","(Thriller,)","(Thriller, IMAX)","(Thriller, War)","(Thriller, Western)","(Thriller, Western, IMAX)","(War,)","(War, IMAX)","(War, Western)","(Western,)","(Western, IMAX)"
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
Toy Story (1995),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Jumanji (1995),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Grumpier Old Men (1995),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.582409,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Waiting to Exhale (1995),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.342532,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


What we have done is -- taken combinations of genres upto 4, i.e if we have "Adventure|Comedy|Action", we are taking combinations like "Adventure", "Comedy", "Action", "Adventure, Comedy", "Comedy, Action" and so on, but in such a way that "Comedy, Action" and "Action, Comedy" are treated the same, since order doesn't matter. 

Then we have caculated the TF-IDF weights for each movies using these combinations. 

We calculate the similarity between the movies by using the Cosine Similarity

In [None]:
cos_sim = cosine_similarity(tfidf_matrix)

cos_sim_df = pd.DataFrame(cos_sim, index=movies['title'], columns=movies['title'])
display(cos_sim_df.head())

title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),"American President, The (1995)",Dracula: Dead and Loving It (1995),Balto (1995),Nixon (1995),Cutthroat Island (1995),Casino (1995),Sense and Sensibility (1995),Four Rooms (1995),Ace Ventura: When Nature Calls (1995),Money Train (1995),Get Shorty (1995),Copycat (1995),Assassins (1995),Powder (1995),Leaving Las Vegas (1995),Othello (1995),Now and Then (1995),Persuasion (1995),"City of Lost Children, The (Cité des enfants perdus, La) (1995)",Shanghai Triad (Yao a yao yao dao waipo qiao) (1995),Dangerous Minds (1995),Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Babe (1995),Dead Man Walking (1995),It Takes Two (1995),Clueless (1995),"Cry, the Beloved Country (1995)",Richard III (1995),Dead Presidents (1995),Restoration (1995),...,Sherlock - A Study in Pink (2010),"Game Over, Man! (2018)",Blockers (2018),Pacific Rim: Uprising (2018),Rampage (2018),Jurassic World: Fallen Kingdom (2018),Incredibles 2 (2018),Deadpool 2 (2018),Solo: A Star Wars Story (2018),Won't You Be My Neighbor? (2018),Sorry to Bother You (2018),Ant-Man and the Wasp (2018),Dogman (2018),Mamma Mia: Here We Go Again! (2018),Tag (2018),The Man Who Killed Don Quixote (2018),Boundaries (2018),Spiral (2018),Mission: Impossible - Fallout (2018),SuperFly (2018),Iron Soldier (2010),BlacKkKlansman (2018),The Darkest Minds (2018),Tilt (2011),Jeff Ross Roasts the Border (2017),John From (2015),Liquid Truth (2017),Bunny (1998),Hommage à Zgougou (et salut à Sabine Mamou) (2002),Gintama (2017),Gintama: The Movie (2010),anohana: The Flower We Saw That Day - The Movie (2013),Silver Spoon (2014),Love Live! The School Idol Movie (2015),Jon Stewart Has Left the Building (2015),Black Butler: Book of the Atlantic (2017),No Game No Life: Zero (2017),Flint (2017),Bungo Stray Dogs: Dead Apple (2018),Andrew Dice Clay: Dice Rules (1991)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
Toy Story (1995),1.0,0.474735,0.033432,0.019663,0.08255,0.0,0.033432,0.275655,0.0,0.038862,0.019663,0.025759,0.471888,0.0,0.032796,0.0,0.0,0.08255,0.08255,0.007296,0.015231,0.0,0.0,0.0,0.0,0.0,0.083056,0.0,0.060033,0.0,0.0,0.0,0.083056,0.0,0.252888,0.033432,0.0,0.0,0.0,0.0,...,0.0,0.030501,0.08255,0.044621,0.038788,0.016934,0.29358,0.014804,0.097617,0.0,0.127136,0.196111,0.0,0.033432,0.08255,0.449574,0.038306,0.0,0.038862,0.0,0.0,0.016847,0.0,0.0,0.08255,0.0,0.0,0.159254,0.0,0.078835,0.09002,0.084617,0.038306,0.159254,0.0,0.306924,0.487104,0.0,0.086065,0.08255
Jumanji (1995),0.474735,1.0,0.0,0.0,0.0,0.0,0.0,0.580651,0.0,0.081861,0.0,0.0,0.33919,0.0,0.069083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.174953,0.0,0.126456,0.0,0.0,0.0,0.174953,0.0,0.202042,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.093992,0.081705,0.03567,0.211023,0.0,0.205623,0.0,0.091178,0.148919,0.0,0.0,0.0,0.341389,0.0,0.0,0.081861,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050147,0.0,0.0,0.0,0.0,0.0,0.060495,0.096008,0.0,0.0,0.0
Grumpier Old Men (1995),0.033432,0.0,1.0,0.588129,0.404997,0.0,1.0,0.0,0.0,0.0,0.588129,0.126374,0.0,0.0,0.137116,0.0,0.346205,0.404997,0.404997,0.035795,0.074726,0.0,0.0,0.0,0.346205,0.0,0.0,0.346205,0.0,0.0,0.0,0.0,0.0,0.0,0.132203,1.0,0.0,0.0,0.0,0.0,...,0.0,0.149642,0.404997,0.0,0.0,0.0,0.0,0.072628,0.0,0.0,0.065182,0.032439,0.0,1.0,0.404997,0.074365,0.187935,0.0,0.0,0.0,0.0,0.082654,0.0,0.346205,0.404997,0.0,0.0,0.0,0.0,0.048129,0.043147,0.0,0.187935,0.0,0.0,0.043247,0.068635,0.0,0.0,0.404997
Waiting to Exhale (1995),0.019663,0.0,0.588129,1.0,0.238191,0.0,0.588129,0.0,0.0,0.0,1.0,0.074324,0.0,0.21998,0.080642,0.07677,0.576229,0.238191,0.238191,0.097765,0.043949,0.015917,0.0,0.064739,0.576229,0.21998,0.057426,0.576229,0.013818,0.07677,0.21998,0.0,0.057426,0.07677,0.077752,0.588129,0.21998,0.061735,0.041335,0.21998,...,0.0,0.088009,0.238191,0.0,0.0,0.017174,0.0,0.042715,0.0,0.0,0.038336,0.019078,0.07677,0.588129,0.238191,0.043736,0.513299,0.0,0.0,0.0,0.0,0.225749,0.0,0.576229,0.238191,0.21998,0.21998,0.0,0.0,0.028306,0.025376,0.055954,0.513299,0.0,0.0,0.025435,0.040366,0.21998,0.0,0.238191
Father of the Bride Part II (1995),0.08255,0.0,0.404997,0.238191,1.0,0.0,0.404997,0.0,0.0,0.0,0.238191,0.312037,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.088383,0.184511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.326429,0.404997,0.0,0.0,0.0,0.0,...,0.0,0.369489,1.0,0.0,0.0,0.0,0.0,0.179329,0.0,0.0,0.160945,0.080097,0.0,0.404997,1.0,0.183618,0.464039,0.0,0.0,0.0,0.0,0.204084,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.118837,0.106537,0.0,0.464039,0.0,0.0,0.106784,0.169471,0.0,0.0,1.0


We observe that of the genres are a perfect match -- for example, the genres for Toy Story (1995) obviously perfectly match with itself, the similarity score is 1, and if the genres don't match at all, like genres of Toy Story (1995) dont match with Sudden Death (1995), then the similarity score is 0

In [None]:
def genre_recommend(movie,n):
    score = pd.DataFrame(cos_sim_df[movie])
    score = score.sort_values(by=movie, ascending = False).head(n+1)
    values = list(score.index.values)
    values.remove(movie)
    return values

In [None]:
movies[movies.title.eq('Aladdin (1992)')]

Unnamed: 0,movieId,title,genres
506,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical


In [None]:
rec = genre_recommend('Aladdin (1992)',10)
mov = []
gen = []
for r in rec:
    mov.append(movies[movies['title']==r]['title'].values[0])
    gen.append(movies[movies['title']==r]['genres'].values[0])

df = pd.DataFrame()
df['Movies'] = mov
df["Genres"] = gen
display(df)


Unnamed: 0,Movies,Genres
0,Oliver & Company (1988),Adventure|Animation|Children|Comedy|Musical
1,Hercules (1997),Adventure|Animation|Children|Comedy|Musical
2,Robin Hood (1973),Adventure|Animation|Children|Comedy|Musical
3,Pete's Dragon (1977),Adventure|Animation|Children|Musical
4,Song of the South (1946),Adventure|Animation|Children|Musical
5,Rock-A-Doodle (1991),Adventure|Animation|Children|Musical
6,Land Before Time III: The Time of the Great Gi...,Adventure|Animation|Children|Musical
7,"Muppet Movie, The (1979)",Adventure|Children|Comedy|Musical
8,Muppet Treasure Island (1996),Adventure|Children|Comedy|Musical
9,Shrek 2 (2004),Adventure|Animation|Children|Comedy|Musical|Ro...


The recommendation system works well

In [None]:
genre_recommend('Stalker (1979)',10)

['Atlas Shrugged: Part 1 (2011)',
 'Sound of My Voice (2011)',
 "I'll Follow You Down (2013)",
 'Solaris (Solyaris) (1972)',
 'Fire in the Sky (1993)',
 'Quiet Earth, The (1985)',
 'Prestige, The (2006)',
 'Moon (2009)',
 'Soylent Green (1973)',
 'Forgotten, The (2004)']

Again, the recommendations make sense.

## Item Based Collaborative Filtering (KNN Approach)

### Motivation

We can try to recommend movies based how close one movie is to another. What we want is to find a way to measure the closeness between different movies and then based on one movie, we recommend the closest 5 or 10 movies.  

Basically, if the movies were treated like points on a graph, we would assume that the points closest to a movie are similar to that movie, and hence make good recommendations. 

### Implementation

*References "Prototyping a Recommender System Step by Step Part 1: KNN Item-Based Collaborative Filtering" by Kevin Liao, URL: https://towardsdatascience.com/prototyping-a-recommender-system-step-by-step-part-1-knn-item-based-collaborative-filtering-637969614ea#:~:text=When%20KNN%20makes%20inference%20about,the%20most%20similar%20movie%20recommendations.*

To implement this closeness criteria, we can use the K-Nearest Neighbor Algorithm. K-Nearest Neighbor will treat each movie as a point on a graph and when asked for recommendations based on that movie, the algorithm will return the top K nearest neighbors as the most similar movie recommendations.

In [None]:
# We merge the movies and ratings dataframes

df_ratings = pd.merge(ratings,movies,on="movieId")
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


To fit the K-Nearest Neighbor algorithm, we need a m x n matrix where m is the number of movies and n is the number of users. We can use the pivot_table() command to achieve this. We fill any missing values with 0.

The matrix thus formed will be a very sparse matrix. We don't want to fit the KNN model on a matrix with mostly just zero values. So, for more efficient calculation and less memory footprint, we need to transform the values of the dataframe into a scipy sparse matrix.

In [None]:
from scipy.sparse import csr_matrix

# pivot ratings into movie features
movie_features_df = df_ratings.pivot_table(index='title',columns='userId',values='rating').fillna(0)
movie_features_df

# convert dataframe of movie features to scipy sparse matrix
mat_features = csr_matrix(movie_features_df.values)

In [None]:
movie_features_df.head()

userId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# storing the movie_titles from the pivot table in a variable test which can help us getting the movie index for getting 
# the recommendations.

test = movie_features_df.index
test


Index([''71 (2014)', ''Hellboy': The Seeds of Creation (2004)',
       ''Round Midnight (1986)', ''Salem's Lot (2004)',
       ''Til There Was You (1997)', ''Tis the Season for Love (2015)',
       ''burbs, The (1989)', ''night Mother (1986)',
       '(500) Days of Summer (2009)', '*batteries not included (1987)',
       ...
       'Zulu (2013)', '[REC] (2007)', '[REC]² (2009)',
       '[REC]³ 3 Génesis (2012)',
       'anohana: The Flower We Saw That Day - The Movie (2013)',
       'eXistenZ (1999)', 'xXx (2002)', 'xXx: State of the Union (2005)',
       '¡Three Amigos! (1986)', 'À nous la liberté (Freedom for Us) (1931)'],
      dtype='object', name='title', length=9719)

The matrix has too many features, and if this is fit directly to the KNN Algorithm, then it will suffer from Curse of Dimensionality. This is because, by default, KNN uses **Euclidean Distance** to measure the distance between points. With so many features, the resulting vactors corresponding to movies would almost be equidistant to the target movie's vector, which is unhelpful for us. So instead of using Euclidean Distance, we use **Cosine Similarity** for the search of the nearest neighbors. 

In [None]:
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute', n_jobs=-1)
model_knn.fit(mat_features)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1)

In [None]:
unique_index = pd.Index(test)
j = unique_index.get_loc('Aladdin (1992)')
print(j)

298


In [None]:
# We then use the nearest neighbours model to find the 10 neighbors for the movie title. 
# These 10 neighbors are the recommendations.

distances, indices = model_knn.kneighbors(movie_features_df.iloc[j,:].values.reshape(1, -1), n_neighbors = 11)
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movie_features_df.index[j]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, movie_features_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Aladdin (1992):

1: Beauty and the Beast (1991), with distance of 0.2529439728150389:
2: Lion King, The (1994), with distance of 0.28209064327932476:
3: Jurassic Park (1993), with distance of 0.3865152329768784:
4: True Lies (1994), with distance of 0.4000935259988143:
5: Batman (1989), with distance of 0.4032788709453009:
6: Ace Ventura: Pet Detective (1994), with distance of 0.4161857691893087:
7: Mrs. Doubtfire (1993), with distance of 0.42457691053382474:
8: Die Hard: With a Vengeance (1995), with distance of 0.4315038141425057:
9: Batman Forever (1995), with distance of 0.4336164363530862:
10: Apollo 13 (1995), with distance of 0.4338500822834891:


We can see that the recomendations are very different to what we got when we used Content Based Filtering. However, intuitively, the recommendations are still relevant and good.

In [None]:
j = unique_index.get_loc('Stalker (1979)')
print(j)

7971


In [None]:
distances, indices = model_knn.kneighbors(movie_features_df.iloc[j,:].values.reshape(1, -1), n_neighbors = 11)
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movie_features_df.index[j]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, movie_features_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Stalker (1979):

1: Bob le Flambeur (1955), with distance of 0.3399961522566557:
2: Cercle Rouge, Le (Red Circle, The) (1970), with distance of 0.3853309121263231:
3: Samouraï, Le (Godson, The) (1967), with distance of 0.4338569483962659:
4: That Obscure Object of Desire (Cet obscur objet du désir) (1977), with distance of 0.45125281016524865:
5: Ghost in the Shell: Stand Alone Complex - The Laughing Man (2005), with distance of 0.4705853560298272:
6: Pierrot le fou (1965), with distance of 0.48427583865920343:
7: Serbian Film, A (Srpski film) (2010), with distance of 0.48773100932499924:
8: Leaves of Grass (2009), with distance of 0.4965441452114183:
9: Ghost in the Shell 2.0 (2008), with distance of 0.5011025371741235:
10: Outlander (2008), with distance of 0.5090215278982438:


Again, the recommendations are different to what we got in Content Based Filtering. But the recommendations make sense.

## Matrix Factorization via Singular Value Decomposition

### Motivation

We can think that if User 1 likes Movie 'A','B','C' and User 2 and 3 also like Movie 'A','B' and 'C', then User 1 has similar preferences to User 2 and 3. So, if User 2 and 3 like a particular Movie 'D', we can recommend that movie to User 1 too in hope that they will like it. 

What we would like to do is -- find some "latent features" in the data so that for any User and Movie, we can figgure out if the user will like the movie based on how other similar users have reacted to it and also how the various "latent" features of the movie match with the user's preferences. We need a method that can derive tastes and preference vectors from the raw data.

Matrix Factorization helps us do this.

### Aim

We use Matrix Factorization to break down one matrix into a product of multiple matrices. SVD is an algorithm that decomposes a matrix R into the best lower rank (i.e. smaller/simpler) approximation of the original matrix R. Mathematically, it decomposes R into two unitary matrices and a diagonal matrix. 

For our Movie Recommendation example, R is the user-movie matrix that has what ratings each user has given to each movie. It is a sparse matrix as not all users have rated all movies. R is decomposed into U.Sigma.transpose(V)
Where U refelects the association between the Users and the features, Sigma is a diagonal matrix of weights, and V is the association between the Movies and the Features. 

U represents how much users “like” each feature and transpose(V) represents how relevant each feature is to each movie.

### Implementation

*References "Matrix Factorization for Movie Recommendations in Python" by Nick Becker, URL : https://beckernick.github.io/matrix-factorization-recommender/*

To implement Matrix Factorization, we first need to create our Matrix R -- which is the sparse matrix showing the association between the users and the movies.

In [6]:
R_df = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
display(R_df.head())

# Converting R to a matrix
R = R_df.values

# Normalize the data
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,34,36,38,39,40,41,42,43,...,185135,185435,185473,185585,186587,187031,187541,187593,187595,187717,188189,188301,188675,188751,188797,188833,189043,189111,189333,189381,189547,189713,190183,190207,190209,190213,190215,190219,190221,191005,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Performing Singular Value Decomposition. We choose the value of k as 50, however, we could make our model better by optimizing this value further by training - testing - validation techniques.

In [7]:
from scipy.sparse.linalg import svds

U, sigma, Vt = svds(R_demeaned, k = 50)

# Converting Sigma to diagonal Matrix
sigma = np.diag(sigma)

### Making predictions

In [8]:
# Taking the product of U, Sigma and transpose(V)
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

# Converting to DataFrame
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = R_df.columns)

In [9]:
# Function to make recommendations
def recommend_movies(predictions_df, userID, movies_df, original_ratings_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # UserID starts at 1, not 0
    sorted_user_predictions = predictions_df.iloc[user_row_number].sort_values(ascending=False)
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings_df[original_ratings_df.userId == (userID)]
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False)
                 )

    print('User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
    print( 'Recommending the highest {0} predicted ratings movies not already rated.'.format(num_recommendations))
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies_df[~movies_df['movieId'].isin(user_full['movieId'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movieId',
               right_on = 'movieId').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

already_rated, predictions = recommend_movies(preds_df, 10, movies, ratings, 10)

User 10 has already rated 140 movies.
Recommending the highest 10 predicted ratings movies not already rated.


In [10]:
already_rated.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
139,10,140110,5.0,1455356776,The Intern (2015),Comedy
48,10,8869,5.0,1455303064,First Daughter (2004),Comedy|Romance
117,10,96079,5.0,1455302172,Skyfall (2012),Action|Adventure|Thriller|IMAX
110,10,91529,5.0,1455302120,"Dark Knight Rises, The (2012)",Action|Adventure|Crime|IMAX
42,10,7458,5.0,1455356656,Troy (2004),Action|Adventure|Drama|War
100,10,81845,5.0,1455302591,"King's Speech, The (2010)",Drama
44,10,8533,5.0,1455301847,"Notebook, The (2004)",Drama|Romance
97,10,79091,5.0,1455306124,Despicable Me (2010),Animation|Children|Comedy|Crime
86,10,71579,5.0,1455301869,"Education, An (2009)",Drama|Romance
57,10,33794,5.0,1455302031,Batman Begins (2005),Action|Crime|IMAX


In [11]:
predictions

Unnamed: 0,movieId,title,genres
4394,6539,Pirates of the Caribbean: The Curse of the Bla...,Action|Adventure|Comedy|Fantasy
7274,79132,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX
5123,8368,Harry Potter and the Prisoner of Azkaban (2004),Adventure|Fantasy|IMAX
3553,4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,Adventure|Children|Fantasy
3547,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy
3593,4963,Ocean's Eleven (2001),Crime|Thriller
4050,5816,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy
7207,76093,How to Train Your Dragon (2010),Adventure|Animation|Children|Fantasy|IMAX
3601,4973,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",Comedy|Romance
6003,40815,Harry Potter and the Goblet of Fire (2005),Adventure|Fantasy|Thriller|IMAX


The recommendations make sense intuitively

## Collaborative Filtering

### Motivation

The motivation behind collaborative filtering is that if user *1* likes movies *A,B and C* and is user *2* likes movies *A,C and D*, then, user *1* might like movie *D* and user *2* might like movie *B*. This kind of system recommends movies based on similarities between users and what other users have liked.

Collaborative filtering can also be done by considering movies. For a movie *m* and a user *u*, a set of similar movies based on rating is found by a similarity measure and the rating for that movie is calculated by picking out movies rated by *u* from that similar movies list.

This technique recommends movies to a particular user based on other users whom they are most similar to. This 'similarity' among users is calculated based on ratings and not other factors like age, gender, etc. Once the set of similar users are identified, then ratings are predicted for previously unseen movies and the top rated movies are recommended. 

### Calculating Similarity

The similarity between users can be calculated in 3 ways -

i) Euclidean Distance

The euclidean distance between 2 points, where each point is the vector of the ratings of a particular user, is calculated and users having the shortest distance among others are considered as most similar. 

ii) Cosine Similarity

The cosine similarity between pairs of ratings of users or movies is computed. 
The value ranged between -1 to 1. Higher the cosine similarity value, more similar are those users/movies.

iii) Pearson Correlation Coefficient

The pearson correlation coefficient is computed for pairs of ratings of different users or movies, depending on the method.

### Calculating Ratings

i) User-User Collaborative Filtering

For a user *u* and movie *m*, the rating of that movie by the user is calculated as -

$r_{u,m}$ = $\frac{\Sigma _{v \in N(u)} sim(u,v) × r_{v,m}}{\Sigma _{v \in N(u)}sim(u,v)}$ 

Here, $N(u)$ is the set of similar users to *u*. The KNN clustering algorithm is considered for finding the set of similar movies based on a given similarity measure.

$sim(u,v)$ is the similarity score between user *u* and *v*.

$r_{v,m}$ is the rating given by user $v$ to movie $m$.

ii) Movie - User Collaborative Filtering

For a user *u* and movie *m*, the rating of that movie by the user is calculated as -

$r_{u,m}$ = $\frac{\Sigma _{n \in N(m)} sim(m,n) × r_{u,n}}{\Sigma _{n \in N(m)}sim(m,n)}$ 

Here, $N(m)$ is the set of similar movies to *m*.

$sim(m,n)$ is the similarity score between movie *m* and *n*.



In [None]:
np.random.seed(100)
reader = Reader(rating_scale=(0, 5))
#ratings = ratings.sample(frac = 0.2)
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [None]:
np.random.seed(100)
# User-User collaborative Filtering
benchmark = []
sim_options = {
    "name": "cosine",
    "user_based": True,  # Compute  similarities between users
}
# Iterate over all algorithms
for algorithm in [KNNBasic(sim_options=sim_options,verbose = False), KNNWithMeans(sim_options=sim_options,verbose = False), KNNWithZScore(sim_options=sim_options,verbose = False)]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=10, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')    

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KNNWithZScore,0.894454,0.576814,1.222893
KNNWithMeans,0.895411,0.435901,1.059357
KNNBasic,0.968676,0.396112,1.011052


In [None]:
np.random.seed(100)
sim_options_grid = {
    "name": ["msd", "cosine", "pearson"],
    "min_support": [3, 4, 5,6,7],
    "user_based": [True],
}

param_grid = {"k" : range(20,100,10),"sim_options": sim_options_grid}

gs = GridSearchCV(KNNWithZScore, param_grid, measures=["rmse", "mae"], cv=5,joblib_verbose = 0,n_jobs = -1)
gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])  

1.006769740956616
{'k': 20, 'sim_options': {'name': 'pearson', 'min_support': 7, 'user_based': True}}


In [None]:
np.random.seed(100)
sim_options = {
    "name": "pearson",
    "user_based": True,
    "min_support" : 7

}
trainset, testset = train_test_split(data, test_size=0.25)
algo = KNNWithZScore(k = 20, sim_options=sim_options,verbose = False)
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

RMSE: 0.9004


0.9003759131621382

In [None]:
# User ID 10
unique_ids = movies['movieId'].unique()
iids10 = ratings.loc[ratings['userId']==10, 'movieId']
movies_to_predict = np.setdiff1d(unique_ids,iids10)
titles = []
for i in range(len(movies_to_predict)):
    titles.append(movies['title'][movies['movieId'] == movies_to_predict[i]])

In [None]:
algo = KNNWithZScore(k = 20, sim_options=sim_options,verbose = False)
algo.fit(data.build_full_trainset())
my_recs = []
for iid in movies_to_predict:
   my_recs.append((iid, algo.predict(uid=10,iid=iid).est))
df = pd.DataFrame(my_recs, columns=['iid', 'predictions'])
df['Title'] = titles
df.sort_values('predictions', ascending=False).head(10) 

Unnamed: 0,iid,predictions,Title
132,159,5.0,"132 Clockers (1995) Name: title, dtype: object"
5469,26528,5.0,5521 Anne of Green Gables (1985) Name: titl...
3501,4813,5.0,3522 When Worlds Collide (1951) Name: title...
8625,128520,5.0,"8761 The Wedding Ringer (2015) Name: title,..."
8412,115664,5.0,"8547 The Book of Life (2014) Name: title, d..."
912,1218,5.0,"919 Killer, The (Die xue shuang xiong) (198..."
6483,54881,5.0,"6551 King of Kong, The (2007) Name: title, ..."
258,299,5.0,"259 Priest (1994) Name: title, dtype: object"
683,905,5.0,687 It Happened One Night (1934) Name: titl...
8803,136469,5.0,8941 Larry David: Curb Your Enthusiasm (199...


In [None]:
# User ID 250
unique_ids = movies['movieId'].unique()
iids10 = ratings.loc[ratings['userId']==250, 'movieId']
movies_to_predict = np.setdiff1d(unique_ids,iids10) 
titles = []
for i in range(len(movies_to_predict)):
    titles.append(movies['title'][movies['movieId'] == movies_to_predict[i]])

In [None]:
algo = KNNWithZScore(k = 20, sim_options=sim_options,verbose = False)
algo.fit(data.build_full_trainset())
my_recs = []
for iid in movies_to_predict:
   my_recs.append((iid, algo.predict(uid=250,iid=iid).est))
df = pd.DataFrame(my_recs, columns=['iid', 'predictions'])
df['Title'] = titles
df.sort_values('predictions', ascending=False).head(10)

Unnamed: 0,iid,predictions,Title
8424,112175,5.0,8451 How to Train Your Dragon 2 (2014) Name...
793,1046,5.0,"799 Beautiful Thing (1996) Name: title, dty..."
796,1050,5.0,"802 Looking for Richard (1996) Name: title,..."
4375,6466,5.0,"4400 Mississippi Masala (1991) Name: title,..."
1691,2295,5.0,"1707 Impostors, The (1998) Name: title, dty..."
4371,6460,5.0,"4396 Trial, The (Procès, Le) (1962) Name: t..."
4365,6442,5.0,"4390 Belle époque (1992) Name: title, dtype..."
4364,6440,5.0,"4389 Barton Fink (1991) Name: title, dtype:..."
6619,56757,5.0,6646 Sweeney Todd: The Demon Barber of Flee...
2560,3451,5.0,2582 Guess Who's Coming to Dinner (1967) Na...


## Implicit Recommender System

All the methods discussed above rely on explicit ratings given by users to movies. This treats ratings not given to movies as lack of information, instead of considering them as a conscious choice made by the user that can help understand the user's preferences better. This kind of a system is called an implicit recommender system. It works on the concept of implicit data, which is data collected from users in the form of websites visited, clicks on hyperlinks, views, purchases, etc. 

When working with explicit data, we impute missing data with 0 as we assume that the user has given it 0 rating. But this might not always be true since we don't know whether the user has heard about that particular movie or just forgot about giving it a rating. In implicit data, we can't assign it a value of 0 as other information as mentioned above has to be kept in mind. 

1) Matrix Factorization Using Alternating Least Squares

In this approach, each movie is characterized by a preference value and it's confidence. Initially, missing values is considered to have negative preference with a low confidence value and rated movies to have a positive preference with high confidence. The confidence can be calculated by using factors like number of times the user has watched that movie or any other form of interaction.

The preference is a binary value, 1 for positive and 0 for negative. The confidence is calculated as a linear function of the ratings given by the user.
$c_{u,i} = 1 + \alpha r_{u,i}$ where $r_{u,i}$ is the rating given by user $u$ to movie $i$ and $c_{u,i}$ is the corresponding confidence value. Here, $\alpha$ is a linear scaling parameter.

The aim of this matrix factorization is to decompose the ratings matrix $R$ into 2 matrices $U$ and $M$ such that $R = U × V$. Here, $U$ and $V$ are the user and movie matrices with hidden features (latent features). Least squares method is used to find the best approximation of $R$. It is called alternating least squares because in each iteration, one of $U$ or $V$ is kept constant and the other is optimized using regulariztion. 

2) Matrix Factorization Using Bayesian Personalized Ranking

This method aims to come up with more personalized rankings for users rather than predicting whether the user will watch a particular movie or not. For this, the training data is considered in terms of pairs of items for each user, $(u,i,j)$, where $u$ is the user, $i$ is considered to be a 'positive' movie and $j$ to be a 'negative' movie. Here, 'positive' and 'negative' is defined in the sense whether the user has interacted with that particular movie. So if $u$ has rated movie 1 but not movie 2, we say that movie 1 is a 'positive' movie and movie 2 as a  'negative' movie for user $u$. But we can't say anything between pairs of movies that have both been rated or not rated.

The method optimizes the following equation :
$\Sigma _{(u,i,j) ∈ D_{s}} ln(\sigma(\hat{x} _{uij})) - λ _{Θ} ||Θ|| ^{2} $ 

Here, Θ is the matrix factorization model parameter like the user and movie matrices.
$D_{s}$ is our dataset containing all the interactions between users and movies.
$\hat{x} _{uij}$ is a function that represents the relation between user $u$, movie $i$ and movie $j$ . 
This is calculated using matrix factorization and then passed into the sigmoid function $\sigma (x) = \frac{1}{1 + e^{x}}$ that returns the probability of user $u$ preferring movie $i$ to movie $j$ . This is optimized using the regularization parameter $\lambda _{\Theta}$.

The matrix factorization method in this model aims to decompose $\hat{x} _{uij}$ as  $\hat{x} _{ui} - \hat{x} _{uj}$.

Optimising the Bayesian Personalized Ranking criterion is the similar to optimizing the AUC (Area Under the Curve) metric, that is a rank based criterion. The optimization process is implemented through gradient descent using bootstrapping where in the movie $j$ is chosen randomly for faster convergence.

In [None]:
movielens = fetch_movielens()
train = movielens['train']
test = movielens['test']

In [None]:
model = LightFM(no_components = 15,learning_rate=0.05, loss='bpr')
model.fit(train, epochs=10)

<lightfm.lightfm.LightFM at 0x7f99fb565290>

In [None]:
train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10).mean()

train_auc = auc_score(model, train).mean()
test_auc = auc_score(model, test).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

Precision: train 0.60, test 0.09.
AUC: train 0.90, test 0.86.


In [None]:
def sample_recommendation(model, data, user_ids):
       n_users, n_items = train.shape
       for user_id in user_ids:
            known_positives = movielens['item_labels'][train.tocsr()[user_id].indices]
            scores = model.predict(user_id, np.arange(n_items))
            top_items = movielens['item_labels'][np.argsort(-scores)]
            print("User %s" % user_id)
            print("     Known positives:")
        
            for x in known_positives[:3]:
                print("        %s" % x)
        
            print("     Recommended:")
        
            for x in top_items[:10]:
                print("        %s" % x)


In [None]:
sample_recommendation(model, data, [10,250])

User 10
     Known positives:
        Babe (1995)
        Dead Man Walking (1995)
        Seven (Se7en) (1995)
     Recommended:
        Empire Strikes Back, The (1980)
        Monty Python and the Holy Grail (1974)
        Back to the Future (1985)
        Raiders of the Lost Ark (1981)
        Star Wars (1977)
        Return of the Jedi (1983)
        Indiana Jones and the Last Crusade (1989)
        Princess Bride, The (1987)
        Fugitive, The (1993)
        E.T. the Extra-Terrestrial (1982)
User 250
     Known positives:
        Toy Story (1995)
        Twelve Monkeys (1995)
        Usual Suspects, The (1995)
     Recommended:
        Independence Day (ID4) (1996)
        Jerry Maguire (1996)
        Mission: Impossible (1996)
        Rock, The (1996)
        Phenomenon (1996)
        Time to Kill, A (1996)
        Mr. Holland's Opus (1995)
        Broken Arrow (1996)
        Toy Story (1995)
        Ransom (1996)
