In [1]:
#https://www.kaggle.com/kanncaa1/recommendation-systems-tutorial/data

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import scipy as scipy
import math as math
import random as random
import sklearn

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds

%matplotlib inline

In [3]:
movies_df = pd.read_csv('movie.csv')

In [4]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
movies_df['title_1'] = movies_df['title'].str.split("\(\d", expand=True)[0]
movies_df['year'] = movies_df['title'].str.extract('.*\((.*\d{4})\).*', expand=True)
movies_df.head(30)

Unnamed: 0,movieId,title,genres,title_1,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995
5,6,Heat (1995),Action|Crime|Thriller,Heat,1995
6,7,Sabrina (1995),Comedy|Romance,Sabrina,1995
7,8,Tom and Huck (1995),Adventure|Children,Tom and Huck,1995
8,9,Sudden Death (1995),Action,Sudden Death,1995
9,10,GoldenEye (1995),Action|Adventure|Thriller,GoldenEye,1995


In [6]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 5 columns):
movieId    27278 non-null int64
title      27278 non-null object
genres     27278 non-null object
title_1    27278 non-null object
year       27257 non-null object
dtypes: int64(1), object(4)
memory usage: 1.0+ MB


In [7]:
interactions_df = pd.read_csv('rating_1.csv')
interactions_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,4/2/2005 23:53
1,1,29,3.5,4/2/2005 23:31
2,1,32,3.5,4/2/2005 23:33
3,1,47,3.5,4/2/2005 23:32
4,1,50,3.5,4/2/2005 23:29


In [8]:
interactions_df.drop('timestamp', axis=1, inplace=True)

In [9]:
interactions_df.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [10]:
interactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 3 columns):
userId     20000 non-null int64
movieId    20000 non-null int64
rating     20000 non-null float64
dtypes: float64(1), int64(2)
memory usage: 468.8 KB


In [11]:
interactions_df = interactions_df.iloc[:20000,:]

In [12]:
interactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 3 columns):
userId     20000 non-null int64
movieId    20000 non-null int64
rating     20000 non-null float64
dtypes: float64(1), int64(2)
memory usage: 468.8 KB


In [13]:
print(interactions_df['userId'].nunique())
print(interactions_df['movieId'].nunique())

156
4192


In [14]:
## Merge movies_df (with only movieId, title columns) with interactions_df

interactions_movies_df = interactions_df.merge(movies_df[['movieId', 'title_1', 'year', 'genres']], how = 'left', left_on = 'movieId',
               right_on = 'movieId')

In [15]:
interactions_movies_df.head(30)

Unnamed: 0,userId,movieId,rating,title_1,year,genres
0,1,2,3.5,Jumanji,1995,Adventure|Children|Fantasy
1,1,29,3.5,"City of Lost Children, The (Cité des enfants p...",1995,Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,1,32,3.5,Twelve Monkeys (a.k.a. 12 Monkeys),1995,Mystery|Sci-Fi|Thriller
3,1,47,3.5,Seven (a.k.a. Se7en),1995,Mystery|Thriller
4,1,50,3.5,"Usual Suspects, The",1995,Crime|Mystery|Thriller
5,1,112,3.5,Rumble in the Bronx (Hont faan kui),1995,Action|Adventure|Comedy|Crime
6,1,151,4.0,Rob Roy,1995,Action|Drama|Romance|War
7,1,223,4.0,Clerks,1994,Comedy
8,1,253,4.0,Interview with the Vampire: The Vampire Chroni...,1994,Drama|Horror
9,1,260,4.0,Star Wars: Episode IV - A New Hope,1977,Action|Adventure|Sci-Fi


In [16]:
interactions_movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 0 to 19999
Data columns (total 6 columns):
userId     20000 non-null int64
movieId    20000 non-null int64
rating     20000 non-null float64
title_1    20000 non-null object
year       20000 non-null object
genres     20000 non-null object
dtypes: float64(1), int64(2), object(3)
memory usage: 1.1+ MB


## Popularity Model

In [17]:
#interactions_movies_df[interactions_movies_df['movieId']==118696]['rating'].avg()

interactions_movies_grp = interactions_movies_df.groupby(['movieId', 'title_1']).agg({'rating': 'sum'}).reset_index()
interactions_movies_grp.rename(columns = {'rating': 'TotalRating'},inplace=True)
interactions_movies_grp.head()

Unnamed: 0,movieId,title_1,TotalRating
0,1,Toy Story,221.5
1,2,Jumanji,60.0
2,3,Grumpier Old Men,59.5
3,4,Waiting to Exhale,8.0
4,5,Father of the Bride Part II,35.5


In [40]:
#Sort movie_Id based upon rating

popular_ratings_df = interactions_movies_grp.sort_values(['TotalRating', 'title_1', 'movieId'], ascending=[0,1,1])
popular_ratings_df.head(10)

Unnamed: 0,movieId,title_1,TotalRating
241,296,Pulp Fiction,339.5
289,356,Forrest Gump,324.0
258,318,"Shawshank Redemption, The",312.5
463,593,"Silence of the Lambs, The",273.0
379,480,Jurassic Park,266.5
209,260,Star Wars: Episode IV - A New Hope,231.0
472,608,Fargo,228.5
45,50,"Usual Suspects, The",228.5
1859,2858,American Beauty,223.0
0,1,Toy Story,221.5


In [47]:
#Generate a recommendation rank based upon score
popular_ratings_df['Rank'] = popular_ratings_df['TotalRating'].rank(ascending=0, method='first')
popular_ratings_df.head(20)

Unnamed: 0,movieId,title_1,TotalRating,Rank
241,296,Pulp Fiction,339.5,1.0
289,356,Forrest Gump,324.0,2.0
258,318,"Shawshank Redemption, The",312.5,3.0
463,593,"Silence of the Lambs, The",273.0,4.0
379,480,Jurassic Park,266.5,5.0
209,260,Star Wars: Episode IV - A New Hope,231.0,6.0
472,608,Fargo,228.5,7.0
45,50,"Usual Suspects, The",228.5,8.0
1859,2858,American Beauty,223.0,9.0
0,1,Toy Story,221.5,10.0


In [48]:
test_user_df = interactions_movies_df[interactions_movies_df['userId'] == 4][['movieId', 'title_1']]
test_user_df.head()

Unnamed: 0,movieId,title_1
423,6,Heat
424,10,GoldenEye
425,19,Ace Ventura: When Nature Calls
426,32,Twelve Monkeys (a.k.a. 12 Monkeys)
427,165,Die Hard: With a Vengeance


In [65]:
test_df=test_user_df.merge(popular_ratings_df.head(30), left_on='movieId', right_on='movieId', how='inner').sort_values(by='Rank', ascending=True)
test_df.head()

Unnamed: 0,movieId,title_1_x,title_1_y,TotalRating,Rank
1,356,Forrest Gump,Forrest Gump,324.0,2.0
3,480,Jurassic Park,Jurassic Park,266.5,5.0
4,589,Terminator 2: Judgment Day,Terminator 2: Judgment Day,216.5,12.0
0,32,Twelve Monkeys (a.k.a. 12 Monkeys),Twelve Monkeys (a.k.a. 12 Monkeys),184.0,24.0
2,380,True Lies,True Lies,181.5,25.0


#### Define a Popularity Recommender class with the above code

In [56]:
class popularity_recommender_py():
    def __init__(self):
        self.data = None
        self.rating = None
        self.movieId = None
        self.userId = None        
        self.title_1 = None
        self.popularity_recommendations = None
        
    #Create the popularity based recommender system model
    def create(self, data, movieId, title_1, rating):
        self.data = data        
        self.movieId = movieId
        self.title_1 = title_1
        self.rating = rating

        df_grp = data.groupby(['movieId', 'title_1']).agg({'rating': 'sum'}).reset_index()
        df_grp.rename(columns = {'rating': 'TotalRating'},inplace=True)

        #Sort the movieId's based upon recommendation score
        popular_movies_df = df_grp.sort_values(['TotalRating', 'title_1', 'movieId'], ascending=[0,1,1])
        
        #Generate a recommendation rank based upon score
        popular_movies_df['Rank'] = popular_movies_df['TotalRating'].rank(ascending=0, method='first')
        
        #Get the top 10 recommendations
        self.popularity_recommendations = popular_movies_df#.head(10)

        
    #Use the popularity based recommender system model to make recommendations of contents that the user has not interacted with:
    def recommend(self, userId, items_to_ignore=[], topn=10):    
        user_recommendations = self.popularity_recommendations[~self.popularity_recommendations['movieId'].isin(items_to_ignore)].head(topn)
        
        #Add userId column for which the recommendations are being generated
        user_recommendations['userId'] = userId
    
        #Bring userId column to the front
        cols = user_recommendations.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        user_recommendations = user_recommendations[cols]
        
        return user_recommendations
       

In [57]:
## Before we call the class and its functions, lets first create the unique personIds

In [58]:
## Get unique personIds
users = interactions_movies_df['userId'].unique()
users

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156], dtype=int64)

In [59]:
#interactions_movies_df.head()

In [60]:
## Create an object for the class and instantiate the functions within the class
pm = popularity_recommender_py()
pm.create(interactions_movies_df, 'movieId', 'title_1', 'rating')

In [61]:
user_id = users[3]

## get list of already interacted content ids (in order to avoid coming up in the popularity recommendation list)
items_to_ignore = interactions_movies_df[interactions_movies_df['userId'] == user_id]['movieId'].tolist()
#items_to_ignore

pm.recommend(user_id,items_to_ignore)

Unnamed: 0,userId,movieId,title_1,TotalRating,Rank
241,4,296,Pulp Fiction,339.5,1.0
258,4,318,"Shawshank Redemption, The",312.5,3.0
463,4,593,"Silence of the Lambs, The",273.0,4.0
209,4,260,Star Wars: Episode IV - A New Hope,231.0,6.0
472,4,608,Fargo,228.5,7.0
45,4,50,"Usual Suspects, The",228.5,8.0
1859,4,2858,American Beauty,223.0,9.0
0,4,1,Toy Story,221.5,10.0
43,4,47,Seven (a.k.a. Se7en),217.0,11.0
93,4,110,Braveheart,211.0,13.0


In [66]:
user_id = users[30]

## get list of already interacted content ids (in order to avoid coming up in the popularity recommendation list)
items_to_ignore = interactions_movies_df[interactions_movies_df['userId'] == user_id]['movieId'].tolist()
#items_to_ignore

pm.recommend(user_id,items_to_ignore)

Unnamed: 0,userId,movieId,title_1,TotalRating,Rank
241,31,296,Pulp Fiction,339.5,1.0
289,31,356,Forrest Gump,324.0,2.0
258,31,318,"Shawshank Redemption, The",312.5,3.0
463,31,593,"Silence of the Lambs, The",273.0,4.0
379,31,480,Jurassic Park,266.5,5.0
472,31,608,Fargo,228.5,7.0
45,31,50,"Usual Suspects, The",228.5,8.0
1859,31,2858,American Beauty,223.0,9.0
43,31,47,Seven (a.k.a. Se7en),217.0,11.0
460,31,589,Terminator 2: Judgment Day,216.5,12.0


## Content based Filtering model

In [67]:
## Content-based filtering approaches leverage description or attributes from items the user has interacted 
## to recommend similar items.
## It depends only on the user's previous choices, making this method robust to avoid the cold-start problem. 

In [68]:
#Ignoring stopwords (words with no semantics) from English
stopwords_list = stopwords.words('english')

#Trains a model whose vectors size is 5000, composed by the main unigrams and bigrams found in the corpus, ignoring stopwords
vectorizer = TfidfVectorizer(analyzer='word',
                     ngram_range= (1,2),
                     min_df=0.003,
                     max_df=0.5,
                     max_features=5000,
                     stop_words=stopwords_list)

In [69]:
#title_genre = pd.concat([movies_df['Title'], movies_df['Genre']])
#title_genre = movies_df['Title'] + " " + movies_df['Genre']
genre = movies_df['genres']

In [70]:
genre

0        Adventure|Animation|Children|Comedy|Fantasy
1                         Adventure|Children|Fantasy
2                                     Comedy|Romance
3                               Comedy|Drama|Romance
4                                             Comedy
5                              Action|Crime|Thriller
6                                     Comedy|Romance
7                                 Adventure|Children
8                                             Action
9                          Action|Adventure|Thriller
10                              Comedy|Drama|Romance
11                                     Comedy|Horror
12                      Adventure|Animation|Children
13                                             Drama
14                          Action|Adventure|Romance
15                                       Crime|Drama
16                                     Drama|Romance
17                                            Comedy
18                                            

In [71]:
tfidf_matrix = vectorizer.fit_transform(genre)

In [72]:
tfidf_feature_names = vectorizer.get_feature_names()

In [73]:
print(len(tfidf_feature_names))
print(tfidf_feature_names)

90
['action', 'action adventure', 'action animation', 'action comedy', 'action crime', 'action drama', 'action horror', 'action sci', 'action thriller', 'adventure', 'adventure animation', 'adventure children', 'adventure comedy', 'adventure crime', 'adventure drama', 'adventure fantasy', 'adventure sci', 'animation', 'animation children', 'animation comedy', 'children', 'children comedy', 'children drama', 'children fantasy', 'comedy', 'comedy crime', 'comedy documentary', 'comedy drama', 'comedy fantasy', 'comedy horror', 'comedy musical', 'comedy romance', 'comedy sci', 'comedy western', 'crime', 'crime drama', 'crime horror', 'crime mystery', 'crime thriller', 'documentary', 'documentary drama', 'documentary musical', 'drama', 'drama fantasy', 'drama film', 'drama horror', 'drama musical', 'drama mystery', 'drama romance', 'drama sci', 'drama thriller', 'drama war', 'drama western', 'fantasy', 'fantasy horror', 'fantasy musical', 'fantasy mystery', 'fantasy romance', 'fantasy sci',

In [74]:
item_ids = movies_df['movieId'].tolist()
#item_ids.index(-8949113594875411859)
#item_ids

In [75]:
interactions_movies_df.head()

Unnamed: 0,userId,movieId,rating,title_1,year,genres
0,1,2,3.5,Jumanji,1995,Adventure|Children|Fantasy
1,1,29,3.5,"City of Lost Children, The (Cité des enfants p...",1995,Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,1,32,3.5,Twelve Monkeys (a.k.a. 12 Monkeys),1995,Mystery|Sci-Fi|Thriller
3,1,47,3.5,Seven (a.k.a. Se7en),1995,Mystery|Thriller
4,1,50,3.5,"Usual Suspects, The",1995,Crime|Mystery|Thriller


In [76]:
users_movies_df = interactions_movies_df.set_index('userId')

In [77]:
users_movies_df.head()

Unnamed: 0_level_0,movieId,rating,title_1,year,genres
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2,3.5,Jumanji,1995,Adventure|Children|Fantasy
1,29,3.5,"City of Lost Children, The (Cité des enfants p...",1995,Adventure|Drama|Fantasy|Mystery|Sci-Fi
1,32,3.5,Twelve Monkeys (a.k.a. 12 Monkeys),1995,Mystery|Sci-Fi|Thriller
1,47,3.5,Seven (a.k.a. Se7en),1995,Mystery|Thriller
1,50,3.5,"Usual Suspects, The",1995,Crime|Mystery|Thriller


In [78]:
users_movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 1 to 156
Data columns (total 5 columns):
movieId    20000 non-null int64
rating     20000 non-null float64
title_1    20000 non-null object
year       20000 non-null object
genres     20000 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 937.5+ KB


In [79]:
users_movies_df.dropna(inplace=True)

In [80]:
users_movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 1 to 156
Data columns (total 5 columns):
movieId    20000 non-null int64
rating     20000 non-null float64
title_1    20000 non-null object
year       20000 non-null object
genres     20000 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 937.5+ KB


#### To model the user profile, we take all the item profiles the user has interacted with and average them.

#### The average is weighted by the rating

In [81]:
users_movies_df.groupby([users_movies_df.index])[['movieId']].count().sort_values(by='movieId', ascending=True).head(5)


Unnamed: 0_level_0,movieId
userId,Unnamed: 1_level_1
36,20
52,20
39,20
37,20
123,22


In [82]:
users_movies_df.loc[36].sort_values(by='rating', ascending=False)

Unnamed: 0_level_0,movieId,rating,title_1,year,genres
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
36,2353,4.0,Enemy of the State,1998,Action|Thriller
36,145,3.5,Bad Boys,1995,Action|Comedy|Crime|Drama|Thriller
36,1358,3.5,Sling Blade,1996,Drama
36,2023,3.5,"Godfather: Part III, The",1990,Crime|Drama|Mystery|Thriller
36,1408,3.5,"Last of the Mohicans, The",1992,Action|Romance|War|Western
36,1597,3.5,Conspiracy Theory,1997,Drama|Mystery|Romance|Thriller
36,58293,3.5,"10,000 BC",2008,Adventure|Romance|Thriller
36,1088,3.0,Dirty Dancing,1987,Drama|Musical|Romance
36,163,3.0,Desperado,1995,Action|Romance|Western
36,2302,3.0,My Cousin Vinny,1992,Comedy


In [83]:
def get_user_profiles():
    user_profiles = {}
    
    ## Create an index_df by joinning the interactions_full_df with the articles_df, and get the columns from the interactions_full_df
    #interactions_indexed_df = interactions_full_df[interactions_articles_df['contentId'].isin(articles_df['contentId'])].set_index('personId')

    ## For every user in the index_df, get all of their contentIds, and get their corresponding tfdifmatrix 

    for userid in users_movies_df.index.unique():
        interactions_person_df = users_movies_df.loc[userid]    
        user_item_strengths = np.array(interactions_person_df['rating']).reshape(-1,1) ## n rows x 1 col
    
        itemprofilelist = [tfidf_matrix[item_ids.index(c)] for c in pd.Series(interactions_person_df['movieId'])]
        item_profiles = scipy.sparse.vstack(itemprofilelist)
    
        #Weighted average of item profiles by the interactions strength
        user_item_strengths_weighted_avg = np.sum(item_profiles.multiply(user_item_strengths), axis=0) / np.sum(user_item_strengths)
        user_profile_norm = sklearn.preprocessing.normalize(user_item_strengths_weighted_avg)    
        
        user_profiles[userid] = user_profile_norm
    return user_profiles

In [84]:
user_profiles = get_user_profiles()

In [85]:
len(user_profiles)

156

In [86]:
## Let's take a look in the profile. It is a unit vector of length 90 (length of tf-idf matrix). 
#The value in each position represents how relevant is a token (unigram or bigram).

In [87]:
myprofile = user_profiles[36]
print(myprofile.shape)

(1, 90)


In [88]:
myprofile.flatten().tolist()

[0.26329893543006716,
 0.027555627256750648,
 0.0,
 0.07462009815710449,
 0.0,
 0.0,
 0.056146109767782414,
 0.0,
 0.1509466724348748,
 0.09268116740707781,
 0.0,
 0.0,
 0.03341814887706501,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.3737383884535064,
 0.13995271933289602,
 0.0,
 0.0,
 0.08358623875789668,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.19138174846036363,
 0.11733353216192162,
 0.0,
 0.0,
 0.07135729712672709,
 0.0,
 0.0,
 0.0,
 0.4082476502008344,
 0.0,
 0.0,
 0.0,
 0.07944279230360352,
 0.24197265230443296,
 0.0,
 0.0,
 0.06273771358665198,
 0.0,
 0.0,
 0.0627395786586514,
 0.0,
 0.0,
 0.0,
 0.05693933159937356,
 0.0,
 0.09063894076818212,
 0.0453409145933734,
 0.0,
 0.0,
 0.0,
 0.0,
 0.08087551236166499,
 0.0,
 0.12616941338044776,
 0.0,
 0.0,
 0.0,
 0.06196542448391175,
 0.07889335612806661,
 0.19024550168531013,
 0.08651119051499069,
 0.0,
 0.16399717686689624,
 0.0,
 0.0,
 0.2602556606661072,
 0.0,
 0.19663651734251553,
 0.09970130483052396,
 0.0985651

In [89]:
token_relevance = pd.DataFrame(sorted(zip(tfidf_feature_names,user_profiles[36].flatten().tolist())), columns=['token', 'relevance']) 

In [90]:
token_relevance.head(20)

Unnamed: 0,token,relevance
0,action,0.263299
1,action adventure,0.027556
2,action animation,0.0
3,action comedy,0.07462
4,action crime,0.0
5,action drama,0.0
6,action horror,0.056146
7,action sci,0.0
8,action thriller,0.150947
9,adventure,0.092681


In [91]:
## Sort by highest order of relevance

token_relevance = token_relevance.sort_values(by='relevance', ascending=False)

In [92]:
token_relevance.head(30)

Unnamed: 0,token,relevance
42,drama,0.408248
86,thriller,0.376396
24,comedy,0.373738
0,action,0.263299
79,romance,0.260256
47,drama mystery,0.241973
81,romance thriller,0.196637
34,crime,0.191382
73,mystery,0.190246
76,mystery thriller,0.163997


In [93]:
#Compute the cosine similarity between the user profile and all item profiles
person_id = 36
cosine_similarities = cosine_similarity(user_profiles[person_id], tfidf_matrix)
cosine_similarities 

array([[ 0.13811655,  0.05289015,  0.27349082, ...,  0.09268117,
         0.        ,  0.07990969]])

In [94]:
cosine_similarities.shape

(1, 27278)

In [95]:
#Gets the top similar items
topn = 15
similar_indices = cosine_similarities.argsort().flatten()[-topn:]
similar_indices

array([14157,  1712,  6841,  2717,  9038, 24334,  9791, 16419,  1726,
       24335, 24336,  8611, 12003, 24664,  9422], dtype=int64)

In [96]:
#Sort the similar items by similarity
similar_items = sorted([(item_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
similar_items

[(82915, 0.63002040523487368),
 (1799, 0.63002040523487368),
 (115335, 0.63002040523487368),
 (115337, 0.63002040523487368),
 (26104, 0.63002040523487368),
 (54262, 0.63002040523487368),
 (116698, 0.63002040523487368),
 (27674, 0.63002040523487368),
 (115333, 0.58738668892917878),
 (31921, 0.58738668892917878),
 (71033, 0.57836218157594188),
 (1783, 0.57836218157594188),
 (6953, 0.57836218157594188),
 (2803, 0.57836218157594188),
 (26761, 0.57836218157594188)]

In [98]:
items_to_ignore = interactions_movies_df[interactions_movies_df['userId'] == person_id]['movieId'].tolist()
#items_to_ignore

In [99]:
similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items)) ##x[0] is the contentId, x[1] is cosine similarity
len(similar_items_filtered)

15

In [100]:
similar_items_filtered

[(82915, 0.63002040523487368),
 (1799, 0.63002040523487368),
 (115335, 0.63002040523487368),
 (115337, 0.63002040523487368),
 (26104, 0.63002040523487368),
 (54262, 0.63002040523487368),
 (116698, 0.63002040523487368),
 (27674, 0.63002040523487368),
 (115333, 0.58738668892917878),
 (31921, 0.58738668892917878),
 (71033, 0.57836218157594188),
 (1783, 0.57836218157594188),
 (6953, 0.57836218157594188),
 (2803, 0.57836218157594188),
 (26761, 0.57836218157594188)]

In [101]:
content_rec_df = pd.DataFrame(similar_items_filtered, columns=['movieId', 'recStrength']).head(10)

In [102]:
content_rec_df

Unnamed: 0,movieId,recStrength
0,82915,0.63002
1,1799,0.63002
2,115335,0.63002
3,115337,0.63002
4,26104,0.63002
5,54262,0.63002
6,116698,0.63002
7,27674,0.63002
8,115333,0.587387
9,31921,0.587387


In [103]:
## Merge movies_df with content_rec_df

content_rec_df_completed = content_rec_df.merge(movies_df[['movieId', 'title', 'genres']], how = 'left', left_on = 'movieId',
               right_on = 'movieId')

In [104]:
## Recommended content for personId: 36
content_rec_df_completed

Unnamed: 0,movieId,recStrength,title,genres
0,82915,0.63002,"Tiger's Tail, The (2006)",Comedy|Crime|Drama|Mystery|Thriller
1,1799,0.63002,Suicide Kings (1997),Comedy|Crime|Drama|Mystery|Thriller
2,115335,0.63002,Charlie Chan in Paris (1935),Comedy|Crime|Drama|Mystery|Thriller
3,115337,0.63002,Charlie Chan in Reno (1939),Comedy|Crime|Drama|Mystery|Thriller
4,26104,0.63002,Murder at the Gallop (1963),Comedy|Crime|Drama|Mystery|Thriller
5,54262,0.63002,Murder Most Foul (1964),Comedy|Crime|Drama|Mystery|Thriller
6,116698,0.63002,Dead Men Tell (1941),Comedy|Crime|Drama|Mystery|Thriller
7,27674,0.63002,11:14 (2003),Comedy|Crime|Drama|Mystery|Thriller
8,115333,0.587387,Charlie Chan in Panama (1940),Adventure|Comedy|Crime|Drama|Mystery|Thriller
9,31921,0.587387,"Seven-Per-Cent Solution, The (1976)",Adventure|Comedy|Crime|Drama|Mystery|Thriller


In [114]:
interactions_movies_df[interactions_movies_df['userId']==36].groupby(['genres'])[['movieId']].count()

Unnamed: 0_level_0,movieId
genres,Unnamed: 1_level_1
Action|Adventure|Comedy|Fantasy,1
Action|Comedy|Crime|Drama|Thriller,1
Action|Horror|Sci-Fi|Thriller,1
Action|Romance|War|Western,1
Action|Romance|Western,1
Action|Thriller,1
Adventure|Romance|Thriller,1
Comedy,2
Comedy|Crime,1
Comedy|Fantasy|Romance,1


### Define the above in a class

In [None]:
class ContentBasedRecommender:
    
    MODEL_NAME = 'Content-Based'
    
    def __init__(self, items_df=None):
        self.item_ids = item_ids
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def get_similar_items_to_user_profile(self, user_id, topn=1000):
        
        #Computes the cosine similarity between the user profile and all item profiles
        cosine_similarities = cosine_similarity(user_profiles[user_id], tfidf_matrix)
        
        #Gets the top similar items
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        
        #Sort the similar items by similarity
        similar_items = sorted([(item_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
        
        return similar_items
        
    def content_recommend_items(self, user_id, articles_df, items_to_ignore=[], topn=10, verbose=False):
        similar_items = self.get_similar_items_to_user_profile(user_id)
        
        #Ignores items the user has already interacted
        similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))
        
        rec_df = pd.DataFrame(similar_items_filtered, columns=['movieId', 'recStrength']).head(topn)

       
       # if verbose:
       #     if self.items_df is None:
       #         raise Exception('"items_df" is required in verbose mode')

       #     recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
       #                                                   left_on = 'contentId', 
       #                                                   right_on = 'contentId')[['recStrength', 'contentId', 'title', 'url', 'lang']]
       

        content_recommendation_df = rec_df.merge(movies_df[['movieId', 'title', 'genres']], how = 'left', left_on = 'movieId',
               right_on = 'movieId')
    
        return content_recommendation_df

In [None]:
cont_rec = ContentBasedRecommender()

In [None]:
user_id = 36

## get list of already interacted content ids (in order to avoid coming up in the recommendation list)
ignore_items = interactions_movies_df[interactions_movies_df['userId'] == user_id]['movieId'].tolist()

recommendation_df = cont_rec.content_recommend_items(user_id, movies_df, ignore_items)

In [None]:
recommendation_df

In [None]:
## Movies watched by user_id = 36
interactions_movies_df[interactions_movies_df['userId'] == 36][['rating', 'title_1', 'genres']].sort_values(by='rating', ascending=False)

In [None]:
user_id = 1

## get list of already interacted content ids (in order to avoid coming up in the recommendation list)
ignore_items = interactions_movies_df[interactions_movies_df['userId'] == user_id]['movieId'].tolist()

recommendation_df = cont_rec.content_recommend_items(user_id, movies_df, ignore_items)
recommendation_df

### Collaborative Filtering - Matrix Factorization

In [115]:
interactions_movies_df.head()

Unnamed: 0,userId,movieId,rating,title_1,year,genres
0,1,2,3.5,Jumanji,1995,Adventure|Children|Fantasy
1,1,29,3.5,"City of Lost Children, The (Cité des enfants p...",1995,Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,1,32,3.5,Twelve Monkeys (a.k.a. 12 Monkeys),1995,Mystery|Sci-Fi|Thriller
3,1,47,3.5,Seven (a.k.a. Se7en),1995,Mystery|Thriller
4,1,50,3.5,"Usual Suspects, The",1995,Crime|Mystery|Thriller


In [116]:
#Creating a sparse pivot table with users in rows and items in columns
users_items_pivot_matrix_df = interactions_movies_df.pivot(index='userId', 
                                                          columns='movieId', 
                                                          values='rating').fillna(0)

users_items_pivot_matrix_df.head(10)

movieId,1,2,3,4,5,6,7,8,9,10,...,112623,112852,113453,114180,115617,116797,117511,117590,118696,125916
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,5.0,0.0,3.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,4.0,0.0,5.0,0.0,0.0,3.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [117]:
users_items_pivot_matrix_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156 entries, 1 to 156
Columns: 4192 entries, 1 to 125916
dtypes: float64(4192)
memory usage: 5.0 MB


In [118]:
users_items_pivot_matrix = users_items_pivot_matrix_df.as_matrix()
users_items_pivot_matrix#[:10]

array([[ 0. ,  3.5,  0. , ...,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  4. , ...,  0. ,  0. ,  0. ],
       [ 4. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       ..., 
       [ 0. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 2.5,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 5. ,  5. ,  2. , ...,  0. ,  0. ,  0. ]])

In [119]:
type(users_items_pivot_matrix)

numpy.ndarray

In [120]:
users_ids = list(users_items_pivot_matrix_df.index)
users_ids[:10]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [121]:
## An important decision is the number of factors to factor the user-item matrix. 
## The higher the number of factors, the more precise is the factorization in the original matrix reconstructions.
## Therefore, if the model is allowed to memorize too much details of the original matrix,
## it may not generalize well for data it was not trained on.

## Reducing the number of factors increases the model generalization.

In [122]:
users_items_pivot_matrix.shape

(156, 4192)

In [123]:
#The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 50

In [124]:
#Performs matrix factorization of the original user item matrix

U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)

In [125]:
U.shape

(156, 50)

In [126]:
Vt.shape

(50, 4192)

In [127]:
sigma.shape

(50,)

In [128]:
#U

In [129]:
#Vt

In [130]:
#sigma

In [131]:
sigma = np.diag(sigma)
sigma.shape

(50, 50)

In [132]:
#sigma

In [133]:
## After the factorization, we try to to reconstruct the original matrix by multiplying its factors. 
## The resulting matrix is not sparse any more.

## It generated predictions for items the user has not yet interacted, which we will exploit for recommendations.

In [134]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt)

#all_user_predicted_ratings

In [135]:
all_user_predicted_ratings.shape

(156, 4192)

In [138]:
#Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(all_user_predicted_ratings, columns = users_items_pivot_matrix_df.columns, index=users_ids).transpose()
cf_preds_df.head(10)

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,147,148,149,150,151,152,153,154,155,156
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.599611,0.698747,4.453231,0.679727,2.168952,3.296606,-0.310101,1.74257,0.005556,0.681066,...,4.760476,0.853245,1.762139,0.025119,0.41881,0.320853,-0.181354,0.414455,0.74848,4.754336
2,3.262585,0.380625,0.003297,0.289157,0.499934,0.287561,0.495997,1.193663,0.083512,-0.206406,...,-0.029974,-0.491637,0.645078,0.114302,0.16109,1.055905,0.059698,-0.191357,0.265448,4.135406
3,-0.101087,1.030881,-0.312974,0.161806,1.585538,2.117141,3.17205,0.717718,0.023801,0.235253,...,0.076916,-0.110658,0.397496,0.028003,0.148002,-0.176604,-0.015627,0.21632,0.396518,2.150614
4,-0.186174,0.220043,0.069618,-0.216058,-0.381269,0.139844,0.061206,-0.017122,0.007479,-0.082773,...,-0.084542,-0.025495,-0.147291,0.072158,-0.083188,0.058978,0.150028,0.026279,0.242424,2.1798
5,-0.06195,0.364409,-0.320773,0.240251,0.462498,0.827814,-0.022522,0.578909,0.008623,-0.138768,...,0.168842,-0.090138,0.223516,0.031387,0.14575,-0.461182,-0.073557,0.158827,-0.063693,2.002719
6,-0.430448,0.850223,-0.120608,0.441755,0.972962,1.726309,-0.313508,0.959346,-0.059669,0.280324,...,3.839314,0.666961,0.052517,-0.069954,0.146876,-0.227345,0.272619,5.293265,0.844549,4.311804
7,0.117004,0.613558,-0.177138,-0.137557,1.051998,2.391056,3.57482,0.12949,-0.015731,-0.037278,...,-0.005258,3.25443,0.166188,-0.121475,0.11966,-0.671689,-0.351043,-0.253093,0.201336,4.190984
8,-0.003157,0.03525,-0.024831,0.038769,0.09414,0.030632,0.059857,0.146352,0.034943,0.019137,...,-0.005716,0.118346,0.037616,-0.007123,0.02499,-0.020638,0.033221,-0.04576,-0.033029,0.125666
9,0.117493,0.231167,0.032385,0.133503,-0.143718,0.11062,-0.050339,0.663361,-0.050998,0.138656,...,0.029639,-0.242202,0.065592,-0.021912,0.135259,-0.449867,-0.477942,-0.071198,0.077894,3.374176
10,-0.278241,0.485076,0.164935,0.8979,0.855678,0.305443,-0.183951,2.429605,-0.024207,-0.068663,...,0.145947,0.394779,1.022208,-0.088922,0.264486,0.539356,0.249012,4.561524,0.285001,3.131134


In [139]:
cf_preds_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4192 entries, 1 to 125916
Columns: 156 entries, 1 to 156
dtypes: float64(156)
memory usage: 5.2 MB


In [140]:
user_id = 1

cf_preds_df[user_id].sort_values(ascending=False).head()

movieId
5952    5.312800
7153    5.247765
4993    4.976275
260     4.707425
4306    4.699117
Name: 1, dtype: float64

In [143]:
sorted_user_predictions = cf_preds_df[user_id].sort_values(ascending=False).reset_index().rename(columns={user_id: 'recStrength'})
sorted_user_predictions.head(5)

Unnamed: 0,movieId,recStrength
0,5952,5.3128
1,7153,5.247765
2,4993,4.976275
3,260,4.707425
4,4306,4.699117


In [144]:
user_predictions = sorted_user_predictions.merge(movies_df[['movieId', 'title', 'genres']], how = 'left', left_on = 'movieId',
               right_on = 'movieId')
user_predictions.head()

Unnamed: 0,movieId,recStrength,title,genres
0,5952,5.3128,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
1,7153,5.247765,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy
2,4993,4.976275,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
3,260,4.707425,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
4,4306,4.699117,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...


In [145]:
# Recommend the highest predicted rating movies that the user hasn't seen yet.

## get list of already interacted content ids (in order to avoid coming up in the recommendation list)
collab_ignore_items = interactions_movies_df[interactions_movies_df['userId'] == user_id]['movieId'].tolist()
#collab_ignore_items

recommendations_df = user_predictions[~user_predictions['movieId'].isin(collab_ignore_items)] \
                               .sort_values('recStrength', ascending = False) \
                               .head(10).reset_index(drop=True)

In [146]:
recommendations_df

Unnamed: 0,movieId,recStrength,title,genres
0,2329,0.840369,American History X (1998),Crime|Drama
1,1732,0.825409,"Big Lebowski, The (1998)",Comedy|Crime
2,141,0.735987,"Birdcage, The (1996)",Comedy
3,7361,0.731191,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi
4,1207,0.654387,To Kill a Mockingbird (1962),Drama
5,2161,0.652717,"NeverEnding Story, The (1984)",Adventure|Children|Fantasy
6,6377,0.629295,Finding Nemo (2003),Adventure|Animation|Children|Comedy
7,2987,0.601606,Who Framed Roger Rabbit? (1988),Adventure|Animation|Children|Comedy|Crime|Fant...
8,1,0.599611,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
9,1517,0.546456,Austin Powers: International Man of Mystery (1...,Action|Adventure|Comedy


### Use correlation

In [None]:
#Creating a sparse pivot table with users in rows and items in columns
users_items_pivot_matrix_df_1 = interactions_movies_df.pivot(index='userId', 
                                                          columns='movieId', 
                                                          values='rating').fillna(0)

users_items_pivot_matrix_df_1.head(10)

In [None]:
movie_id = 1

movie_watched = users_items_pivot_matrix_df_1[movie_id]

In [None]:
similarity_with_other_movies = users_items_pivot_matrix_df_1.corrwith(movie_watched)  # find correlation between 1 and other movies
similarity_with_other_movies.head(10)

In [None]:
similarity_df = pd.DataFrame(data=similarity_with_other_movies, index=users_items_pivot_matrix_df_1.columns,columns=['Correlation'])

In [None]:
similarity_df.head(10)

In [None]:
similarity_df.reset_index(inplace=True)
similarity_df.head()
#similarity_with_other_movies.sort_values(ascending=False)#.reset_index().rename(columns={: 'recStrength'})
#similarity_with_other_movies.head()

In [None]:
similarity_df.sort_values(by='Correlation', inplace=True, ascending=False)
similarity_df.head()

In [None]:
sim_df = similarity_df.merge(movies_df[['movieId', 'title', 'genres']], how = 'left', left_on = 'movieId',
               right_on = 'movieId')
sim_df.head()

In [None]:
# Recommend the highest predicted rating movies that the user hasn't seen yet.
user_id = 36
## get list of already interacted content ids (in order to avoid coming up in the recommendation list)
collab_ignore_items_1 = interactions_movies_df[interactions_movies_df['userId'] == user_id]['movieId'].tolist()
#collab_ignore_items


In [None]:
collab_recommendations_df_1 = sim_df[~sim_df['movieId'].isin(collab_ignore_items_1)] \
                               .sort_values('Correlation', ascending = False) \
                               .head(10)

In [None]:
collab_recommendations_df_1