In [1]:
## https://www.kaggle.com/kanncaa1/recommendation-systems-tutorial/data
## https://www.kaggle.com/gspmoreira/recommender-systems-in-python-101

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import scipy as scipy
import math as math
import random as random
import sklearn

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds

%matplotlib inline

In [3]:
movies_df = pd.read_csv('movie.csv')

In [4]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
movies_df['title_1'] = movies_df['title'].str.split("\(\d", expand=True)[0]
movies_df['year'] = movies_df['title'].str.extract('.*\((.*\d{4})\).*', expand=True)
movies_df.head(30)

Unnamed: 0,movieId,title,genres,title_1,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995
5,6,Heat (1995),Action|Crime|Thriller,Heat,1995
6,7,Sabrina (1995),Comedy|Romance,Sabrina,1995
7,8,Tom and Huck (1995),Adventure|Children,Tom and Huck,1995
8,9,Sudden Death (1995),Action,Sudden Death,1995
9,10,GoldenEye (1995),Action|Adventure|Thriller,GoldenEye,1995


In [6]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 5 columns):
movieId    27278 non-null int64
title      27278 non-null object
genres     27278 non-null object
title_1    27278 non-null object
year       27257 non-null object
dtypes: int64(1), object(4)
memory usage: 1.0+ MB


In [7]:
interactions_df = pd.read_csv('rating_1.csv')
interactions_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,4/2/2005 23:53
1,1,29,3.5,4/2/2005 23:31
2,1,32,3.5,4/2/2005 23:33
3,1,47,3.5,4/2/2005 23:32
4,1,50,3.5,4/2/2005 23:29


In [8]:
interactions_df.drop('timestamp', axis=1, inplace=True)

In [9]:
interactions_df.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [10]:
interactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 3 columns):
userId     20000 non-null int64
movieId    20000 non-null int64
rating     20000 non-null float64
dtypes: float64(1), int64(2)
memory usage: 468.8 KB


In [11]:
interactions_df = interactions_df.iloc[:20000,:]

In [12]:
interactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 3 columns):
userId     20000 non-null int64
movieId    20000 non-null int64
rating     20000 non-null float64
dtypes: float64(1), int64(2)
memory usage: 468.8 KB


In [13]:
print(interactions_df['userId'].nunique())
print(interactions_df['movieId'].nunique())

156
4192


In [14]:
## Merge movies_df (with only movieId, title columns) with interactions_df

interactions_movies_df = interactions_df.merge(movies_df[['movieId', 'title_1', 'year', 'genres']], how = 'left', left_on = 'movieId',
               right_on = 'movieId')

In [15]:
interactions_movies_df.head()

Unnamed: 0,userId,movieId,rating,title_1,year,genres
0,1,2,3.5,Jumanji,1995,Adventure|Children|Fantasy
1,1,29,3.5,"City of Lost Children, The (Cité des enfants p...",1995,Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,1,32,3.5,Twelve Monkeys (a.k.a. 12 Monkeys),1995,Mystery|Sci-Fi|Thriller
3,1,47,3.5,Seven (a.k.a. Se7en),1995,Mystery|Thriller
4,1,50,3.5,"Usual Suspects, The",1995,Crime|Mystery|Thriller


In [16]:
interactions_movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 0 to 19999
Data columns (total 6 columns):
userId     20000 non-null int64
movieId    20000 non-null int64
rating     20000 non-null float64
title_1    20000 non-null object
year       20000 non-null object
genres     20000 non-null object
dtypes: float64(1), int64(2), object(3)
memory usage: 1.1+ MB


### Split interactions_movies_df into train and test set

In [17]:
train_df, test_df = train_test_split(interactions_movies_df, stratify=interactions_movies_df['userId'], test_size=0.2, random_state=42)

In [18]:
print(train_df.shape, test_df.shape)

(16000, 6) (4000, 6)


In [19]:
print(train_df['userId'].nunique(), test_df['userId'].nunique())

156 156


In [20]:
print(train_df['movieId'].nunique(), test_df['movieId'].nunique())

3826 1906


## Popularity Model

In [21]:
#interactions_movies_df[interactions_movies_df['movieId']==1][['rating']].mean().reset_index()

interactions_movies_grp_rating = interactions_movies_df.groupby(['movieId', 'title_1'])[['rating']].sum().reset_index()
interactions_movies_grp_rating.rename(columns = {'rating': 'TotalRating'}, inplace=True)
interactions_movies_grp_rating.head()

Unnamed: 0,movieId,title_1,TotalRating
0,1,Toy Story,221.5
1,2,Jumanji,60.0
2,3,Grumpier Old Men,59.5
3,4,Waiting to Exhale,8.0
4,5,Father of the Bride Part II,35.5


In [22]:
#Sort on desc order of TotalRating

popular_movies_df = interactions_movies_grp_rating.sort_values(by='TotalRating', ascending=False).reset_index(drop=True)
popular_movies_df.head(5)

Unnamed: 0,movieId,title_1,TotalRating
0,296,Pulp Fiction,339.5
1,356,Forrest Gump,324.0
2,318,"Shawshank Redemption, The",312.5
3,593,"Silence of the Lambs, The",273.0
4,480,Jurassic Park,266.5


In [23]:
class PopularityRecommender:
    
    MODEL_NAME = 'Popularity'
    
    def __init__(self, popularity_df, movies_df):
        self.popularity_df = popularity_df
        self.movies_df = movies_df
        
    def get_model_name(self):
        return self.MODEL_NAME            
    
    def recommend_items(self, userid, already_watched=[], topn=10):
        popular_recommendations_df = self.popularity_df[~self.popularity_df['movieId'].isin(already_watched)].reset_index(drop=True).head(topn)
        popular_recommendations_df['Rank'] = popular_recommendations_df['TotalRating'].rank(ascending=0, method='first')
        popular_recommendations_df = popular_recommendations_df.merge(movies_df[['movieId','year', 'genres']], how='left', left_on='movieId', right_on='movieId')[['movieId', 'title_1', 'year','genres','TotalRating','Rank']]      
        return popular_recommendations_df
    

### Instantiate the PopularityRecommender class

In [24]:
popular_rec_model = PopularityRecommender(popular_movies_df, movies_df)

### Call recommend_items for user in train df

In [25]:
users_train = train_df['userId'].tolist()
train_userid = users_train[1]

## movies already watched by train_userid
already_watched_train = train_df[train_df['userId']==train_userid]['movieId'].tolist()


print("Popular movie recommendations for user %d in train_df: " %train_userid)
popular_rec_model.recommend_items(train_userid, already_watched_train)

Popular movie recommendations for user 131 in train_df: 


Unnamed: 0,movieId,title_1,year,genres,TotalRating,Rank
0,318,"Shawshank Redemption, The",1994,Crime|Drama,312.5,1.0
1,593,"Silence of the Lambs, The",1991,Crime|Horror|Thriller,273.0,2.0
2,260,Star Wars: Episode IV - A New Hope,1977,Action|Adventure|Sci-Fi,231.0,3.0
3,50,"Usual Suspects, The",1995,Crime|Mystery|Thriller,228.5,4.0
4,608,Fargo,1996,Comedy|Crime|Drama|Thriller,228.5,5.0
5,47,Seven (a.k.a. Se7en),1995,Mystery|Thriller,217.0,6.0
6,589,Terminator 2: Judgment Day,1991,Action|Sci-Fi,216.5,7.0
7,527,Schindler's List,1993,Drama|War,210.5,8.0
8,590,Dances with Wolves,1990,Adventure|Drama|Western,200.5,9.0
9,457,"Fugitive, The",1993,Thriller,199.5,10.0


In [26]:
print("Movies already watched by user %d in train_df " %train_userid)
train_df[train_df['userId']==train_userid][['movieId', 'year', 'rating', 'title_1', 'genres']].sort_values(by='year', ascending=False).head(5)

Movies already watched by user 131 in train_df 


Unnamed: 0,movieId,year,rating,title_1,genres
16125,67788,2009,3.0,Confessions of a Shopaholic,Comedy|Romance
16130,69644,2009,2.5,Ice Age: Dawn of the Dinosaurs,Action|Adventure|Animation|Children|Comedy|Rom...
16131,71264,2009,3.0,Cloudy with a Chance of Meatballs,Animation|Children|Fantasy|IMAX
16128,69406,2009,1.0,"Proposal, The",Comedy|Romance
16122,65585,2009,1.5,Bride Wars,Comedy|Romance


### Call recommend_items for SAME user in test df

In [27]:
#users_test = test_df['userId'].tolist()
#test_userid = 131 #users_test[1]

## movies already watched by train_userid
already_watched_test = test_df[test_df['userId']==train_userid]['movieId'].tolist()


print("Popular movie recommendations for user in test_df: ", train_userid)
popular_rec_model.recommend_items(train_userid, already_watched_test)

Popular movie recommendations for user in test_df:  131


Unnamed: 0,movieId,title_1,year,genres,TotalRating,Rank
0,296,Pulp Fiction,1994,Comedy|Crime|Drama|Thriller,339.5,1.0
1,356,Forrest Gump,1994,Comedy|Drama|Romance|War,324.0,2.0
2,318,"Shawshank Redemption, The",1994,Crime|Drama,312.5,3.0
3,480,Jurassic Park,1993,Action|Adventure|Sci-Fi|Thriller,266.5,4.0
4,260,Star Wars: Episode IV - A New Hope,1977,Action|Adventure|Sci-Fi,231.0,5.0
5,50,"Usual Suspects, The",1995,Crime|Mystery|Thriller,228.5,6.0
6,608,Fargo,1996,Comedy|Crime|Drama|Thriller,228.5,7.0
7,2858,American Beauty,1999,Comedy|Drama,223.0,8.0
8,1,Toy Story,1995,Adventure|Animation|Children|Comedy|Fantasy,221.5,9.0
9,589,Terminator 2: Judgment Day,1991,Action|Sci-Fi,216.5,10.0


In [28]:
print("Movies already watched by user in test_df: ", train_userid)
test_df[test_df['userId']==train_userid][['movieId', 'year', 'rating', 'title_1', 'genres']].sort_values(by='year', ascending=False).head(5)

Movies already watched by user in test_df:  131


Unnamed: 0,movieId,year,rating,title_1,genres
16112,62265,2009,0.5,"Accidental Husband, The",Comedy|Romance
16127,69122,2009,2.0,"Hangover, The",Comedy|Crime
16129,69436,2009,2.5,Year One,Adventure|Comedy
16126,68954,2009,2.5,Up,Adventure|Animation|Children|Drama
16120,64032,2008,0.5,Four Christmases,Comedy


In [29]:
print("Total number of movies watched by user %d in test_df is %d " %(train_userid, test_df[test_df['userId']==train_userid].shape[0]))

Total number of movies watched by user 131 in test_df is 86 


In [30]:
#TP = Predicted movies that user likes

#TP + FN = Movies that are actually watched by user
# recall = TP / (TP+FN)

In [31]:
## 1. Get list of items that a user has interacted in interactions full data set(train+test)
## 2. Get the recommended items based on model (Popularity, content-based, collab-filtering)
## 3 Get the items intereacted in test set
## 3.1A For evey item in the interacted_test_set
## 3.1B Filter the items that were recommended (#3) 
## 3.1C If the item in the interacted_test_set is in top 5 or top10 of the recommended_list, count as hits_5 or hits_10 resply.
## 4. Calculate recall_5 as hits_5/total count of interacted_test_set and recall_10 = hits_10/total count of interacted_test_set

In [32]:
class ModelEvaluator:
    
    def already_watched(self, df, userid):
        already_watched_items = set(df[df['userId']==userid]['movieId'])
        return already_watched_items
    
    def evaluate_model_for_user(self, model, userid):
        user_metrics = {}
        
        already_watched_train = self.already_watched(train_df, userid)
        rec_df = model.recommend_items(userid, already_watched_train, topn=20)
        rec_items = rec_df['movieId'].values
        
        already_watched_test = self.already_watched(test_df, userid)
        hits_5 = 0
        hits_10 = 0
             
        for test_movie_watched in set(already_watched_test):
            for idx, rec_movie_id in enumerate(rec_items):
                try:
                    if(test_movie_watched == rec_movie_id):
                        index = idx
                        if(index < 5):
                            hits_5 += 1
                        if(index < 10):
                            hits_10 += 1
                except:
                    index = -1
         
        recall_5 = (hits_5 / len(already_watched_test))*100
        recall_10 = (hits_10 / len(already_watched_test))*100
        
        recall_5 = "{:.2f}".format(recall_5)
        recall_10 = "{:.2f}".format(recall_10)
                        
        user_metrics = {#'hits@05_count':hits_5, 
                      'hits@10_count':hits_10, 
                      'test_interacted_count': len(already_watched_test),
                      #'recall@05': recall_5,
                      'recall@10': recall_10
                       }
        
        return user_metrics
        
    def evaluate_model(self, model):
            print('Running evaluation for users')
            metrics = []
            for idx, user_id in enumerate(list(test_df['userId'].unique())):                  
                user_metrics = self.evaluate_model_for_user(model, user_id)  
                user_metrics['_person_id'] = user_id
                metrics.append(user_metrics)
                
            print('Number of users processed: ', (idx+1))        

            detailed_results_df = pd.DataFrame(metrics).sort_values('test_interacted_count', ascending=False).reset_index(drop=True)
        
            #global_recall_at_5 = (detailed_results_df['hits@05_count'].sum() / float(detailed_results_df['test_interacted_count'].sum()))*100
            global_recall_at_10 = (detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['test_interacted_count'].sum()))*100
        
            #global_recall_at_5 = "{:.2f}".format(global_recall_at_5)
            global_recall_at_10 = "{:.2f}".format(global_recall_at_10)
            
            global_metrics = {'modelName': model.get_model_name(),
                              #'recall@05': global_recall_at_5,
                              'recall@10': global_recall_at_10}    
       
            return global_metrics, detailed_results_df   
    

In [33]:
model_eval = ModelEvaluator()

In [34]:
print('Evaluating Popularity recommendation model...')
pop_global_metrics, pop_detailed_results_df = model_eval.evaluate_model(popular_rec_model)

Evaluating Popularity recommendation model...
Running evaluation for users
Number of users processed:  156


In [35]:
pop_global_metrics

{'modelName': 'Popularity', 'recall@10': '5.88'}

In [36]:
pop_detailed_results_df.head()

Unnamed: 0,_person_id,hits@10_count,recall@10,test_interacted_count
0,116,7,3.15,222
1,104,4,2.0,200
2,54,7,4.93,142
3,91,10,7.52,133
4,58,3,2.29,131


In [37]:
#pop_detailed_results_df[pop_detailed_results_df['_person_id'] == 131]

In [38]:
#already_watched_131 = train_df[train_df['userId']==131]['movieId'].tolist()
#rec_df_131 = popular_rec_model.recommend_items(131, already_watched_131)
#test_interacted_movies_131 = set(test_df[test_df['userId']==131]['movieId'])
#len(test_interacted_movies_131)

#hits = 0
#for item_id in test_interacted_movies_131: ## 41 movies
#    valid_recs = rec_df_131['movieId'].values
    
#    for i, m in enumerate(valid_recs):
#        try:
#            if(m == item_id):
#                index = i
##                print(i, m, item_id)
#                if(index < 10):
#                    hits += 1
#        except:
#            index = -1
#print(hits, len(test_interacted_movies_131), hits/len(test_interacted_movies_131))        

## Content based Filtering model

#### Content-based filtering approach leverages description or attributes from items the user has interacted to recommend similar items.
#### It depends only on the user's previous choices, making this method robust to avoid the cold-start problem. 

In [39]:
#Ignoring stopwords (words with no semantics) from English
stopwords_list = stopwords.words('english')

#Trains a model whose vectors size is 5000, composed by the main unigrams and bigrams found in the corpus, ignoring stopwords
vectorizer = TfidfVectorizer(analyzer='word',
                     ngram_range= (1,2),
                     min_df=0.003,
                     max_df=0.5,
                     max_features=5000,
                     stop_words=stopwords_list)

In [40]:
#title_genre = pd.concat([movies_df['Title'], movies_df['Genre']])
#title_genre = movies_df['Title'] + " " + movies_df['Genre']
genres = movies_df['genres'].tolist()

In [41]:
#genres

In [42]:
tfidf_matrix = vectorizer.fit_transform(genres)

In [43]:
tfidf_feature_names = vectorizer.get_feature_names()

In [44]:
print(len(tfidf_feature_names))
print(tfidf_feature_names)

90
['action', 'action adventure', 'action animation', 'action comedy', 'action crime', 'action drama', 'action horror', 'action sci', 'action thriller', 'adventure', 'adventure animation', 'adventure children', 'adventure comedy', 'adventure crime', 'adventure drama', 'adventure fantasy', 'adventure sci', 'animation', 'animation children', 'animation comedy', 'children', 'children comedy', 'children drama', 'children fantasy', 'comedy', 'comedy crime', 'comedy documentary', 'comedy drama', 'comedy fantasy', 'comedy horror', 'comedy musical', 'comedy romance', 'comedy sci', 'comedy western', 'crime', 'crime drama', 'crime horror', 'crime mystery', 'crime thriller', 'documentary', 'documentary drama', 'documentary musical', 'drama', 'drama fantasy', 'drama film', 'drama horror', 'drama musical', 'drama mystery', 'drama romance', 'drama sci', 'drama thriller', 'drama war', 'drama western', 'fantasy', 'fantasy horror', 'fantasy musical', 'fantasy mystery', 'fantasy romance', 'fantasy sci',

In [45]:
item_ids = movies_df['movieId'].tolist()

In [46]:
interactions_movies_df.head()

Unnamed: 0,userId,movieId,rating,title_1,year,genres
0,1,2,3.5,Jumanji,1995,Adventure|Children|Fantasy
1,1,29,3.5,"City of Lost Children, The (Cité des enfants p...",1995,Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,1,32,3.5,Twelve Monkeys (a.k.a. 12 Monkeys),1995,Mystery|Sci-Fi|Thriller
3,1,47,3.5,Seven (a.k.a. Se7en),1995,Mystery|Thriller
4,1,50,3.5,"Usual Suspects, The",1995,Crime|Mystery|Thriller


In [47]:
class ContentBasedRecommender:
    
    MODEL_NAME = 'Content-Based'
    
    def __init__(self, movies_df=None):         
        self.movies_df = movies_df        
        
    def get_model_name(self):
        return self.MODEL_NAME

    
    ##Take all the item profiles the user has interacted with and average them, and average is weighted by the rating
    def get_user_profiles(self):
        user_profiles = {}
    
        ## Create an index_df from interactions_movies_df
        users_movies_indexed_df = interactions_movies_df.set_index('userId')
    
        ## For every user in the index_df, get all of their contentIds, and their corresponding tfdifmatrix 
        for userid in users_movies_indexed_df.index.unique():
            interactions_person_df = users_movies_indexed_df.loc[userid]    
            user_item_strengths = np.array(interactions_person_df['rating']).reshape(-1,1) ## n rows x 1 col
    
            itemprofilelist = [tfidf_matrix[item_ids.index(c)] for c in pd.Series(interactions_person_df['movieId'])]
            item_profiles = scipy.sparse.vstack(itemprofilelist)
    
            #Weighted average of item profiles by the interactions strength
            user_item_strengths_weighted_avg = np.sum(item_profiles.multiply(user_item_strengths), axis=0) / np.sum(user_item_strengths)
            user_profile_norm = sklearn.preprocessing.normalize(user_item_strengths_weighted_avg)    
        
            user_profiles[userid] = user_profile_norm
            
        return user_profiles    
        
    def get_similar_items_to_user_profile(self, user_id, topn=1000):
        
        user_profiles = self.get_user_profiles()
        
        #Computes the cosine similarity between the user profile and all item profiles
        cosine_similarities = cosine_similarity(user_profiles[user_id], tfidf_matrix)
        
        #Gets the top similar items
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        
        #Sort the similar items by similarity
        similar_items = sorted([(item_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
        
        return similar_items
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        similar_items = self.get_similar_items_to_user_profile(user_id)
        
        #Ignores items the user has already interacted
        similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))
        
        rec_df = pd.DataFrame(similar_items_filtered, columns=['movieId', 'recStrength']).head(topn)

       
       # if verbose:
       #     if self.movies_df is None:
       #         raise Exception('"movies_df" is required in verbose mode')

        content_recommendation_df = rec_df.merge(movies_df, how = 'left', 
                                                          left_on = 'movieId', 
                                                          right_on = 'movieId')[['movieId', 'recStrength', 'title_1', 'year', 'genres']]
       
    
        return content_recommendation_df

### Instantiate the ContentBasedRecommender class

In [48]:
cont_rec_model = ContentBasedRecommender(movies_df)

### Call recommend_items for user in train df

In [49]:
users_train = train_df['userId'].tolist()
train_userid = users_train[1]

## movies already watched by train_userid
already_watched_train = train_df[train_df['userId']==train_userid]['movieId'].tolist()

print("Content recommendations for user %d in train_df: " %train_userid)
cont_rec_model.recommend_items(train_userid, already_watched_train)

Content recommendations for user 131 in train_df: 


Unnamed: 0,movieId,recStrength,title_1,year,genres
0,84847,0.621798,Emma,2009,Comedy|Drama|Romance
1,100562,0.621798,"Goddess of 1967, The",2000,Comedy|Drama|Romance
2,84890,0.621798,Incantato (Il cuore altrove),2003,Comedy|Drama|Romance
3,6087,0.621798,If You Could See What I Hear,1982,Comedy|Drama|Romance
4,100203,0.621798,Sundome,2007,Comedy|Drama|Romance
5,70661,0.621798,Tyler Perry's Meet the Browns,2008,Comedy|Drama|Romance
6,4191,0.621798,Alfie,1966,Comedy|Drama|Romance
7,55147,0.621798,27 Missing Kisses,2000,Comedy|Drama|Romance
8,50279,0.621798,My Bollywood Bride,2006,Comedy|Drama|Romance
9,5902,0.621798,Adaptation,2002,Comedy|Drama|Romance


In [50]:
print("Movies already watched by user %d in train_df: " %train_userid)
train_df[train_df['userId']==train_userid][['movieId', 'year', 'rating', 'title_1', 'genres']].sort_values(by='rating', ascending=False).reset_index(drop=True).head(10)

Movies already watched by user 131 in train_df: 


Unnamed: 0,movieId,year,rating,title_1,genres
0,49286,2006,5.0,"Holiday, The",Comedy|Romance
1,2411,1985,5.0,Rocky IV,Action|Drama
2,2409,1979,4.5,Rocky II,Action|Drama
3,6942,2003,4.5,Love Actually,Comedy|Drama|Romance
4,2410,1982,4.5,Rocky III,Action|Drama
5,58998,2008,4.5,Forgetting Sarah Marshall,Comedy|Romance
6,3578,2000,4.5,Gladiator,Action|Adventure|Drama
7,35836,2005,4.0,"40-Year-Old Virgin, The",Comedy|Romance
8,8132,1992,4.0,Gladiator,Action|Drama
9,30825,2004,4.0,Meet the Fockers,Comedy


### Call recommend_items for SAME user in test df

In [51]:
#users_test = test_df['userId'].tolist()
#test_userid = 131 #users_test[1]

## movies already watched by train_userid  in test_df
already_watched_test = test_df[test_df['userId']==train_userid]['movieId'].tolist()

print("Content Recommendation for user %d in test_df: " %train_userid)
cont_rec_model.recommend_items(train_userid, already_watched_test)

Content Recommendation for user 131 in test_df: 


Unnamed: 0,movieId,recStrength,title_1,year,genres
0,84847,0.621798,Emma,2009,Comedy|Drama|Romance
1,100562,0.621798,"Goddess of 1967, The",2000,Comedy|Drama|Romance
2,84890,0.621798,Incantato (Il cuore altrove),2003,Comedy|Drama|Romance
3,6087,0.621798,If You Could See What I Hear,1982,Comedy|Drama|Romance
4,100203,0.621798,Sundome,2007,Comedy|Drama|Romance
5,70661,0.621798,Tyler Perry's Meet the Browns,2008,Comedy|Drama|Romance
6,4191,0.621798,Alfie,1966,Comedy|Drama|Romance
7,55147,0.621798,27 Missing Kisses,2000,Comedy|Drama|Romance
8,50279,0.621798,My Bollywood Bride,2006,Comedy|Drama|Romance
9,5902,0.621798,Adaptation,2002,Comedy|Drama|Romance


In [52]:
print("Movies already watched by user %d in test_df: " %train_userid)
test_df[test_df['userId']==train_userid][['movieId', 'year', 'rating', 'title_1', 'genres']].sort_values(by='rating', ascending=False).reset_index(drop=True).head(10)

Movies already watched by user 131 in test_df: 


Unnamed: 0,movieId,year,rating,title_1,genres
0,58047,2008,4.5,"Definitely, Maybe",Comedy|Drama|Romance
1,66934,2008,4.0,Dr. Horrible's Sing-Along Blog,Comedy|Drama|Musical|Sci-Fi
2,47,1995,3.5,Seven (a.k.a. Se7en),Mystery|Thriller
3,6753,2003,3.5,Secondhand Lions,Children|Comedy|Drama
4,4054,2001,3.5,Save the Last Dance,Drama|Romance
5,442,1993,3.0,Demolition Man,Action|Adventure|Sci-Fi
6,4091,1987,3.0,Can't Buy Me Love,Comedy|Romance
7,2058,1998,3.0,"Negotiator, The",Action|Crime|Drama|Mystery|Thriller
8,1073,1971,3.0,Willy Wonka & the Chocolate Factory,Children|Comedy|Fantasy|Musical
9,2706,1999,3.0,American Pie,Comedy|Romance


In [53]:
print('Evaluating Content based recommendation model...')
pop_global_metrics, pop_detailed_results_df = model_eval.evaluate_model(cont_rec_model)
pop_global_metrics

Evaluating Content based recommendation model...
Running evaluation for users
Number of users processed:  156


{'modelName': 'Content-Based', 'recall@10': '0.45'}

In [54]:
pop_detailed_results_df.head()

Unnamed: 0,_person_id,hits@10_count,recall@10,test_interacted_count
0,116,0,0.0,222
1,104,1,0.5,200
2,54,0,0.0,142
3,91,1,0.75,133
4,58,0,0.0,131


### Collaborative Filtering - Matrix Factorization

In [55]:
interactions_movies_df.head()

Unnamed: 0,userId,movieId,rating,title_1,year,genres
0,1,2,3.5,Jumanji,1995,Adventure|Children|Fantasy
1,1,29,3.5,"City of Lost Children, The (Cité des enfants p...",1995,Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,1,32,3.5,Twelve Monkeys (a.k.a. 12 Monkeys),1995,Mystery|Sci-Fi|Thriller
3,1,47,3.5,Seven (a.k.a. Se7en),1995,Mystery|Thriller
4,1,50,3.5,"Usual Suspects, The",1995,Crime|Mystery|Thriller


In [56]:
#Creating a sparse pivot table with users in rows and movies in columns
users_movies_pivot_matrix_df = interactions_movies_df.pivot(index='userId', 
                                                          columns='movieId', 
                                                          values='rating').fillna(0)

users_movies_pivot_matrix_df.head(10)

movieId,1,2,3,4,5,6,7,8,9,10,...,112623,112852,113453,114180,115617,116797,117511,117590,118696,125916
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,5.0,0.0,3.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,4.0,0.0,5.0,0.0,0.0,3.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
users_movies_pivot_matrix_df.shape ## users, movies

(156, 4192)

In [58]:
users_movies_pivot_matrix = users_movies_pivot_matrix_df.as_matrix()
users_movies_pivot_matrix#[:10]

array([[ 0. ,  3.5,  0. , ...,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  4. , ...,  0. ,  0. ,  0. ],
       [ 4. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       ..., 
       [ 0. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 2.5,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 5. ,  5. ,  2. , ...,  0. ,  0. ,  0. ]])

In [59]:
type(users_movies_pivot_matrix)

numpy.ndarray

In [60]:
users_ids = list(users_movies_pivot_matrix_df.index) ## users are in index
users_ids[:10]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

### An important decision is the number of factors to factor the user-movies matrix. 
#### The higher the number of factors, the more precise is the factorization in the original matrix reconstructions.
#### Therefore, if the model is allowed to memorize too much details of the original matrix, it may not generalize well for data it was not trained on.

### Reducing the number of factors increases the model generalization.

In [61]:
users_movies_pivot_matrix.shape ## users, movies

(156, 4192)

In [62]:
#The number of factors to factor the user-movies matrix.
NUMBER_OF_FACTORS_MF = 50

In [63]:
#Performs matrix factorization of the original user movies matrix

U, sigma, Vt = svds(users_movies_pivot_matrix, k = NUMBER_OF_FACTORS_MF)

In [64]:
print(U.shape, sigma.shape, Vt.shape)

(156, 50) (50,) (50, 4192)


In [65]:
sigma = np.diag(sigma)
sigma.shape

(50, 50)

#### After the factorization, we try to to reconstruct the original matrix by multiplying its factors. 
#### The resulting matrix is not sparse any more.

### It generates predictions for movies the user has not yet interacted, which we will exploit for recommendations.

In [66]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt)
all_user_predicted_ratings

array([[  5.99611204e-01,   3.26258524e+00,  -1.01087277e-01, ...,
         -6.07326396e-02,  -1.88358305e-02,  -1.88358305e-03],
       [  6.98746992e-01,   3.80624901e-01,   1.03088057e+00, ...,
         -3.43261582e-02,   1.05431321e-01,   1.05431321e-02],
       [  4.45323051e+00,   3.29654909e-03,  -3.12973702e-01, ...,
          9.15177228e-02,   7.73604333e-03,   7.73604333e-04],
       ..., 
       [  4.14455070e-01,  -1.91356543e-01,   2.16320194e-01, ...,
         -7.49147730e-02,  -3.12321445e-02,  -3.12321445e-03],
       [  7.48480431e-01,   2.65448178e-01,   3.96517707e-01, ...,
         -3.09586190e-02,   7.45271486e-02,   7.45271486e-03],
       [  4.75433562e+00,   4.13540551e+00,   2.15061424e+00, ...,
          3.28790885e-02,  -3.72129769e-02,  -3.72129769e-03]])

In [67]:
all_user_predicted_ratings.shape

(156, 4192)

In [68]:
#Converting the reconstructed matrix back to a Pandas dataframe

cf_preds_df = pd.DataFrame(all_user_predicted_ratings, columns = users_movies_pivot_matrix_df.columns, index=users_ids).transpose()
cf_preds_df.head(10)

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,147,148,149,150,151,152,153,154,155,156
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.599611,0.698747,4.453231,0.679727,2.168952,3.296606,-0.310101,1.74257,0.005556,0.681066,...,4.760476,0.853245,1.762139,0.025119,0.41881,0.320853,-0.181354,0.414455,0.74848,4.754336
2,3.262585,0.380625,0.003297,0.289157,0.499934,0.287561,0.495997,1.193663,0.083512,-0.206406,...,-0.029974,-0.491637,0.645078,0.114302,0.16109,1.055905,0.059698,-0.191357,0.265448,4.135406
3,-0.101087,1.030881,-0.312974,0.161806,1.585538,2.117141,3.17205,0.717718,0.023801,0.235253,...,0.076916,-0.110658,0.397496,0.028003,0.148002,-0.176604,-0.015627,0.21632,0.396518,2.150614
4,-0.186174,0.220043,0.069618,-0.216058,-0.381269,0.139844,0.061206,-0.017122,0.007479,-0.082773,...,-0.084542,-0.025495,-0.147291,0.072158,-0.083188,0.058978,0.150028,0.026279,0.242424,2.1798
5,-0.06195,0.364409,-0.320773,0.240251,0.462498,0.827814,-0.022522,0.578909,0.008623,-0.138768,...,0.168842,-0.090138,0.223516,0.031387,0.14575,-0.461182,-0.073557,0.158827,-0.063693,2.002719
6,-0.430448,0.850223,-0.120608,0.441755,0.972962,1.726309,-0.313508,0.959346,-0.059669,0.280324,...,3.839314,0.666961,0.052517,-0.069954,0.146876,-0.227345,0.272619,5.293265,0.844549,4.311804
7,0.117004,0.613558,-0.177138,-0.137557,1.051998,2.391056,3.57482,0.12949,-0.015731,-0.037278,...,-0.005258,3.25443,0.166188,-0.121475,0.11966,-0.671689,-0.351043,-0.253093,0.201336,4.190984
8,-0.003157,0.03525,-0.024831,0.038769,0.09414,0.030632,0.059857,0.146352,0.034943,0.019137,...,-0.005716,0.118346,0.037616,-0.007123,0.02499,-0.020638,0.033221,-0.04576,-0.033029,0.125666
9,0.117493,0.231167,0.032385,0.133503,-0.143718,0.11062,-0.050339,0.663361,-0.050998,0.138656,...,0.029639,-0.242202,0.065592,-0.021912,0.135259,-0.449867,-0.477942,-0.071198,0.077894,3.374176
10,-0.278241,0.485076,0.164935,0.8979,0.855678,0.305443,-0.183951,2.429605,-0.024207,-0.068663,...,0.145947,0.394779,1.022208,-0.088922,0.264486,0.539356,0.249012,4.561524,0.285001,3.131134


In [69]:
cf_preds_df.shape ## movies, users

(4192, 156)

In [70]:
class CollabFilterRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False).reset_index().rename(columns={user_id: 'recStrength'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['movieId'].isin(items_to_ignore)] \
                               .sort_values('recStrength', ascending = False).reset_index(drop=True).head(topn)

        #if verbose:
        #    if self.items_df is None:
        #        raise Exception('"items_df" is required in verbose mode')

        recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'movieId', 
                                                          right_on = 'movieId')[['movieId', 'recStrength', 'title_1', 'year', 'genres']]


        return recommendations_df

### Instantiate the CollabFilterRecommender model

In [71]:
collab_filter_model = CollabFilterRecommender(cf_preds_df, movies_df)

### Call recommend_items for user in train df

In [72]:
users_train = train_df['userId'].tolist()
train_userid = users_train[1]

## movies already watched by train_userid
already_watched_train = train_df[train_df['userId']==train_userid]['movieId'].tolist()

print("Movie recommendations for user %d in train_df using Collaborative filtering: " %train_userid)
collab_filter_model.recommend_items(train_userid, already_watched_train)

Movie recommendations for user 131 in train_df using Collaborative filtering: 


Unnamed: 0,movieId,recStrength,title_1,year,genres
0,58047,4.060635,"Definitely, Maybe",2008,Comedy|Drama|Romance
1,66934,3.963107,Dr. Horrible's Sing-Along Blog,2008,Comedy|Drama|Musical|Sci-Fi
2,2706,3.797807,American Pie,1999,Comedy|Romance
3,733,3.485601,"Rock, The",1996,Action|Adventure|Thriller
4,442,3.387927,Demolition Man,1993,Action|Adventure|Sci-Fi
5,4054,3.322113,Save the Last Dance,2001,Drama|Romance
6,6753,3.274944,Secondhand Lions,2003,Children|Comedy|Drama
7,4034,3.129634,Traffic,2000,Crime|Drama|Thriller
8,1517,3.073546,Austin Powers: International Man of Mystery,1997,Action|Adventure|Comedy
9,593,3.009496,"Silence of the Lambs, The",1991,Crime|Horror|Thriller


In [73]:
print("Movies already watched by user in train_df: ", train_userid)
train_df[train_df['userId']==train_userid][['movieId', 'year', 'rating', 'title_1', 'genres']].sort_values(by='rating', ascending=False).head(10)

Movies already watched by user in train_df:  131


Unnamed: 0,movieId,year,rating,title_1,genres
16082,49286,2006,5.0,"Holiday, The",Comedy|Romance
15873,2411,1985,5.0,Rocky IV,Action|Drama
15871,2409,1979,4.5,Rocky II,Action|Drama
16036,6942,2003,4.5,Love Actually,Comedy|Drama|Romance
15872,2410,1982,4.5,Rocky III,Action|Drama
16100,58998,2008,4.5,Forgetting Sarah Marshall,Comedy|Romance
15938,3578,2000,4.5,Gladiator,Action|Adventure|Drama
16065,35836,2005,4.0,"40-Year-Old Virgin, The",Comedy|Romance
16050,8132,1992,4.0,Gladiator,Action|Drama
16057,30825,2004,4.0,Meet the Fockers,Comedy


In [74]:
#users_test = test_df['userId'].tolist()
#test_userid = 131 #users_test[1]

## movies already watched by train_userid
already_watched_test = test_df[test_df['userId']==train_userid]['movieId'].tolist()


print("Movie recommendations for user %d in test_df using Collaborative filtering: " %train_userid)
collab_filter_model.recommend_items(train_userid, already_watched_test)

Movie recommendations for user 131 in test_df using Collaborative filtering: 


Unnamed: 0,movieId,recStrength,title_1,year,genres
0,2411,4.970993,Rocky IV,1985,Action|Drama
1,3578,4.9087,Gladiator,2000,Action|Adventure|Drama
2,2410,4.529725,Rocky III,1982,Action|Drama
3,49286,4.512483,"Holiday, The",2006,Comedy|Romance
4,2409,4.504337,Rocky II,1979,Action|Drama
5,58998,4.244268,Forgetting Sarah Marshall,2008,Comedy|Romance
6,35836,4.052606,"40-Year-Old Virgin, The",2005,Comedy|Romance
7,2959,4.030157,Fight Club,1999,Action|Crime|Drama|Thriller
8,261,3.977819,Little Women,1994,Drama
9,5952,3.972308,"Lord of the Rings: The Two Towers, The",2002,Adventure|Fantasy


In [75]:
print("Movies already watched by user in test_df: ", train_userid)
test_df[test_df['userId']==train_userid][['movieId', 'year', 'rating', 'title_1', 'genres']].sort_values(by='rating', ascending=False).head(5)

Movies already watched by user in test_df:  131


Unnamed: 0,movieId,year,rating,title_1,genres
16098,58047,2008,4.5,"Definitely, Maybe",Comedy|Drama|Romance
16123,66934,2008,4.0,Dr. Horrible's Sing-Along Blog,Comedy|Drama|Musical|Sci-Fi
15710,47,1995,3.5,Seven (a.k.a. Se7en),Mystery|Thriller
16034,6753,2003,3.5,Secondhand Lions,Children|Comedy|Drama
15963,4054,2001,3.5,Save the Last Dance,Drama|Romance


In [76]:
print('Evaluating Collaborative filtering based recommendation model...')
pop_global_metrics, pop_detailed_results_df = model_eval.evaluate_model(collab_filter_model)
pop_global_metrics

Evaluating Collaborative filtering based recommendation model...
Running evaluation for users
Number of users processed:  156


{'modelName': 'Collaborative Filtering', 'recall@10': '23.20'}

In [77]:
pop_detailed_results_df.head()

Unnamed: 0,_person_id,hits@10_count,recall@10,test_interacted_count
0,116,10,4.5,222
1,104,10,5.0,200
2,54,10,7.04,142
3,91,10,7.52,133
4,58,10,7.63,131


### Correlation between movies

In [78]:
# Recommend the highest predicted rating movies that the user hasn't seen yet.
user_id = 36
## get list of already interacted content ids (in order to avoid coming up in the recommendation list)
collab_ignore_items_1 = interactions_movies_df[interactions_movies_df['userId'] == user_id]['movieId'].tolist()
#collab_ignore_items


In [80]:
#collab_recommendations_df_1 = sim_df[~sim_df['movieId'].isin(collab_ignore_items_1)] \
#                               .sort_values('Correlation', ascending = False) \
#                               .head(10)
#collab_recommendations_df_1        

In [81]:
#interactions_movies_df_new = interactions_movies_df[:500]

In [82]:
#interactions_movies_df_new.shape

In [83]:
#Creating a sparse pivot table with users in rows and items in columns
#users_movies_pivot_df = interactions_movies_df.pivot(index='userId', 
#                                                          columns='movieId', 
#                                                          values='rating').fillna(0)

#users_movies_pivot_df.head(10)

In [None]:
'''
class ItemsCorrelation:
    
    MODEL_NAME = 'Correlation'
    
    def __init__(self, pivot_df, items_df=None):
        self.pivot_df = pivot_df
        self.items_df = items_df
        
        
    def get_model_name(self):
        return self.MODEL_NAME
    
    
    def corr_with_movies(self, mov_id):
        movies_watched = self.pivot_df[mov_id]
        correlation_with_other_movies = self.pivot_df.corrwith(movies_watched)            
        correlation_df = pd.DataFrame(data=correlation_with_other_movies, columns=['Correlation'], index=self.pivot_df.columns)#.reset_index()    
        correlation_df['Correlation_with_Movie'] = mov_id
        correlation_df = correlation_df.sort_values(by='Correlation', ascending=False)
        correlation_df = correlation_df.reset_index()         
        return correlation_df
        
    
    def create_correlation_df(self):
        corr_df = pd.DataFrame()
        for movieid in self.pivot_df.columns:
            data = self.corr_with_movies(movieid)    
            corr_df = corr_df.append(data)        
        return corr_df  
 

    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        corr_df = self.create_correlation_df()
        recommendations_df = corr_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'movieId', 
                                                          right_on = 'movieId')[['movieId', 'Correlation', 'Correlation_with_Movie', 'title_1', 'year', 'genres']]


        return recommendations_df
''' 