In [1]:
## https://www.kaggle.com/kanncaa1/recommendation-systems-tutorial/data
## https://www.kaggle.com/gspmoreira/recommender-systems-in-python-101

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import scipy as scipy
import math as math
import random as random
import sklearn

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds

%matplotlib inline

In [3]:
movies_df = pd.read_csv('movie.csv')

In [4]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
movies_df['title_1'] = movies_df['title'].str.split("\(\d", expand=True)[0]
movies_df['year'] = movies_df['title'].str.extract('.*\((.*\d{4})\).*', expand=True)
movies_df.head(30)

Unnamed: 0,movieId,title,genres,title_1,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995
5,6,Heat (1995),Action|Crime|Thriller,Heat,1995
6,7,Sabrina (1995),Comedy|Romance,Sabrina,1995
7,8,Tom and Huck (1995),Adventure|Children,Tom and Huck,1995
8,9,Sudden Death (1995),Action,Sudden Death,1995
9,10,GoldenEye (1995),Action|Adventure|Thriller,GoldenEye,1995


In [None]:
movies_df.info()

In [None]:
interactions_df = pd.read_csv('rating_1.csv')
interactions_df.head()

In [None]:
interactions_df.drop('timestamp', axis=1, inplace=True)

In [None]:
interactions_df.head()

In [None]:
interactions_df.info()

In [None]:
interactions_df = interactions_df.iloc[:20000,:]

In [None]:
interactions_df.info()

In [None]:
print(interactions_df['userId'].nunique())
print(interactions_df['movieId'].nunique())

In [None]:
## Merge movies_df (with only movieId, title columns) with interactions_df

interactions_movies_df = interactions_df.merge(movies_df[['movieId', 'title_1', 'year', 'genres']], how = 'left', left_on = 'movieId',
               right_on = 'movieId')

In [None]:
interactions_movies_df.head(30)

In [None]:
interactions_movies_df.info()

## Popularity Model

In [None]:
#interactions_movies_df[interactions_movies_df['movieId']==118696]['rating'].avg()

interactions_movies_grp = interactions_movies_df.groupby(['movieId', 'title_1']).agg({'rating': 'sum'}).reset_index()
interactions_movies_grp.rename(columns = {'rating': 'TotalRating'},inplace=True)
interactions_movies_grp.head()

In [None]:
#Sort movie_Id based upon rating

popular_ratings_df = interactions_movies_grp.sort_values(['TotalRating', 'title_1', 'movieId'], ascending=[0,1,1])
popular_ratings_df.head(10)

In [None]:
#Generate a recommendation rank based upon score
popular_ratings_df['Rank'] = popular_ratings_df['TotalRating'].rank(ascending=0, method='first')
popular_ratings_df.head(20)

In [None]:
test_user_df = interactions_movies_df[interactions_movies_df['userId'] == 4][['movieId', 'title_1']]
test_user_df.head()

In [None]:
test_df=test_user_df.merge(popular_ratings_df.head(30), left_on='movieId', right_on='movieId', how='inner').sort_values(by='Rank', ascending=True)
test_df.head()

#### Define a Popularity Recommender class with the above code

In [None]:
class popularity_recommender_py():
    def __init__(self):
        self.data = None
        self.rating = None
        self.movieId = None
        self.userId = None        
        self.title_1 = None
        self.popularity_recommendations = None
        
    #Create the popularity based recommender system model
    def create(self, data, movieId, title_1, rating):
        self.data = data        
        self.movieId = movieId
        self.title_1 = title_1
        self.rating = rating

        df_grp = data.groupby(['movieId', 'title_1']).agg({'rating': 'sum'}).reset_index()
        df_grp.rename(columns = {'rating': 'TotalRating'},inplace=True)

        #Sort the movieId's based upon recommendation score
        popular_movies_df = df_grp.sort_values(['TotalRating', 'title_1', 'movieId'], ascending=[0,1,1])
        
        #Generate a recommendation rank based upon score
        popular_movies_df['Rank'] = popular_movies_df['TotalRating'].rank(ascending=0, method='first')
        
        #Get the top 10 recommendations
        self.popularity_recommendations = popular_movies_df#.head(10)

        
    #Use the popularity based recommender system model to make recommendations of contents that the user has not interacted with:
    def recommend(self, userId, items_to_ignore=[], topn=10):    
        user_recommendations = self.popularity_recommendations[~self.popularity_recommendations['movieId'].isin(items_to_ignore)].head(topn)
        
        #Add userId column for which the recommendations are being generated
        user_recommendations['userId'] = userId
    
        #Bring userId column to the front
        cols = user_recommendations.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        user_recommendations = user_recommendations[cols]
        
        return user_recommendations
       

In [None]:
## Before we call the class and its functions, lets first create the unique personIds

In [None]:
## Get unique personIds
users = interactions_movies_df['userId'].unique()
users

In [None]:
#interactions_movies_df.head()

In [None]:
## Create an object for the class and instantiate the functions within the class
pm = popularity_recommender_py()
pm.create(interactions_movies_df, 'movieId', 'title_1', 'rating')

In [None]:
user_id = users[3]

## get list of already interacted content ids (in order to avoid coming up in the popularity recommendation list)
items_to_ignore = interactions_movies_df[interactions_movies_df['userId'] == user_id]['movieId'].tolist()
#items_to_ignore

pm.recommend(user_id,items_to_ignore)

In [None]:
user_id = users[30]

## get list of already interacted content ids (in order to avoid coming up in the popularity recommendation list)
items_to_ignore = interactions_movies_df[interactions_movies_df['userId'] == user_id]['movieId'].tolist()
#items_to_ignore

pm.recommend(user_id,items_to_ignore)

## Content based Filtering model

In [None]:
## Content-based filtering approaches leverage description or attributes from items the user has interacted 
## to recommend similar items.
## It depends only on the user's previous choices, making this method robust to avoid the cold-start problem. 

In [None]:
#Ignoring stopwords (words with no semantics) from English
stopwords_list = stopwords.words('english')

#Trains a model whose vectors size is 5000, composed by the main unigrams and bigrams found in the corpus, ignoring stopwords
vectorizer = TfidfVectorizer(analyzer='word',
                     ngram_range= (1,2),
                     min_df=0.003,
                     max_df=0.5,
                     max_features=5000,
                     stop_words=stopwords_list)

In [None]:
#title_genre = pd.concat([movies_df['Title'], movies_df['Genre']])
#title_genre = movies_df['Title'] + " " + movies_df['Genre']
genre = movies_df['genres']

In [None]:
genre

In [None]:
tfidf_matrix = vectorizer.fit_transform(genre)

In [None]:
tfidf_feature_names = vectorizer.get_feature_names()

In [None]:
print(len(tfidf_feature_names))
print(tfidf_feature_names)

In [None]:
item_ids = movies_df['movieId'].tolist()
#item_ids.index(-8949113594875411859)
#item_ids

In [None]:
interactions_movies_df.head()

In [None]:
users_movies_df = interactions_movies_df.set_index('userId')

In [None]:
users_movies_df.head()

In [None]:
users_movies_df.info()

In [None]:
users_movies_df.dropna(inplace=True)

In [None]:
users_movies_df.info()

#### To model the user profile, we take all the item profiles the user has interacted with and average them.

#### The average is weighted by the rating

In [None]:
users_movies_df.groupby([users_movies_df.index])[['movieId']].count().sort_values(by='movieId', ascending=True).head(5)


In [None]:
users_movies_df.loc[36].sort_values(by='rating', ascending=False)

In [None]:
def get_user_profiles():
    user_profiles = {}
    
    ## Create an index_df by joinning the interactions_full_df with the articles_df, and get the columns from the interactions_full_df
    #interactions_indexed_df = interactions_full_df[interactions_articles_df['contentId'].isin(articles_df['contentId'])].set_index('personId')

    ## For every user in the index_df, get all of their contentIds, and get their corresponding tfdifmatrix 

    for userid in users_movies_df.index.unique():
        interactions_person_df = users_movies_df.loc[userid]    
        user_item_strengths = np.array(interactions_person_df['rating']).reshape(-1,1) ## n rows x 1 col
    
        itemprofilelist = [tfidf_matrix[item_ids.index(c)] for c in pd.Series(interactions_person_df['movieId'])]
        item_profiles = scipy.sparse.vstack(itemprofilelist)
    
        #Weighted average of item profiles by the interactions strength
        user_item_strengths_weighted_avg = np.sum(item_profiles.multiply(user_item_strengths), axis=0) / np.sum(user_item_strengths)
        user_profile_norm = sklearn.preprocessing.normalize(user_item_strengths_weighted_avg)    
        
        user_profiles[userid] = user_profile_norm
    return user_profiles

In [None]:
user_profiles = get_user_profiles()

In [None]:
len(user_profiles)

In [None]:
## Let's take a look in the profile. It is a unit vector of length 90 (length of tf-idf matrix). 
#The value in each position represents how relevant is a token (unigram or bigram).

In [None]:
myprofile = user_profiles[36]
print(myprofile.shape)

In [None]:
myprofile.flatten().tolist()

In [None]:
token_relevance = pd.DataFrame(sorted(zip(tfidf_feature_names,user_profiles[36].flatten().tolist())), columns=['token', 'relevance']) 

In [None]:
token_relevance.head(20)

In [None]:
## Sort by highest order of relevance

token_relevance = token_relevance.sort_values(by='relevance', ascending=False)

In [None]:
token_relevance.head(30)

In [None]:
#Compute the cosine similarity between the user profile and all item profiles
person_id = 36
cosine_similarities = cosine_similarity(user_profiles[person_id], tfidf_matrix)
cosine_similarities 

In [None]:
cosine_similarities.shape

In [None]:
#Gets the top similar items
topn = 15
similar_indices = cosine_similarities.argsort().flatten()[-topn:]
similar_indices

In [None]:
#Sort the similar items by similarity
similar_items = sorted([(item_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
similar_items

In [None]:
items_to_ignore = interactions_movies_df[interactions_movies_df['userId'] == person_id]['movieId'].tolist()
#items_to_ignore

In [None]:
similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items)) ##x[0] is the contentId, x[1] is cosine similarity
len(similar_items_filtered)

In [None]:
similar_items_filtered

In [None]:
content_rec_df = pd.DataFrame(similar_items_filtered, columns=['movieId', 'recStrength']).head(10)

In [None]:
content_rec_df

In [None]:
## Merge movies_df with content_rec_df

content_rec_df_completed = content_rec_df.merge(movies_df[['movieId', 'title', 'genres']], how = 'left', left_on = 'movieId',
               right_on = 'movieId')

In [None]:
## Recommended content for personId: 36
content_rec_df_completed

In [None]:
interactions_movies_df[interactions_movies_df['userId']==36].groupby(['genres'])[['movieId']].count()

### Define the above in a class

In [None]:
class ContentBasedRecommender:
    
    MODEL_NAME = 'Content-Based'
    
    def __init__(self, items_df=None):
        self.item_ids = item_ids
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def get_similar_items_to_user_profile(self, user_id, topn=1000):
        
        #Computes the cosine similarity between the user profile and all item profiles
        cosine_similarities = cosine_similarity(user_profiles[user_id], tfidf_matrix)
        
        #Gets the top similar items
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        
        #Sort the similar items by similarity
        similar_items = sorted([(item_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
        
        return similar_items
        
    def content_recommend_items(self, user_id, articles_df, items_to_ignore=[], topn=10, verbose=False):
        similar_items = self.get_similar_items_to_user_profile(user_id)
        
        #Ignores items the user has already interacted
        similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))
        
        rec_df = pd.DataFrame(similar_items_filtered, columns=['movieId', 'recStrength']).head(topn)

       
       # if verbose:
       #     if self.items_df is None:
       #         raise Exception('"items_df" is required in verbose mode')

       #     recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
       #                                                   left_on = 'contentId', 
       #                                                   right_on = 'contentId')[['recStrength', 'contentId', 'title', 'url', 'lang']]
       

        content_recommendation_df = rec_df.merge(movies_df[['movieId', 'title', 'genres']], how = 'left', left_on = 'movieId',
               right_on = 'movieId')
    
        return content_recommendation_df

In [None]:
cont_rec = ContentBasedRecommender()

In [None]:
user_id = 36

## get list of already interacted content ids (in order to avoid coming up in the recommendation list)
ignore_items = interactions_movies_df[interactions_movies_df['userId'] == user_id]['movieId'].tolist()

recommendation_df = cont_rec.content_recommend_items(user_id, movies_df, ignore_items)

In [None]:
recommendation_df

In [None]:
## Movies watched by user_id = 36
interactions_movies_df[interactions_movies_df['userId'] == 36][['rating', 'title_1', 'genres']].sort_values(by='rating', ascending=False)

In [None]:
user_id = 1

## get list of already interacted content ids (in order to avoid coming up in the recommendation list)
ignore_items = interactions_movies_df[interactions_movies_df['userId'] == user_id]['movieId'].tolist()

recommendation_df = cont_rec.content_recommend_items(user_id, movies_df, ignore_items)
recommendation_df

### Collaborative Filtering - Matrix Factorization

In [None]:
interactions_movies_df.head()

In [None]:
#Creating a sparse pivot table with users in rows and items in columns
users_items_pivot_matrix_df = interactions_movies_df.pivot(index='userId', 
                                                          columns='movieId', 
                                                          values='rating').fillna(0)

users_items_pivot_matrix_df.head(10)

In [None]:
users_items_pivot_matrix_df.info()

In [None]:
users_items_pivot_matrix = users_items_pivot_matrix_df.as_matrix()
users_items_pivot_matrix#[:10]

In [None]:
type(users_items_pivot_matrix)

In [None]:
users_ids = list(users_items_pivot_matrix_df.index)
users_ids[:10]

In [None]:
## An important decision is the number of factors to factor the user-item matrix. 
## The higher the number of factors, the more precise is the factorization in the original matrix reconstructions.
## Therefore, if the model is allowed to memorize too much details of the original matrix,
## it may not generalize well for data it was not trained on.

## Reducing the number of factors increases the model generalization.

In [None]:
users_items_pivot_matrix.shape

In [None]:
#The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 50

In [None]:
#Performs matrix factorization of the original user item matrix

U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)

In [None]:
U.shape

In [None]:
Vt.shape

In [None]:
sigma.shape

In [None]:
#U

In [None]:
#Vt

In [None]:
#sigma

In [None]:
sigma = np.diag(sigma)
sigma.shape

In [None]:
#sigma

In [None]:
## After the factorization, we try to to reconstruct the original matrix by multiplying its factors. 
## The resulting matrix is not sparse any more.

## It generated predictions for items the user has not yet interacted, which we will exploit for recommendations.

In [None]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt)

#all_user_predicted_ratings

In [None]:
all_user_predicted_ratings.shape

In [None]:
#Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(all_user_predicted_ratings, columns = users_items_pivot_matrix_df.columns, index=users_ids).transpose()
cf_preds_df.head(10)

In [None]:
cf_preds_df.info()

In [None]:
user_id = 1

cf_preds_df[user_id].sort_values(ascending=False).head()

In [None]:
sorted_user_predictions = cf_preds_df[user_id].sort_values(ascending=False).reset_index().rename(columns={user_id: 'recStrength'})
sorted_user_predictions.head(5)

In [None]:
user_predictions = sorted_user_predictions.merge(movies_df[['movieId', 'title', 'genres']], how = 'left', left_on = 'movieId',
               right_on = 'movieId')
user_predictions.head()

In [None]:
# Recommend the highest predicted rating movies that the user hasn't seen yet.

## get list of already interacted content ids (in order to avoid coming up in the recommendation list)
collab_ignore_items = interactions_movies_df[interactions_movies_df['userId'] == user_id]['movieId'].tolist()
#collab_ignore_items

recommendations_df = user_predictions[~user_predictions['movieId'].isin(collab_ignore_items)] \
                               .sort_values('recStrength', ascending = False) \
                               .head(10).reset_index(drop=True)

In [None]:
recommendations_df

### Use correlation

In [None]:
#Creating a sparse pivot table with users in rows and items in columns
users_items_pivot_matrix_df_1 = interactions_movies_df.pivot(index='userId', 
                                                          columns='movieId', 
                                                          values='rating').fillna(0)

users_items_pivot_matrix_df_1.head(10)

In [None]:
movie_id = 1

movie_watched = users_items_pivot_matrix_df_1[movie_id]

In [None]:
similarity_with_other_movies = users_items_pivot_matrix_df_1.corrwith(movie_watched)  # find correlation between 1 and other movies
similarity_with_other_movies.head(10)

In [None]:
similarity_df = pd.DataFrame(data=similarity_with_other_movies, index=users_items_pivot_matrix_df_1.columns,columns=['Correlation'])

In [None]:
similarity_df.head(10)

In [None]:
similarity_df.reset_index(inplace=True)
similarity_df.head()
#similarity_with_other_movies.sort_values(ascending=False)#.reset_index().rename(columns={: 'recStrength'})
#similarity_with_other_movies.head()

In [None]:
similarity_df.sort_values(by='Correlation', inplace=True, ascending=False)
similarity_df.head()

In [None]:
sim_df = similarity_df.merge(movies_df[['movieId', 'title', 'genres']], how = 'left', left_on = 'movieId',
               right_on = 'movieId')
sim_df.head()

In [None]:
# Recommend the highest predicted rating movies that the user hasn't seen yet.
user_id = 36
## get list of already interacted content ids (in order to avoid coming up in the recommendation list)
collab_ignore_items_1 = interactions_movies_df[interactions_movies_df['userId'] == user_id]['movieId'].tolist()
#collab_ignore_items


In [None]:
collab_recommendations_df_1 = sim_df[~sim_df['movieId'].isin(collab_ignore_items_1)] \
                               .sort_values('Correlation', ascending = False) \
                               .head(10)

In [None]:
collab_recommendations_df_1