In [86]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds 
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import zscore

In [345]:
data = pd.read_csv('movie_db_clean.csv', iterator=True, chunksize=250000)

In [346]:
df = pd.concat(data, ignore_index=True)

In [347]:
df.columns

Index(['movieId', 'title', 'genres', 'year', 'userId', 'rating', 'Synopsis'], dtype='object')

In [348]:
df = df.sample(50000, random_state = 21)

## 1- Data Pre-Processing

Inspired by the following repo : https://github.com/topspinj/tmls-2020-recommender-workshop

In [6]:
def create_matrix(df):
    
    """
    This function creates a sparse user-movie matrix from a dataframe

    Args:
    - a Dataframe that contains at least the columns movieId, userId, and rating

    Returns:
    - matrix : a sparse user-movie matrix of size NxM with N the number of unique users and M the number of unique movies
    - map_user : a dictionary that maps user_ids to their respective indices
    - map_user_inv : a dictionary that maps indices to the user_id
    - map_movie : a dictionary that maps movie_ids to their respective indices
    - map_movie_inv : a dictionary that maps indices to the movie_id

    """

    N = df['userId'].nunique()
    M = df['movieId'].nunique()

    map_user = dict(zip(np.unique(df['userId']), list(range(df['userId'].nunique()))))
    map_movie = dict(zip(np.unique(df['movieId']), list(range(df['movieId'].nunique()))))

    map_user_inv = {v: k for k, v in map_user.items()}
    map_movie_inv = {v: k for k, v in map_movie.items()}

    user_idx = [map_user[i] for i in df['userId']]
    movie_idx = [map_movie[i] for i in df['movieId']]

    matrix = csr_matrix((df["rating"], (user_idx, movie_idx)), shape=(N,M))

    df_matrix = df.pivot(index='userId', columns='movieId', values='rating').fillna(0)

    return matrix, df_matrix, map_user, map_user_inv, map_movie, map_movie_inv, user_idx, movie_idx

In [7]:
matrix, df_matrix, map_user, map_user_inv, map_movie, map_movie_inv, user_idx, movie_idx = create_matrix(df)

## 2- Create SVD algorithm

Inspired by the following repo : https://github.com/vivdalal/movie-recommender-system/blob/master/movie_recommendation_system.ipynb

In [8]:
def svd(matrix, n_factors=50):
    """
    This function returns a dataframe with the predicted ratings for all users within the dataframe

    Args:
    - matrix : the sparse user-movie matrix created during step 1
    - n_factors : the number of factors / rank of the latent matrix for factorization

    Returns:
    - predictions : a DataFrame containing the predicted ratings for all users in the original dataset
    """
    # The following code creates :
    # U : user matrix of dimension (n_users, n_factors)
    # sigma : the diagonal matrix of singular values
    # V_t : the transposed movie matrix of dimension (n_factors, n_movies)
    
    U, sigma, V_t = svds(matrix, k = n_factors)

    sigma = np.diag(sigma)

    pred_ratings = np.dot((U @ sigma), V_t)

    predictions = pd.DataFrame(pred_ratings)

    predictions.rename(columns=dict(zip(predictions.columns, list(map_movie.keys()))))
    predictions.index = list(map_user.keys())
    
    return predictions

In [9]:
df_pred = svd(matrix, n_factors=50)

## 3- Recommand movies to user

Inspired by the following repo : https://github.com/vivdalal/movie-recommender-system/blob/master/movie_recommendation_system.ipynb

In [10]:
def recommend_movies(df_pred, user_id, df, df_matrix, n_recommendations):

    """

    Args:

    Returns:
    
    """

    # Sort user's predictions
    sort_pred = df_pred.iloc[user_id].sort_values(ascending=False)
    
    # User data
    user_data = df_matrix.iloc[user_id]

    # Get the index of movies already seen by user
    # We filled by 0 the missing values and there was no 0 rating in the original database
    seen_movies = list(df_matrix.iloc[user_id][df_matrix.iloc[user_id] != 0.0].index)

    print('User {0} has already rated {1} movies.'.format(user_id, len(seen_movies)))
    print('Recommending highest {0} predicted ratings movies not already rated.'.format(n_recommendations))
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet
    reco_movies = sort_pred[~sort_pred.index.isin(seen_movies)][:n_recommendations].index

    # Return the recommanded movies with their respective titles
    recommandations = df.set_index('movieId').iloc[reco_movies][['title']]

    return recommandations

In [377]:
recommandations = recommend_movies(df_pred, 26, df, df_matrix, 10)

recommandations

User 26 has already rated 1 movies.
Recommending highest 10 predicted ratings movies not already rated.


Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
3813,Interiors
986,Fly Away Home
31437,Nobody Knows (Dare mo shiranai)
4306,Shrek
1276,Cool Hand Luke
8531,White Chicks
1196,Star Wars: Episode V - The Empire Strikes Back
5247,Smokey and the Bandit
18,Four Rooms
2922,Hang 'Em High


## User based collaborative filtering 
Based on the following video : https://www.youtube.com/watch?v=cxcFi3RDrEw&list=PLVppujud2yJqshyM80nNDZgye-AFufyqF&index=2

In [371]:
class user_collaborative_filtering:
    def __init__(self, matrix, df_matrix, df, user):
        self.matrix = matrix
        self.df_matrix = df_matrix
        self.df = df
        self.user = user

    def compute_cosine_similarity(self):

        """
        This function returns a similarity vector based on the cosine similarity for a specific user of the database

        Args:
        - self : instance of the class

        Returns:
        - similarity_vector : a similarity vector of a user with the other users in the database
        """

        similarity_vector = cosine_similarity(self.matrix[self.user], self.matrix) #we compute the similarity vector of the user
        return similarity_vector[0]
    
    def get_neighborhood(self, similarity_vector, k):

        """
        This function returns the indices of the k most similar users excluding the user itself

        Args:
        - self : instance of the class
        - similarity_vector : a similarity vector of a user with the other users in the database
        - k : the number of closest users to get

        Returns:
        - ordered_similar_users_indices : indices of the k most similar users excluding the user itself
        """

        similar_users_indices = np.argsort(similarity_vector) #we get the indexes of the k nearest neighbors of the traget user excluding the target itself
        ordered_similar_users_indices = similar_users_indices[::-1]
        ordered_similar_users_indices = ordered_similar_users_indices[ordered_similar_users_indices != self.user][:k]
        return ordered_similar_users_indices
    
    def get_weights_min_max(self, similarity_vector, indices):
    
        """
        This function returns the weights vector of the similar users, based on min-max weighting

        Args:
        - similarity_vector : a similarity vector of a user with the other users in the database
        - indices : indices of the k most similar users excluding the user itself

        Returns:
        - min_max_weights : the weights vector of the similar users, based on min-max weighting
        """

        new_similarity_vector = similarity_vector[indices]
        min = np.min(new_similarity_vector)
        max = np.max(new_similarity_vector)
        min_max_weights = (new_similarity_vector-min)/(max-min)
        min_max_weights = min_max_weights/sum(min_max_weights)
        return min_max_weights
    
    def get_predictions(self, indices, weights):

        """
        This function returns the movies recommendations (10 movies)

        Args:
        - self : instance of the class
        - indices : indices of the k most similar users excluding the user itself
        - weights : the weights vector of the similar users, based on min-max weighting

        Returns:
        - recommandations : a two columns dataframe with the movieIds and the movies titles
        """

        weighted_ratings = self.df_matrix.iloc[indices].multiply(weights, axis=0)
        movies_ranked = weighted_ratings.sum().sort_values(ascending = False)
        movies_ranked_index = movies_ranked[:10].index.values
        recommandations = self.df.set_index('movieId').loc[movies_ranked_index][['title']]
        recommandations = recommandations.drop_duplicates()
        return recommandations
    




In [372]:
filtering =  user_collaborative_filtering(matrix, df_matrix, df, 26)
similarity_vector = filtering.compute_cosine_similarity()
indices = filtering.get_neighborhood(similarity_vector, 50)
weights = filtering.get_weights_min_max(similarity_vector, indices)
predictions = filtering.get_predictions(indices, weights)


In [373]:
predictions

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
52458,Disturbia
2160,Rosemary's Baby
96857,Red Lights
2804,"Christmas Story, A"
2067,Doctor Zhivago
4573,Blaze
2702,Summer of Sam
2369,Desperately Seeking Susan
73854,"Rudolph, the Red-Nosed Reindeer"
1078,Bananas


## Item based collaborative filtering

In [374]:
class item_collaborative_filtering:
    def __init__(self, matrix, df_matrix, df, user):
        self.matrix = matrix
        self.df_matrix = df_matrix
        self.df = df
        self.user = user
        self.map_movie_inv = map_movie_inv
        self.similarity_matrix = self.compute_cosine_similarity()
        self.movies_rated = self.get_user_watched_movies()
        


    def compute_cosine_similarity(self):

        """
        This function returns a similarity vector based on the cosine similarity matrix of movies

        Args:
        - self : instance of the class

        Returns:
        - similarity_matrix : a similarity matrix of movies with the other movies in the database
        """
        similarity_matrix = cosine_similarity(self.matrix.T) #we compute the similarity vector of the user
        return similarity_matrix
    
    def get_user_watched_movies(self):

        """
        This function returns the indices of the rated movies for a user

        Args:
        - self : instance of the class

        Returns:
        - movies_rated : indices of the rated movies
        """

        movies_user_ratings = self.matrix[self.user].toarray()[0] #we get the ratings for a specific user 
        movies_rated = np.nonzero(movies_user_ratings)[0]
        return movies_rated



    def get_movie_recommendations(self, num_recommendations=10):
        """
        This function returns movie recommendations for the user based on item-based collaborative filtering

        Args:
        - num_recommendations: number of recommendations to generate

        Returns:
        - recommendations: a DataFrame with the recommended movies
        """
        # Find movies similar to the ones the user has rated
        similar_movies = np.zeros(self.matrix.shape[1])
        for movie in self.movies_rated:
            similar_movies += self.similarity_matrix[movie]

        # Exclude movies the user has already rated
        similar_movies[self.movies_rated] = 0

        # Get indices of the top N similar movies
        recommended_movie_indices = np.argsort(similar_movies)[::-1][:num_recommendations]
        recommended_movie_ids = [self.map_movie_inv[index] for index in recommended_movie_indices]
        #print(recommended_movie_ids)

        # Get movie titles from DataFrame
        recommendations = self.df.set_index('movieId').loc[recommended_movie_ids][['title']]
        unique_recos = recommendations.drop_duplicates()

        return unique_recos




In [376]:
item_collaborative_filter = item_collaborative_filtering(matrix, df_matrix, df, 26)
recommendation = item_collaborative_filter.get_movie_recommendations()
recommendation

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
136239,Smart House
4573,Blaze
96857,Red Lights
2160,Rosemary's Baby
2702,Summer of Sam
2369,Desperately Seeking Susan
2804,"Christmas Story, A"
2067,Doctor Zhivago
73854,"Rudolph, the Red-Nosed Reindeer"
1078,Bananas


In [379]:
recommandations

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
3813,Interiors
986,Fly Away Home
31437,Nobody Knows (Dare mo shiranai)
4306,Shrek
1276,Cool Hand Luke
8531,White Chicks
1196,Star Wars: Episode V - The Empire Strikes Back
5247,Smokey and the Bandit
18,Four Rooms
2922,Hang 'Em High


In [380]:
predictions

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
52458,Disturbia
2160,Rosemary's Baby
96857,Red Lights
2804,"Christmas Story, A"
2067,Doctor Zhivago
4573,Blaze
2702,Summer of Sam
2369,Desperately Seeking Susan
73854,"Rudolph, the Red-Nosed Reindeer"
1078,Bananas


In [381]:
recommendation

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
136239,Smart House
4573,Blaze
96857,Red Lights
2160,Rosemary's Baby
2702,Summer of Sam
2369,Desperately Seeking Susan
2804,"Christmas Story, A"
2067,Doctor Zhivago
73854,"Rudolph, the Red-Nosed Reindeer"
1078,Bananas
