<a href="https://colab.research.google.com/github/dhrumil96/BIDS/blob/master/BIDS_Assignment_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import time
import gc

# data science imports
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# utils import
!pip install -q fuzzywuzzy
from fuzzywuzzy import fuzz



In [0]:
url = 'https://raw.githubusercontent.com/dhrumil96/BIDS/master/movies.csv'
movies = pd.read_csv(url)

url = 'https://raw.githubusercontent.com/dhrumil96/BIDS/master/ratings.csv'
ratings = pd.read_csv(url)

In [0]:
class KnnRecommender:
    """
    This is an item-based collaborative filtering recommender with
    KNN implmented by sklearn
    """
    def __init__(self):
        """
        Recommender requires path to data: movies data and ratings data
        Parameters
        ----------
        path_movies: str, movies data file path
        path_ratings: str, ratings data file path
        """
        
        self.movie_rating_thres = 0
        self.user_rating_thres = 0
        self.model = NearestNeighbors(5,'brut', metric='cosine', n_jobs=-1)

    def set_filter_params(self, movie_rating_thres, user_rating_thres):
        """
        set rating frequency threshold to filter less-known movies and
        less active users
        Parameters
        ----------
        movie_rating_thres: int, minimum number of ratings received by users
        user_rating_thres: int, minimum number of ratings a user gives
        """
        self.movie_rating_thres = movie_rating_thres
        self.user_rating_thres = user_rating_thres
    
  
    def _prep_data(self):
        """
        prepare data for recommender
        1. movie-user scipy sparse matrix
        2. hashmap of movie to row index in movie-user scipy sparse matrix
        """
                
        df_ratings = ratings
        df_movies = movies
        
        # filter data
        df_movies_cnt = pd.DataFrame(
            df_ratings.groupby('movieId').size(),
            columns=['count'])
        popular_movies = list(set(df_movies_cnt.query('count >= @self.movie_rating_thres').index))  # noqa
        movies_filter = df_ratings.movieId.isin(popular_movies).values

        df_users_cnt = pd.DataFrame(
            df_ratings.groupby('userId').size(),
            columns=['count'])
        active_users = list(set(df_users_cnt.query('count >= @self.user_rating_thres').index))  # noqa
        users_filter = df_ratings.userId.isin(active_users).values

        df_ratings_filtered = df_ratings[movies_filter & users_filter]

        # pivot and create movie-user matrix
        movie_user_mat = df_ratings_filtered.pivot(
            index='movieId', columns='userId', values='rating').fillna(0)
        # create mapper from movie title to index
        hashmap = {
            movie: i for i, movie in
            enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title)) # noqa
        }
        # transform matrix to scipy sparse matrix
        movie_user_mat_sparse = csr_matrix(movie_user_mat.values)

        # clean up
        del df_movies, df_movies_cnt, df_users_cnt
        del df_ratings, df_ratings_filtered, movie_user_mat
        gc.collect()
        return movie_user_mat_sparse, hashmap

        
    def _inference(self, model, data, hashmap,
                   fav_movie, n_recommendations):
        """
        return top n similar movie recommendations based on user's input movie
        Parameters
        ----------
        model: sklearn model, knn model
        data: movie-user matrix
        hashmap: dict, map movie title name to index of the movie in data
        fav_movie: str, name of user input movie
        n_recommendations: int, top n recommendations
        Return
        ------
        list of top n similar movie recommendations
        """
        # fit
        model.fit(data)
        
        # get input movie index
        print('You have input movie:', fav_movie)
        
        match_found = []
        # get match
        for title, idx in hashmap.items():
            ratio = fuzz.token_set_ratio(title.lower(), fav_movie.lower())
            if ratio >= 50:
                match_found.append((title, idx, ratio))
        # sort
        match_found = sorted(match_found, key=lambda x: x[2])[::-1]
        if not match_found:
            print('Oops! No match is found')
        else:
            print('Found possible matches in our database: '
                  '{0}\n'.format([x[0] for x in match_found]))
            idx = match_found[0][1]
        
        #idx = self._fuzzy_matching(hashmap, fav_movie)
        
        # inference
        print('Recommendation system start to make inference')
        print('......\n')
        t0 = time.time()
        distances, indices = model.kneighbors(
            data[idx],
            n_neighbors=n_recommendations+1)
        
        # get list of raw idx of recommendations
        raw_recommends = \
            sorted(
                list(
                    zip(
                        indices.squeeze().tolist(),
                        distances.squeeze().tolist()
                    )
                ),
                key=lambda x: x[1]
            )[:0:-1]
        print('It took my system {:.2f}s to make inference \n\
              '.format(time.time() - t0))
        # return recommendation (movieId, distance)
        return raw_recommends

    def make_recommendations(self, fav_movie, n_recommendations):
        """
        make top n movie recommendations
        Parameters
        ----------
        fav_movie: str, name of user input movie
        n_recommendations: int, top n recommendations
        """
        # get data
        movie_user_mat_sparse, hashmap = self._prep_data()
        # get recommendations
        raw_recommends = self._inference(
            self.model, movie_user_mat_sparse, hashmap,
            fav_movie, n_recommendations)
        # print results
        reverse_hashmap = {v: k for k, v in hashmap.items()}
        print('Recommendations for {}:'.format(fav_movie))
        for i, (idx, dist) in enumerate(raw_recommends):
            print('{0}: {1}, with distance '
                  'of {2}'.format(i+1, reverse_hashmap[idx], dist))



In [4]:
recommend = KnnRecommender()
recommend.set_filter_params(20, 20)
recommend.make_recommendations('Captain America', 10)

You have input movie: Captain America
Found possible matches in our database: ['Captain America: Civil War (2016)', 'Captain America: The Winter Soldier (2014)', 'Captain America: The First Avenger (2011)', 'Captain Phillips (2013)', 'Borat: Cultural Learnings of America for Make Benefit Glorious Nation of Kazakhstan (2006)', 'Team America: World Police (2004)', 'Coming to America (1988)', 'Beavis and Butt-Head Do America (1996)', 'American Pie (1999)']

Recommendation system start to make inference
......

It took my system 0.11s to make inference 
              
Recommendations for Captain America:
1: Mad Max: Fury Road (2015), with distance of 0.45130580730238046
2: Star Wars: Episode VII - The Force Awakens (2015), with distance of 0.4271566242244059
3: Avengers: Age of Ultron (2015), with distance of 0.41587221510196537
4: Iron Man 3 (2013), with distance of 0.40842913996731944
5: Rogue One: A Star Wars Story (2016), with distance of 0.40769655565900265
6: Doctor Strange (2016), w