In [13]:

import pandas as pd
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

from fuzzywuzzy import fuzz



class DataFrameReader:
    """
    Class reads the movie and ratings files and organizes 
    them into Pandas DataFrames.
    """
    def __init__(self, movies_path, ratings_path):
        self.movies_path = movies_path
        self.ratings_path = ratings_path

    
    def load_data(self):
        movies_df = pd.read_csv(
            self.movies_path, usecols=['movieId', 'title'], 
            dtype={'movieId': 'int', 'title': 'str'}
        )

        ratings_df = pd.read_csv(
            self.ratings_path, usecols=['userId', 'movieId', 
            'rating'],
            dtype={'userId': 'int', 'movieId': 'int', 'rating': 
            'float'}
        )

        return movies_df, ratings_df



class MovieMatrixBuilder:
    """
    Builds a Scipy Sparse Matrix holding ratings by most 
    active users for most rated movies
    """
    def __init__(self, user_ratings_threshold, 
    movie_ratings_threshold):
        self.user_ratings_threshold = user_ratings_threshold
        self.movie_ratings_threshold = movie_ratings_threshold


    def build_knn_matrix(self, movies_df, ratings_df):
        filtered_movies = ratings_df[
            self.get_filtered_movies(ratings_df) & 
            self.get_filtered_users(ratings_df)
        ]
        movies_matrix = filtered_movies.pivot(
            index='movieId', columns='userId', values='rating'
        ).fillna(0)
        sparse_movies_matrix = csr_matrix(movies_matrix.values)
        return sparse_movies_matrix

    
    def get_filtered_movies(self, ratings_df):
        """
        filters movies by the supplied threshold which is 
        applied to the count of ratings made on each movie
        """
        movies_df_count = pd.DataFrame(
            ratings_df.groupby('movieId').size(), 
            columns=['count']
        )
        most_rated_movies = movies_df_count.query(
            'count >= @self.movie_ratings_threshold'
        ).index
        return ratings_df.movieId.isin(most_rated_movies).values


    def get_filtered_users(self, ratings_df):
        """
        filters users by the supplied threshold which 
        is applied to the count of ratings made by each user. 
        most active users with ratings count above the threshold 
        are selected. 
        """
        users_df_count = pd.DataFrame(
            ratings_df.groupby('userId').size(), columns=['count']
        )
        most_active_users = users_df_count.query(
            'count >= @self.user_ratings_threshold'
        ).index
        return ratings_df.userId.isin(most_active_users).values



class FuzzyWuzzyMatcher:

    def __init__(self, ratio_threshold):
        self.ratio_threshold = ratio_threshold

    def fuzzy_search(self, movie, movie_map):
        matching_items = []

        for index, title in movie_map.items():
            fuzz_ratio = fuzz.ratio(title.lower(), movie.lower())
            if (fuzz_ratio >= self.ratio_threshold):
                matching_items.append((title, index))
        
        return matching_items



class MovieRecommender:
    
    def __init__(self, n_neighbors, movies_path, 
    ratings_path, user_ratings_threshold, movie_ratings_threshold):
        self.n_neighbors = n_neighbors

        data_reader = DataFrameReader(movies_path, ratings_path)
        self.movies_df, self.ratings_df = data_reader.load_data()

        self.matrix_builder = MovieMatrixBuilder(
            user_ratings_threshold, movie_ratings_threshold
        )


    def prepare_model(self):
        self.sparse_matrix = self.matrix_builder.build_knn_matrix(
            self.movies_df, self.ratings_df)
        self.movie_map = {
            index: row.title for index, row in 
            self.movies_df.set_index('movieId').iterrows() 
        }
        self.model = NearestNeighbors(
            n_neighbors=self.n_neighbors, 
            metric='cosine', 
            algorithm='brute', 
            n_jobs=-1
        )
        self.model.fit(self.sparse_matrix)

    
    def recommend(self, movie_query, search_threshold):
        matcher = FuzzyWuzzyMatcher(search_threshold)
        search_results = matcher.fuzzy_search(
            movie_query, self.movie_map
        )

        print(f'Search Results: {search_results}')

        if not search_results:
            print('No Results matching Search Query.')
        else:
            movieIndex = search_results[0][1]
            distances, indices = self.model.kneighbors(
                self.sparse_matrix[movieIndex]
            )
            return [
                [self.movie_map.get(key) for key in indexRow] 
                for indexRow in indices
            ]


# read relevant parameters from user input
movies_path = input('Path to Movies CSV: ')
ratings_path = input('Path to User Ratings CSV: ')
search_threshold = input('Fuzzy Search Threshold: ')
neighbors_count = input('Number of Neighbors: ')
user_ratings_threshold = input('User Ratings Threshold: ')
movie_ratings_threshold = input('Movie Ratings Threshold: ')

# apply unpacking operator to initialize movie 
# recommender with named arguments
recommender = MovieRecommender(
    **{
        'n_neighbors': int(neighbors_count),
        'movies_path': movies_path,
        'ratings_path': ratings_path,
        'user_ratings_threshold': int(user_ratings_threshold),
        'movie_ratings_threshold': int(movie_ratings_threshold)
    }
)
recommender.prepare_model()

movie_query = input('Movie Name: ')
recommender.recommend(movie_query, int(search_threshold))




Search Results: [('Young Frankenstein (1974)', 1278), ('Frankenstein (1931)', 2648), ('Curse of Frankenstein, The (1957)', 2652), ('Frankenstein Unbound (1990)', 8092), ('Frankenstein 90 (1984)', 37444), ('Frankenweenie (2012)', 97172), ('I, Frankenstein (2014)', 108689), ('Victor Frankenstein (2015)', 141004), ('Frankenstein (2015)', 151559)]


[['Young Frankenstein (1974)',
  "Bug's Life, A (1998)",
  'Henry: Portrait of a Serial Killer (1986)',
  None,
  'Babe: Pig in the City (1998)']]