In [11]:

import pandas as pd
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

from fuzzywuzzy import fuzz

import os
import argparse



class DataFrameReader:
    """
    A class that reads the movie files and organizes into data-frames.
    """
    def __init__(self, movies_path, ratings_path):
        self.movies_path = movies_path
        self.ratings_path = ratings_path

    
    def load_data(self):
        movies_df = pd.read_csv(
            self.movies_path, usecols=['movieId', 'title'], 
            dtype={'movieId': 'int32', 'title': 'str'}
        )

        ratings_df = pd.read_csv(
            self.ratings_path, usecols=['userId', 'movieId', 'rating'],
            dtype={'userId': 'int', 'movieId': 'int', 'rating': 'float'}
        )

        return movies_df, ratings_df



class MovieMatrixBuilder:
    
    def __init__(self, user_ratings_threshold, movie_ratings_threshold):
        self.user_ratings_threshold = user_ratings_threshold
        self.movie_ratings_threshold = movie_ratings_threshold


    def build_knn_matrix(self, movies_df, ratings_df):
        filtered_movies = ratings_df[
            self.get_filtered_movies(ratings_df) & self.get_filtered_users(ratings_df)
        ]
        movies_matrix = filtered_movies.pivot(
            index='movieId', columns='userId', values='rating'
        ).fillna(0)
        sparse_movies_matrix = csr_matrix(movies_matrix.values)
        return sparse_movies_matrix

    
    def get_filtered_movies(self, ratings_df):
        movies_df_count = pd.DataFrame(ratings_df.groupby('movieId').size(), columns=['count'])
        highest_rated_movies = movies_df_count.query('count >= @self.movie_ratings_threshold').index
        return ratings_df.movieId.isin(highest_rated_movies).values


    def get_filtered_users(self, ratings_df):
        users_df_count = pd.DataFrame(ratings_df.groupby('userId').size(), columns=['count'])
        most_active_users = users_df_count.query('count >= @self.user_ratings_threshold').index
        return ratings_df.userId.isin(most_active_users).values



class FuzzyWuzzyMatcher:

    def __init__(self, ratio_threshold):
        self.ratio_threshold = ratio_threshold

    def fuzzy_search(self, movie, movie_map):
        matching_items = []

        for index, title in movie_map.items():
            fuzz_ratio = fuzz.ratio(title.lower(), movie.lower())
            if (fuzz_ratio >= self.ratio_threshold):
                matching_items.append((title, index))
        
        return matching_items



class MovieRecommender:
    
    def __init__(self, n_neighbors, movies_path, ratings_path, user_ratings_threshold, movie_ratings_threshold):
        self.n_neighbors = n_neighbors

        data_reader = DataFrameReader(movies_path, ratings_path)
        self.movies_df, self.ratings_df = data_reader.load_data()

        self.matrix_builder = MovieMatrixBuilder(user_ratings_threshold, movie_ratings_threshold)


    def prepare_model(self):
        self.sparse_matrix = self.matrix_builder.build_knn_matrix(
            self.movies_df, self.ratings_df)
        self.movie_map = { 
            index: row.title for index, row in self.movies_df.set_index('movieId').iterrows() 
        }
        self.model = NearestNeighbors(n_neighbors=self.n_neighbors, metric='cosine', algorithm='brute', n_jobs=-1)
        self.model.fit(self.sparse_matrix)

        
    
    def recommend(self, movie_query, search_threshold):
        matcher = FuzzyWuzzyMatcher(search_threshold)
        search_results = matcher.fuzzy_search(movie_query, self.movie_map)

        print(f'Search Results: {search_results}')

        if not search_results:
            print('No Results matching Search Query.')
        else:
            movieIndex = search_results[0][1]
            distances, indices = self.model.kneighbors(self.sparse_matrix[movieIndex])
            return [[self.movie_map.get(key) for key in indexRow] for indexRow in indices]



# 'movies_path': '/Users/dduru/PythonProjects/data/KNN/movielens/ml-latest-small/movies.csv',
# 'ratings_path': '/Users/dduru/PythonProjects/data/KNN/movielens/ml-latest-small/ratings.csv',


movies_path = input('Path to Movies CSV: ')
ratings_path = input('Path to User Ratings CSV: ')
search_threshold = input('Fuzzy Search Threshold: ')
neighbors_count = input('Number of Neighbors: ')
user_ratings_threshold = input('User Ratings Threshold: ')
movie_ratings_threshold = input('Movie Ratings Threshold: ')

recommender = MovieRecommender(
    **{
        'n_neighbors': int(neighbors_count),
        'movies_path': movies_path,
        'ratings_path': ratings_path,
        'user_ratings_threshold': int(user_ratings_threshold),
        'movie_ratings_threshold': int(movie_ratings_threshold)
    }
)
recommender.prepare_model()

movie_query = input('Movie Name: ')
recommender.recommend(movie_query, int(search_threshold))




Search Results: [('Babe (1995)', 34), ('Barbarella (1968)', 674), ('Beach, The (2000)', 3285), ('Hamlet (2000)', 3598), ('Bait (2000)', 3898), ('Beautiful (2000)', 3912), ('Bamboozled (2000)', 3943), ('Bedazzled (2000)', 3968), ('Bounce (2000)', 3987), ('Malèna (2000)', 3992), ('Unbreakable (2000)', 3994), ('Double Take (2001)', 4053), ('Hannibal (2001)', 4148), ('Vatel (2000)', 4152), ('Blow (2001)', 4239), ('Angel Eyes (2001)', 4305), ('Baby Boy (2001)', 4371), ('Lumumba (2000)', 4384), ('Bully (2001)', 4450), ('Made (2001)', 4452), ('Brother (2000)', 4640), ('Bubble Boy (2001)', 4732), ('Maybe Baby (2000)', 4740), ('O (2001)', 4745), ('Hardball (2001)', 4771), ('Big Trouble (2002)', 4774), ('Liam (2000)', 4780), ('Bandits (2001)', 4844), ('Bones (2001)', 4852), ('Tape (2001)', 4888), ('Baran (2001)', 4964), ('Ali (2001)', 5009), ('Maelström (2000)', 5071), ('Rollerball (2002)', 5094), ('Ice Age (2002)', 5218), ('Blade II (2002)', 5254), ('Frailty (2001)', 5294), ('Rain (2001)', 5328

[['Babe (1995)',
  'Leaving Las Vegas (1995)',
  'Big Bully (1996)',
  'Kingpin (1996)',
  'House of the Spirits, The (1993)']]