In [82]:
# KNN
# Needed libraries 
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [83]:
# Takes the csv file and converts it into a dataframe
df = pd.read_csv('/Users/dimitrishort/Documents/DataMiningFinal/ProcessedMovieData.csv')

# Displays the first few rows of the dataframe 
df.head()

Unnamed: 0,userId,movieId,rating,title,genres,datetime,year,month,day,dayofweek,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,release_year
0,1,1,0.777778,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,2000-07-30 18:45:03,2000,7,30,6,...,0,0,0,0,0,0,0,0,0,1995.0
1,5,1,0.777778,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1996-11-08 06:36:02,1996,11,8,4,...,0,0,0,0,0,0,0,0,0,1995.0
2,7,1,0.888889,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,2005-01-25 06:52:26,2005,1,25,1,...,0,0,0,0,0,0,0,0,0,1995.0
3,15,1,0.444444,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,2017-11-13 12:59:30,2017,11,13,0,...,0,0,0,0,0,0,0,0,0,1995.0
4,17,1,0.888889,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,2011-05-18 05:28:03,2011,5,18,2,...,0,0,0,0,0,0,0,0,0,1995.0


In [84]:
# Creates a user-item matrix, each row represents a user and each column represents a movie
user_item_matrix = df.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)

In [85]:
# Converts the user-item matrix to a sparse matrix format for more efficient data handling
sparse_user_item = csr_matrix(user_item_matrix.values)

In [86]:
# Sets up the KNN model to find nearest neighbors with euclidean similarity
model_knn = NearestNeighbors(metric='euclidean', algorithm='brute', n_neighbors=20, n_jobs=-1)

# Fits the model on the data
model_knn.fit(sparse_user_item)

In [87]:
def recommend_movies(user_id, data, model, n_recommendations):
    user_index = user_id - 1 # Gets the user index since it's off by 1 
    
    # Uses the model to find the nearest neighbors (users) based on the given user preferences 
    distances, indices = model.kneighbors(data[user_index], n_neighbors=n_recommendations+1)  

    # Gets a set of movieID that the user has already watched and rated
    target_user_rated_movies = set(df[df['userId'] == user_id]['movieId'])

    recommended_movies = set() # Initializes a set to store the recommended movies 
    
    # Iterates over each index in list of indices skipping the user
    for i in range(1, len(indices.flatten())):
        similar_user_id = indices.flatten()[i] + 1 # Gets the userID of a similar user adjusting the index 
        
        # Gets the movieID rated by the similar user
        similar_user_movies = set(df[df['userId'] == similar_user_id]['movieId']) - target_user_rated_movies 
        recommended_movies.update(similar_user_movies) # Adds the new movieID to the set of recommended movies 

        # Loop breaks if the number of recommendations is higher than the amount requested 
        if len(recommended_movies) >= n_recommendations:
            break

    # Gets the titles and movieID of the recommended movies, ensures no duplicates are produced
    movie_titles = df[df['movieId'].isin(recommended_movies)][['movieId', 'title']].drop_duplicates().head(n_recommendations)
    
    # Prints the recommendations for the user with Movie ID and the Title 
    print(f"Recommendations for User {user_id}:")
    for movie_id, title in zip(movie_titles['movieId'], movie_titles['title']):
        print(f"Movie ID {movie_id}: {title}")

# Calling the function to give recommendations 
recommend_movies(1, sparse_user_item, model_knn, 5)

Recommendations for User 1:
Movie ID 2019: Seven Samurai (Shichinin no samurai)
Movie ID 3996: Crouching Tiger, Hidden Dragon (Wo hu cang long)
Movie ID 589: Terminator 2: Judgment Day
Movie ID 332: Village of the Damned
Movie ID 493: Menace II Society


In [88]:
def user_genre_profile(user_id, rating_threshold=0.8):
    # Filters the movies that the user has rated higher than a 0.8 
    liked_movies = df[(df['userId'] == user_id) & (df['rating'] >= rating_threshold)]
    
    # Converts the genre column into a binary matrix to get the count of occurances 
    liked_genres = liked_movies['genres'].str.get_dummies(sep='|').sum().sort_values(ascending=False)
    return liked_genres # Returns the sorted list of genres liked by the user


def recommended_movie_genres(recommended_movie_ids):
    # Filters the dataframe to get the movies that are in the recommended list
    recommended_movies = df[df['movieId'].isin(recommended_movie_ids)]
    
    # Converts the genre column into a binary matrix to get the count of occurances 
    recommended_genres = recommended_movies['genres'].str.get_dummies(sep='|').sum().sort_values(ascending=False)
    return recommended_genres # Returns the sorted list of genres recommended for the user 

user_genres = user_genre_profile(1, 0.8) # Specify the user to give recommendations with movies they rated above a 0.8
recommended_genres = recommended_movie_genres([2019, 3996, 589, 332, 493]) # list of movieID recommended to the user

print("User's Preferred Genres:\n", user_genres) # Prints the user's preferred genres 
print("Genres of Recommended Movies:\n", recommended_genres) # Prints the genres of movies recommended 


User's Preferred Genres:
 Adventure    45
Action       44
Drama        42
Comedy       38
Children     28
Thriller     25
Crime        24
Animation    22
Fantasy      22
Musical      17
Sci-Fi       17
War          13
Mystery      11
Romance      10
Western       3
Horror        3
Film-Noir     1
dtype: int64
Genres of Recommended Movies:
 Action       394
Sci-Fi       236
Drama        170
Romance      110
Adventure     48
Crime         12
Horror        12
dtype: int64


In [None]:
'''
The movie recommendations using KNN modeling produces somewhat accurate results. 
The recommended movie genres line up with their preferred but tends to have emphasize on
Action, Sci-Fi, and Drama which are mostly in the user's top preferences. It also seems to 
lack in recommendations for other top genres such as Adventure and Comedy. While it does give
somewhat accurate recommendations, it could use some improvement in the modeling to gear more
towards the users top preferences.
'''