Steps
 
1. Popularity-Based Recommendation
• Recommend top movies based on the average rating.
def  popurality_based(full_movies, n=number of movies to return) -> list of n movie names
2. Collaborative Filtering
• User-Based: Find similar users and recommend movies they liked.
• Item-Based: Recommend movies similar to those a user has already rated highly.
def colobrative_filtering(full_movies, n=number of movies to return, type=’user’)
3. Content-Based Filtering
• Recommend movies based on the similarity of their genres.
Def content_based_filtering(full_movies, n: number of movies to recommend) -> list of n movie names
 
4. Hybrid Recommendation
• Combine the above methods (e.g., weighted average of scores from collaborative filtering and content-based methods).
Def hybrid_filtering(full_movies, n: number of movies to recommend) -> list of n movie names

In [33]:
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)

df_movies = pd.read_excel('movies.xlsx')
df_ratings = pd.read_excel('ratings.xlsx')
df_tags = pd.read_excel('tags.xlsx')

print(df_movies.head(3))
df_movies = df_movies.drop_duplicates()
print(df_ratings.head(3))
df_ratings = df_ratings.drop_duplicates()
print()
df_tags = df_tags.drop_duplicates()
movie_ratings = pd.merge(df_ratings, df_movies, on='movieId', how='left')
print(movie_ratings.head(5))

   movieId                    title  \
0        1         Toy Story (1995)   
1        2           Jumanji (1995)   
2        3  Grumpier Old Men (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224

   userId  movieId  rating  timestamp                        title  \
0       1        1     4.0  964982703             Toy Story (1995)   
1       1        3     4.0  964981247      Grumpier Old Men (1995)   
2       1        6     4.0  964982224                  Heat (1995)   
3       1       47     5.0  964983815  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0  964982931   Usual Suspects, The (1995)   

                                        genres  
0  Adventure|Animation|C

In [34]:
def popularity_based(movie_ratings, n_movies=5):
    top_movies = movie_ratings.sort_values(by='rating', axis=0, ascending=False)['title'].head(n_movies)
    return top_movies

top_movies = popularity_based(movie_ratings, n_movies=5)
print(top_movies)

34031                 Green Mile, The (1999)
50717    Monty Python's Life of Brian (1979)
18166                  Lightning Jack (1994)
18165                   Jurassic Park (1993)
76157               Flamingo Kid, The (1984)
Name: title, dtype: object


In [35]:
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np

def collaborative_user_based(movie_ratings, n_movies=5):
    user_ratings = movie_ratings.pivot_table(index='userId', columns='movieId', values='rating', fill_value=0)
    distances = pairwise_distances(user_ratings, metric='cosine')
    closest_user_indices = np.argsort(distances, axis=1)[:,-3:-2]
    closest_users = user_ratings.iloc[closest_user_indices.ravel()]
    recommended_movie_indices = closest_users.values.argsort(axis=1)[:,-n_movies:]
    closest_user_movies = pd.DataFrame(user_ratings.columns.to_numpy()[recommended_movie_indices], index=user_ratings.index)
    return closest_user_movies

rec_collab_user_movies = collaborative_user_based(movie_ratings, n_movies=5)
print(rec_collab_user_movies.head(3))

            0     1      2      3      4
userId                                  
1       73017  8533  49772  45668  81845
2         339    50    590    150     62
3        3421  1663   1194   1210    420


In [36]:
def collaborative_item_based(movie_ratings, n_movies=5):
    movie_ratings_user = movie_ratings.pivot_table(index='movieId', columns='userId', values='rating', fill_value=0)
    distances = pairwise_distances(movie_ratings_user, metric='cosine')
    closest_movie_indices = np.argsort(distances, axis=1)[:,(-n_movies-2):-2]
    closest_movies = pd.DataFrame(movie_ratings_user.index.to_numpy()[closest_movie_indices], index=movie_ratings_user.index)
    return closest_movies

rec_collab_item_movies = collaborative_item_based(movie_ratings, n_movies=5)
print(rec_collab_item_movies.head(3))

             0      1      2      3      4
movieId                                   
1        26554   3544  92637  92730  92760
2         5979   5975  98604   5974   5969
3        26171  26169  26158  26151  26150


In [41]:
from sklearn.feature_extraction.text import CountVectorizer

def content_based(movies, n_movies=5):
    genres_vectors = CountVectorizer().fit_transform(movies.genres)
    distances = pairwise_distances(genres_vectors, metric='cosine')
    closest_movie_indices = np.argsort(distances, axis=1)[:,(-n_movies-2):-2]
    closest_movies = pd.DataFrame(movies.movieId.to_numpy()[closest_movie_indices], index=movies.movieId)
    return closest_movies

rec_content_movies = content_based(df_movies, n_movies=5)
print(rec_content_movies.head(5))

             0      1      2      3     4
movieId                                  
1         2076  52967  52952   5390  5391
2         4936   4939   4942   4945  4946
3        44397  44301   3736   3737  3738
4         1999   1998  55156  55167  1997
5        27722  27721   3173  27708  3176


In [None]:
movie_dates = pd.to_datetime(movie_ratings.timestamp, unit='D')
print(movie_dates.head(5))


def popularity_based_weighted_by_time(movies, n_movies=5):
    top_movies = movie_ratings.sort_values(by='rating', axis=0, ascending=False)['title'].head(n_movies)
    return top_movies


AttributeError: Can only use .dt accessor with datetimelike values