In [2]:
# data from: https://grouplens.org/datasets/movielens/
# small dataset -- Small: 100,000 ratings and 3,600 tag applications applied to 9,000 movies by 600 users. Last updated 9/2018.

# import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split

# load data downloaded from above link
links = pd.read_csv('links.csv')     # movie id links to IMDB and TMDB
movies = pd.read_csv('movies.csv')   # maps of movie id, title, and genre
ratings = pd.read_csv('ratings.csv') # ratings /5
tags = pd.read_csv('tags.csv')       # comments from users

# prepare data for collaborative filtering within Surprise library
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# train a collaborative filtering model (SVD)
trainset, testset = train_test_split(data, test_size=0.2)
best_svd = SVD()
best_svd.fit(trainset)

# preprocess genres and tags for content similarity
movies['genres'] = movies['genres'].str.replace('|', ' ')
movies['genres'] = movies['genres'].str.lower()
tags['tag'] = tags['tag'].str.lower()
tags_agg = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()
movies = movies.merge(tags_agg, on='movieId', how='left')
movies['tag'] = movies['tag'].fillna('')
movies['content'] = movies['genres'] + ' ' + movies['tag']

# create content similarity matrix
count_vectorizer = CountVectorizer(stop_words='english')
count_matrix = count_vectorizer.fit_transform(movies['content'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)

# mapping of movie titles to indices
title_indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

# helper function to get content-based recommendations
def get_content_recommendations(title, num_recommendations=10):
    # get the index of the input title
    idx = title_indices[title]
    
    # calculate similarity scores and sort them
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    
    # get movie indices and return movie IDs
    movie_indices = [i[0] for i in sim_scores]
    return movies['movieId'].iloc[movie_indices].tolist()

# helper function to get collaborative filtering predictions for a list of movies
def get_collaborative_predictions(user_id, movie_ids):
    predictions = [best_svd.predict(user_id, movie_id) for movie_id in movie_ids]
    return {pred.iid: pred.est for pred in predictions}

# main function: generates hybrid recommendations with explanations based on content and collaborative filters
def get_hybrid_recommendations_with_explanations(user_id, title, num_recommendations=10, alpha=0.5):
    # get a large list of content-based recommendations for the movie title
    content_movie_ids = get_content_recommendations(title, num_recommendations * 5)
    
    # get collaborative predictions for the given user and the list of content-recommended movies
    collab_preds = get_collaborative_predictions(user_id, content_movie_ids)
    
    # find the index of the movie title in the cosine similarity matrix
    idx = title_indices[title]
    
    # get similarity scores from the content-based filter for each recommended movie
    content_sim_scores = cosine_sim[idx][[movies.index[movies['movieId'] == mid][0] for mid in content_movie_ids]]
    
    # initialize
    hybrid_scores = []
    explanations = []
    
    # calculate hybrid score for each recommended movie (only when collaborative score exists)
    for movie_id, content_score in zip(content_movie_ids, content_sim_scores):
        collab_score = collab_preds.get(movie_id, None)  # retrieve collaborative score
        if collab_score is not None:
            # blend content and collaborative scores based on the alpha weight
            hybrid_score = alpha * collab_score + (1 - alpha) * content_score
            hybrid_scores.append((movie_id, hybrid_score, collab_score, content_score))
        else:
            continue  # skip if there's no collaborative score for this movie
    
    # sort movies by hybrid score in descending order and keep top recommendations
    hybrid_scores.sort(key=lambda x: x[1], reverse=True)
    top_hybrid_scores = hybrid_scores[:num_recommendations]
    
    # filter the movies dataset for details on recommended movies
    recommended_movies = movies[movies['movieId'].isin([x[0] for x in top_hybrid_scores])]
    
    # construct explanations for each recommended movie in the top hybrid recommendations
    recommendations_with_explanations = []
    for movie_id, hybrid_score, collab_score, content_score in top_hybrid_scores:
        movie = movies[movies['movieId'] == movie_id].iloc[0]  # get movie details
        title = movie['title']  # extract movie title
        
        # generate individual explanations from content and collaborative filters
        content_explanation = generate_content_explanation(title, movie_id, idx)
        collab_explanation = generate_collab_explanation(user_id, movie_id)
        
        # combine explanations and add to the final recommendations list
        explanation = f"{content_explanation} {collab_explanation}"
        recommendations_with_explanations.append((title, explanation.strip()))
    
    # return final list of recommended movies with explanations
    return recommendations_with_explanations

# function to generate content-based explanation
def generate_content_explanation(recommended_title, recommended_movie_id, original_movie_idx):
    # retrieve the recommended movie's genres and tags
    recommended_movie = movies[movies['movieId'] == recommended_movie_id].iloc[0]
    recommended_genres = set(recommended_movie['genres'].split())
    recommended_tags = set(recommended_movie['tag'].split()) if pd.notna(recommended_movie['tag']) else set()
    
    # retrieve the original movie's genres and tags
    original_movie = movies.iloc[original_movie_idx]
    original_genres = set(original_movie['genres'].split())
    original_tags = set(original_movie['tag'].split()) if pd.notna(original_movie['tag']) else set()
    
    # find common genres and tags
    common_genres = recommended_genres.intersection(original_genres)
    common_tags = recommended_tags.intersection(original_tags)
    
    # build explanation based on shared features
    explanation_parts = []
    if common_genres:
        explanation_parts.append(f"shares genres {', '.join(common_genres)}")
    if common_tags:
        explanation_parts.append(f"has similar themes like {', '.join(common_tags)}")
    
    # format the explanation
    if explanation_parts:
        content_explanation = f"Because it {', and '.join(explanation_parts)},"
    else:
        content_explanation = "Based on content similarity,"
    
    return content_explanation

# function to generate collaborative filtering explanation
def generate_collab_explanation(user_id, movie_id):
    return "and users with similar preferences also liked it."

# example usage
user_id = 1
favorite_movie = 'Toy Story (1995)'
recommendations = get_hybrid_recommendations_with_explanations(user_id, favorite_movie, 3)

# output recommendations with explanations
print(f"Hybrid Recommendations for User {user_id} based on '{favorite_movie}':\n")
for idx, (title, explanation) in enumerate(recommendations, start=1):
    print(f"{idx}. {title}")
    print(f"   {explanation}\n")

Hybrid Recommendations for User 1 based on 'Toy Story (1995)':

1. Monsters, Inc. (2001)
   Because it shares genres animation, fantasy, children, adventure, comedy, and users with similar preferences also liked it.

2. Toy Story 2 (1999)
   Because it shares genres animation, fantasy, children, adventure, comedy, and has similar themes like pixar, and users with similar preferences also liked it.

3. Emperor's New Groove, The (2000)
   Because it shares genres animation, fantasy, children, adventure, comedy, and users with similar preferences also liked it.

