# Content based recommendation filtering

Code adaptation from https://medium.com/mlearning-ai/basic-content-based-recommendation-system-with-python-code-be920b412067

## Overview:

1. Read the movie data from a CSV file using pandas.

2. Preprocess the movie data by removing rows with missing Plot values.
3. Create a TfidfVectorizer object to generate a term frequency-inverse document frequency (TF-IDF) matrix based on the Plot column of the movie data.
4. Use the fit_transform method of the TfidfVectorizer object to generate the TF-IDF matrix.
5. Use the linear_kernel function from scikit-learn to calculate the cosine similarity between each pair of movies based on their TF-IDF matrices.
6. Define a get_recommendations function that takes a movie title and number of recommendations as input, and returns a list of recommended movies based on the cosine similarity scores of the input movie.
7. In the get_recommendations function, use the indices variable to get the index of the input movie, and the cosine similarity scores to find the most similar movies.
8. Return the titles of the recommended movies.

In [99]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

df = pd.read_csv('../Data wrangling/merged_movie_data_success.csv')
df = df[df['Plot'].notna()]

# Create a TfidfVectorizer
tfidf = TfidfVectorizer()
# Fit and transform the data to a tfidf matrix
tfidf_matrix = tfidf.fit_transform(df['Plot'])

#Create matrix for movie similarities based on tfidf interactions between movies
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
indices = pd.Series(df.index, index=df['Title']).drop_duplicates()

def get_recommendations(title, cosine_sim=cosine_sim, num_recommend=10):
    idx = indices[title]
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 10 most similar movies
    top_similar = sim_scores[1:num_recommend+1]
    # Get the movie indices
    movie_indices = [i[0] for i in top_similar]
    # Return the top 10 most similar movies
    final_recommendation = list(df['Title'].iloc[movie_indices])
    return final_recommendation

get_recommendations('Spectre', num_recommend=20)


['Die Another Day',
 'Quantum of Solace',
 'Skyfall',
 'Casino Royale',
 'GoldenEye',
 'The Prince of Egypt',
 'Tomorrow Never Dies',
 'V for Vendetta',
 'Hard Rain',
 'National Treasure',
 'Resident Evil: Apocalypse',
 'Hercules',
 'Hercules',
 'Sky Captain and the World of Tomorrow',
 'Cinderella Man',
 'Superman Returns',
 'Poseidon',
 'Real Steel',
 'X-Men: Apocalypse',
 'End of Days']

Let's refactor the code so we can make a recommendation for a combined number of movies, like a "profile based" recommendation.

In [104]:
def get_combined_recommendations(movie_list, num_recommend=10):
    df_movies = pd.read_csv('../Data wrangling/merged_movie_data_success.csv')
    df_movies = df_movies[df_movies['Plot'].notna()]

    # Create a TfidfVectorizer and remove stopwords
    tfidf = TfidfVectorizer()
    # Fit and transform the data to a tfidf matrix
    tfidf_matrix = tfidf.fit_transform(df_movies['Plot'])

    # Calculate the cosine similarity between all movies
    cosine_sim = cosine_similarity(tfidf_matrix)

    # Initialize an empty array to store the combined similarity scores
    combined_sim_scores = np.zeros(cosine_sim.shape[0])

    # Calculate the combined similarity scores for all input movies
    for movie_title in movie_list:
        idx = df_movies[df_movies['Title'] == movie_title].index[0]
        sim_scores = cosine_sim[idx]
        combined_sim_scores += sim_scores

    # Sort the movies based on the combined similarity scores
    sim_scores = list(enumerate(combined_sim_scores))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top `num_recommend` recommended movies
    movie_indices = [i[0] for i in sim_scores[1:num_recommend+len(movie_list)]]

    # Remove the input movies from the recommended movies
    recommended_movies = []
    for i in movie_indices:
        if df_movies['Title'].iloc[i] not in movie_list:
            recommended_movies.append(df_movies['Title'].iloc[i])

    # Return the top `num_recommend` most similar movies
    return recommended_movies


test_movie_list = ['Puss in Boots', 'Alice Through the Looking Glass', 'Spectre']
get_combined_recommendations(test_movie_list, num_recommend = 10)

['Salt',
 'Shrek the Third',
 'Shrek 2',
 'V for Vendetta',
 'Skyfall',
 'Quantum of Solace',
 'Mirror Mirror',
 'RED 2',
 'Shrek',
 'Hercules',
 'Hercules',
 'Die Another Day']