# A Simple Content-Based Recommender System




# Data and Libary imports


In [None]:
import os

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

## Titles imports

In [None]:
amazon_titles = pd.read_csv("data/amazon/titles.csv")
appletv_titles = pd.read_csv("data/appletv/titles.csv")
disney_titles = pd.read_csv("data/disney/titles.csv")
hbo_titles = pd.read_csv("data/hbo/titles.csv")
netflix_titles = pd.read_csv("data/netflix/titles.csv")
paramount_titles = pd.read_csv("data/paramount/titles.csv")

## Concatenation

In [None]:
titles = pd.concat([amazon_titles, appletv_titles, disney_titles, hbo_titles, netflix_titles, paramount_titles], axis = 0).reset_index()
titles.drop(['index'], axis=1, inplace=True)

In [None]:
titles[titles.duplicated() == True].head(5)

In [None]:
titles.drop_duplicates(inplace=True)

In [None]:
titles.head()

# Cleaning the data

## Seeing how many null values

First, let's see how many non-null values we have for each column

In [None]:
titles.info()

We have some gaps in each column, let's see how many null values are for each column

In [None]:
titles.isna().sum()

## Handling the 'seasons' column

Now, we are going to handle the 'seasons' column. In the first view, we can say that all the null values are from the 'MOVIE' types, let's see if this is true.

In [None]:
len(titles.loc[(titles['seasons'].isna()) & (titles['type'] == 'MOVIE')]) == titles.seasons.isna().sum()

So, all the null values from 'season' is from movies. Let's change the null values to 0.

In [None]:
titles['seasons'].fillna(0, inplace=True)

In [None]:
titles.head()

## Handling the 'genres' and 'production_countries' columns

These two columns are formed by list values, so we need to handle these values to be a single value.

In [None]:
# For genres
titles['genres'] = titles['genres'].str.replace(r'[','').str.replace(r"'",'').str.replace(r']','')
titles['genre'] = titles['genres'].str.split(',').str[0]

# For countries
titles['production_countries'] = titles['production_countries'].str.replace(r"[", '').str.replace(r"'", '').str.replace(r"]", '')
titles['production_countrie'] = titles['production_countries'].str.split(',').str[0]


In [None]:
titles.drop(['genres', 'production_countries'], axis=1, inplace=True)

In [None]:
titles.head()

In [None]:
titles['genre'].unique()

In [None]:
titles['production_countrie'].unique()

We can see that in both columns we have an empty value. Let's fill these values with NaN, in order to make it easier to deal with in the future.

In [None]:
titles['genre'] = titles['genre'].replace('', np.nan)
titles['production_countrie'] = titles['production_countrie'].replace('',np.nan)

## Handling the rest of the null values

Let's see how many null values are left

In [None]:
titles.isna().sum()

Now, let's drop the 'id' and the 'imdb_id' and 'age_certification' columns.

In [None]:
titles.drop(['id','imdb_id','age_certification'], axis=1,inplace=True)

Let's get rid of some NaN value that are still left in our dataset.

In [None]:
titles.dropna(inplace=True)

In [None]:
titles.info()

We can see that we still have 18601 values remaining.

In [None]:
#Saving the data for future use
if not os.path.exists('data/clean'):
            os.mkdir('data/clean')

titles.to_csv('data/clean/title.csv')

# Content Based Recommender



## Plot description based Recommender



We will compute pairwise similarity scores for all movies/shows based on their plot descriptions and recommend movies based on that similarity score. 

The plot description is given in the 'description' feature of our dataset. Let's take a look at the data.

In [None]:
titles['description'].head()

## Separating the data in Movies and TV Shows

I am separating the data in Movies and TV Shows, to create separate recommendation systems.

In [None]:
movies = titles[titles['type'] == 'MOVIE'].copy().reset_index()
movies.drop(['index'], axis=1, inplace=True)

shows = titles[titles['type'] == 'SHOW'].copy().reset_index()
shows.drop(['index'], axis=1, inplace=True)

In [None]:
movies.head()

In [None]:
shows.head()

Now we'll compute Term Frequency-Inverse Document Frequency (TF-IDF) vectors for each description.

In [None]:
#Define a TF-IDF Vectorizer Object. 
#This remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix_movies = tfidf.fit_transform(movies['description'])
tfidf_matrix_shows = tfidf.fit_transform(shows['description'])

#Output the shape of tfidf_matrix
print(f'Shape for Movies: {tfidf_matrix_movies.shape}')
print(f'Shape for Shows: {tfidf_matrix_shows.shape}')

We see that over **35k** different words were used to describe the 14027 titles in our movies dataset, and **19k** different words to describe the 4574 titles in our shows dataset.

Now, we need to calculate the similarity score. We will be using the cosine similarity to calculate a numeric quantity that denotes the similarity between two movies. We use the cosine similarity score since it is independent of magnitude and is relatively easy and fast to calculate

In [None]:
# Compute the cosine similarity matrix
cosine_sim_movies = linear_kernel(tfidf_matrix_movies, tfidf_matrix_movies)
cosine_sim_shows = linear_kernel(tfidf_matrix_shows, tfidf_matrix_shows)

Now, we need a mechanism to identify the index of a movie/show in our metadata DataFrame, given its title.

In [None]:
indices_movies = pd.Series(movies.index, index=movies['title'])
indices_shows = pd.Series(shows.index, index=shows['title'])

Now, let's define a function that takes in a movie title as an input and outputs a list of the 10 most similar movies.

In [None]:

def get_recommendations_movie(title, cosine_sim=cosine_sim_movies):
    """
    A function that takes a movie title as input and prints on the screen
    the 10 most similar movies based on the input description.
    """    

    idx = indices_movies[title]
      
    print(f"Title: {movies['title'].loc[idx]} |  Year: {movies['release_year'].loc[idx]}")

    print('**' * 40)

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    print(movies[['title', 'release_year']].iloc[movie_indices])

    print('**' * 40)

In [None]:
get_recommendations_movie('Rocky')

In [None]:
def get_recommendations_show(title, cosine_sim=cosine_sim_shows):
    """
    A function that takes a show title as input and prints on the screen
    the 10 most similar shows based on the input description.
    """

    idx = indices_shows[title]

    print(f"Title: {shows['title'].loc[idx]} | Year: {shows['release_year'].loc[idx]}")

    print('**' * 40)

    # Get the pairwsie similarity scores of all shows with that show
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the shows based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar shows
    sim_scores = sim_scores[1:11]

    # Get the show indices
    show_indices = [i[0] for i in sim_scores]

    print(shows[['title', 'release_year']].iloc[show_indices])

    print('**' * 40)

In [None]:
get_recommendations_show('The Silent Sea')