In [1]:
## import the libraries I'll be using for the rest of the project
import numpy as np
import pandas as pd

In [2]:
movies_org = pd.read_csv('movies.csv.zip')
movies_org.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [13]:
user_ratings_org = pd.read_csv('user_ratings.csv.zip')
user_ratings_org.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [3]:
## Prepare the title column by removing the year
movie_genre_df = movies_org.copy()
movie_genre_df['title'] = movie_genre_df['title'].str.replace(r'\(\d{4}\)', '', regex=True)

## Drop the na, and then strip/trim the title column
movie_genre_df = movie_genre_df.dropna().copy()
movie_genre_df['title'] = movie_genre_df['title'].str.strip()

## Split the genres
movie_genre_df['genres'] = movie_genre_df['genres'].str.split('[|]')
movie_genre_df = movie_genre_df.explode('genres').reset_index(drop=True)
movie_genre_df["name"] = movie_genre_df["title"]
movie_genre_df["genre_list"] = movie_genre_df["genres"]
movie_genre_df = movie_genre_df[["name", "genre_list"]]

movie_genre_df.head()

Unnamed: 0,name,genre_list
0,Toy Story,Adventure
1,Toy Story,Animation
2,Toy Story,Children
3,Toy Story,Comedy
4,Toy Story,Fantasy


In [4]:
user_ratings_org = pd.read_csv('user_ratings.csv.zip')
user_ratings_org.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


## Creating content-based data

As much as you might want to jump right to finding similar items and making recommendations, you first need to get your data in a usable format. In the next few exercises, you will explore your base data and work through how to format that data to be used for content-based recommendations.

As a reminder, the desired outcome is a row per movie with each column indicating whether a genre applies to the movie. You will be looking at movie_genre_df, which contains these columns:

    - name - Name of movie
    - genre_list - Genre that the movie has been labeled as

A movie may have multiple genres, and therefore multiple rows. In this exercise, you will particularly focus on one movie (Toy Story in this case) to be able to clearly see what is happening with the data.

### Instructions 2/3
    - Get the rows in movie_genre_df which have a name equal to Toy Story and save this as toy_story_genres.

In [5]:
# Select only the rows with values in the name column equal to Toy Story
toy_story_genres = movie_genre_df[movie_genre_df['name'] == 'Toy Story']

# Inspect the subset
print(toy_story_genres)

        name genre_list
0  Toy Story  Adventure
1  Toy Story  Animation
2  Toy Story   Children
3  Toy Story     Comedy
4  Toy Story    Fantasy


### Instructions 3/3
    - Transform movie_genre_df to a table called movie_cross_table.
    - Assign the subset of movie_cross_table that contains Toy Story to the variable toy_story_genres_ct and inspect the results.

In [6]:
# Select only the rows with values in the name column equal to Toy Story
toy_story_genres = movie_genre_df[movie_genre_df['name'] == 'Toy Story']

# Create cross-tabulated DataFrame from name and genre_list columns
movie_cross_table = pd.crosstab(movie_genre_df['name'], movie_genre_df['genre_list'])

# Select only the rows with Toy Story as the index
toy_story_genres_ct = movie_cross_table[movie_cross_table.index == 'Toy Story']
print(toy_story_genres_ct)

genre_list  (no genres listed)  Action  Adventure  Animation  Children  \
name                                                                     
Toy Story                    0       0          1          1         1   

genre_list  Comedy  Crime  Documentary  Drama  Fantasy  Film-Noir  Horror  \
name                                                                        
Toy Story        1      0            0      0        1          0       0   

genre_list  IMAX  Musical  Mystery  Romance  Sci-Fi  Thriller  War  Western  
name                                                                         
Toy Story      0        0        0        0       0         0    0        0  


## Comparing individual movies with Jaccard similarity

In the last lesson, you built a DataFrame of movies, where each column represents a different genre. You can now use this DataFrame to compare movies by measuring the Jaccard similarity between rows. The higher the Jaccard similarity score, the more similar the two items are.

In this exercise, you will compare the movie GoldenEye with the movie Toy Story, and GoldenEye with SkyFall and compare the results.

The DataFrame movie_cross_table containing all the movies as rows and the genres as Boolean columns that you created in the last lesson has been loaded.

### Instructions 1/3
    - Import the Jaccard similarity score function from sklearn.metrics.

In [7]:
# Import numpy and the Jaccard similarity function
import numpy as np
from sklearn.metrics import jaccard_score

### Instructions 2/3
    - Convert the rows containing 'GoldenEye' and 'Toy Story' to numpy arrays and measure their similarity.

In [8]:
# Import numpy and the distance metric
import numpy as np
from sklearn.metrics import jaccard_score

# Extract just the rows containing GoldenEye and Toy Story
goldeneye_values = movie_cross_table.loc['GoldenEye'].values
toy_story_values = movie_cross_table.loc['Toy Story'].values

# Find the similarity between GoldenEye and Toy Story
print(jaccard_score(goldeneye_values, toy_story_values))

0.14285714285714285


### Instructions 3/3
    - Convert the row containing Skyfall to a numpy array and measure its similarity to GoldenEye.

In [9]:
# Import numpy and the distance metric
import numpy as np
from sklearn.metrics import jaccard_score

# Extract just the rows containing GoldenEye and Toy Story
goldeneye_values = movie_cross_table.loc['GoldenEye'].values
toy_story_values = movie_cross_table.loc['Toy Story'].values

# Find the similarity between GoldenEye and Toy Story
print(jaccard_score(goldeneye_values, toy_story_values))

# Repeat for GoldenEye and Skyfall
skyfall_values = movie_cross_table.loc['Skyfall'].values
print(jaccard_score(goldeneye_values, skyfall_values))

0.14285714285714285
0.75


## Comparing all your movies at once

While finding the Jaccard similarity between any two individual movies in your dataset is great for small-scale analyses, it can prove slow on larger datasets to make recommendations.

In this exercise, you will find the similarities between all movies and store them in a DataFrame for quick and easy lookup.

When finding the similarities between the rows in a DataFrame, you could run through all pairs and calculate them individually, but it's more efficient to use the pdist() (pairwise distance) function from scipy.

This can be reshaped into the desired rectangular shape using squareform() from the same library. Since you want similarity values as opposed to distances, you should subtract the values from 1.

movie_cross_table has once again been loaded for you.

### Instructions
    - Find the Jaccard distance measures between all movies and assign the results to jaccard_similarity_array.
    - Create a DataFrame from the jaccard_similarity_array with movie_genre_df.index as its rows and columns.
    - Print the top 5 rows of the DataFrame and examine the similarity scores.

In [10]:
# Import functions from scipy
from scipy.spatial.distance import pdist, squareform

# Calculate all pairwise distances
jaccard_distances = pdist(movie_cross_table.values, metric='jaccard')

# Convert the distances to a square matrix
jaccard_similarity_array = 1 - squareform(jaccard_distances)

# Wrap the array in a pandas DataFrame
jaccard_similarity_df = pd.DataFrame(jaccard_similarity_array, index=movie_cross_table.index, columns=movie_cross_table.index)

# Print the top 5 rows of the DataFrame
print(jaccard_similarity_df.head())

name                                   '71  'Hellboy': The Seeds of Creation  \
name                                                                           
'71                               1.000000                             0.125   
'Hellboy': The Seeds of Creation  0.125000                             1.000   
'Round Midnight                   0.200000                             0.000   
'Salem's Lot                      0.333333                             0.000   
'Til There Was You                0.200000                             0.000   

name                              'Round Midnight  'Salem's Lot  \
name                                                              
'71                                      0.200000      0.333333   
'Hellboy': The Seeds of Creation         0.000000      0.000000   
'Round Midnight                          1.000000      0.200000   
'Salem's Lot                             0.200000      1.000000   
'Til There Was You                   

## Making recommendations based on movie genres

Now that you have your data in a usable format and know how to compare two movies, the next step is to use this to generate recommendations. In this exercise, you will learn how to generate recommendations for any movie in your dataset. The similarity scores between all movies in the dataset that you calculated in the last exercise have been pre-loaded for you as jaccard_similarity_array. movie_cross_table containing the movies and their attributes is also available.

For ease of use, you will need to wrap the similarity scores in a DataFrame. Then you will use this new DataFrame to suggest a movie recommendation.

### Instructions 1/2
    - Generate a DataFrame called jaccard_similarity_df from jaccard_similarity_array.
    - Store the similarity values between Thor and all other movies as a Series.
    - Sort these from largest to smallest in ordered_similarities.

In [11]:
# Wrap the preloaded array in a DataFrame
jaccard_similarity_df = pd.DataFrame(jaccard_similarity_array, index=movie_cross_table.index, columns=movie_cross_table.index)

# Find the values for the movie Thor
jaccard_similarity_series = jaccard_similarity_df.loc['Thor']

# Sort these values from highest to lowest
ordered_similarities = jaccard_similarity_series.sort_values(ascending=False)

# Print the results
print(ordered_similarities)

name
Thor                                                    1.000000
Harry Potter and the Deathly Hallows: Part 2            0.833333
In the Name of the King III                             0.800000
Harry Potter and the Deathly Hallows: Part 1            0.800000
Harry Potter and the Order of the Phoenix               0.800000
                                                          ...   
Kevin Hart: I'm a Grown Little Man                      0.000000
Kentucky Fried Movie, The                               0.000000
Keeping Up with the Joneses                             0.000000
Keep the River on Your Right: A Modern Cannibal Tale    0.000000
À nous la liberté (Freedom for Us)                      0.000000
Name: Thor, Length: 9461, dtype: float64


## Instantiate the TF-IDF model

TF-IDF by default generates a column for every word in all of your documents (movie summaries in our case). This creates a huge and unintuitive dataset as it will contain both very common words that appear in every document, and words that appear so rarely they provide no value in finding similarities between items.

In this exercise, you will work with the df_plots DataFrame. It contains movies' names in the Title column and their plots in the Plot column.

Using this DataFrame, you will generate the default TF-IDF scores and see if non-valuable columns are present.

You will go on to rerun the TF-IDF calculations, this time limiting the number of columns using the min_df and max_df arguments and hopefully see the improvement.

### Instructions 1/2
    - Create a TfidfVectorizer and call it vectorizer.
    - Use vectorizer to transform the data in the Plots column of df_plots and assign the output to vectorized_data.
    - Inspect the features that have been generated by the transformation.

In [16]:
df_plots = pd.read_csv('../../data/movies_plot.csv.zip')

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate the vectorizer object to the vectorizer variable
vectorizer = TfidfVectorizer()

# Fit and transform the plot column
vectorized_data = vectorizer.fit_transform(df_plots['Plot'])

# Look at the features generated
#print(vectorizer.get_feature_names()) ## Corrected for the updated version
print(vectorizer.get_feature_names_out())

['00' '000' '007' ... 'émile' 'étoile' 'željko']


### Instructions 2/2
    - Repeat the creation of the TfidfVectorizer, but this time, set the minimum document frequency to 2 and the maximum document frequency to 0.7.
    - Inspect the features that have been generated by the transformation.

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate the vectorizer object to the vectorizer variable
vectorizer = TfidfVectorizer(min_df=2, max_df=0.7)

# Fit and transform the plot column
vectorized_data = vectorizer.fit_transform(df_plots['Plot'])

# Look at the features generated
#print(vectorizer.get_feature_names()) ## Corrected for the updated version
print(vectorizer.get_feature_names_out())

['00' '000' '04' ... 'zoological' 'zorro' 'zuckerman']


## Creating the TF-IDF DataFrame

Now that you have generated our TF-IDF features, you will need to get them in a format that you can use to make recommendations. You will once again leverage pandas for this and wrap the array in a DataFrame. As you will be using the movie titles to do your filtering of the data, you can assign the titles to the DataFrame's index.

The df_plots DataFrame has once again been loaded for you. It contains movies' names in the Title column and their plots in the Plot column.

### Instructions
    - Create a TfidfVectorizer and fit and transform it as you did in the previous exercise.
    - Wrap the generated vectorized_data in a DataFrame. Use the names of the features generated during the fit and transform phase as its column names and assign your new DataFrame to tfidf_df.
    - Assign the original movie titles to the index of the newly created tfidf_df DataFrame.

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate the vectorizer object and transform the plot column
vectorizer = TfidfVectorizer(max_df=0.7, min_df=2)
vectorized_data = vectorizer.fit_transform(df_plots['Plot']) 

# Create Dataframe from TF-IDFarray
#tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names()) ## Corrected for the updated version
tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names_out())

# Assign the movie titles to the index and inspect
tfidf_df.index = df_plots['Title']
print(tfidf_df.head())

                            00  000   04   10       100  1000  101st  10th  \
Title                                                                        
The Ballad of Cable Hogue  0.0  0.0  0.0  0.0  0.022794   0.0    0.0   0.0   
Monsters vs. Aliens        0.0  0.0  0.0  0.0  0.000000   0.0    0.0   0.0   
The Bandit Queen           0.0  0.0  0.0  0.0  0.000000   0.0    0.0   0.0   
Broken Arrow               0.0  0.0  0.0  0.0  0.000000   0.0    0.0   0.0   
Dolemite                   0.0  0.0  0.0  0.0  0.000000   0.0    0.0   0.0   

                            11   12  ...  zero  zoe  zola  zombie  zombies  \
Title                                ...                                     
The Ballad of Cable Hogue  0.0  0.0  ...   0.0  0.0   0.0     0.0      0.0   
Monsters vs. Aliens        0.0  0.0  ...   0.0  0.0   0.0     0.0      0.0   
The Bandit Queen           0.0  0.0  ...   0.0  0.0   0.0     0.0      0.0   
Broken Arrow               0.0  0.0  ...   0.0  0.0   0.0     0

## Comparing all your movies with TF-IDF

Now that you have put in the hard work of getting your TF-IDF data into a usable format, it's time to put it to work generating finding similarities and generating recommendations.

This time as you are using TF-IDF scores (which are floats as opposed to Booleans) you will use the cosine similarity metric to find the similarities between items. In this exercise, you will generate a matrix of all of the movie cosine similarities and store them in a DataFrame for ease of lookup. This will allow you to compare movies and find recommendations quickly and easily.

The tfidf_df DataFrame you created in the last exercise containing a row for each movie has been loaded for you.

### Instructions
    - Find the cosine similarity measures between all movies and assign the results to cosine_similarity_array.
    - Create a DataFrame from the cosine_similarity_array with tfidf_summary_df.index as its rows and columns.
    - Print the top five rows of the DataFrame and examine the similarity scores.

In [24]:
tfidf_summary_df = tfidf_df

In [25]:
# Import cosine_similarity measure
from sklearn.metrics.pairwise import cosine_similarity

# Create the array of cosine similarity values
cosine_similarity_array = cosine_similarity(tfidf_summary_df)

# Wrap the array in a pandas DataFrame
cosine_similarity_df = pd.DataFrame(cosine_similarity_array, index=tfidf_summary_df.index, columns=tfidf_summary_df.index)

# Print the top 5 rows of the DataFrame
print(cosine_similarity_df.head())

Title                      The Ballad of Cable Hogue  Monsters vs. Aliens  \
Title                                                                       
The Ballad of Cable Hogue                   1.000000             0.028441   
Monsters vs. Aliens                         0.028441             1.000000   
The Bandit Queen                            0.012453             0.017621   
Broken Arrow                                0.023429             0.038809   
Dolemite                                    0.012980             0.023125   

Title                      The Bandit Queen  Broken Arrow  Dolemite  \
Title                                                                 
The Ballad of Cable Hogue          0.012453      0.023429  0.012980   
Monsters vs. Aliens                0.017621      0.038809  0.023125   
The Bandit Queen                   1.000000      0.012407  0.004632   
Broken Arrow                       0.012407      1.000000  0.013701   
Dolemite                          

## Making recommendations with TF-IDF

In the last exercise you pre-calculated the similarity ratings between all movies in the dataset based on their plots transformed by TF-IDF. Now you will put these similarity ratings in a DataFrame for ease of use. Then you will use this new DataFrame to suggest a movie recommendation.

The cosine_similarity_array containing a matrix of the similarity values between all movies that you created in the last exercise has been loaded for you. The tfidf_summary_df DataFrame containing the movies and their TF-IDF features is also available.

### Instructions 1/2
    - Generate a DataFrame from cosine_similarity_array.
    - Store the cosine similarity values between the movie Rio and all other movies as a Series.
    - Sort these from largest to smallest in ordered_similarities and print the ordered results.

In [None]:
# Wrap the preloaded array in a DataFrame
cosine_similarity_df = pd.DataFrame(cosine_similarity_array, index=tfidf_summary_df.index, columns=tfidf_summary_df.index)

# Find the values for the movie Rio
cosine_similarity_series = cosine_similarity_df.loc['Rio']

# Sort these values highest to lowest
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

# Print the results
print(ordered_similarities)

## Build the user profiles

You are now able to generate suggestions for similar items based on their labeled features or based on their descriptions. But sometimes finding similar items might not be enough. In the next exercises, you will work through how one could create recommendations based on a user and all the items they liked as opposed to a singular item. You will first generate a profile for a user by aggregating all of the movies they have previously enjoyed.

The tfidf_summary_df you have been working on in the last few exercises has been loaded for you. This contains a row per movie with their titles as the index and a column for each feature containing their respective TF-IDF score.

### Instructions 1/2
    - Create a subset of the tfidf_summary_df that contains only rows corresponding to the supplied list_of_movies_enjoyed list.

In [None]:
list_of_movies_enjoyed = ['Captain America: The First Avenger', 'Green Lantern', 'The Avengers']

# Create a subset of only the movies the user has enjoyed
movies_enjoyed_df = tfidf_summary_df.reindex(list_of_movies_enjoyed)

# Inspect the DataFrame
print(movies_enjoyed_df)

### Instructions 2/2
    - Generate the user profile by finding the average TF-IDF scores of each of the features of the movies contained in movies_enjoyed_df.
    - Inspect the results.

In [None]:
list_of_movies_enjoyed = ['Captain America: The First Avenger', 'Green Lantern', 'The Avengers']

# Create a subset of only the movies the user has enjoyed
movies_enjoyed_df = tfidf_summary_df.reindex(list_of_movies_enjoyed)

# Generate the user profile by finding the average scores of movies they enjoyed
user_prof = movies_enjoyed_df.mean()

# Inspect the results
print(user_prof)

## User profile based recommendations

Now that you have built the user profile based on the aggregate of the individual movies they enjoyed, you can compare it to the larger tfidf_summary_df DataFrame that you have been working with to generate suggestions. As you would not want to suggest movies that the user has already watched, you will first find a subset of the tfidf_summary_df DataFrame that does not contain any of the previously watched movies.

The DataFrame user_prof that you generated in the last exercise that contains a single column representing the user has been loaded for you. Similarly, the list_of_movies_enjoyed has been loaded so you can exclude them from the predictions.

### Instructions 1/3
    - Find the subset of tfidf_df that does not include movies in list_of_movies_enjoyed and assign it to tfidf_subset_df.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Find subset of tfidf_df that does not include movies in list_of_movies_enjoyed
tfidf_subset_df = tfidf_df.drop(list_of_movies_enjoyed, axis=0)

### Instructions 2/3
    - Calculate the cosine_similarity between the user profile contained in user_prof and all the movie profiles in tfidf_subset_df.
    - Wrap the similarity_array in a DataFrame, assigning it the same index as tfidf_subset_df.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Find subset of tfidf_df that does not include movies in list_of_movies_enjoyed
tfidf_subset_df = tfidf_df.drop(list_of_movies_enjoyed, axis=0)

# Calculate the cosine_similarity and wrap it in a DataFrame
similarity_array = cosine_similarity(user_prof.values.reshape(1, -1), tfidf_subset_df)
similarity_df = pd.DataFrame(similarity_array.T, index=tfidf_subset_df.index, columns=["similarity_score"])

### Instructions 3/3
    - Sort the results from high to low and take a look at the movies most similar to the user's likes.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Find subset of tfidf_df that does not include movies in list_of_movies_enjoyed
tfidf_subset_df = tfidf_df.drop(list_of_movies_enjoyed, axis=0)

# Calculate the cosine_similarity and wrap it in a DataFrame
similarity_array = cosine_similarity(user_prof.values.reshape(1, -1), tfidf_subset_df)
similarity_df = pd.DataFrame(similarity_array.T, index=tfidf_subset_df.index, columns=["similarity_score"])

# Sort the values from high to low by the values in the similarity_score
sorted_similarity_df = similarity_df.sort_values(by="similarity_score", ascending=False)

# Inspect the most similar to the user preferences
print(sorted_similarity_df.head())