In [None]:
# Import Dependencies
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Read in the cleaned movie dataframe
merge_file = '02_Clean_Dataset/movies.csv'
merge_df = pd.read_csv(merge_file)
merge_df.head()

In [None]:
# Check for null values
merge_df.count()

In [None]:
# There were no null values in the dataframe in the last python code, but for some reason writing the dataframe . . . 
# to a csv and then reading it back in generated some null values in the genres and keywords columns.
# These null values have to be converted to strings.
merge_df['genres'] = merge_df['genres'].fillna('Blank')
merge_df['keywords'] = merge_df['keywords'].fillna('Blank')
merge_df['cast'] = merge_df['cast'].fillna('Blank')
merge_df.count()

In [None]:
# Additionally for some reason some cells are type float not string so these have to be converted to string
merge_df['genres'] = merge_df['genres'].apply(str)

## Calculate the Cosine Matrix for each column

In [None]:
cv = CountVectorizer()
# Calculate the count matrix for genres
count_matrix = cv.fit_transform(merge_df["genres"])
# Calculate cosine similarity matrix based on count matrix results
genre_cos_sim = cosine_similarity(count_matrix)

In [None]:
# Find similarity between movies based on keywords
cv1 = CountVectorizer()
count_matrix1 = cv1.fit_transform(merge_df["keywords"])
keywords_cos_sim = cosine_similarity(count_matrix1)

In [None]:
# Find similarity between movies based on cast
cv2 = CountVectorizer()
count_matrix2 = cv2.fit_transform(merge_df["cast"])
cast_cos_sim = cosine_similarity(count_matrix2)

In [None]:
# Find similarity between movies based on production companies
cv3 = CountVectorizer()
count_matrix3 = cv3.fit_transform(merge_df["production_companies"])
prod_cos_sim = cosine_similarity(count_matrix3)

In [None]:
# Add weights to each column's cosine similarity matrix so that certain columns affect . . .
# the recommended movies more than other columns 
# I want keywords to effect the outcome the most and prod cos to effect it the least
average_cosine = (keywords_cos_sim*4) + (genre_cos_sim*3) + (cast_cos_sim*2) + prod_cos_sim*1.5

In [None]:
# Define function that returns the index when the movie title is entered as an argument
def get_index(title):
    return merge_df[merge_df["title"] == title]["index"].values[0]
# Define another function that returns the movie title when index is entered an argument
def get_title(index):
    return merge_df[merge_df["index"] == index]["title"].values[0]

In [None]:
def get_similar_movies(user_movie):
    try:
        similar_index = get_index(user_movie)
        similar_movies = list(enumerate(average_cosine[similar_index]))
        #Sort the list in descending order that the most similar movies come up on top
        similar_movies_sorted = sorted(similar_movies,key=lambda x:x[1],reverse=True)[1:]
        y=0
        # Display the most similar movies
        for index in similar_movies_sorted:
            print(get_title(index[0]))
            y = y+1
            if y==5:
                break
    except:
        print("The movie you are looking for is not in the dataframe. Try another movie")


In [None]:
# Find movies similar to what the user likes
get_similar_movies("Ant-Man")