In [154]:
import numpy as np
import pandas as pd
import ast
import math
from datascience import *

# Content-based Movie Recommendation System Using the Vector Space Method

This is a personal project that will create a TV Show recommending system using the "TMDB 5000 Movie Dataset" data set from Kaggle. The user inputs their favorite movie into an algorithm that returns the top 10 movies that are most similar to their favorite.

## 1: The Datasets and Feature Manipulation

I'll be using two datasets: movies and credits.

I'll be focusing on recommending a movie based on these features: cast, crew, keywords, genres.

In [3]:
tmdb_movies = Table.read_table("tmdb_5000_movies.csv")
tmdb_movies.show(3)

budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adven ...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""na ...",en,Avatar,"In the 22nd century, a paraplegic Marine is dispatched t ...",150.438,"[{""name"": ""Ingenious Film Partners"", ""id"": 289}, {""name"" ...","[{""iso_3166_1"": ""US"", ""name"": ""United States of America"" ...",2009-12-10,2787965087,162,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639_1"": "" ...",Released,Enter the World of Pandora.,Avatar,7.2,11800
300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fa ...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""name"": ""drug ...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, has come bac ...",139.083,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""name"": ""Je ...","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2007-05-19,961000000,169,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adven ...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name"": ""based ...",en,Spectre,A cryptic message from Bond’s past sends him on a trail ...,107.377,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""name"": ""Danja ...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""}, {""iso_3 ...",2015-10-26,880674609,148,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""}, {""iso_639 ...",Released,A Plan No One Escapes,Spectre,6.3,4466


In [4]:
credits = Table.read_table("tmdb_5000_credits.csv")
credits.show(3)

movie_id,title,cast,crew
19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""credit_id"" ...","[{""credit_id"": ""52fe48009251416c750aca23"", ""department"": ..."
285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Sparrow"", ""cr ...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""department"": ..."
206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""credit_id"": ...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""department"": ..."


Joining the credits and tmdb_movies table to include all possible features.

In [5]:
m = tmdb_movies.join("title", credits, "title")
m.show(3)

title,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,vote_average,vote_count,movie_id,cast,crew
#Horror,1500000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 9648, ""name"": ""Myst ...",http://www.hashtaghorror.com/,301325,[],de,#Horror,"Inspired by actual events, a group of 12 year old girls ...",2.81523,"[{""name"": ""AST Studios"", ""id"": 75277}, {""name"": ""Lowland ...","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2015-11-20,0,90,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639_1"": "" ...",Released,Death is trending.,3.3,52,301325,"[{""cast_id"": 0, ""character"": ""Alex's 12-Step Friend"", ""c ...","[{""credit_id"": ""545bbac70e0a261fb6002329"", ""department"": ..."
(500) Days of Summer,7500000,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""name"": ""Drama ...",http://500days.com,19913,"[{""id"": 248, ""name"": ""date""}, {""id"": 572, ""name"": ""sex""} ...",en,(500) Days of Summer,"Tom (Joseph Gordon-Levitt), greeting-card writer and hop ...",45.611,"[{""name"": ""Fox Searchlight Pictures"", ""id"": 43}, {""name"" ...","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2009-07-17,60722734,95,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639_1"": "" ...",Released,It was almost like falling in love.,7.2,2904,19913,"[{""cast_id"": 4, ""character"": ""Tom Hansen"", ""credit_id"": ...","[{""credit_id"": ""52fe47f99251416c750abaa5"", ""department"": ..."
10 Cloverfield Lane,15000000,"[{""id"": 53, ""name"": ""Thriller""}, {""id"": 878, ""name"": ""Sc ...",http://www.10cloverfieldlane.com/,333371,"[{""id"": 1930, ""name"": ""kidnapping""}, {""id"": 2321, ""name"" ...",en,10 Cloverfield Lane,"After a car accident, Michelle awakens to find herself i ...",53.6987,"[{""name"": ""Paramount Pictures"", ""id"": 4}, {""name"": ""Bad ...","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2016-03-10,108286421,103,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Monsters come in many forms.,6.8,2468,333371,"[{""cast_id"": 2, ""character"": ""Michelle"", ""credit_id"": ""5 ...","[{""credit_id"": ""57627624c3a3680682000872"", ""department"": ..."


Because all of the data is in string format, I'll use literal_eval to convert the string data into a list of dictionaries. I'll add the altered features into a new table called movies.

In [171]:
features = ["genres", "cast", "crew", "keywords"]
movies = m.select("title")
for feature in features:
    movies = movies.with_columns(
        feature, m.apply(ast.literal_eval, feature)
    )
    
movies

title,genres,cast,crew,keywords
#Horror,"[{'id': 18, 'name': 'Drama'}, {'id': 9648, 'name': 'Myst ...","[{'cast_id': 0, 'character': ""Alex's 12-Step Friend"", 'c ...","[{'credit_id': '545bbac70e0a261fb6002329', 'department': ...",[]
(500) Days of Summer,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama ...","[{'cast_id': 4, 'character': 'Tom Hansen', 'credit_id': ...","[{'credit_id': '52fe47f99251416c750abaa5', 'department': ...","[{'id': 248, 'name': 'date'}, {'id': 572, 'name': 'sex'} ..."
10 Cloverfield Lane,"[{'id': 53, 'name': 'Thriller'}, {'id': 878, 'name': 'Sc ...","[{'cast_id': 2, 'character': 'Michelle', 'credit_id': '5 ...","[{'credit_id': '57627624c3a3680682000872', 'department': ...","[{'id': 1930, 'name': 'kidnapping'}, {'id': 2321, 'name' ..."
10 Days in a Madhouse,"[{'id': 18, 'name': 'Drama'}]","[{'cast_id': 2, 'character': 'Nellie Bly', 'credit_id': ...","[{'credit_id': '594efa1fc3a36832650455ff', 'department': ...","[{'id': 1568, 'name': 'undercover'}, {'id': 4924, 'name' ..."
10 Things I Hate About You,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, 'name': 'Ro ...","[{'cast_id': 2, 'character': 'Patrick Verona', 'credit_i ...","[{'credit_id': '52fe43e6c3a36847f807731d', 'department': ...","[{'id': 497, 'name': 'shakespeare'}, {'id': 5923, 'name' ..."
102 Dalmatians,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Fa ...","[{'cast_id': 1, 'character': 'Cruella de Vil', 'credit_i ...","[{'credit_id': '5539fe5e9251413f5a00359b', 'department': ...","[{'id': 212, 'name': 'london england'}, {'id': 378, 'nam ..."
10th & Wolf,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'name': 'Crime ...","[{'cast_id': 1, 'character': 'Tommy', 'credit_id': '52fe ...","[{'credit_id': '52fe454d9251416c75051e59', 'department': ...","[{'id': 1568, 'name': 'undercover'}, {'id': 10391, 'name ..."
11:14,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name': 'Drama' ...","[{'cast_id': 10, 'character': 'Jack', 'credit_id': '52fe ...","[{'credit_id': '52fe44e1c3a36847f80af88d', 'department': ...","[{'id': 567, 'name': 'alcohol'}, {'id': 572, 'name': 'se ..."
12 Angry Men,"[{'id': 18, 'name': 'Drama'}]","[{'cast_id': 29, 'character': 'Juror 8', 'credit_id': '5 ...","[{'credit_id': '52fe423dc3a36847f800ec55', 'department': ...","[{'id': 934, 'name': 'judge'}, {'id': 1417, 'name': 'jur ..."
12 Rounds,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adven ...","[{'cast_id': 1, 'character': 'Det. Danny Fisher', 'credi ...","[{'credit_id': '538ee0730e0a2644b9001141', 'department': ...","[{'id': 6149, 'name': 'police'}, {'id': 8233, 'name': 'c ..."


# 2: Creating the recommendation system

## Method 1: The Vector Space Method

We can gauge the similarity of text by changing the text into vectors and getting the angular distance between the vectors using the cosine function. Cosine will give us a similarity value between 0 and 1.

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

To give an example of how this data analysis is going to work, I've created a list of 4 strings. Some strings share words with each other, some don't. The CountVectorizer function creates a matrix that represents the frequency of each word/feature in each string.

In [8]:
text = ["happy go lucky", "happy happy lucky", "happy go lucky go lucky bother", "expert go"]

count_vector = CountVectorizer()
cm = count_vector.fit_transform(text)

print(count_vector.get_feature_names())
print(cm.toarray())

['bother', 'expert', 'go', 'happy', 'lucky']
[[0 0 1 1 1]
 [0 0 0 2 1]
 [1 0 2 1 2]
 [0 1 1 0 0]]


In the first string in "text", "bother" and "expert" appear 0 times. "go", "happy", and "lucky" appear once. The first row of the matrix represent these occurrences: [0 0 1 1 1]. Now, I want to see how similar each of the strings in "text" are to each other. That's where I'll use the cosine_similarity function.

In [9]:
similarity_scores = cosine_similarity(cm)
similarity_scores

array([[ 1.        ,  0.77459667,  0.91287093,  0.40824829],
       [ 0.77459667,  1.        ,  0.56568542,  0.        ],
       [ 0.91287093,  0.56568542,  1.        ,  0.4472136 ],
       [ 0.40824829,  0.        ,  0.4472136 ,  1.        ]])

The similarity score ranges from 0 to 1, with 0 being least similar and 1 being identical. Here's an easier way to visualize similarity_scores:

        text[0]   text[1]  text[2]    text[3]
text[0] [ 1.00000000,  0.77459667,  0.91287093,  0.40824829]

text[1] [ 0.77459667,  1.00000000,  0.56568542,  0.00000000]

text[2] [ 0.91287093,  0.56568542,  1.00000000,  0.4472136 ]

text[3] [ 0.40824829,  0.00000000,  0.4472136 ,  1.00000000]



Text[0] is identical to itself, hence the similarity score of 1.

I want to make the data in my dataset to emulate the format of "text": a list of strings where each keyword is lowercase and separated by a space. This way, I can run the CountVectorizer and cosine_similarity functions at a larger scale, and can find how similar each movie is to one another based by the keywords of each movie. I will extract these keywords from the "genres", "cast", "crew", and "keywords" columns from the table.

I'll extract the director of each movie from the "crew" column to add another keyword for comparison. As of now, the data is formatted in lists of dictionaries. The "director" and "relevant_data" functions extract the data and format it similarly to the "text" list.

In [10]:
def director(array):
    for dic in array:
        if dic["job"] == "Director":
            # make each string lowercase while removing the space between the first and last name.
            # removing the space creates one string per person
            return str.lower(dic["name"]).replace(" ", "")

def relevant_data(array):
    # returns the first 5 keywords in the array. If len(array) < 5, it returns the whole array.
    names = make_array()
    for i in range(len(array)):
        lowercase_name = str.lower(array[i]["name"]).replace(" ", "")
        if lowercase_name not in names:
            names = np.append(names, lowercase_name)
    if len(names) > 5:
        return names[:5]
    else:
        return names

The new_movies table contains the reformatted data.

In [14]:
new_movies = movies.select('title').with_column("director", movies.apply(director, "crew"))
for feature in features:
    new_movies = new_movies.with_columns(
        feature, movies.apply(relevant_data, feature)
    )
    
new_movies

title,director,genres,cast,crew,keywords
#Horror,tarasubkoff,['drama' 'mystery' 'horror' 'thriller'],['tarynmanning' 'natashalyonne' 'chloësevigny' 'balthaza ...,['tarasubkoff' 'jasonludman' 'orensegal' 'brendanwalsh'],[]
(500) Days of Summer,marcwebb,['comedy' 'drama' 'romance'],['josephgordon-levitt' 'zooeydeschanel' 'chloëgracemoret ...,['mychaeldanna' 'hopehanafin' 'stevenj.wolfe' 'masonnovi ...,['date' 'sex' 'jealousy' 'fight' 'architect']
10 Cloverfield Lane,dantrachtenberg,['thriller' 'sciencefiction' 'drama'],['maryelizabethwinstead' 'johngoodman' 'johngallagherjr. ...,['monikamikkelsen' 'j.j.abrams' 'mattheww.mungle' 'jeffc ...,['kidnapping' 'bunker' 'paranoia' 'basement' 'survivalist']
10 Days in a Madhouse,timothyhines,['drama'],['carolinebarry' 'christopherlambert' 'kellylebrock' 'ju ...,['martinwiley' 'janglaser' 'strathfordhamilton' 'marcyle ...,['undercover' 'insaneasylum' 'reporter']
10 Things I Hate About You,giljunger,['comedy' 'romance' 'drama'],['heathledger' 'juliastiles' 'josephgordon-levitt' 'lari ...,['charlesgraffeo' 'williamshakespeare' 'markirwin' 'o.ni ...,['shakespeare' 'sister' 'highschool' 'cannabis' 'deception']
102 Dalmatians,kevinlima,['comedy' 'family'],['glennclose' 'ioangruffudd' 'aliceevans' 'timmcinnerny' ...,['davidnewman' 'adrianbiddle' 'kevinlima' 'gregoryperler ...,"['londonengland' 'prison' 'releasefromprison' ""women'spr ..."
10th & Wolf,robertmoresco,['action' 'crime' 'drama' 'mystery' 'thriller'],['jamesmarsden' 'briandennehy' 'leorossi' 'dennishopper' ...,['robertmoresco' 'allansteele'],['undercover' 'mafia' 'mobster' 'crimefamily']
11:14,gregmarcks,['crime' 'drama' 'thriller'],['henrythomas' 'blakeheron' 'barbarahershey' 'hilaryswan ...,['hilaryswank' 'johnmorrissey' 'maryvernieu' 'clintmanse ...,['alcohol' 'sex' 'robbery' 'secret' 'gun']
12 Angry Men,sidneylumet,['drama'],['henryfonda' 'martinbalsam' 'johnfiedler' 'leej.cobb' ' ...,['henryfonda' 'sidneylumet' 'reginaldrose' 'georgejustin ...,['judge' 'jurors' 'sultriness' 'deathpenalty' 'fathermur ...
12 Rounds,rennyharlin,['action' 'adventure' 'drama' 'thriller'],['johncena' 'aidangillen' 'ashleyscott' 'steveharris' 'b ...,['johnpapsidera' 'trevorrabin' 'brianberdan' 'markgordon ...,['police' 'cops' 'catandmouse' 'family' 'revengedrama']


Now that I have the reformatted data through new_movies, I'll combine the data of each row into one string and compile those strings into an array/list. The array/list will be a much larger version of "text".

In [32]:
new_features = ["director", "genres", "cast", "crew", "keywords"]

cd_column = []
for row in new_movies.rows:
    # I want to get the data from each row into one string excluding the title, represented by cd_string
    cd_string = ""
    for feature in new_features:
        data = row.item(feature)
        
        # if the data is string format, directly add the string data to the cd_string
        if isinstance(data, str):
            cd_string += data + " "
        
        # if the data is a numpy array of strings, iterate through the array and add the strings to cd_string
        elif isinstance(data, np.ndarray):
            for string in data:
                cd_string += string + " "
    # append each row's data(cd_string) into a list that I'll place in the table
    cd_column.append(cd_string)


fmovies = new_movies.with_columns(
    "combined data", cd_column,
    "index", np.arange(fmovies.num_rows) # Creating an index will help me traverse through the similarity matrix
)
fmovies.show(3)

title,director,genres,cast,crew,keywords,combined data,index
#Horror,tarasubkoff,['drama' 'mystery' 'horror' 'thriller'],['tarynmanning' 'natashalyonne' 'chloësevigny' 'balthaza ...,['tarasubkoff' 'jasonludman' 'orensegal' 'brendanwalsh'],[],tarasubkoff drama mystery horror thriller tarynmanning n ...,0
(500) Days of Summer,marcwebb,['comedy' 'drama' 'romance'],['josephgordon-levitt' 'zooeydeschanel' 'chloëgracemoret ...,['mychaeldanna' 'hopehanafin' 'stevenj.wolfe' 'masonnovi ...,['date' 'sex' 'jealousy' 'fight' 'architect'],marcwebb comedy drama romance josephgordon-levitt zooeyd ...,1
10 Cloverfield Lane,dantrachtenberg,['thriller' 'sciencefiction' 'drama'],['maryelizabethwinstead' 'johngoodman' 'johngallagherjr. ...,['monikamikkelsen' 'j.j.abrams' 'mattheww.mungle' 'jeffc ...,['kidnapping' 'bunker' 'paranoia' 'basement' 'survivalist'],dantrachtenberg thriller sciencefiction drama maryelizab ...,2


It's time to apply the functions to the combined data column.

Here is the movie recommender function:

In [169]:
def recommend(movie, table):
    """
    Recommends 5 movies based on similarity scores with other movies.
    """
    # Create the similarity matrix
    movie_index = table.where("title", movie)[7][0]
    count_matrix = count_vector.fit_transform(table.column("combined data"))
    similarity = cosine_similarity(count_matrix)
    
    # Find the movie's row in the similarity matrix and sort it in descending order
    movie_sim_scores = list(enumerate(similarity[movie_index]))
    sorted_ms = sorted(movie_sim_scores, key=lambda x: x[1], reverse=True)[1:11]
    
    # Based on the indexes sorted_ms gives, add the recommended movies to recommended_movies.
    recommended_movies = []
    for tup in sorted_ms:
        recommended_movies.append(fmovies.where("index", tup[0])[0][0])
        
    return list(enumerate(recommended_movies, 1))


recommend("Die Hard", fmovies)

[(1, 'Die Hard 2'),
 (2, 'Die Hard: With a Vengeance'),
 (3, 'Predator'),
 (4, 'Commando'),
 (5, 'Rollerball'),
 (6, 'The Hunt for Red October'),
 (7, 'Basic'),
 (8, 'Road House'),
 (9, 'The 13th Warrior'),
 (10, 'Last Action Hero')]

Here are some more examples of movie recommendations:

In [170]:
recommend("Troy", fmovies)

[(1, 'Legends of the Fall'),
 (2, 'Alexander'),
 (3, 'The NeverEnding Story'),
 (4, 'Das Boot'),
 (5, 'Fury'),
 (6, 'Clash of the Titans'),
 (7, 'Entrapment'),
 (8, 'Kingdom of Heaven'),
 (9, 'Poseidon'),
 (10, 'End of the Spear')]

In [160]:
recommend("Transformers", fmovies)

[(1, 'Transformers: Revenge of the Fallen'),
 (2, 'Transformers: Age of Extinction'),
 (3, 'Transformers: Dark of the Moon'),
 (4, 'The Island'),
 (5, 'I Am Number Four'),
 (6, 'Teenage Mutant Ninja Turtles'),
 (7, 'The Helix... Loaded'),
 (8, 'Green Lantern'),
 (9, 'Indiana Jones and the Kingdom of the Crystal Skull'),
 (10, 'Teenage Mutant Ninja Turtles: Out of the Shadows')]

In [167]:
recommend("Indiana Jones and the Temple of Doom", fmovies)

[(1, 'Indiana Jones and the Kingdom of the Crystal Skull'),
 (2, 'Indiana Jones and the Last Crusade'),
 (3, 'Raiders of the Lost Ark'),
 (4, 'The Adventures of Tintin'),
 (5, 'Jurassic Park'),
 (6, 'The Lost World: Jurassic Park'),
 (7, '1941'),
 (8, 'The BFG'),
 (9, 'War of the Worlds'),
 (10, 'A.I. Artificial Intelligence')]

In [168]:
recommend("American Psycho", fmovies)

[(1, "Amidst the Devil's Wings"),
 (2, 'Out of the Furnace'),
 (3, 'Cecil B. Demented'),
 (4, 'The House of Mirth'),
 (5, 'The Silence of the Lambs'),
 (6, 'The Spanish Prisoner'),
 (7, 'Trash'),
 (8, 'Fabled'),
 (9, 'A Walk on the Moon'),
 (10, 'Assault on Precinct 13')]