### Import libraries

In [31]:
import pandas as pd
import numpy as np
from ast import literal_eval

### Load & Preprocess data

In [32]:
tmdb_5000_credits = pd.read_csv("datasets/movies-2/tmdb_5000_credits.csv")
tmdb_5000_movies = pd.read_csv("datasets/movies-2/tmdb_5000_movies.csv")

In [33]:
columns_movies = ["budget", "genres", "id", "original_language", "original_title", "overview", "popularity", "production_countries", "release_date", "spoken_languages", "vote_average", "vote_count"]
columns_credits = ["movie_id", "title", "cast", "crew"]

tmdb_5000_movies = tmdb_5000_movies[columns_movies]
tmdb_5000_credits = tmdb_5000_credits[columns_credits]

for column in ["genres", "production_countries", "spoken_languages"]:
    tmdb_5000_movies[column] = tmdb_5000_movies[column].apply(literal_eval)

for column in ["crew", "cast"]:
    tmdb_5000_credits[column] = tmdb_5000_credits[column].apply(literal_eval)    

In [34]:
tmdb_5000_credits.head(5)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '...","[{'credit_id': '52fe48009251416c750aca23', 'de..."
1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4232c3a36847f800b579', 'de..."
2,206647,Spectre,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '54805967c3a36829b5002c41', 'de..."
3,49026,The Dark Knight Rises,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '52fe4781c3a36847f81398c3', 'de..."
4,49529,John Carter,"[{'cast_id': 5, 'character': 'John Carter', 'c...","[{'credit_id': '52fe479ac3a36847f813eaa3', 'de..."


In [35]:
tmdb_5000_movies.head(5)

Unnamed: 0,budget,genres,id,original_language,original_title,overview,popularity,production_countries,release_date,spoken_languages,vote_average,vote_count
0,237000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",19995,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{'iso_3166_1': 'US', 'name': 'United States o...",2009-12-10,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",7.2,11800
1,300000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",285,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{'iso_3166_1': 'US', 'name': 'United States o...",2007-05-19,"[{'iso_639_1': 'en', 'name': 'English'}]",6.9,4500
2,245000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",206647,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",2015-10-26,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",6.3,4466
3,250000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",49026,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{'iso_3166_1': 'US', 'name': 'United States o...",2012-07-16,"[{'iso_639_1': 'en', 'name': 'English'}]",7.6,9106
4,260000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",49529,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{'iso_3166_1': 'US', 'name': 'United States o...",2012-03-07,"[{'iso_639_1': 'en', 'name': 'English'}]",6.1,2124


In [36]:
def preprocess_dict(column: str, key: str = "name"):
    genres_column = tmdb_5000_movies[column]
    genres_column_processed = []
    for item in genres_column:
        genres_column_processed.append([dic[key] for dic in item])
    return genres_column_processed


In [37]:
tmdb_5000_movies["genres"] = preprocess_dict("genres")
tmdb_5000_movies["production_countries"] = preprocess_dict("production_countries")
tmdb_5000_movies["spoken_languages"] = preprocess_dict("spoken_languages")

In [38]:
tmdb_5000_movies.head(5)

Unnamed: 0,budget,genres,id,original_language,original_title,overview,popularity,production_countries,release_date,spoken_languages,vote_average,vote_count
0,237000000,"[Action, Adventure, Fantasy, Science Fiction]",19995,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[United States of America, United Kingdom]",2009-12-10,"[English, Español]",7.2,11800
1,300000000,"[Adventure, Fantasy, Action]",285,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,[United States of America],2007-05-19,[English],6.9,4500
2,245000000,"[Action, Adventure, Crime]",206647,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[United Kingdom, United States of America]",2015-10-26,"[Français, English, Español, Italiano, Deutsch]",6.3,4466
3,250000000,"[Action, Crime, Drama, Thriller]",49026,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,[United States of America],2012-07-16,[English],7.6,9106
4,260000000,"[Action, Adventure, Science Fiction]",49529,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,[United States of America],2012-03-07,[English],6.1,2124


In [39]:
tmdb_5000_movies = tmdb_5000_movies.merge(tmdb_5000_credits, left_on='id', right_on='movie_id')

In [40]:
print(tmdb_5000_movies.shape)
tmdb_5000_movies = tmdb_5000_movies.query("budget != 0")
print(tmdb_5000_movies.shape)

(4803, 16)
(3766, 16)


In [41]:
tmdb_5000_movies["cast"] = preprocess_dict("cast", "character")

In [42]:
tmdb_5000_movies.head()

Unnamed: 0,budget,genres,id,original_language,original_title,overview,popularity,production_countries,release_date,spoken_languages,vote_average,vote_count,movie_id,title,cast,crew
0,237000000,"[Action, Adventure, Fantasy, Science Fiction]",19995,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[United States of America, United Kingdom]",2009-12-10,"[English, Español]",7.2,11800,19995,Avatar,"[Jake Sully, Neytiri, Dr. Grace Augustine, Col...","[{'credit_id': '52fe48009251416c750aca23', 'de..."
1,300000000,"[Adventure, Fantasy, Action]",285,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,[United States of America],2007-05-19,[English],6.9,4500,285,Pirates of the Caribbean: At World's End,"[Captain Jack Sparrow, Will Turner, Elizabeth ...","[{'credit_id': '52fe4232c3a36847f800b579', 'de..."
2,245000000,"[Action, Adventure, Crime]",206647,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[United Kingdom, United States of America]",2015-10-26,"[Français, English, Español, Italiano, Deutsch]",6.3,4466,206647,Spectre,"[James Bond, Blofeld, Madeleine, M, Lucia, Q, ...","[{'credit_id': '54805967c3a36829b5002c41', 'de..."
3,250000000,"[Action, Crime, Drama, Thriller]",49026,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,[United States of America],2012-07-16,[English],7.6,9106,49026,The Dark Knight Rises,"[Bruce Wayne / Batman, Alfred Pennyworth, Jame...","[{'credit_id': '52fe4781c3a36847f81398c3', 'de..."
4,260000000,"[Action, Adventure, Science Fiction]",49529,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,[United States of America],2012-03-07,[English],6.1,2124,49529,John Carter,"[John Carter, Dejah Thoris, Sola, Tars Tarkas,...","[{'credit_id': '52fe479ac3a36847f813eaa3', 'de..."


In [43]:
tmdb_5000_movies['overview'].head()

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

### Con la overview

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
tmdb_5000_movies['overview'] = tmdb_5000_movies['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(tmdb_5000_movies['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(3766, 18162)

In [45]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [46]:
indices = pd.Series(tmdb_5000_movies.index, index=tmdb_5000_movies['original_title']).drop_duplicates()

In [63]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return tmdb_5000_movies[['original_title', 'production_countries', 'genres', 'vote_average']].iloc[movie_indices]

In [64]:
get_recommendations('The Dark Knight Rises')

Unnamed: 0,original_title,production_countries,genres,vote_average
65,The Dark Knight,"[United Kingdom, United States of America]","[drama, action, crime, thriller]",8.2
298,Batman Forever,"[United Kingdom, United States of America]","[action, crime, fantasy]",5.2
423,Batman Returns,"[United Kingdom, United States of America]","[action, fantasy]",6.6
1305,Batman,"[United Kingdom, United States of America]","[fantasy, action]",7.0
3279,"Batman: The Dark Knight Returns, Part 2",[United States of America],"[action, animation]",7.9
2293,Slow Burn,[United States of America],"[mystery, crime, drama, thriller]",5.5
119,Batman Begins,"[United Kingdom, United States of America]","[action, crime, drama]",7.5
1145,JFK,[United States of America],"[drama, thriller, history]",7.5
9,Batman v Superman: Dawn of Justice,[United States of America],"[action, adventure, fantasy]",5.7
210,Batman & Robin,"[United Kingdom, United States of America]","[action, crime, fantasy]",4.2


### Por categoria de peli

In [49]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [50]:
def get_list(x):
    if isinstance(x, list):
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(x) > 5:
            x = x[:5]
        return x

    #Return empty list in case of missing/malformed data
    return []

In [51]:
tmdb_5000_movies['director'] = tmdb_5000_movies['crew'].apply(get_director)

features = ['cast', 'genres']
for feature in features:
    tmdb_5000_movies[feature] = tmdb_5000_movies[feature].apply(get_list)

In [52]:
tmdb_5000_movies[['original_title', 'cast', 'director', 'genres']].head(3)

Unnamed: 0,original_title,cast,director,genres
0,Avatar,"[Jake Sully, Neytiri, Dr. Grace Augustine, Col...",James Cameron,"[Action, Adventure, Fantasy, Science Fiction]"
1,Pirates of the Caribbean: At World's End,"[Captain Jack Sparrow, Will Turner, Elizabeth ...",Gore Verbinski,"[Adventure, Fantasy, Action]"
2,Spectre,"[James Bond, Blofeld, Madeleine, M, Lucia]",Sam Mendes,"[Action, Adventure, Crime]"


In [53]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [54]:
features = ['cast', 'director', 'genres']

for feature in features:
    tmdb_5000_movies[feature] = tmdb_5000_movies[feature].apply(clean_data)

In [55]:
tmdb_5000_movies.head()

Unnamed: 0,budget,genres,id,original_language,original_title,overview,popularity,production_countries,release_date,spoken_languages,vote_average,vote_count,movie_id,title,cast,crew,director
0,237000000,"[action, adventure, fantasy, sciencefiction]",19995,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[United States of America, United Kingdom]",2009-12-10,"[English, Español]",7.2,11800,19995,Avatar,"[jakesully, neytiri, dr.graceaugustine, col.qu...","[{'credit_id': '52fe48009251416c750aca23', 'de...",jamescameron
1,300000000,"[adventure, fantasy, action]",285,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,[United States of America],2007-05-19,[English],6.9,4500,285,Pirates of the Caribbean: At World's End,"[captainjacksparrow, willturner, elizabethswan...","[{'credit_id': '52fe4232c3a36847f800b579', 'de...",goreverbinski
2,245000000,"[action, adventure, crime]",206647,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[United Kingdom, United States of America]",2015-10-26,"[Français, English, Español, Italiano, Deutsch]",6.3,4466,206647,Spectre,"[jamesbond, blofeld, madeleine, m, lucia]","[{'credit_id': '54805967c3a36829b5002c41', 'de...",sammendes
3,250000000,"[action, crime, drama, thriller]",49026,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,[United States of America],2012-07-16,[English],7.6,9106,49026,The Dark Knight Rises,"[brucewayne/batman, alfredpennyworth, jamesgor...","[{'credit_id': '52fe4781c3a36847f81398c3', 'de...",christophernolan
4,260000000,"[action, adventure, sciencefiction]",49529,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,[United States of America],2012-03-07,[English],6.1,2124,49529,John Carter,"[johncarter, dejahthoris, sola, tarstarkas, ta...","[{'credit_id': '52fe479ac3a36847f813eaa3', 'de...",andrewstanton


In [56]:
def create_soup(x):
    return ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
tmdb_5000_movies['soup'] = tmdb_5000_movies.apply(create_soup, axis=1)

In [57]:
tmdb_5000_movies['soup'].head()

0    jakesully neytiri dr.graceaugustine col.quarit...
1    captainjacksparrow willturner elizabethswann w...
2    jamesbond blofeld madeleine m lucia sammendes ...
3    brucewayne/batman alfredpennyworth jamesgordon...
4    johncarter dejahthoris sola tarstarkas talhaju...
Name: soup, dtype: object

In [58]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(tmdb_5000_movies['soup'])

In [59]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [60]:
tmdb_5000_movies = tmdb_5000_movies.reset_index()
indices = pd.Series(tmdb_5000_movies.index, index=tmdb_5000_movies['original_title'])

In [61]:
tmdb_5000_movies.query("original_title == 'The Dark Knight Rises'")["genres"]

3    [action, crime, drama, thriller]
Name: genres, dtype: object

In [65]:
get_recommendations('The Dark Knight Rises', cosine_sim2)

Unnamed: 0,original_title,production_countries,genres,vote_average
119,Batman Begins,"[United Kingdom, United States of America]","[action, crime, drama]",7.5
65,The Dark Knight,"[United Kingdom, United States of America]","[drama, action, crime, thriller]",8.2
3705,Amidst the Devil's Wings,[United States of America],"[drama, action, crime]",0.0
423,Batman Returns,"[United Kingdom, United States of America]","[action, fantasy]",6.6
402,The Fast and the Furious: Tokyo Drift,"[Japan, United States of America]","[action, crime, drama, thriller]",6.1
1439,Takers,[United States of America],"[action, crime, drama, thriller]",6.0
3051,ต้มยำกุ้ง,"[Thailand, Australia]","[action, crime, drama, thriller]",6.8
616,Need for Speed,"[Philippines, United States of America, United...","[action, crime, drama, thriller]",6.1
743,Righteous Kill,[United States of America],"[action, crime, drama, thriller]",5.9
825,The Hunted,[United States of America],"[drama, action, thriller, crime]",6.0
