### Import libraries

In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.neighbors import NearestNeighbors

### Load & Preprocess data

In [2]:
tmdb_5000_credits = pd.read_csv("datasets/movies-2/tmdb_5000_credits.csv")
tmdb_5000_movies = pd.read_csv("datasets/movies-2/tmdb_5000_movies.csv")

In [3]:
columns_movies = ["budget", "genres", "id", "original_language", "original_title", "overview", "popularity", "production_countries", "release_date", "spoken_languages", "vote_average", "vote_count"]
columns_credits = ["movie_id", "title", "cast", "crew"]

tmdb_5000_movies = tmdb_5000_movies[columns_movies]
tmdb_5000_credits = tmdb_5000_credits[columns_credits]

for column in ["genres", "production_countries", "spoken_languages"]:
    tmdb_5000_movies[column] = tmdb_5000_movies[column].apply(literal_eval)

for column in ["crew", "cast"]:
    tmdb_5000_credits[column] = tmdb_5000_credits[column].apply(literal_eval)    

In [4]:
tmdb_5000_credits.head(5)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '...","[{'credit_id': '52fe48009251416c750aca23', 'de..."
1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4232c3a36847f800b579', 'de..."
2,206647,Spectre,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '54805967c3a36829b5002c41', 'de..."
3,49026,The Dark Knight Rises,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '52fe4781c3a36847f81398c3', 'de..."
4,49529,John Carter,"[{'cast_id': 5, 'character': 'John Carter', 'c...","[{'credit_id': '52fe479ac3a36847f813eaa3', 'de..."


In [5]:
tmdb_5000_movies.head(5)

Unnamed: 0,budget,genres,id,original_language,original_title,overview,popularity,production_countries,release_date,spoken_languages,vote_average,vote_count
0,237000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",19995,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{'iso_3166_1': 'US', 'name': 'United States o...",2009-12-10,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",7.2,11800
1,300000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",285,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{'iso_3166_1': 'US', 'name': 'United States o...",2007-05-19,"[{'iso_639_1': 'en', 'name': 'English'}]",6.9,4500
2,245000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",206647,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",2015-10-26,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",6.3,4466
3,250000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",49026,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{'iso_3166_1': 'US', 'name': 'United States o...",2012-07-16,"[{'iso_639_1': 'en', 'name': 'English'}]",7.6,9106
4,260000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",49529,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{'iso_3166_1': 'US', 'name': 'United States o...",2012-03-07,"[{'iso_639_1': 'en', 'name': 'English'}]",6.1,2124


In [6]:
def preprocess_dict(column: str, key: str = "name"):
    genres_column = tmdb_5000_movies[column]
    genres_column_processed = []
    for item in genres_column:
        genres_column_processed.append([dic[key] for dic in item])
    return genres_column_processed


In [7]:
tmdb_5000_movies["genres"] = preprocess_dict("genres")
tmdb_5000_movies["production_countries"] = preprocess_dict("production_countries")
tmdb_5000_movies["spoken_languages"] = preprocess_dict("spoken_languages")

In [8]:
tmdb_5000_movies.head(5)

Unnamed: 0,budget,genres,id,original_language,original_title,overview,popularity,production_countries,release_date,spoken_languages,vote_average,vote_count
0,237000000,"[Action, Adventure, Fantasy, Science Fiction]",19995,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[United States of America, United Kingdom]",2009-12-10,"[English, Español]",7.2,11800
1,300000000,"[Adventure, Fantasy, Action]",285,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,[United States of America],2007-05-19,[English],6.9,4500
2,245000000,"[Action, Adventure, Crime]",206647,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[United Kingdom, United States of America]",2015-10-26,"[Français, English, Español, Italiano, Deutsch]",6.3,4466
3,250000000,"[Action, Crime, Drama, Thriller]",49026,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,[United States of America],2012-07-16,[English],7.6,9106
4,260000000,"[Action, Adventure, Science Fiction]",49529,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,[United States of America],2012-03-07,[English],6.1,2124


In [9]:
tmdb_5000_movies = tmdb_5000_movies.merge(tmdb_5000_credits, left_on='id', right_on='movie_id')

In [10]:
print(tmdb_5000_movies.shape)
tmdb_5000_movies = tmdb_5000_movies[tmdb_5000_movies['budget'].apply(lambda x: x > 0)]
tmdb_5000_movies = tmdb_5000_movies[tmdb_5000_movies['genres'].apply(lambda x: len(x) > 0)]
print(tmdb_5000_movies.shape)

(4803, 16)
(3762, 16)


In [11]:
tmdb_5000_movies = tmdb_5000_movies.reset_index(drop=True)

In [12]:
tmdb_5000_movies["cast"] = preprocess_dict("cast", "name")

In [None]:
tmdb_5000_movies.tail()

Unnamed: 0,budget,genres,id,original_language,original_title,overview,popularity,production_countries,release_date,spoken_languages,vote_average,vote_count,movie_id,title,cast,crew,director,soup
3757,13,[horror],157185,en,Tin Can Man,Recently dumped by his girlfirend for another ...,0.332679,[Ireland],2007-01-01,[English],2.0,1,157185,Tin Can Man,"[michaelparle, emmaelizaregan, patricko'donnel...","[{'credit_id': '54c7851b925141679100372a', 'de...",ivankavanagh,michaelparle emmaelizaregan patricko'donnell k...
3758,20000,"[crime, horror, mystery, thriller]",36095,ja,キュア,A wave of gruesome murders is sweeping Tokyo. ...,0.212443,[Japan],1997-11-06,[日本語],7.4,63,36095,Cure,"[kojiyakusho, masatohagiwara, tsuyoshiujiki, a...","[{'credit_id': '52fe45cc9251416c9103eb7b', 'de...",kiyoshikurosawa,kojiyakusho masatohagiwara tsuyoshiujiki annan...
3759,7000,"[sciencefiction, drama, thriller]",14337,en,Primer,Friends/fledgling entrepreneurs invent a devic...,23.307949,[United States of America],2004-10-08,[English],6.9,658,14337,Primer,"[shanecarruth, davidsullivan, caseygooden, ana...","[{'credit_id': '52fe45e79251416c75066791', 'de...",shanecarruth,shanecarruth davidsullivan caseygooden anandup...
3760,220000,"[action, crime, thriller]",9367,es,El Mariachi,El Mariachi just wants to play his guitar and ...,14.269792,"[Mexico, United States of America]",1992-09-04,[Español],6.6,238,9367,El Mariachi,"[carlosgallardo, jaimedehoyos, petermarquardt,...","[{'credit_id': '52fe44eec3a36847f80b280b', 'de...",robertrodriguez,carlosgallardo jaimedehoyos petermarquardt rei...
3761,9000,"[comedy, romance]",72766,en,Newlyweds,A newlywed couple's honeymoon is upended by th...,0.642552,[],2011-12-26,[],5.9,5,72766,Newlyweds,"[edwardburns, kerrybishé, marshadietlein, cait...","[{'credit_id': '52fe487dc3a368484e0fb013', 'de...",edwardburns,edwardburns kerrybishé marshadietlein caitlinf...


In [14]:
tmdb_5000_movies['overview'].head()

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

In [40]:
titles = tmdb_5000_movies['original_title']
titles = pd.DataFrame(titles)
titles.head(50)

Unnamed: 0,original_title
0,Avatar
1,Pirates of the Caribbean: At World's End
2,Spectre
3,The Dark Knight Rises
4,John Carter
5,Spider-Man 3
6,Tangled
7,Avengers: Age of Ultron
8,Harry Potter and the Half-Blood Prince
9,Batman v Superman: Dawn of Justice


In [16]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [17]:
def get_list(x):
    if isinstance(x, list):
        if len(x) > 5:
            x = x[:5]
        return x

    return []

In [18]:
tmdb_5000_movies['director'] = tmdb_5000_movies['crew'].apply(get_director)

features = ['cast', 'genres']
for feature in features:
    tmdb_5000_movies[feature] = tmdb_5000_movies[feature].apply(get_list)

In [19]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [20]:
features = ['cast', 'director', 'genres']

for feature in features:
    tmdb_5000_movies[feature] = tmdb_5000_movies[feature].apply(clean_data)

In [21]:
def create_soup(x):
    return ' '.join(x['cast']) + ' ' + x['director'] + ' '.join(x['genres'])
tmdb_5000_movies['soup'] = tmdb_5000_movies.apply(create_soup, axis=1)

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

tfidf = TfidfVectorizer(stop_words='english')

tfidf_matrix = tfidf.fit_transform(tmdb_5000_movies['soup'])
tfidf_matrix_dense = tfidf_matrix.toarray()
scaler = StandardScaler()
tfidf_matrix_standardized = scaler.fit_transform(tfidf_matrix_dense)

print(tfidf_matrix_standardized.shape)

(3762, 10242)


In [23]:
knn = NearestNeighbors(n_neighbors=15, metric='cosine', algorithm='brute')
knn.fit(tfidf_matrix_standardized)

def get_recommendation(title, k=10):
    idx = tmdb_5000_movies[tmdb_5000_movies['original_title'] == title].index[0]
    distances, indices = knn.kneighbors(tfidf_matrix_standardized[idx].reshape(1, -1), n_neighbors=k+1)
    print(sorted(distances[0]))
    recommendations = []
    for i in range(1, len(indices[0])):
        recommendations.append(tmdb_5000_movies.iloc[indices[0][i]][['original_title', 'genres', 'popularity', 'cast', 'vote_average', 'director', 'overview']])
    
    return recommendations

In [None]:
peli = "Jason Bourne"
recommendations = get_recommendation(peli, k=10)
print(f"Recommendations if you saw '{peli}':")
actores = tmdb_5000_movies.query("original_title == @peli")['cast']
for i, recommendation in enumerate(recommendations):
    recommendations[i]['same_actors'] = len(set(actores.values[0]) & set(recommendation['cast']))
pd.DataFrame(recommendations)

[np.float64(0.0), np.float64(0.2931492520451858), np.float64(0.7274923913951588), np.float64(0.7833629794875951), np.float64(0.803241984371125), np.float64(0.8043289892865506), np.float64(0.8398903077785611), np.float64(0.8423712393571026), np.float64(0.8629239737289832), np.float64(0.878030754986121), np.float64(0.8935013773352429)]
Recommendations if you saw 'Shrek':


Unnamed: 0,original_title,genres,popularity,cast,vote_average,director,overview,same_actors
554,Shrek 2,"[adventure, animation, comedy, family, fantasy]",47.320801,"[mikemyers, eddiemurphy, camerondiaz, julieand...",6.7,andrewadamson,"Shrek, Fiona and Donkey set off to Far, Far Aw...",3
86,Shrek Forever After,"[comedy, adventure, fantasy, animation, family]",44.041186,"[mikemyers, eddiemurphy, camerondiaz, antoniob...",6.0,mikemitchell,A bored and domesticated Shrek pacts with deal...,3
63,"The Chronicles of Narnia: The Lion, the Witch ...","[adventure, family, fantasy]",67.391328,"[williammoseley, annapopplewell, skandarkeynes...",6.7,andrewadamson,"Siblings Lucy, Edmund, Susan and Peter step th...",0
106,Shrek the Third,"[fantasy, adventure, animation, comedy, family]",42.986467,"[mikemyers, eddiemurphy, camerondiaz, julieand...",6.0,chrismiller,The King of Far Far Away has died and Shrek an...,3
15,The Chronicles of Narnia: Prince Caspian,"[adventure, family, fantasy]",53.978602,"[benbarnes, williammoseley, annapopplewell, sk...",6.3,andrewadamson,One year after their incredible adventures in ...,0
218,Jason Bourne,"[action, thriller]",62.641286,"[mattdamon, aliciavikander, tommyleejones, vin...",5.9,paulgreengrass,The most dangerous former operative of the CIA...,1
2467,Black Swan,"[drama, thriller]",91.285683,"[natalieportman, milakunis, vincentcassel, bar...",7.3,darrenaronofsky,"A ballet dancer wins the lead in ""Swan Lake"" a...",1
960,Joan of Arc,"[adventure, drama, action, history, war]",21.084542,"[millajovovich, dustinhoffman, fayedunaway, jo...",6.2,lucbesson,In 1429 a teenage girl from a remote French vi...,1
2079,A Dangerous Method,"[drama, thriller]",27.133277,"[keiraknightley, viggomortensen, michaelfassbe...",6.2,davidcronenberg,Seduced by the challenge of an impossible case...,1
333,Rise of the Planet of the Apes,"[thriller, action, drama, sciencefiction]",138.433168,"[jamesfranco, freidapinto, johnlithgow, brianc...",7.0,rupertwyatt,Scientist Will Rodman is determined to find a ...,1


In [44]:
k = tmdb_5000_movies.query("original_title == @peli")
k

Unnamed: 0,budget,genres,id,original_language,original_title,overview,popularity,production_countries,release_date,spoken_languages,vote_average,vote_count,movie_id,title,cast,crew,director,soup
322,95000000,"[adventure, animation, comedy, family]",10501,en,The Road to El Dorado,"After a failed swindle, two con-men end up wit...",37.054554,[United States of America],2000-03-31,[English],7.0,858,10501,The Road to El Dorado,"[kennethbranagh, kevinkline, rosieperez, arman...","[{'credit_id': '52fe437c9251416c750122e9', 'de...",donmichaelpaul,kennethbranagh kevinkline rosieperez armandass...
