### Import libraries

In [None]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.neighbors import NearestNeighbors

### Load & Preprocess data

In [457]:
tmdb_5000_credits = pd.read_csv("datasets/movies-2/tmdb_5000_credits.csv")
tmdb_5000_movies = pd.read_csv("datasets/movies-2/tmdb_5000_movies.csv")

In [458]:
columns_movies = ["budget", "genres", "id", "original_language", "original_title", "overview", "popularity", "production_countries", "release_date", "spoken_languages", "vote_average", "vote_count"]
columns_credits = ["movie_id", "title", "cast", "crew"]

tmdb_5000_movies = tmdb_5000_movies[columns_movies]
tmdb_5000_credits = tmdb_5000_credits[columns_credits]

for column in ["genres", "production_countries", "spoken_languages"]:
    tmdb_5000_movies[column] = tmdb_5000_movies[column].apply(literal_eval)

for column in ["crew", "cast"]:
    tmdb_5000_credits[column] = tmdb_5000_credits[column].apply(literal_eval)    

In [459]:
tmdb_5000_credits.head(5)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '...","[{'credit_id': '52fe48009251416c750aca23', 'de..."
1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4232c3a36847f800b579', 'de..."
2,206647,Spectre,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '54805967c3a36829b5002c41', 'de..."
3,49026,The Dark Knight Rises,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '52fe4781c3a36847f81398c3', 'de..."
4,49529,John Carter,"[{'cast_id': 5, 'character': 'John Carter', 'c...","[{'credit_id': '52fe479ac3a36847f813eaa3', 'de..."


In [460]:
tmdb_5000_movies.head(5)

Unnamed: 0,budget,genres,id,original_language,original_title,overview,popularity,production_countries,release_date,spoken_languages,vote_average,vote_count
0,237000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",19995,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{'iso_3166_1': 'US', 'name': 'United States o...",2009-12-10,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",7.2,11800
1,300000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",285,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{'iso_3166_1': 'US', 'name': 'United States o...",2007-05-19,"[{'iso_639_1': 'en', 'name': 'English'}]",6.9,4500
2,245000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",206647,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",2015-10-26,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",6.3,4466
3,250000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",49026,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{'iso_3166_1': 'US', 'name': 'United States o...",2012-07-16,"[{'iso_639_1': 'en', 'name': 'English'}]",7.6,9106
4,260000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",49529,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{'iso_3166_1': 'US', 'name': 'United States o...",2012-03-07,"[{'iso_639_1': 'en', 'name': 'English'}]",6.1,2124


In [461]:
def preprocess_dict(column: str, key: str = "name"):
    genres_column = tmdb_5000_movies[column]
    genres_column_processed = []
    for item in genres_column:
        genres_column_processed.append([dic[key] for dic in item])
    return genres_column_processed


In [462]:
tmdb_5000_movies["genres"] = preprocess_dict("genres")
tmdb_5000_movies["production_countries"] = preprocess_dict("production_countries")
tmdb_5000_movies["spoken_languages"] = preprocess_dict("spoken_languages")

In [463]:
tmdb_5000_movies.head(5)

Unnamed: 0,budget,genres,id,original_language,original_title,overview,popularity,production_countries,release_date,spoken_languages,vote_average,vote_count
0,237000000,"[Action, Adventure, Fantasy, Science Fiction]",19995,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[United States of America, United Kingdom]",2009-12-10,"[English, Español]",7.2,11800
1,300000000,"[Adventure, Fantasy, Action]",285,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,[United States of America],2007-05-19,[English],6.9,4500
2,245000000,"[Action, Adventure, Crime]",206647,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[United Kingdom, United States of America]",2015-10-26,"[Français, English, Español, Italiano, Deutsch]",6.3,4466
3,250000000,"[Action, Crime, Drama, Thriller]",49026,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,[United States of America],2012-07-16,[English],7.6,9106
4,260000000,"[Action, Adventure, Science Fiction]",49529,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,[United States of America],2012-03-07,[English],6.1,2124


In [464]:
tmdb_5000_movies = tmdb_5000_movies.merge(tmdb_5000_credits, left_on='id', right_on='movie_id')

In [465]:
print(tmdb_5000_movies.shape)
tmdb_5000_movies = tmdb_5000_movies[tmdb_5000_movies['budget'].apply(lambda x: x > 0)]
tmdb_5000_movies = tmdb_5000_movies[tmdb_5000_movies['genres'].apply(lambda x: len(x) > 0)]
print(tmdb_5000_movies.shape)

(4803, 16)
(3762, 16)


In [466]:
tmdb_5000_movies = tmdb_5000_movies.reset_index(drop=True)

In [467]:
tmdb_5000_movies["cast"] = preprocess_dict("cast", "name")

In [468]:
tmdb_5000_movies.head()

Unnamed: 0,budget,genres,id,original_language,original_title,overview,popularity,production_countries,release_date,spoken_languages,vote_average,vote_count,movie_id,title,cast,crew
0,237000000,"[Action, Adventure, Fantasy, Science Fiction]",19995,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[United States of America, United Kingdom]",2009-12-10,"[English, Español]",7.2,11800,19995,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weave...","[{'credit_id': '52fe48009251416c750aca23', 'de..."
1,300000000,"[Adventure, Fantasy, Action]",285,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,[United States of America],2007-05-19,[English],6.9,4500,285,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...","[{'credit_id': '52fe4232c3a36847f800b579', 'de..."
2,245000000,"[Action, Adventure, Crime]",206647,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[United Kingdom, United States of America]",2015-10-26,"[Français, English, Español, Italiano, Deutsch]",6.3,4466,206647,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...","[{'credit_id': '54805967c3a36829b5002c41', 'de..."
3,250000000,"[Action, Crime, Drama, Thriller]",49026,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,[United States of America],2012-07-16,[English],7.6,9106,49026,The Dark Knight Rises,"[Christian Bale, Michael Caine, Gary Oldman, A...","[{'credit_id': '52fe4781c3a36847f81398c3', 'de..."
4,260000000,"[Action, Adventure, Science Fiction]",49529,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,[United States of America],2012-03-07,[English],6.1,2124,49529,John Carter,"[Taylor Kitsch, Lynn Collins, Samantha Morton,...","[{'credit_id': '52fe479ac3a36847f813eaa3', 'de..."


In [469]:
tmdb_5000_movies['overview'].head()

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

In [None]:
titles = tmdb_5000_movies['original_title']

20                 The Amazing Spider-Man
21                             Robin Hood
22    The Hobbit: The Desolation of Smaug
23                     The Golden Compass
24                              King Kong
25                                Titanic
26             Captain America: Civil War
27                             Battleship
28                         Jurassic World
29                                Skyfall
30                           Spider-Man 2
31                             Iron Man 3
32                    Alice in Wonderland
33                  X-Men: The Last Stand
34                    Monsters University
35    Transformers: Revenge of the Fallen
36        Transformers: Age of Extinction
37             Oz: The Great and Powerful
38               The Amazing Spider-Man 2
39                           TRON: Legacy
Name: original_title, dtype: object


In [471]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [472]:
def get_list(x):
    if isinstance(x, list):
        if len(x) > 5:
            x = x[:5]
        return x

    return []

In [473]:
tmdb_5000_movies['director'] = tmdb_5000_movies['crew'].apply(get_director)

features = ['cast', 'genres']
for feature in features:
    tmdb_5000_movies[feature] = tmdb_5000_movies[feature].apply(get_list)

In [474]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [475]:
features = ['cast', 'director', 'genres']

for feature in features:
    tmdb_5000_movies[feature] = tmdb_5000_movies[feature].apply(clean_data)

In [478]:
def create_soup(x):
    return ' '.join(x['cast']) + ' ' + x['director'] + ' '.join(x['genres'])
tmdb_5000_movies['soup'] = tmdb_5000_movies.apply(create_soup, axis=1)

In [479]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

tfidf = TfidfVectorizer(stop_words='english')

tfidf_matrix = tfidf.fit_transform(tmdb_5000_movies['soup'])
tfidf_matrix_dense = tfidf_matrix.toarray()
scaler = StandardScaler()
tfidf_matrix_standardized = scaler.fit_transform(tfidf_matrix_dense)

print(tfidf_matrix_standardized.shape)

(3762, 10242)


In [480]:
knn = NearestNeighbors(n_neighbors=15, metric='cosine', algorithm='brute')
knn.fit(tfidf_matrix_standardized)

def get_recommendation(title, k=10):
    idx = tmdb_5000_movies[tmdb_5000_movies['original_title'] == title].index[0]
    distances, indices = knn.kneighbors(tfidf_matrix_standardized[idx].reshape(1, -1), n_neighbors=k+1)
    print(sorted(distances[0]))
    recommendations = []
    for i in range(1, len(indices[0])):
        recommendations.append(tmdb_5000_movies.iloc[indices[0][i]][['original_title', 'genres', 'popularity', 'cast', 'vote_average', 'director', 'overview']])
    
    return recommendations

In [513]:
peli = "Back to the Future"
recommendations = get_recommendation(peli, k=10)
print(f"Recommendations if you saw '{peli}':")
actores = tmdb_5000_movies.query("original_title == @peli")['cast']
for i, recommendation in enumerate(recommendations):
    recommendations[i]['same_actors'] = len(set(actores.values[0]) & set(recommendation['cast']))
pd.DataFrame(recommendations)

[np.float64(0.0), np.float64(0.09793959918198814), np.float64(0.6181006363380529), np.float64(0.7847530491331365), np.float64(0.8105847452779072), np.float64(0.8137424569100443), np.float64(0.8614063179007245), np.float64(0.8707759631487063), np.float64(0.8763739033340799), np.float64(0.8829060270402789), np.float64(0.9028804339192884)]
Recommendations if you saw 'Back to the Future':


Unnamed: 0,original_title,genres,popularity,cast,vote_average,director,overview,same_actors
1129,Back to the Future Part III,"[adventure, comedy, family, sciencefiction]",45.769562,"[michaelj.fox, christopherlloyd, marysteenburg...",7.1,robertzemeckis,"[the, final, installment, of, the, back, to, t...",4
1116,Back to the Future Part II,"[adventure, comedy, family, sciencefiction]",43.345252,"[michaelj.fox, christopherlloyd, leathompson, ...",7.4,robertzemeckis,"[marty, and, doc, are, at, it, again, in, this...",3
2933,Action Jackson,"[action, adventure, comedy, crime, drama]",6.872784,"[carlweathers, craigt.nelson, vanity, sharonst...",4.9,craigr.baxley,"[vengence, drives, a, tough, detroit, cop, to,...",1
151,Beowulf,"[adventure, action, animation]",35.601665,"[raywinstone, angelinajolie, anthonyhopkins, r...",5.5,robertzemeckis,"[6thcentury, scandinavian, warrior, beowulf, e...",0
3110,April Fool's Day,"[horror, mystery]",8.190888,"[deborahforeman, thomasf.wilson, lloydberry, t...",5.8,fredwalton,"[a, group, of, eight, college, friends, gather...",1
267,Stuart Little,"[animation, fantasy, family, comedy]",30.475297,"[michaelj.fox, geenadavis, hughlaurie, jonatha...",5.8,robminkoff,"[the, adventures, of, a, heroic, and, debonair...",1
668,The American President,"[comedy, drama, romance]",11.056763,"[michaeldouglas, annettebening, michaelj.fox, ...",6.5,robreiner,"[widowed, us, president, andrew, shepherd, one...",1
32,Alice in Wonderland,"[family, fantasy, adventure]",78.530105,"[miawasikowska, johnnydepp, annehathaway, hele...",6.4,timburton,"[alice, an, unpretentious, and, individual, 19...",1
1545,9,"[action, adventure, animation, sciencefiction,...",55.806402,"[christopherplummer, martinlandau, johnc.reill...",6.6,shaneacker,"[when, 9, first, comes, to, life, he, finds, h...",1
735,My Favorite Martian,"[comedy, drama, family, sciencefiction]",6.80692,"[jeffdaniels, elizabethhurley, darylhannah, ch...",5.1,donaldpetrie,"[news, producer, tim, ohara, gets, himself, fi...",1


In [511]:
k = tmdb_5000_movies.query("original_title == @peli")
k

Unnamed: 0,budget,genres,id,original_language,original_title,overview,popularity,production_countries,release_date,spoken_languages,vote_average,vote_count,movie_id,title,cast,crew,director,soup
32,200000000,"[family, fantasy, adventure]",12155,en,Alice in Wonderland,"[alice, an, unpretentious, and, individual, 19...",78.530105,[United States of America],2010-03-03,[English],6.4,4645,12155,Alice in Wonderland,"[miawasikowska, johnnydepp, annehathaway, hele...","[{'credit_id': '52fe44c09251416c7503fbc3', 'de...",timburton,miawasikowska johnnydepp annehathaway helenabo...
