In [38]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.feature_extraction.text import TfidfVectorizer

In [39]:
pd.set_option('display.max_columns',None) # display all columns

In [40]:
df = pd.read_csv('movie_metadata.csv')
df.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,actor_1_name,movie_title,num_voted_users,cast_total_facebook_likes,actor_3_name,facenumber_in_poster,plot_keywords,movie_imdb_link,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,4834,Wes Studi,0.0,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,48350,Jack Davenport,0.0,goddess|marriage ceremony|marriage proposal|pi...,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,11700,Stephanie Sigman,1.0,bomb|espionage|sequel|spy|terrorist,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,106759,Joseph Gordon-Levitt,0.0,deception|imprisonment|lawlessness|police offi...,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens ...,8,143,,0.0,,http://www.imdb.com/title/tt5289954/?ref_=fn_t...,,,,,,,12.0,7.1,,0


In [41]:
df['movie_title']=df['movie_title'].apply(lambda n: n.strip())

In [42]:
print(df.columns.tolist())

['color', 'director_name', 'num_critic_for_reviews', 'duration', 'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name', 'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name', 'movie_title', 'num_voted_users', 'cast_total_facebook_likes', 'actor_3_name', 'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link', 'num_user_for_reviews', 'language', 'country', 'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score', 'aspect_ratio', 'movie_facebook_likes']


In [43]:
subset = df.select_dtypes('object').drop(columns=['color','movie_imdb_link','language']).copy()

In [44]:
print(subset.columns.tolist())

['director_name', 'actor_2_name', 'genres', 'actor_1_name', 'movie_title', 'actor_3_name', 'plot_keywords', 'country', 'content_rating']


In [45]:
subset.dropna(inplace=True)

In [46]:
subset['text'] = subset['director_name'] + subset['actor_2_name'] + subset['genres'] + subset['actor_1_name'] + subset['movie_title']+subset['actor_3_name']+subset['plot_keywords']+subset['country'] + subset['content_rating']

In [47]:
subset['text'] # out final data

0       James CameronJoel David MooreAction|Adventure|...
1       Gore VerbinskiOrlando BloomAction|Adventure|Fa...
2       Sam MendesRory KinnearAction|Adventure|Thrille...
3       Christopher NolanChristian BaleAction|Thriller...
5       Andrew StantonSamantha MortonAction|Adventure|...
                              ...                        
5034    Neill Dela LlanaEdgar TancangcoThrillerIan Gam...
5035    Robert RodriguezPeter MarquardtAction|Crime|Dr...
5036    Anthony ValloneJohn ConsidineCrime|DramaRichar...
5037    Edward BurnsCaitlin FitzGeraldComedy|DramaKerr...
5042    Jon GunnBrian HerzlingerDocumentaryJohn August...
Name: text, Length: 4595, dtype: object

In [48]:
vec = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
movie_mat = vec.fit_transform(subset['text']).toarray()
movie_mat.shape

(4595, 141967)

In [49]:
cs = cosine_similarity(movie_mat, movie_mat, dense_output=True)
cs

array([[1.        , 0.00555661, 0.00660392, ..., 0.00172615, 0.        ,
        0.        ],
       [0.00555661, 1.        , 0.00604687, ..., 0.00158054, 0.        ,
        0.        ],
       [0.00660392, 0.00604687, 1.        , ..., 0.00187844, 0.        ,
        0.        ],
       ...,
       [0.00172615, 0.00158054, 0.00187844, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [52]:
def get_movie_loc(name):
    try:
        return subset[subset['movie_title'] == name ].index[0]
    except Exception as e:
        print(f'Error {name} not found, {e}')
        return None

In [60]:
get_movie_loc('The Dark Knight Rises')

3

In [80]:
def recommend(movie, k=5):
    if subset['movie_title'].str.contains(movie).any():
        try:
            idx = get_movie_loc(movie)
            print(idx)
            if idx != -1:
                sim_scores = list(enumerate(cs[idx]))
                sim_scores.sort(key=lambda i:i[1], reverse=True)
                movie_idxs = [i[0] for i in sim_scores]
                return subset.iloc[movie_idxs]['movie_title'].head(k).tolist()
            else:
                return None
        except Exception as e:
            print("Error+>",e)
            return None
    else:
        print('movie not found')
        return None

In [81]:
recommend('Avatar', k=10)

0


['Avatar',
 'Dragonball: Evolution',
 'X-Men: Days of Future Past',
 'X-Men 2',
 'Underworld: Evolution',
 'Underworld: Rise of the Lycans',
 'Reign of Fire',
 'Beastmaster 2: Through the Portal of Time',
 'Highlander: Endgame',
 'X-Men: The Last Stand']