In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from sklearn.externals import joblib



In [3]:
# Read the dataset
df = pd.read_csv('movie_dataset.csv')
df.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,actor_1_name,movie_title,num_voted_users,cast_total_facebook_likes,actor_3_name,facenumber_in_poster,plot_keywords,movie_imdb_link,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,4834,Wes Studi,0.0,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,48350,Jack Davenport,0.0,goddess|marriage ceremony|marriage proposal|pi...,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,11700,Stephanie Sigman,1.0,bomb|espionage|sequel|spy|terrorist,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,106759,Joseph Gordon-Levitt,0.0,deception|imprisonment|lawlessness|police offi...,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens ...,8,143,,0.0,,http://www.imdb.com/title/tt5289954/?ref_=fn_t...,,,,,,,12.0,7.1,,0


In [4]:
print('The dataset contains {} samples and {} columns'.format(*df.shape))

The dataset contains 5043 samples and 28 columns


In [5]:
df.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

Consider the features 'movie_title', 'genres', 'plot_keywords' for our analysis

In [0]:
df_ = df[['movie_title', 'genres', 'plot_keywords']]

In [7]:
df_.head()

Unnamed: 0,movie_title,genres,plot_keywords
0,Avatar,Action|Adventure|Fantasy|Sci-Fi,avatar|future|marine|native|paraplegic
1,Pirates of the Caribbean: At World's End,Action|Adventure|Fantasy,goddess|marriage ceremony|marriage proposal|pi...
2,Spectre,Action|Adventure|Thriller,bomb|espionage|sequel|spy|terrorist
3,The Dark Knight Rises,Action|Thriller,deception|imprisonment|lawlessness|police offi...
4,Star Wars: Episode VII - The Force Awakens ...,Documentary,


Check for any missing values

In [8]:
df_.isnull().sum()

movie_title        0
genres             0
plot_keywords    153
dtype: int64

Fill the missing values from 'plot_keywords' with an empty space

In [0]:
df_['plot_keywords'].fillna(" " , inplace=True)

In [0]:
df_['index'] = range(0, len(df_))
df_.index = range(0, len(df_))

In [24]:
df_.isnull().sum()

movie_title          0
genres               0
plot_keywords        0
combined_features    0
index                0
dtype: int64

create a function for combining the values of these columns into a single string

In [0]:
def combine_features(row):
  movie_data = row['movie_title'] + row['genres'] + ' ' + row['plot_keywords']

  movie_data_list = [ele for ele in movie_data.split()]
  clean_words = [t for t in movie_data_list if re.sub(r'[^A-Za-z0-9]', ' ', t)]
  clean_movie_data = ' '.join(clean_words)

  # To remove " | " from genres and plot_keywords
  movie_data_list_ = [ele for ele in clean_movie_data.split('|')]
  clean_words_ = [t for t in movie_data_list_ if re.sub(r'[^A-Za-z0-9]', ' ', t)]
  clean_movie_data_ = ' '.join(clean_words_)
  return clean_movie_data_


In [0]:
df_['combined_features'] = df_.apply(combine_features,axis=1)

In [14]:
df_['combined_features'].head()

0    Avatar Action Adventure Fantasy Sci-Fi avatar ...
1    Pirates of the Caribbean: At World's End Actio...
2    Spectre Action Adventure Thriller bomb espiona...
3    The Dark Knight Rises Action Thriller deceptio...
4    Star Wars: Episode VII - The Force Awakens Doc...
Name: combined_features, dtype: object

Creating Bag of words using CountVectorizer

In [0]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df_['combined_features'])

Finding the similarity scores using cosine similarity

In [0]:
cosine_sim = cosine_similarity(count_matrix)

Save the model containing the Cosine similarity scores

In [25]:
joblib.dump(cosine_sim, 'Movie_Cosine_Scores.pkl')

['Movie_Cosine_Scores.pkl']

Functions to get movie title from movie index and vice-versa

In [0]:
def get_title_from_index(index):
    find_title = df_.loc[df_['index'] == index, 'movie_title']
    return find_title[index].replace(u'\xa0','')

def get_genres_from_index(index):
    find_genres = df_.loc[df_['index'] == index, 'genres']
    return find_genres[index].replace(u'\xa0','')

def get_index_from_title(title):
    movie = title + '\xa0'
    find_index = df_.loc[df_['movie_title'] == movie, 'index']
    return find_index.index[0]

Find out the movies similar to the movie 'Avatar'

In [0]:
movie_user_likes = 'Avatar'
movie_index = get_index_from_title(movie_user_likes)
similar_movies = list(enumerate(cosine_sim[movie_index])) 
#accessing the row corresponding to given movie to find all the similarity scores for that movie and then enumerating over it

Sort the similar movies in descending order

In [0]:
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)[1:]

Output top 5 similar movies

In [23]:
i=0
print('Top 5 similar movies to ' + movie_user_likes + '[Genre:' + str(df_['genres'][movie_index]) + ']' + ' are:\n')
for element in sorted_similar_movies:
    print(get_title_from_index(element[0]) + '........ [Genres: ' + get_genres_from_index(element[0]) + ']')
    i=i+1
    if i>5:
        break

Top 5 similar movies to Avatar[Genre:Action|Adventure|Fantasy|Sci-Fi] are:

Destiny........ [Genres: Action|Adventure|Fantasy|Sci-Fi]
Waterworld........ [Genres: Action|Adventure|Sci-Fi|Thriller]
Serenity........ [Genres: Action|Adventure|Sci-Fi|Thriller]
Zathura: A Space Adventure........ [Genres: Action|Adventure|Comedy|Family|Fantasy|Sci-Fi]
Terminator Salvation........ [Genres: Action|Adventure|Sci-Fi]
The Last Airbender........ [Genres: Action|Adventure|Family|Fantasy]
