In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import ast
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_csv('data/tmdb_5000_movies.csv')
credits = pd.read_csv('data/tmdb_5000_credits.csv') 

movies = movies.merge(credits, on='title')
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]
movies.dropna(inplace=True)


In [3]:
movies['genres'] = movies['genres'].apply(lambda x : [dict_['name'] for dict_ in ast.literal_eval(x)])
movies['keywords'] = movies['keywords'].apply(lambda x : [dict_['name'] for dict_ in ast.literal_eval(x)])
movies['cast'] = movies['cast'].apply(lambda x: [dict_['name'] for i, dict_ in enumerate(ast.literal_eval(x)) if i < 3])
movies['crew'] = movies['crew'].apply(lambda x: [dict_['name'] for dict_ in ast.literal_eval(x) if dict_['job'].lower() == 'director'])

movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [4]:
# Remove whitespace between names
movies['cast'] = movies['cast'].apply(lambda x: [ele.replace(" ", "") for ele in x])
movies['crew'] = movies['crew'].apply(lambda x: [ele.replace(" ", "") for ele in x])
movies['genres'] = movies['genres'].apply(lambda x: [ele.replace(" ", "") for ele in x])

In [5]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
new = movies.drop(columns=['overview','genres','keywords','cast','crew'])
new['tags'] = new['tags'].apply(lambda x: " ".join(x))

In [6]:
new.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [7]:
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
porter_stemmer=PorterStemmer()

def my_preprocessor(text):
    text=text.lower() 
    text=re.sub("\\W"," ",text) # remove special chars
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words] # stop words
    stemmed_words=[porter_stemmer.stem(word=word) for word in filtered_words] #stemming
    return ' '.join(stemmed_words)

In [9]:
cvectorizer = CountVectorizer(max_features=5000, preprocessor=my_preprocessor)
    
vector = cvectorizer.fit_transform(new['tags']).toarray()

In [10]:
similarity = cosine_similarity(vector)

In [11]:
similarity = np.array(similarity, dtype=np.float16)

In [12]:
def recommend(movie):
    index = new[new['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for tup in distances[1:6]:
        print(new.iloc[tup[0]].title)

In [14]:
recommend('The Dark Knight Rises')

The Dark Knight
Batman Begins
Batman
Batman
Batman Returns


In [15]:
pickle.dump(similarity, open('similarity.pkl','wb'))

In [16]:
new[['movie_id', 'title']].to_csv("movie_info.csv", index=False)