In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [2]:
#Loading our data
movies=pd.read_csv('dataset/tmdb_5000_movies.csv')
credits=pd.read_csv('dataset/tmdb_5000_credits.csv')

In [3]:
#mergeing the data of both the movies and credits dataset on title
movies=movies.merge(credits,on='title')

In [4]:
#exracting the columns which are important for the recommendation
movies=movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [5]:
#checking for the null cells in our dataset
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [6]:
#removing null values
movies.dropna(inplace=True)

In [7]:
#check duplicate values
movies.duplicated().sum()

np.int64(0)

In [8]:
#checking for generes column
movies.iloc[0]['genres']

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [9]:
#defining a function to just keep genres name in genres column
import ast
def convert(text):
    l=[]
    for i in ast.literal_eval(text): #using ast.literal_eval() to convert text to list.
        l.append(i['name'])
    return l

In [10]:
#applying our created fuction to seprate every movies genres within the cell and storing it in the data on same place
movies['genres']=movies['genres'].apply(convert)

In [11]:
#fixing our keywords column with same technique
movies['keywords']=movies['keywords'].apply(convert)

In [12]:
#defining a function to fetch only three cast from every movie
def convert_cast(text):
    l=[]
    count=0
    for i in ast.literal_eval(text): #using ast.literal_eval() to convert text to list.
        if count<3:
            l.append(i['name'])
        count+=1
    return l

In [13]:
movies['cast']=movies['cast'].apply(convert_cast)

In [14]:
#defining a function to take out the director name from the crew column
def fetch_director(text):
    l=[]
    for i in ast.literal_eval(text): #using ast.literal_eval() to convert text to list.
        if i['job']=='Director':
            l.append(i['name'])
            break
    return l


In [15]:
#applying our fetch director function
movies['crew']=movies['crew'].apply(fetch_director)


In [16]:
#making overview column as a list 
movies['overview']=movies['overview'].apply(lambda x:x.split())

In [17]:
#creating a function to remove space from the every column of our dataset
def remove_space(text):
    l=[]
    for i in text:
        l.append(i.replace(" ",""))
    return l             

In [18]:
movies['crew']=movies['crew'].apply(remove_space)
movies['cast']=movies['cast'].apply(remove_space)
movies['keywords']=movies['keywords'].apply(remove_space)
movies['genres']=movies['genres'].apply(remove_space)

In [19]:
#creating a new columns tags for concatinating our other columns in it 
movies['tags']=movies['overview']+movies['genres']+movies['keywords']+movies['cast']+movies['crew']

In [20]:
movies.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."


In [21]:
#creating a new data frame in which only imprtant columns are present
newer_df=movies[['movie_id','title','tags']]

In [22]:
#converting tags column into paragragh with all cell values
newer_df['tags']=newer_df['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newer_df['tags']=newer_df['tags'].apply(lambda x: " ".join(x))


In [25]:
#creating all the elements into lower case
newer_df['tags']=newer_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newer_df['tags']=newer_df['tags'].apply(lambda x:x.lower())


In [26]:
newer_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [27]:
#creating a function to make unique and generalize word for every repeating word with same meaning
#example:love,loved,loving these all means same in recommedation thing 
import nltk
from nltk.stem import PorterStemmer
ps=PorterStemmer()
def stems(text):
    l=[]
    for i in text.split():
        l.append(ps.stem(i))
    return " ".join(l)   
        


In [28]:
#applying the function to tag column
newer_df['tags']=newer_df['tags'].apply(stems)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newer_df['tags']=newer_df['tags'].apply(stems)


In [31]:
#creating our tags column as a vector 
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000,stop_words='english')#stop words is use to get rid of the words such as is,an the (which are not useful in our recommendation)
vector=cv.fit_transform(newer_df['tags']).toarray()

In [32]:
vector.shape

(4806, 5000)

In [33]:
#for matching cosine similarity will use scikitLearn pacakage
from sklearn.metrics.pairwise import cosine_similarity

In [34]:
similar=cosine_similarity(vector)

In [35]:
similar

array([[1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
        0.        ],
       [0.08346223, 1.        , 0.06063391, ..., 0.02378257, 0.        ,
        0.02615329],
       [0.0860309 , 0.06063391, 1.        , ..., 0.02451452, 0.        ,
        0.        ],
       ...,
       [0.04499213, 0.02378257, 0.02451452, ..., 1.        , 0.03962144,
        0.04229549],
       [0.        , 0.        , 0.        , ..., 0.03962144, 1.        ,
        0.08714204],
       [0.        , 0.02615329, 0.        , ..., 0.04229549, 0.08714204,
        1.        ]])

In [36]:
newer_df[newer_df['title'] == 'Spider-Man'].index[0]


np.int64(159)

In [37]:
#creating a function 
def recommend(movie):
    index=newer_df[newer_df['title'] == movie].index[0]
    distances=sorted(list(enumerate(similar[index])),reverse=True,key=lambda x:x[1])
    for i in distances[1:6]:
        print(newer_df.iloc[i[0]].title)


In [39]:
recommend('Avengers: Age of Ultron')

Iron Man 3
Iron Man 2
Iron Man
Thor
The Avengers


In [40]:
import pickle
pickle.dump(newer_df,open('artifacts/movie.list.pkl','wb'))
pickle.dump(similar,open('artifacts/similarity.list.pkl','wb'))