In [1]:
import numpy as np
import pandas as pd

In [2]:
movies=pd.read_csv('tmdb_5000_movies.csv')
credits=pd.read_csv('tmdb_5000_credits.csv')

In [3]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [4]:
movies=movies.merge(credits,on='title')

In [5]:
movies.sample(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
1099,45000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.bangkokdangerousmovie.net/,13184,"[{""id"": 782, ""name"": ""assassin""}, {""id"": 1293,...",en,Bangkok Dangerous,"When carrying out a hit, assassin Joe (Cage) a...",14.813057,"[{""name"": ""Virtual Studios"", ""id"": 449}, {""nam...",...,99.0,"[{""iso_639_1"": ""th"", ""name"": ""\u0e20\u0e32\u0e...",Released,There's only one way out.,Bangkok Dangerous,5.0,332,13184,"[{""cast_id"": 2, ""character"": ""Joe"", ""credit_id...","[{""credit_id"": ""52fe454b9251416c75051965"", ""de..."


In [6]:
#list of columns to keep inorder to create tags for ContentBased Recommender System
#not budget,homepage,original_language(highly imbalanced),original_title(similar to title)
#not popularity as its a numeric column similarly the release date,revenue,runtime,vote_avg,vote_id
#not prod_company or countries,spoken_languages,status,tagline(vague)
#def genre,id,keywords,title,overview,cast,crew

In [7]:
movies=movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [8]:
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [9]:
movies.dropna(inplace=True)

In [10]:
movies.duplicated().sum()

0

In [11]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [12]:
import ast
def convert(obj):
    L=[]
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [13]:
def convert3(obj):
    L=[]
    counter=0
    for i in ast.literal_eval(obj):
        if counter!=3:
            L.append(i['name'])
            counter+=1
        else:
            break
    return L

In [14]:
movies['genres']=movies['genres'].apply(convert)

In [15]:
movies['keywords']=movies['keywords'].apply(convert)

In [16]:
movies['cast']=movies['cast'].apply(convert3)

In [18]:
def fetch_director(obj):
    L=[]
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            L.append(i['name'])
            break
    return L

In [19]:
movies['crew']=movies['crew'].apply(fetch_director)

In [21]:
#converting overview which is a string into list
movies['overview']=movies['overview'].apply(lambda x:x.split())

In [22]:
#to ensure that bcoz of spaces tags wont consider things to be two seperate words
#lets remove spaces inbw for eg (Science Fiction) or (Sam Worthington )
movies['genres']=movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords']=movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast']=movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew']=movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [24]:
#creating tags column which is concatenation of above 4 columns
movies['tags']=movies['overview']+movies['genres']+movies['cast']+movies['crew']

In [25]:
new_df=movies[['movie_id','title','tags']]

In [26]:
new_df['tags']=new_df['tags'].apply(lambda x:' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x:' '.join(x))


In [30]:
new_df['tags']=new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x:x.lower())


In [44]:
import nltk

In [45]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [46]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [47]:
new_df['tags']=new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(stem)


In [48]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000,stop_words='english')

In [55]:
vectors=cv.fit_transform(new_df['tags']).toarray()

In [56]:
#to check those 5000 words
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [57]:
#but the above has similar instances of words but considered different because of tenses
#hence using stemming before vectorization

In [58]:
from sklearn.metrics.pairwise import cosine_similarity

In [70]:
similarity=cosine_similarity(vectors)

In [74]:
def recommend(movie):
    movie_index=new_df[new_df['title']==movie].index[0]
    distances=similarity[movie_index]
    movies_list=sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
    
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [75]:
recommend('Avatar')

Mad Max Beyond Thunderdome
The Helix... Loaded
Jupiter Ascending
Krull
The Book of Life
