In [2]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

In [3]:
movies=pd.read_csv("./archive/tmdb_5000_movies.csv")
credits=pd.read_csv("./archive/tmdb_5000_credits.csv")

In [4]:
movies.shape

(4803, 20)

In [5]:
credits.shape

(4803, 4)

Merging both tables

In [6]:
movies=movies.merge(credits,on='title')

In [7]:
movies.shape

(4809, 23)

In [8]:
movies=movies[['movie_id','title','genres','keywords','vote_count','cast','crew']]
movies.shape

(4809, 7)

In [9]:
movies.dropna(inplace=True)

**Data Filteration**

Extract movie genres 

In [10]:
import ast ## converts the string to list 


def extract(text):
    l=[]
    for i in ast.literal_eval(text):
        l.append(i['name'])
    return l


In [11]:
movies['genres']=movies['genres'].apply(extract)
movies['keywords']=movies['keywords'].apply(extract)


In [12]:
def extract_cast(text):
    l=[]
    counter=0
    for i in ast.literal_eval(text):
        if counter<3:
            l.append(i['name'])
        else: break
        counter +=1
    return l

In [13]:
movies['cast']=movies['cast'].apply(extract_cast)

In [14]:
def extract_director(text):
    l=[]
    for i in ast.literal_eval(text):
        if i['job']=='Director':
            l.append(i['name'])
            break
    return l

In [15]:
movies['crew']=movies['crew'].apply(extract_director)

In [16]:
movies.head(2)

Unnamed: 0,movie_id,title,genres,keywords,vote_count,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",11800,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",4500,"[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]


Remove Spaces in Words to remove ambiguity

In [17]:
def remove_spaces(word):
    l=[]
    for i in word: 
        l.append(i.replace(" ",""))
    return l

In [18]:
movies['genres']=movies['genres'].apply(remove_spaces)
movies['keywords']=movies['keywords'].apply(remove_spaces)
movies['cast']=movies['cast'].apply(remove_spaces)
movies['crew']=movies['crew'].apply(remove_spaces)

In [19]:
movies.head(2)

Unnamed: 0,movie_id,title,genres,keywords,vote_count,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...",11800,"[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...",4500,"[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]


Combining all the columns to vectorize

In [20]:
movies['tags']=movies['genres']+movies['keywords']+movies['cast']+movies['crew']

In [21]:
movies['tags'][0]

['Action',
 'Adventure',
 'Fantasy',
 'ScienceFiction',
 'cultureclash',
 'future',
 'spacewar',
 'spacecolony',
 'society',
 'spacetravel',
 'futuristic',
 'romance',
 'space',
 'alien',
 'tribe',
 'alienplanet',
 'cgi',
 'marine',
 'soldier',
 'battle',
 'loveaffair',
 'antiwar',
 'powerrelations',
 'mindandsoul',
 '3d',
 'SamWorthington',
 'ZoeSaldana',
 'SigourneyWeaver',
 'JamesCameron']

In [22]:
filtered_movies_data=movies[['movie_id','title','vote_count','tags']]

Convert Tags in Paragraph

In [23]:
filtered_movies_data = filtered_movies_data.copy()
filtered_movies_data['tags'] = filtered_movies_data['tags'].apply(lambda x: " ".join(x)).apply(lambda x: x.lower())


In [24]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000, stop_words='english')

vector=cv.fit_transform(filtered_movies_data['tags']).toarray()

In [25]:
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
similarity=cosine_similarity(vector)
similarity

array([[1.        , 0.12309149, 0.11605177, ..., 0.06085806, 0.        ,
        0.        ],
       [0.12309149, 1.        , 0.12856487, ..., 0.        , 0.        ,
        0.        ],
       [0.11605177, 0.12856487, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.06085806, 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [27]:
title='Spider-Man'


In [28]:
def recommend(movie_title):
    index=filtered_movies_data[filtered_movies_data['title']==movie_title].index[0]
    distance=sorted(list(enumerate(similarity[index])),reverse=True,key=lambda x: x[1]) # sort in descending order 
    result=[]
    for movie in distance[1:6]:
        result.append(filtered_movies_data.iloc[movie[0]].title)
    return result

In [29]:
print(recommend(title))

['Spider-Man 3', 'Spider-Man 2', 'Highlander: Endgame', 'Thor: The Dark World', 'Ghost Rider: Spirit of Vengeance']


In [30]:
import pickle

pickle.dump(filtered_movies_data,open('./artifacts/movie_list.pkl','wb'))
pickle.dump(similarity,open('./artifacts/similarity.pkl','wb'))


In [31]:
filtered_movies_data

Unnamed: 0,movie_id,title,vote_count,tags
0,19995,Avatar,11800,action adventure fantasy sciencefiction cultur...
1,285,Pirates of the Caribbean: At World's End,4500,adventure fantasy action ocean drugabuse exoti...
2,206647,Spectre,4466,action adventure crime spy basedonnovel secret...
3,49026,The Dark Knight Rises,9106,action crime drama thriller dccomics crimefigh...
4,49529,John Carter,2124,action adventure sciencefiction basedonnovel m...
...,...,...,...,...
4804,9367,El Mariachi,238,action crime thriller unitedstates–mexicobarri...
4805,72766,Newlyweds,5,comedy romance edwardburns kerrybishé marshadi...
4806,231617,"Signed, Sealed, Delivered",6,comedy drama romance tvmovie date loveatfirsts...
4807,126186,Shanghai Calling,7,danielhenney elizacoupe billpaxton danielhsia
