# Data preprocessing

In [26]:
import pandas as pd
import numpy as np

In [27]:
# Dowlnoad data - from kaggle - https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata
movies_df = pd.read_csv('tmdb_5000_movies.csv')
credits_df = pd.read_csv('tmdb_5000_credits.csv')

In [28]:
movies_df.shape

(4803, 20)

In [29]:
credits_df.shape

(4803, 4)

In [30]:
# merge both csv's
new_movies_df = movies_df.merge(credits_df, on='title')

In [31]:
new_movies_df.shape

(4809, 23)

In [32]:
new_movies_df.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [33]:
# remove columns that are not textual and are not as importannt for comparison- use your intution for columns to keep .
columns_to_keep = ['id', 'title', 'genres', 'keywords', 'overview', 'cast', 'crew']
new_movies_df = new_movies_df[columns_to_keep]

In [34]:
new_movies_df.head(1)

Unnamed: 0,id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [35]:
# check for null values. as null values are only 3 and not a big number, drop them instead of imputing them.
#Check for duplicates and drop them as they would be same movies.
new_movies_df.isnull().sum()

id          0
title       0
genres      0
keywords    0
overview    3
cast        0
crew        0
dtype: int64

In [36]:
new_movies_df.dropna(inplace=True)

new_movies_df = new_movies_df.drop_duplicates(subset=['title', 'crew'], keep='first')

In [37]:
import ast
# import json

##### Textual data in geners, keywords, cast and crew are list which are strings. So need to convert them into a list first and then process them accordingly. 
##### Use either json.loads or ast.literal.eval to convert string into list and then write a function to extract name from the list

In [38]:
# type(json.loads(new_movies_df['genres'][0]))
type((new_movies_df['genres'][0]))

str

In [39]:
def clean(series):
    '''
    function to convert string and 
    extract name from the list
    '''
    list = []
    for i in ast.literal_eval(series):
        list.append(i['name'])
    return list        

In [40]:
new_movies_df['genres'] = new_movies_df['genres'].apply(clean)

new_movies_df['keywords'] = new_movies_df['keywords'].apply(clean)

In [42]:
def clean_2(series):
    '''
    decided to take just the 3 top actors from the
    cast for our comparison
    '''
    list = []
    count = 0
    for i in ast.literal_eval(series):
        if count != 3:
            list.append(i['name'])
            count += 1
        else:
            break
    return list

In [43]:
new_movies_df['cast'] = new_movies_df['cast'].apply(clean_2)

In [44]:
new_movies_df['cast'][0]

['Sam Worthington', 'Zoe Saldana', 'Sigourney Weaver']

In [45]:
def find_director(series):
    '''
    decided to take just the director
    from the crew for our comparison.
    '''
    
    list = []
    for i in ast.literal_eval(series):
        if i['job'] == 'Director':
            list.append(i['name'])
            break
    return list

In [46]:
new_movies_df['crew'] = new_movies_df['crew'].apply(find_director)

In [47]:
# overview column has textual data in string, convert that into list
new_movies_df['overview'] = new_movies_df['overview'].apply(lambda x: x.split())

In [48]:
new_movies_df.head()

Unnamed: 0,id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[In, the, 22nd, century,, a, paraplegic, Marin...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Captain, Barbossa,, long, believed, to, be, d...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[A, cryptic, message, from, Bond’s, past, send...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Following, the, death, of, District, Attorney...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[John, Carter, is, a, war-weary,, former, mili...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [49]:
new_movies_df.shape

(4800, 7)

##### Removed spaces from the names of actors and directors so when we embed these they can be embbeded as one person. for ex Sam Worthington and Sam Mendes could be distinguished as 2 different people rather than three entities Sam, Worthington and Mendes. Similarly joined words in geners and keywords.

In [50]:
new_movies_df['genres'] = new_movies_df['genres'].apply(lambda x: [i.replace(' ', '') for i in x])
new_movies_df['keywords'] = new_movies_df['keywords'].apply(lambda x: [i.replace(' ', '') for i in x])
new_movies_df['cast'] = new_movies_df['cast'].apply(lambda x: [i.replace(' ', '') for i in x])
new_movies_df['crew'] = new_movies_df['crew'].apply(lambda x: [i.replace(' ', '') for i in x])

In [51]:
new_movies_df.head()

Unnamed: 0,id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[Following, the, death, of, District, Attorney...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan]
4,49529,John Carter,"[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[John, Carter, is, a, war-weary,, former, mili...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton]


In [54]:
# made a new column by concatenation all textual data
new_movies_df['tags'] = new_movies_df['genres'] + new_movies_df['keywords'] + new_movies_df['overview'] + new_movies_df['cast'] + new_movies_df['crew']

In [55]:
new_movies_df.head()

Unnamed: 0,id,title,genres,keywords,overview,cast,crew,tags
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[Action, Adventure, Fantasy, ScienceFiction, c..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Adventure, Fantasy, Action, ocean, drugabuse,..."
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],"[Action, Adventure, Crime, spy, basedonnovel, ..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[Following, the, death, of, District, Attorney...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan],"[Action, Crime, Drama, Thriller, dccomics, cri..."
4,49529,John Carter,"[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[John, Carter, is, a, war-weary,, former, mili...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton],"[Action, Adventure, ScienceFiction, basedonnov..."


In [56]:
# deleted the text columns as we only need tags column that has all the information needed
final_df = new_movies_df[['id', 'title', 'tags']]

In [57]:
final_df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction, c..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action, ocean, drugabuse,..."
2,206647,Spectre,"[Action, Adventure, Crime, spy, basedonnovel, ..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller, dccomics, cri..."
4,49529,John Carter,"[Action, Adventure, ScienceFiction, basedonnov..."


In [58]:
# converted tags column into string and also into lower font.
final_df['tags'] = final_df['tags'].apply(lambda x: ' '.join(x).lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['tags'] = final_df['tags'].apply(lambda x: ' '.join(x).lower())


In [59]:
final_df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,action adventure fantasy sciencefiction cultur...
1,285,Pirates of the Caribbean: At World's End,adventure fantasy action ocean drugabuse exoti...
2,206647,Spectre,action adventure crime spy basedonnovel secret...
3,49026,The Dark Knight Rises,action crime drama thriller dccomics crimefigh...
4,49529,John Carter,action adventure sciencefiction basedonnovel m...


# Stemming & Vectorisation

In [60]:
import nltk

In [61]:
from nltk.stem.porter import PorterStemmer

In [62]:
ps = PorterStemmer()

In [63]:
def stemming(series):
    list = []
    for i in series.split():
        list.append(ps.stem(i))
    return ' '.join(list)
        

In [64]:
final_df['tags'] = final_df['tags'].apply(stemming)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['tags'] = final_df['tags'].apply(stemming)


In [66]:
# using Bag of Words to vectorise, maximum words to be used from corpous = 5000 and used stop words.
from sklearn.feature_extraction.text import CountVectorizer

In [67]:
cv = CountVectorizer(max_features=5000, stop_words='english')

In [68]:
vectors = cv.fit_transform(final_df['tags']).toarray()

In [69]:
feature_names = cv.get_feature_names_out()
for feature in feature_names:
    print(feature)

000
007
10
100
11
12
13
14
15
16
17
17th
18
18th
18thcenturi
19
1910
1920
1930
1940
1944
1950
1950s
1960
1960s
1970
1970s
1971
1974
1976
1980
1985
1990
1999
19th
19thcenturi
20
200
2003
2009
20th
21st
23
24
25
30
300
3d
40
50
500
60
70
80
aaron
aaroneckhart
abandon
abduct
abigailbreslin
abil
abl
aboard
abov
abus
academ
academi
accept
access
accid
accident
acclaim
accompani
accomplish
account
accus
ace
achiev
acquaint
act
action
actionhero
activ
activist
activities
actor
actress
actual
ad
adam
adamsandl
adamshankman
adapt
add
addict
adjust
admir
admit
adolesc
adopt
ador
adrienbrodi
adult
adultanim
adulteri
adulthood
advanc
adventur
adventure
adventures
advertis
advic
advis
affair
affect
afghanistan
africa
african
africanamerican
aftercreditssting
afterlif
aftermath
ag
age
agediffer
agenc
agency
agenda
agent
agents
aggress
ago
agre
ahead
aid
aidanquinn
ail
aim
air
airplan
airplanecrash
airport
aka
al
alabama
alan
alaska
albert
alcatraz
alcohol
alecbaldwin
alex
alexkendrick
alfredhitchcoc

In [70]:
vectors[0].shape

(5000,)

In [71]:
# used coine similarity to find distances between the vectors
from sklearn.metrics.pairwise import cosine_similarity

In [72]:
similarity = cosine_similarity(vectors)

In [73]:
similarity.shape

(4800, 4800)

In [75]:
# distance of film Avatar to other films in the list. values close to 1 being the most similar and 0 being least similar.
distance = (similarity[final_df[final_df['title'] == 'Avatar'].index[0]])
distance

array([1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
       0.        ])

In [77]:
# need to sort the distance list above to find the 5 most similar films.
# need to retain the original index from the distance list(enumerate will help), then sort. example below. 
# key function helps to sort according to the 2nd value of the enumerate tuple. then slice top 5.

sorted(list(enumerate(similarity[0])), reverse=True, key=lambda x: x[1])[1:6]

[(1214, 0.28676966733820225),
 (2405, 0.26901379342448517),
 (3724, 0.2605130246476754),
 (507, 0.255608593705383),
 (539, 0.25038669783359574)]

In [78]:
new_movies_df.iloc[1214]

id                                                        440
title                             Aliens vs Predator: Requiem
genres      [Fantasy, Action, ScienceFiction, Thriller, Ho...
keywords    [predator, nationalguard, hybrid, alien, morgu...
overview    [A, sequel, to, 2004's, Alien, vs., Predator,,...
cast             [StevenPasquale, ReikoAylesworth, JohnOrtiz]
crew                                           [ColinStrause]
tags        [Fantasy, Action, ScienceFiction, Thriller, Ho...
Name: 1216, dtype: object

In [79]:
def recommend(movie):
    movie_index = final_df[final_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6] 
    for i in movies_list:
        print(new_movies_df.iloc[i[0]]['title'])

In [81]:
recommend('Batman Begins')

The Dark Knight
Batman
Batman
The Dark Knight Rises
10th & Wolf


# Import the functions as pickle file for streamlit

In [82]:
import pickle

In [83]:
pickle.dump(final_df.to_dict(), open('movies_dict.pkl', 'wb'))

In [84]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))