## Movie Recommendation System

In [46]:
import numpy as np
import pandas as pd

In [47]:
credits_df = pd.read_csv("credits.csv")
movies_df = pd.read_csv("movies.csv")

In [48]:
# For displaying all the rows and columns
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [49]:
# merge both datasets
# we are merging based on the title (think of it as the primary Key) -- that's why the "on"
movies_df = movies_df.merge(credits_df, on="title")

In [50]:
#keep only the columns that we will use
movies_df = movies_df[["movie_id", "title", "overview", "genres", "keywords", "cast", "crew"]]

In [51]:
# check missing values
movies_df.isna().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [52]:
# eliminate the missing values
movies_df.dropna(inplace=True) # or we can also do movies_df = movies_df.dropna()

In [53]:
#Check if there is duplicates
movies_df.duplicated().sum()

0

### Now we will handle the text within a column as follows:

In [54]:
# index the data so we can see the text

movies_df.iloc[0].genres # displaying the 'Genres' text within the first (0 index) row, in this case Avatar Movie

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [55]:
import ast  # library that classifies text data into trees (with a clear writing pattern)

def convert(obj): # define the function
    L=[] # create a new list in the function to later populate it with data
    for i in ast.literal_eval(obj): # loop to find values in object _______
        L.append(i["name"]) # append the list with the value associated with the key "name".
    return L

In [56]:
# Now we apply the function on the column we selected (genres, keywords, )

movies_df["genres"] = movies_df["genres"].apply(convert)
movies_df["keywords"] = movies_df["keywords"].apply(convert)

In [57]:
movies_df.head(3)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."


In [58]:
# now we are going to the 'same' to the rest of variables

def convert3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3: # each row has 3 keys associated with 3 values (actors), so once the counter gets to 3 it breaks and reiterates with the next "i" value
            L.append(i["name"])
            counter += 1
        else:
            break
    return L    

In [59]:
movies_df["cast"] = movies_df["cast"].apply(convert3)

In [60]:
movies_df.head(3)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."


In [None]:
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == "Director":
            L.append(i["name"])
    return L        

In [64]:
movies_df["crew"] = movies_df["crew"].apply(fetch_director)

In [65]:
movies_df.head(3)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]


#### Dealing with the Overview Text

In [67]:
movies_df["overview"][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'

In [68]:
movies_df["overview"]= movies_df["overview"].apply(lambda x:x.split()) # to separate the text by commas
movies_df.head(3)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]


In [69]:
# to remove spaces in the value.
# example: Science Fiction to ScienceFiction

movies_df["genres"] = movies_df["genres"].apply(lambda x:[i.replace(" ", "") for i in x]) 
movies_df["keywords"] = movies_df["keywords"].apply(lambda x:[i.replace(" ", "") for i in x])
movies_df["cast"] = movies_df["cast"].apply(lambda x:[i.replace(" ", "") for i in x]) 
movies_df["crew"] = movies_df["crew"].apply(lambda x:[i.replace(" ", "") for i in x])

In [70]:
movies_df.head(3)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]


#### We want to enter all the data into one variable labeled as "Tags"

In [71]:
movies_df["tags"] = movies_df["overview"] + movies_df["genres"] + movies_df["keywords"] + movies_df["cast"] + movies_df["crew"]
movies_df.head(3)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],"[A, cryptic, message, from, Bond’s, past, send..."


### Create a new dataframe

In [72]:
new_df = movies_df[["movie_id", "title", "tags"]]
new_df.head(3)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."


In [73]:
# eliminate the commas
new_df["tags"] = new_df["tags"].apply(lambda x: " ".join(x))
new_df.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["tags"] = new_df["tags"].apply(lambda x: " ".join(x))


Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...


In [74]:
# to check how a whole tag record would look like we filter:
new_df["tags"][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver JamesCameron'

In [75]:
# lower the case of the data 
new_df["tags"] = new_df["tags"].apply(lambda X:X.lower())
new_df.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["tags"] = new_df["tags"].apply(lambda X:X.lower())


Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...


## Text Analysis

### Feature Extraction

In [76]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000, stop_words = "english")

# count vectorizer: used to transform a given text into a vector on the basis of the frequency count of each word occuring in the entire text

In [77]:
cv.fit_transform(new_df["tags"]).toarray().shape # transforming the text into vectors

(4805, 5000)

In [78]:
vectors = cv.fit_transform(new_df["tags"]).toarray() # storing the vectors in a different variable

In [79]:
vectors[0] # checking

array([0, 0, 0, ..., 0, 0, 0])

In [80]:
len(cv.get_feature_names())



5000

### Natural Language Toolkit

The Natural Language Toolkit (NLTK) is a library in Python that provides tools for working with human language data, such as text. NLTK is a widely used library in the field of natural language processing (NLP) and has a wide range of features.


###### * Tokenization: dividing text into words, sentences, and paragraphs
###### * Stemming and lemmatization: reducing words to their base form
###### * Part-of-speech tagging: identifying the role of each word in a sentence (noun, verb, adjective, etc.)
###### * Chunking and parsing: breaking sentences down into their grammatical components
###### * Named entity recognition: identifying entities such as people, organizations, and locations in text
###### * Sentiment analysis: determining the emotional tone of text
###### * Text classification: categorizing text into predefined classes
###### * Corpus readers: reading and accessing large text corpora

In [81]:
# NLP 

import nltk 
from nltk.stem.porter import PorterStemmer 
ps = PorterStemmer()

In [84]:
# stemming
# Stemming a text in Python means to reduce the words to their base or root form. This is typically done by removing common prefixes or suffixes 
# from the word. For example, the stem of the word "jumping" might be "jump" and the stem of the word "jumps" might also be "jump".

def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [85]:
new_df["tags"] = new_df["tags"].apply(stem) # apply the function to the tag variable

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["tags"] = new_df["tags"].apply(stem) # apply the function to the tag variable


In [87]:
# cosine similarities
# Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space that measures 
# the cosine of the angle between them. 
# In other words, it is a measure of how similar two vectors are, based on the cosine of the angle between them.

from sklearn.metrics.pairwise import cosine_similarity

In [89]:
cosine_similarity(vectors).shape # vector was defined a few lines above

(4805, 4805)

In [90]:
similarity = cosine_similarity(vectors) 

In [92]:
similarity[0].shape

(4805,)

In [93]:
sorted(list(enumerate(similarity[0])), reverse = True, key = lambda x:x[1])[1:6]

[(539, 0.26089696604360174),
 (507, 0.25302403842552984),
 (1194, 0.25226248955475644),
 (1216, 0.2480694691784169),
 (582, 0.24397501823713333)]

###### the code is sorting a list of tuples containing index and similarity scores, in descending order based on the similarity score, and returning a slice of the sorted list starting from index 1 (the second element) and returning the next 5 elements

## Recommendation Function

In [96]:
def recommend(movie):
    # finding the index of the first row in the dataframe where the value of the "title" column is equal to the value of the 
    # "movie" variable and assigning the index to the "movie_index" variable.
    movie_index = new_df[new_df["title"]== movie].index[0]  
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse = True, key=lambda x:x[1])[1:6]
    
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [97]:
# Checking the recommendations with the movie Avatar
recommend("Avatar")

Titan A.E.
Independence Day
Small Soldiers
Aliens vs Predator: Requiem
Battle: Los Angeles


In [106]:
# Checking with a random value
movies_df["title"][1018]

'K-PAX'

In [107]:
recommend(movies_df["title"][1018])

Veronika Decides to Die
Planet 51
Escape from Planet Earth
What Planet Are You From?
Gothika


In [108]:
recommend("Iron Man")

Iron Man 2
Iron Man 3
Avengers: Age of Ultron
Captain America: Civil War
The Avengers


In [110]:
recommend("King Kong")

20,000 Leagues Under the Sea
The Black Hole
Supporting Characters
Ice Age: Continental Drift
Serenity


In [111]:
recommend("World War Z")

Megiddo: The Omega Code 2
Teacher's Pet
The Omega Code
Left Behind
Brooklyn Rules
