# Content-based Recommendation System using Natural language processing

## Step 1 A : Import packages and load dataset

In [24]:
from rake_nltk import Rake
import nltk

import pandas as pd
import numpy as np
import matplotlib.pyplot as plot

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings("ignore")

## Helper Functions 
#### We will use them to seek index and title of the movies from dataframe

In [25]:
def get_title_from_index(index):
    return df[df.index == index]['Title'].values[0]

def get_index_from_title(Title):
    return df[df.Title == Title]['index'].values[0]

In [26]:
df = pd.read_csv('https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7')
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,...,tomatoConsensus,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,tomatoURL,DVD,BoxOffice,Production,Website,Response
0,1,The Shawshank Redemption,1994,R,14 Oct 1994,142 min,"Crime, Drama",Frank Darabont,"Stephen King (short story ""Rita Hayworth and S...","Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",...,,,,,http://www.rottentomatoes.com/m/shawshank_rede...,27 Jan 1998,,Columbia Pictures,,True
1,2,The Godfather,1972,R,24 Mar 1972,175 min,"Crime, Drama",Francis Ford Coppola,"Mario Puzo (screenplay), Francis Ford Coppola ...","Marlon Brando, Al Pacino, James Caan, Richard ...",...,,,,,http://www.rottentomatoes.com/m/godfather/,09 Oct 2001,,Paramount Pictures,http://www.thegodfather.com,True
2,3,The Godfather: Part II,1974,R,20 Dec 1974,202 min,"Crime, Drama",Francis Ford Coppola,"Francis Ford Coppola (screenplay), Mario Puzo ...","Al Pacino, Robert Duvall, Diane Keaton, Robert...",...,,,,,http://www.rottentomatoes.com/m/godfather_part...,24 May 2005,,Paramount Pictures,http://www.thegodfather.com/,True
3,4,The Dark Knight,2008,PG-13,18 Jul 2008,152 min,"Action, Crime, Drama",Christopher Nolan,"Jonathan Nolan (screenplay), Christopher Nolan...","Christian Bale, Heath Ledger, Aaron Eckhart, M...",...,,,,,http://www.rottentomatoes.com/m/the_dark_knight/,09 Dec 2008,"$533,316,061",Warner Bros. Pictures/Legendary,http://thedarkknight.warnerbros.com/,True
4,5,12 Angry Men,1957,APPROVED,01 Apr 1957,96 min,"Crime, Drama",Sidney Lumet,"Reginald Rose (story), Reginald Rose (screenplay)","Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",...,,,,,http://www.rottentomatoes.com/m/1000013-12_ang...,06 Mar 2001,,Criterion Collection,http://www.criterion.com/films/27871-12-angry-men,True


In [27]:
print(df.columns.tolist())

['Unnamed: 0', 'Title', 'Year', 'Rated', 'Released', 'Runtime', 'Genre', 'Director', 'Writer', 'Actors', 'Plot', 'Language', 'Country', 'Awards', 'Poster', 'Ratings.Source', 'Ratings.Value', 'Metascore', 'imdbRating', 'imdbVotes', 'imdbID', 'Type', 'tomatoMeter', 'tomatoImage', 'tomatoRating', 'tomatoReviews', 'tomatoFresh', 'tomatoRotten', 'tomatoConsensus', 'tomatoUserMeter', 'tomatoUserRating', 'tomatoUserReviews', 'tomatoURL', 'DVD', 'BoxOffice', 'Production', 'Website', 'Response']


In [32]:
# df.rename(columns={"A": "a", "B": "c"})

#df.rename(columns={'Unnamed: 0' : 'indices'})

## Step 1 B : Select important Features, redefine the df, and check for missing values. 

In [33]:
features = ['Title', 'Genre', 'Director', 'Actors', 'Plot']

In [34]:
df = df[features]
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...


In [35]:
df.info()
print('Missing Values: ', df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Title     250 non-null    object
 1   Genre     250 non-null    object
 2   Director  250 non-null    object
 3   Actors    250 non-null    object
 4   Plot      250 non-null    object
dtypes: object(5)
memory usage: 9.9+ KB
Missing Values:  Title       0
Genre       0
Director    0
Actors      0
Plot        0
dtype: int64


In [36]:
df.describe().T

Unnamed: 0,count,unique,top,freq
Title,250,250,Unforgiven,1
Genre,250,110,Drama,19
Director,250,155,Alfred Hitchcock,9
Actors,250,248,"Mark Hamill, Harrison Ford, Carrie Fisher, Bil...",2
Plot,250,250,A college professor's bond with the abandoned ...,1


In [37]:
df.loc[(df.Genre == 'Drama')]

Unnamed: 0,Title,Genre,Director,Actors,Plot
8,Fight Club,Drama,David Fincher,"Edward Norton, Brad Pitt, Meat Loaf, Zach Grenier","An insomniac office worker, looking for a way ..."
14,One Flew Over the Cuckoo's Nest,Drama,Milos Forman,"Michael Berryman, Peter Brocco, Dean R. Brooks...",A criminal pleads insanity after getting into ...
63,Requiem for a Dream,Drama,Darren Aronofsky,"Ellen Burstyn, Jared Leto, Jennifer Connelly, ...",The drug-induced utopias of four Coney Island ...
81,All About Eve,Drama,Joseph L. Mankiewicz,"Bette Davis, Anne Baxter, George Sanders, Cele...",An ingenue insinuates herself into the company...
85,Good Will Hunting,Drama,Gus Van Sant,"Matt Damon, Ben Affleck, Stellan Skarsgård, Jo...","Will Hunting, a janitor at M.I.T., has a gift ..."
105,Room,Drama,Lenny Abrahamson,"Brie Larson, Jacob Tremblay, Sean Bridgers, We...",A young boy is raised within the confines of a...
121,Trainspotting,Drama,Danny Boyle,"Ewan McGregor, Ewen Bremner, Jonny Lee Miller,...","Renton, deeply immersed in the Edinburgh drug ..."
124,Gran Torino,Drama,Clint Eastwood,"Clint Eastwood, Christopher Carley, Bee Vang, ...",Disgruntled Korean War veteran Walt Kowalski s...
139,Network,Drama,Sidney Lumet,"Faye Dunaway, William Holden, Peter Finch, Rob...",A television network cynically exploits a dera...
170,"Paris, Texas",Drama,Wim Wenders,"Harry Dean Stanton, Sam Berry, Bernhard Wicki,...","Travis Henderson, an aimless drifter who has b..."


In [38]:
df['Genre'].value_counts()

Drama                           19
Crime, Drama                    14
Drama, War                       9
Animation, Adventure, Comedy     8
Crime, Drama, Thriller           8
                                ..
Drama, History, Romance          1
Action, Adventure, Drama         1
Crime, Film-Noir, Mystery        1
Horror, Mystery, Thriller        1
Drama, Mystery, Sci-Fi           1
Name: Genre, Length: 110, dtype: int64

## Step 2: Data Pre-processing

Firstly, the data is pre-processed using NLP to obtain one column that contains all the attributes (in string/words) for each row. Thereafter, this column is vectorized, where score are allocated to each word. Subsequently, cosine similarity metrics is used to calculate the similarity measure between the movies.

In [46]:
# to remove all the punctuations from 'Plot' series
df['Plot'] = df['Plot'].str.replace('[^\w\s]','')

# an alternative way to do so is:
# import string
# df['Plot'] = df['Plot'].str.replace('[{}]'.format(string.punctuation), '')
print(df.Plot.head())
df['Plot'][249]

0    Two imprisoned men bond over a number of years...
1    The aging patriarch of an organized crime dyna...
2    The early life and career of Vito Corleone in ...
3    When the menace known as the Joker emerges fro...
4    A jury holdout attempts to prevent a miscarria...
Name: Plot, dtype: object


'A Mumbai teen reflects on his upbringing in the slums when he is accused of cheating on the Indian Version of Who Wants to be a Millionaire'

In [47]:
# to extract keywords from PLot into a list
df['Keywords'] = ''   # initializing a new column

# use Rake to remove stop words (based on english stop words from nltk)
r = Rake()
for index, row in df.iterrows():
    r.extract_keywords_from_text(row['Plot'])   # extract keywords from Plot, default in lower case
    keywords_dict = r.get_word_degrees()        # get dictionary with keywords and their scores
    row['Keywords'] = list(keywords_dict.keys())# assign list of keywords to 'keywords' column
    
print(df.Keywords.head())
print(df['Keywords'][249])

0    [common, decency, acts, eventual, redemption, ...
1    [clandestine, empire, aging, patriarch, reluct...
2    [career, 1920s, new, york, family, crime, synd...
3    [wreaks, havoc, ability, chaos, joker, emerges...
4    [jury, holdout, attempts, forcing, reconsider,...
Name: Keywords, dtype: object
['cheating', 'indian', 'version', 'accused', 'upbringing', 'slums', 'mumbai', 'teen', 'reflects', 'millionaire', 'wants']


In [48]:
keywords_dict

defaultdict(<function rake_nltk.rake.Rake._build_word_co_occurance_graph.<locals>.<lambda>()>,
            {'cheating': 1,
             'indian': 2,
             'version': 2,
             'accused': 1,
             'upbringing': 1,
             'slums': 1,
             'mumbai': 3,
             'teen': 3,
             'reflects': 3,
             'millionaire': 1,
             'wants': 1})

NEXT STEP - The names of actors and directors are converted into unique identity values. This is done by merging all the first and last names into one word, so that Chris Evans and Chris Hemsworth appear different. 
The recommender should detect a similarity only if the person associated to different movies is exactly the same. Every word needs to be converted to lowercase to avoid duplications.

In [49]:
# to extract all genre into a list, the actors into a list, and all  the directors into o list
df['Genre'] = df['Genre'].map(lambda x: x.split(','))
df['Actors'] = df['Actors'].map(lambda x: x.split(','))
df['Director'] = df['Director'].map(lambda x: x.split(','))

# create unique identity by merging first and last names into one, and convert all to lowercase
for index, row in df.iterrows():
    row['Genre'] = [x.lower().replace(' ', '') for x in row['Genre']]
    row['Actors'] = [x.lower().replace(' ', '') for x in row['Actors']]
    row['Director'] = [x.lower().replace(' ', '') for x in row['Director']]
    
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot,Keywords
0,The Shawshank Redemption,"[crime, drama]",[frankdarabont],"[timrobbins, morganfreeman, bobgunton, william...",Two imprisoned men bond over a number of years...,"[common, decency, acts, eventual, redemption, ..."
1,The Godfather,"[crime, drama]",[francisfordcoppola],"[marlonbrando, alpacino, jamescaan, richards.c...",The aging patriarch of an organized crime dyna...,"[clandestine, empire, aging, patriarch, reluct..."
2,The Godfather: Part II,"[crime, drama]",[francisfordcoppola],"[alpacino, robertduvall, dianekeaton, robertde...",The early life and career of Vito Corleone in ...,"[career, 1920s, new, york, family, crime, synd..."
3,The Dark Knight,"[action, crime, drama]",[christophernolan],"[christianbale, heathledger, aaroneckhart, mic...",When the menace known as the Joker emerges fro...,"[wreaks, havoc, ability, chaos, joker, emerges..."
4,12 Angry Men,"[crime, drama]",[sidneylumet],"[martinbalsam, johnfiedler, leej.cobb, e.g.mar...",A jury holdout attempts to prevent a miscarria...,"[jury, holdout, attempts, forcing, reconsider,..."


## Step 3: To create word representations by combining column attributes to Bag_of_words

In [50]:
# to combine 4 lists (4 columns) into a string of words under 'Bag_Of_words'
df['Bag_of_words'] = ''
columns = ['Genre', 'Actors', 'Director', 'Keywords']

for index, row in df.iterrows():
    word = []
    
    for col in columns:
        word.extend(row[col])
        
    row['Bag_of_words'] = ' '.join(word)
    
df = df[['Title', 'Bag_of_words']]
df

Unnamed: 0,Title,Bag_of_words
0,The Shawshank Redemption,crime drama timrobbins morganfreeman bobgunton...
1,The Godfather,crime drama marlonbrando alpacino jamescaan ri...
2,The Godfather: Part II,crime drama alpacino robertduvall dianekeaton ...
3,The Dark Knight,action crime drama christianbale heathledger a...
4,12 Angry Men,crime drama martinbalsam johnfiedler leej.cobb...
...,...,...
245,The Lost Weekend,drama film-noir raymilland janewyman phillipte...
246,Short Term 12,drama brielarson johngallagherjr. stephaniebea...
247,His Girl Friday,comedy drama romance carygrant rosalindrussell...
248,The Straight Story,biography drama sissyspacek janegallowayheitz ...


In [51]:
# an example of a row of Bag_of_words column
df['Bag_of_words'][249]

'drama devpatel saurabhshukla anilkapoor rajzutshi dannyboyle loveleentandan cheating indian version accused upbringing slums mumbai teen reflects millionaire wants'

## Step 4: To create a vector representation for Bag_of_words and the similarity matrix

Recommendation algorithms can only read and compare a vector or matrix with another, so we need to convert the Bag_of_words column into vector representation using CountVectorizer, which is a simple frequency counter for each word in the column. 
On this frequency matrix of each word, we can apply the cosine similarity function to compute similarities amongst movies.

In [52]:
cv = CountVectorizer() # instance of the class
count_matrix = cv.fit_transform(df['Bag_of_words']) # create word list, row wise
print(count_matrix)
print('\n')
cosine_sim = cosine_similarity(count_matrix) # compute similarity measure btw movies
print(cosine_sim)

  (0, 616)	1
  (0, 817)	1
  (0, 2914)	1
  (0, 1991)	1
  (0, 323)	1
  (0, 3156)	1
  (0, 1084)	1
  (0, 549)	1
  (0, 701)	1
  (0, 58)	1
  (0, 951)	1
  (0, 2376)	1
  (0, 2086)	1
  (0, 3206)	1
  (0, 1036)	1
  (0, 2709)	1
  (0, 3003)	1
  (0, 1364)	1
  (0, 1905)	1
  (0, 328)	1
  (1, 616)	2
  (1, 817)	1
  (1, 1838)	1
  (1, 122)	1
  (1, 1453)	1
  :	:
  (248, 421)	1
  (248, 798)	1
  (248, 1757)	1
  (248, 1356)	1
  (248, 1687)	1
  (248, 2959)	1
  (248, 1907)	1
  (249, 817)	1
  (249, 2858)	1
  (249, 49)	1
  (249, 659)	1
  (249, 1957)	1
  (249, 1374)	1
  (249, 749)	1
  (249, 2557)	1
  (249, 153)	1
  (249, 2341)	1
  (249, 1774)	1
  (249, 473)	1
  (249, 3064)	1
  (249, 3040)	1
  (249, 2697)	1
  (249, 2008)	1
  (249, 2378)	1
  (249, 3110)	1


[[1.         0.14638501 0.1315587  ... 0.05129892 0.05129892 0.05270463]
 [0.14638501 1.         0.34236839 ... 0.05006262 0.05006262 0.05143445]
 [0.1315587  0.34236839 1.         ... 0.04499213 0.04499213 0.04622502]
 ...
 [0.05129892 0.05006262 0.04499213 ... 

In [53]:
df['index'] = '' # adding a new 'index' column

for index, row in df.iterrows():
    row['index'] = index

In [54]:
def predict(movie_user_likes):
    movie_index = get_index_from_title(movie_user_likes)
    similar_movies = list(enumerate(cosine_sim[movie_index]))
    # print(similar_movies)
    sorted_similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)
    # print(sorted_similar_movies)
    i=0
    for elem in sorted_similar_movies:
        print(get_title_from_index(elem[0]))
        i += 1
        if i == 15: break
            
# movie_user_likes = 'The Dark Knight'

In [55]:
predict('Inception')

Inception
Mad Max: Fury Road
Interstellar
The Dark Knight Rises
Aliens
Guardians of the Galaxy
Guardians of the Galaxy Vol. 2
Spider-Man: Homecoming
The Prestige
The Matrix
Batman Begins
The Revenant
Jurassic Park
Terminator 2: Judgment Day
2001: A Space Odyssey


In [56]:
predict('Jurassic Park')

Jurassic Park
Interstellar
Aliens
Guardians of the Galaxy
Guardians of the Galaxy Vol. 2
Blade Runner
Inception
Terminator 2: Judgment Day
2001: A Space Odyssey
The Martian
Donnie Darko
Twelve Monkeys
Mad Max: Fury Road
Back to the Future
Jaws
