In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
!pip install rake-nltk

Collecting rake-nltk
  Downloading https://files.pythonhosted.org/packages/8e/c4/b4ff57e541ac5624ad4b20b89c2bafd4e98f29fd83139f3a81858bdb3815/rake_nltk-1.0.4.tar.gz
Building wheels for collected packages: rake-nltk
  Building wheel for rake-nltk (setup.py) ... [?25l[?25hdone
  Created wheel for rake-nltk: filename=rake_nltk-1.0.4-py2.py3-none-any.whl size=7829 sha256=9f8477c94a9f91332f68c71737c46f898cbff651b4a0608c46892509f513b210
  Stored in directory: /root/.cache/pip/wheels/ef/92/fc/271b3709e71a96ffe934b27818946b795ac6b9b8ff8682483f
Successfully built rake-nltk
Installing collected packages: rake-nltk
Successfully installed rake-nltk-1.0.4


In [None]:
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('display.max_columns', 100)
df = pd.read_csv('/content/gdrive/My Drive/MovieDataset/IMBD_19kmovies.csv')
df.head()

Unnamed: 0,imdb_title_id,Title,year,date_published,Genre,duration,country,language,Director,writer,production_company,Actors,Plot,avg_vote,votes
0,tt0088751,The Naked Monster,2005,4/22/2005,"Comedy, Horror, Sci-Fi",100,USA,English,"Wayne Berwick, Ted Newsom",Ted Newsom,Heidelberg Films,"Kenneth Tobey, Brinke Stevens, R.G. Wilson, Jo...","A brain-dead sheriff, a stolid secret agent an...",5.4,264.0
1,tt0102267,Lifebreath,1997,11/7/1997,"Drama, Thriller, Romance",90,USA,English,P.J. Posner,"Joel Posner, P.J. Posner",Felder Pomus Entertainment,"Luke Perry, Francie Swift, Gia Carides, Gary B...",A man plans a perfect murder to get a lung tra...,5.5,345.0
2,tt0103655,Almost Blue,1996,8/4/1993,"Drama, Thriller",85,USA,English,Keoni Waxman,Keoni Waxman,Postcard Picture,"Michael Madsen, Lynette Walden, Garrett Morris...",The young jazz saxophonist Morris Poole is at ...,5.2,106.0
3,tt0105298,Running Wild,1995,3/3/1995,Drama,94,South Africa,English,Dee McLachlan,"John Varty, Andrea Buck",Londolozi Productions,"John Varty, Elmon Mhlongo, Brooke Shields, Mar...",A filmmaker determines to document the story o...,5.6,304.0
4,tt0106621,Cries of Silence,1996,5/1/1996,Drama,109,USA,English,Avery Crounse,Avery Crounse,Elysian Pictures,"Kathleen York, Karen Black, Ed Nelson, Ellen C...",After a hurricane reeks havoc on Sister Island...,6.2,168.0


In [None]:
df.shape

(19720, 15)

In [None]:
df = df[['Title','Genre','Director','Actors','Plot']]
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Naked Monster,"Comedy, Horror, Sci-Fi","Wayne Berwick, Ted Newsom","Kenneth Tobey, Brinke Stevens, R.G. Wilson, Jo...","A brain-dead sheriff, a stolid secret agent an..."
1,Lifebreath,"Drama, Thriller, Romance",P.J. Posner,"Luke Perry, Francie Swift, Gia Carides, Gary B...",A man plans a perfect murder to get a lung tra...
2,Almost Blue,"Drama, Thriller",Keoni Waxman,"Michael Madsen, Lynette Walden, Garrett Morris...",The young jazz saxophonist Morris Poole is at ...
3,Running Wild,Drama,Dee McLachlan,"John Varty, Elmon Mhlongo, Brooke Shields, Mar...",A filmmaker determines to document the story o...
4,Cries of Silence,Drama,Avery Crounse,"Kathleen York, Karen Black, Ed Nelson, Ellen C...",After a hurricane reeks havoc on Sister Island...


In [None]:
df.shape

(19720, 5)

In [None]:
# discarding the commas between the actors' full names and getting only the first three names
df['Actors'] = df['Actors'].map(lambda x: str(x).split(',')[:3])

# putting the genres in a list of words
df['Genre'] = df['Genre'].map(lambda x: str(x).lower().split(','))

df['Director'] = df['Director'].map(lambda x: str(x).split(' '))

# merging together first and last name for each actor and director, so it's considered as one word
# and there is no mix up between people sharing a first name
for index, row in df.iterrows():
    row['Actors'] = [x.lower().replace(' ','') for x in row['Actors']]
    row['Director'] = ''.join(row['Director']).lower()

In [None]:
# initializing the new column
df['Key_words'] = ""

for index, row in df.iterrows():
    plot = row['Plot']

    # instantiating Rake, by default is uses english stopwords from NLTK
    # and discard all puntuation characters
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(plot)

    # getting the dictionary whith key words and their scores
    key_words_dict_scores = r.get_word_degrees()

    # assigning the key words to the new column
    row['Key_words'] = list(key_words_dict_scores.keys())

# dropping the Plot column
df.drop(columns = ['Plot'], inplace = True)

In [None]:
df.set_index('Title', inplace = True)
df.head()

Unnamed: 0_level_0,Genre,Director,Actors,Key_words
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
The Naked Monster,"[comedy, horror, sci-fi]","wayneberwick,tednewsom","[kennethtobey, brinkestevens, r.g.wilson]","[dead, sheriff, sexy, scientist, team, dinosau..."
Lifebreath,"[drama, thriller, romance]",p.j.posner,"[lukeperry, francieswift, giacarides]","[dying, wife, man, plans, get, perfect, murder..."
Almost Blue,"[drama, thriller]",keoniwaxman,"[michaelmadsen, lynettewalden, garrettmorris]","[grief, almost, overcome, career, young, jazz,..."
Running Wild,[drama],deemclachlan,"[johnvarty, elmonmhlongo, brookeshields]","[filmmaker, determines, africa, save, wild, le..."
Cries of Silence,[drama],averycrounse,"[kathleenyork, karenblack, ednelson]","[shore, soon, opens, eyes, hurricane, reeks, h..."


In [None]:
df['bag_of_words'] = ''
columns = df.columns
for index, row in df.iterrows():
    words = ''
    for col in columns:
        if col != 'Director':
            words = words + ' '.join(row[col])+ ' '
        else:
            words = words + row[col]+ ' '
    row['bag_of_words'] = words

df.drop(columns = [col for col in df.columns if col!= 'bag_of_words'], inplace = True)

In [None]:
df.head()

Unnamed: 0_level_0,bag_of_words
Title,Unnamed: 1_level_1
The Naked Monster,"comedy horror sci-fi wayneberwick,tednewsom ..."
Lifebreath,drama thriller romance p.j.posner lukeperry ...
Almost Blue,drama thriller keoniwaxman michaelmadsen lyne...
Running Wild,drama deemclachlan johnvarty elmonmhlongo broo...
Cries of Silence,drama averycrounse kathleenyork karenblack edn...


In [None]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])

# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use later to match the indexes
indices = pd.Series(df.index)
indices[:5]

0    The Naked Monster
1           Lifebreath
2          Almost Blue
3         Running Wild
4     Cries of Silence
Name: Title, dtype: object

In [None]:
# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.04564355, 0.        , ..., 0.03333333, 0.        ,
        0.06558258],
       [0.04564355, 1.        , 0.15638581, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.15638581, 1.        , ..., 0.03806935, 0.03686049,
        0.03745029],
       ...,
       [0.03333333, 0.        , 0.03806935, ..., 1.        , 0.        ,
        0.09837388],
       [0.        , 0.        , 0.03686049, ..., 0.        , 1.        ,
        0.        ],
       [0.06558258, 0.        , 0.03745029, ..., 0.09837388, 0.        ,
        1.        ]])

In [None]:
# function that takes in movie title as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = cosine_sim):

    recommended_movies = []

    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)

    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(list(df.index)[i])

    return recommended_movies

In [None]:
recommendations('Lion of Oz')

['Oz the Great and Powerful',
 'Barbie and the Secret Door',
 'The Steam Engines of Oz',
 'The Trail to Oregon!',
 'Animal Crackers',
 'Enchanted Princess',
 'Barbie Mariposa and the Fairy Princess',
 'Homeward',
 'Dinosaur Island',
 "The Queen's Corgi"]