In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
#pd.Set_option(display.max_rows, 1000)

In [2]:
df = pd.read_csv('movie_overviews.csv')
df.head()

Unnamed: 0,id,title,overview,tagline
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9099 entries, 0 to 9098
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        9099 non-null   int64 
 1   title     9099 non-null   object
 2   overview  9087 non-null   object
 3   tagline   7033 non-null   object
dtypes: int64(1), object(3)
memory usage: 284.5+ KB


In [4]:
df.isnull().sum()

id             0
title          0
overview      12
tagline     2066
dtype: int64

In [5]:
df[df['tagline'].isnull()]

Unnamed: 0,id,title,overview,tagline
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",
11,12110,Dracula: Dead and Loving It,When a lawyer shows up at the vampire's doorst...,
27,17015,Persuasion,This film adaptation of Jane Austen's last nov...,
34,687,Dead Man Walking,A justice drama based on a true story about a ...,
35,139405,Across the Sea of Time,"A young Russian boy, Thomas Minton, travels to...",
...,...,...,...,...
9088,401387,Sunspring,Sunspring is a short film about three people l...,
9089,373348,Author: The JT LeRoy Story,New York magazine’s October 2005 issue sent sh...,
9092,314420,Body,A night out turns deadly when three girls brea...,
9094,159550,The Last Brickmaker in America,A man must cope with the loss of his wife and ...,


In [6]:
index_with_nan = df.index[df.isnull().any(axis = 1)]
index_with_nan

Int64Index([   0,   11,   27,   34,   35,   38,   41,   47,   49,   50,
            ...
            9073, 9075, 9083, 9085, 9087, 9088, 9089, 9092, 9094, 9096],
           dtype='int64', length=2066)

In [7]:
abc = df.drop(index_with_nan,axis = 0).reset_index()

In [8]:
abc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7033 entries, 0 to 7032
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   index     7033 non-null   int64 
 1   id        7033 non-null   int64 
 2   title     7033 non-null   object
 3   overview  7033 non-null   object
 4   tagline   7033 non-null   object
dtypes: int64(2), object(3)
memory usage: 274.9+ KB


In [9]:
abc.describe()

Unnamed: 0,index,id
count,7033.0,7033.0
mean,4434.291341,36209.068818
std,2636.700256,60599.313941
min,1.0,5.0
25%,2117.0,9062.0
50%,4351.0,13920.0
75%,6733.0,36669.0
max,9098.0,410921.0


In [10]:
abc.isnull().sum()

index       0
id          0
title       0
overview    0
tagline     0
dtype: int64

In [11]:
abc[abc['tagline'].isnull()]

Unnamed: 0,index,id,title,overview,tagline


In [12]:
abc.head()

Unnamed: 0,index,id,title,overview,tagline
0,1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!
1,2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...
2,3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...
3,4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...
4,5,949,Heat,"Obsessive master thief, Neil McCauley leads a ...",A Los Angeles Crime Saga


In [13]:
abc['title']

0                                                 Jumanji
1                                        Grumpier Old Men
2                                       Waiting to Exhale
3                             Father of the Bride Part II
4                                                    Heat
                              ...                        
7028                        Kingsglaive: Final Fantasy XV
7029                         Sharknado 4: The 4th Awakens
7030                                               Rustom
7031                                        Shin Godzilla
7032    The Beatles: Eight Days a Week - The Touring Y...
Name: title, Length: 7033, dtype: object

# Core to Convert text to Vectors and checking similarity

In [14]:
cv = CountVectorizer(stop_words = 'english')
cv_matrix = cv.fit_transform(abc['overview'])
linear_sim = linear_kernel(cv_matrix, cv_matrix)
cosine_sim = cosine_similarity(cv_matrix, cv_matrix)

In [15]:
indices = pd.Series(abc.index, index = abc['title']).drop_duplicates()
indices

title
Jumanji                                                  0
Grumpier Old Men                                         1
Waiting to Exhale                                        2
Father of the Bride Part II                              3
Heat                                                     4
                                                      ... 
Kingsglaive: Final Fantasy XV                         7028
Sharknado 4: The 4th Awakens                          7029
Rustom                                                7030
Shin Godzilla                                         7031
The Beatles: Eight Days a Week - The Touring Years    7032
Length: 7033, dtype: int64

In [16]:
indices['Heat']

4

In [17]:
def get(title, sim, index):
    idx = index[title]
    scores = list(enumerate(sim[idx]))
    scores_sorted = sorted(scores, key = lambda x:x[1], reverse = True)
    top_scores = scores_sorted[1:11]
    asd = [i[0] for i in top_scores]
    return abc['title'].iloc[asd]

In [18]:
get('Heat', cosine_sim, indices)

5928                   The Town
6931                    Cop Car
4894        Kiss Kiss Bang Bang
3554                 Blue Steel
6738              Inherent Vice
5221           Perfect Stranger
3506                  Roadgames
6443    Star Trek Into Darkness
2743                  Manhunter
3897                      Radio
Name: title, dtype: object

In [19]:
scores = list(enumerate(cosine_sim[4]))
scores

[(0, 0.08122955416108235),
 (1, 0.0),
 (2, 0.03592106040535498),
 (3, 0.0),
 (4, 0.9999999999999998),
 (5, 0.0),
 (6, 0.0),
 (7, 0.08712136837380642),
 (8, 0.0),
 (9, 0.0),
 (10, 0.0),
 (11, 0.06913011298202835),
 (12, 0.06253053994807226),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.0),
 (17, 0.0),
 (18, 0.0),
 (19, 0.049813548138671795),
 (20, 0.026198125853112306),
 (21, 0.032791291789197645),
 (22, 0.0),
 (23, 0.049813548138671795),
 (24, 0.0),
 (25, 0.0),
 (26, 0.0),
 (27, 0.0),
 (28, 0.0),
 (29, 0.0),
 (30, 0.0),
 (31, 0.0),
 (32, 0.0),
 (33, 0.0),
 (34, 0.0),
 (35, 0.0),
 (36, 0.0),
 (37, 0.0),
 (38, 0.0),
 (39, 0.0),
 (40, 0.0),
 (41, 0.0),
 (42, 0.0),
 (43, 0.0),
 (44, 0.07332355751067667),
 (45, 0.0),
 (46, 0.0),
 (47, 0.056099271345321444),
 (48, 0.0),
 (49, 0.0),
 (50, 0.10369516947304254),
 (51, 0.0),
 (52, 0.0),
 (53, 0.026198125853112306),
 (54, 0.037450294313656915),
 (55, 0.0),
 (56, 0.0),
 (57, 0.04490132550669373),
 (58, 0.0),
 (59, 0.03829197905337418),
 (60, 0.0),


In [20]:
scores_sorted = sorted(scores, key = lambda x:x[1], reverse = True)
scores_sorted

[(4, 0.9999999999999998),
 (5928, 0.17960530202677488),
 (6931, 0.1624591083221647),
 (4894, 0.1567723603339241),
 (3554, 0.15554275420956382),
 (6738, 0.14400460822119582),
 (5221, 0.13068205256070964),
 (3506, 0.12361284651454937),
 (6443, 0.1233534278728621),
 (2743, 0.12048289933537483),
 (3897, 0.11973686801784994),
 (2614, 0.1175792702504431),
 (4585, 0.11757927025044308),
 (202, 0.11503946170861017),
 (557, 0.11503946170861017),
 (2158, 0.11487593716012254),
 (4881, 0.11487593716012254),
 (5660, 0.11487593716012254),
 (2246, 0.11359236684941297),
 (2983, 0.11359236684941297),
 (1638, 0.1131407055503551),
 (5227, 0.11235088294097074),
 (3242, 0.11053942207134546),
 (6815, 0.109985336266015),
 (6793, 0.10955820713295535),
 (485, 0.10830607221477648),
 (955, 0.10830607221477648),
 (5383, 0.10830607221477648),
 (3121, 0.10776318121606496),
 (4119, 0.10776318121606496),
 (1050, 0.10439346133241542),
 (50, 0.10369516947304254),
 (747, 0.10369516947304254),
 (3162, 0.10369516947304254)

In [21]:
top_scores = scores_sorted[1:11]
top_scores

[(5928, 0.17960530202677488),
 (6931, 0.1624591083221647),
 (4894, 0.1567723603339241),
 (3554, 0.15554275420956382),
 (6738, 0.14400460822119582),
 (5221, 0.13068205256070964),
 (3506, 0.12361284651454937),
 (6443, 0.1233534278728621),
 (2743, 0.12048289933537483),
 (3897, 0.11973686801784994)]

In [22]:
scores_sorted[:11]

[(4, 0.9999999999999998),
 (5928, 0.17960530202677488),
 (6931, 0.1624591083221647),
 (4894, 0.1567723603339241),
 (3554, 0.15554275420956382),
 (6738, 0.14400460822119582),
 (5221, 0.13068205256070964),
 (3506, 0.12361284651454937),
 (6443, 0.1233534278728621),
 (2743, 0.12048289933537483),
 (3897, 0.11973686801784994)]

In [23]:
asd = [i[0] for i in top_scores]
asd

[5928, 6931, 4894, 3554, 6738, 5221, 3506, 6443, 2743, 3897]

In [24]:
movie_index = abc['title'].iloc[asd]
movie_index

5928                   The Town
6931                    Cop Car
4894        Kiss Kiss Bang Bang
3554                 Blue Steel
6738              Inherent Vice
5221           Perfect Stranger
3506                  Roadgames
6443    Star Trek Into Darkness
2743                  Manhunter
3897                      Radio
Name: title, dtype: object

In [25]:
def get_recommendation(title, sim, indices):
    idx = indices[title]
    similarity_score = list(enumerate(sim[idx]))
    sorted_scores = sorted(similarity_score, key = lambda x:x[1], reverse = True)
    sorted_scores = sorted_scores[1:11]
    movie_index = [simi[0] for simi in sorted_scores]
    return abc['title'].iloc[movie_index]

In [26]:
get_recommendation('Heat', cosine_sim, indices)

5928                   The Town
6931                    Cop Car
4894        Kiss Kiss Bang Bang
3554                 Blue Steel
6738              Inherent Vice
5221           Perfect Stranger
3506                  Roadgames
6443    Star Trek Into Darkness
2743                  Manhunter
3897                      Radio
Name: title, dtype: object

In [27]:
get_recommendation('Heat', linear_sim, indices)

5928                    The Town
6352              Wreck-It Ralph
1050    The Hunt for Red October
1607     Another Day in Paradise
1638          The Color of Money
2348               Human Traffic
3203                  Rollerball
3242                   Hopscotch
3445       Bowling for Columbine
5906                    Ip Man 2
Name: title, dtype: object

In [28]:
get_recommendation('Jumanji', cosine_sim, indices)

6352          Wreck-It Ralph
1730                eXistenZ
5008              Stay Alive
4960           Grandma's Boy
6890                  Pixels
4363    The Last Starfighter
5708                   Gamer
5140           Casino Royale
2623      Dungeons & Dragons
6296             Geri's Game
Name: title, dtype: object

In [29]:
get_recommendation('Jumanji', linear_sim, indices)

6352                         Wreck-It Ralph
5008                             Stay Alive
5140                          Casino Royale
7                              Sudden Death
2623                     Dungeons & Dragons
5708                                  Gamer
3140    Porn Star: The Legend of Ron Jeremy
2104                       Any Given Sunday
3242                              Hopscotch
3853                                 Avalon
Name: title, dtype: object

In [30]:
tfidf = TfidfVectorizer(stop_words = 'english')
tfidf_matrix = tfidf.fit_transform(abc['overview'])
tfidf_cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
tfidf_linear_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [31]:
def get_recommendation(title, sim, indices):
    idx = indices[title]
    similarity_score = list(enumerate(sim[idx]))
    sorted_scores = sorted(similarity_score, key = lambda x:x[1], reverse = True)
    sorted_scores = sorted_scores[1:11]
    movie_index = [simi[0] for simi in sorted_scores]
    return abc['title'].iloc[movie_index]

In [32]:
get_recommendation('Heat', tfidf_cosine_sim, indices)

6931                Cop Car
3554             Blue Steel
5928               The Town
4894    Kiss Kiss Bang Bang
1638     The Color of Money
3897                  Radio
5221       Perfect Stranger
5660        The Hurt Locker
3506              Roadgames
4786                Vincent
Name: title, dtype: object

In [33]:
get_recommendation('Heat', tfidf_linear_sim, indices)

6931                Cop Car
3554             Blue Steel
5928               The Town
4894    Kiss Kiss Bang Bang
1638     The Color of Money
3897                  Radio
5221       Perfect Stranger
5660        The Hurt Locker
3506              Roadgames
4786                Vincent
Name: title, dtype: object

In [34]:
get_recommendation('Jumanji', tfidf_cosine_sim, indices)

6352          Wreck-It Ralph
6890                  Pixels
1730                eXistenZ
5008              Stay Alive
5708                   Gamer
4960           Grandma's Boy
4363    The Last Starfighter
2623      Dungeons & Dragons
5140           Casino Royale
6296             Geri's Game
Name: title, dtype: object

In [35]:
get_recommendation('Jumanji', tfidf_linear_sim, indices)

6352          Wreck-It Ralph
6890                  Pixels
1730                eXistenZ
5008              Stay Alive
5708                   Gamer
4960           Grandma's Boy
4363    The Last Starfighter
2623      Dungeons & Dragons
5140           Casino Royale
6296             Geri's Game
Name: title, dtype: object

In [36]:
abc.head()

Unnamed: 0,index,id,title,overview,tagline
0,1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!
1,2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...
2,3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...
3,4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...
4,5,949,Heat,"Obsessive master thief, Neil McCauley leads a ...",A Los Angeles Crime Saga


In [37]:
df[df['title'] == 'Sleepy Hollow']

Unnamed: 0,id,title,overview,tagline
2474,2668,Sleepy Hollow,New York detective Ichabod Crane is sent to Sl...,Heads will roll.


In [38]:
tfidf = TfidfVectorizer(stop_words = 'english')
tfidf_matrix = tfidf.fit_transform(abc['tagline'])
tfidf_cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
tfidf_linear_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [39]:
def get_recommendation(title, sim, indices):
    idx = indices[title]
    similarity_score = list(enumerate(sim[idx]))
    sorted_scores = sorted(similarity_score, key = lambda x:x[1], reverse = True)
    sorted_scores = sorted_scores[1:11]
    movie_index = [simi[0] for simi in sorted_scores]
    return abc['title'].iloc[movie_index]

In [40]:
get_recommendation('Heat', tfidf_cosine_sim, indices)

485                                 Mulholland Falls
2402                                 Blazing Saddles
4803    Star Wars: Episode III - Revenge of the Sith
5913                                  Animal Kingdom
1250                          Take the Money and Run
496                                       Last Dance
5563                        A Chinese Ghost Story II
2717                         3000 Miles to Graceland
4357                                Lassie Come Home
798                      Once Upon a Time in America
Name: title, dtype: object

In [41]:
get_recommendation('Heat', tfidf_linear_sim, indices)

485                                 Mulholland Falls
2402                                 Blazing Saddles
4803    Star Wars: Episode III - Revenge of the Sith
5913                                  Animal Kingdom
1250                          Take the Money and Run
496                                       Last Dance
5563                        A Chinese Ghost Story II
2717                         3000 Miles to Graceland
4357                                Lassie Come Home
798                      Once Upon a Time in America
Name: title, dtype: object

In [42]:
get_recommendation('Jumanji', tfidf_cosine_sim, indices)

3763                                 Hulk
2868    Final Fantasy: The Spirits Within
2049                        Sleepy Hollow
4839                            Last Days
4209            The Plague of the Zombies
1629                       Trick or Treat
6437                           Iron Man 3
4772                 Panic in the Streets
968                        Vegas Vacation
2708                      Head Over Heels
Name: title, dtype: object

In [43]:
get_recommendation('Jumanji', tfidf_linear_sim, indices)

3763                                 Hulk
2868    Final Fantasy: The Spirits Within
2049                        Sleepy Hollow
4839                            Last Days
4209            The Plague of the Zombies
1629                       Trick or Treat
6437                           Iron Man 3
4772                 Panic in the Streets
968                        Vegas Vacation
2708                      Head Over Heels
Name: title, dtype: object